Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 47 additions & 43 deletions riva/proto/riva_asr.proto
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ message RecognitionConfig {
int32 sample_rate_hertz = 2;

// Required. The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
string language_code = 3;

// Maximum number of recognition hypotheses to be returned.
Expand All @@ -132,39 +132,39 @@ message RecognitionConfig {
// information, see SpeechContext section
repeated SpeechContext speech_contexts = 6;

// The number of channels in the input audio data.
// ONLY set this for MULTI-CHANNEL recognition.
// Valid values for LINEAR16 and FLAC are `1`-`8`.
// Valid values for OGG_OPUS are '1'-'254'.
// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
// If `0` or omitted, defaults to one channel (mono).
// Note: We only recognize the first channel by default.
// To perform independent recognition on each channel set
// `enable_separate_recognition_per_channel` to 'true'.
int32 audio_channel_count = 7;

// If `true`, the top result includes a list of words and
// the start and end time offsets (timestamps) for those words. If
// `false`, no word-level time offset information is returned. The default is
// `false`.
// The number of channels in the input audio data.
// ONLY set this for MULTI-CHANNEL recognition.
// Valid values for LINEAR16 and FLAC are `1`-`8`.
// Valid values for OGG_OPUS are '1'-'254'.
// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
// If `0` or omitted, defaults to one channel (mono).
// Note: We only recognize the first channel by default.
// To perform independent recognition on each channel set
// `enable_separate_recognition_per_channel` to 'true'.
int32 audio_channel_count = 7;

// If `true`, the top result includes a list of words and the start and end
// time offsets (timestamps), and confidence scores for those words. If
// `false`, no word-level time offset information is returned. The default
// is `false`.
bool enable_word_time_offsets = 8;

// If 'true', adds punctuation to recognition result hypotheses.
// The default 'false' value does not add punctuation to result hypotheses.
bool enable_automatic_punctuation = 11;
// If 'true', adds punctuation to recognition result hypotheses. The
// default 'false' value does not add punctuation to result hypotheses.
bool enable_automatic_punctuation = 11;

// This needs to be set to `true` explicitly and `audio_channel_count` > 1
// to get each channel recognized separately. The recognition result will
// contain a `channel_tag` field to state which channel that result belongs
// to. If this is not true, we will only recognize the first channel. The
// request is billed cumulatively for all channels recognized:
// `audio_channel_count` multiplied by the length of the audio.
bool enable_separate_recognition_per_channel = 12;
// This needs to be set to `true` explicitly and `audio_channel_count` > 1
// to get each channel recognized separately. The recognition result will
// contain a `channel_tag` field to state which channel that result belongs
// to. If this is not true, we will only recognize the first channel. The
// request is billed cumulatively for all channels recognized:
// `audio_channel_count` multiplied by the length of the audio.
bool enable_separate_recognition_per_channel = 12;

// Which model to select for the given request.
// If empty, Riva will select the right model based on the other RecognitionConfig parameters.
// The model should correspond to the name passed to `riva-build` with the `--name` argument
string model = 13;
string model = 13;

// The verbatim_transcripts flag enables or disable inverse text normalization.
// 'true' returns exactly what was said, with no denormalization.
Expand All @@ -186,10 +186,10 @@ message RecognitionConfig {

// Provides information to the recognizer that specifies how to process the request
message StreamingRecognitionConfig {
// Provides information to the recognizer that specifies how to process the request
// Provides information to the recognizer that specifies how to process the request
RecognitionConfig config = 1;

// If `true`, interim results (tentative hypotheses) may be
// If `true`, interim results (tentative hypotheses) may be
// returned as they become available (these interim results are indicated with
// the `is_final=false` flag).
// If `false` or omitted, only `is_final=true` result(s) are returned.
Expand Down Expand Up @@ -234,7 +234,7 @@ message SpeechContext {
// messages.
message RecognizeResponse {
// Sequential list of transcription results corresponding to
// sequential portions of audio. Currently only returns one transcript.
// sequential portions of audio. Currently only returns one transcript.
repeated SpeechRecognitionResult results = 1;
}

Expand All @@ -261,12 +261,14 @@ message SpeechRecognitionAlternative {
// Transcript text representing the words that the user spoke.
string transcript = 1;

// The non-normalized confidence estimate. A higher number indicates an
// estimated greater likelihood that the recognized words are correct. This
// field is set only for a non-streaming result or, for a streaming result
// where is_final=true. This field is not guaranteed to be accurate and users
// should not rely on it to be always provided. The default of 0.0 is a
// sentinel value indicating confidence was not set.
// The confidence estimate. A higher number indicates an estimated greater
// likelihood that the recognized word is correct. This field is set only for
// a non-streaming result or, for a streaming result where is_final=true.
// This field is not guaranteed to be accurate and users should not rely on
// it to be always provided. Although confidence can currently be roughly
// interpreted as a natural-log probability, the estimate computation varies
// with difference configurations, and is subject to change. The default of
// 0.0 is a sentinel value indicating confidence was not set.
float confidence = 2;

// A list of word-specific information for each recognized word. Only populated
Expand All @@ -291,12 +293,14 @@ message WordInfo {
// The word corresponding to this set of information.
string word = 3;

// The confidence estimate between 0.0 and 1.0. A higher number indicates an
// estimated greater likelihood that the recognized words are correct. This
// field is set only for a non-streaming result or, for a streaming result
// where is_final=true. This field is not guaranteed to be accurate and users
// should not rely on it to be always provided. The default of 0.0 is a
// sentinel value indicating confidence was not set.
// The confidence estimate. A higher number indicates an estimated greater
// likelihood that the recognized word is correct. This field is set only for
// a non-streaming result or, for a streaming result where is_final=true.
// This field is not guaranteed to be accurate and users should not rely on
// it to be always provided. Although confidence can currently be roughly
// interpreted as a natural-log probability, the estimate computation varies
// with difference configurations, and is subject to change. The default of
// 0.0 is a sentinel value indicating confidence was not set.
float confidence = 4;

// Output only. A distinct integer value is assigned for every speaker within
Expand Down