1// Copyright 2020 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.mediatranslation.v1alpha1; 18 19import "google/api/field_behavior.proto"; 20import "google/rpc/status.proto"; 21import "google/api/client.proto"; 22 23option cc_enable_arenas = true; 24option go_package = "cloud.google.com/go/mediatranslation/apiv1alpha1/mediatranslationpb;mediatranslationpb"; 25option java_package = "com.google.cloud.mediatranslation.v1alpha1"; 26 27// Provides translation from/to media types. 28service SpeechTranslationService { 29 option (google.api.default_host) = "mediatranslation.googleapis.com"; 30 option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; 31 32 // Performs bidirectional streaming speech translation: receive results while 33 // sending audio. This method is only available via the gRPC API (not REST). 34 rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) { 35 } 36} 37 38// Provides information to the speech translation that specifies how to process 39// the request. 40message TranslateSpeechConfig { 41 // Required. Encoding of audio data. 42 // Supported formats: 43 // 44 // - `linear16` 45 // 46 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 47 // 48 // - `flac` 49 // 50 // `flac` (Free Lossless Audio Codec) is the recommended encoding 51 // because it is lossless--therefore recognition is not compromised--and 52 // requires only about half the bandwidth of `linear16`. 53 // 54 // - `mulaw` 55 // 56 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 57 // 58 // - `amr` 59 // 60 // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. 61 // 62 // - `amr-wb` 63 // 64 // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. 65 // 66 // - `ogg-opus` 67 // 68 // Opus encoded audio frames in Ogg container 69 // ([OggOpus](https://wiki.xiph.org/OggOpus)). 70 // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. 71 // 72 // - `mp3` 73 // 74 // MP3 audio. Support all standard MP3 bitrates (which range from 32-320 75 // kbps). When using this encoding, `sample_rate_hertz` has to match the 76 // sample rate of the file being used. 77 // 78 // 79 string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; 80 81 // Required. Source language code (BCP-47) of the input audio. 82 string source_language_code = 2 [(google.api.field_behavior) = REQUIRED]; 83 84 // Required. Target language code (BCP-47) of the output. 85 string target_language_code = 3 [(google.api.field_behavior) = REQUIRED]; 86 87 // Optional. A list of up to 3 additional language codes (BCP-47), listing possible 88 // alternative languages of the supplied audio. If alternative source 89 // languages are listed, speech translation result will translate in the most 90 // likely language detected including the main source_language_code. The 91 // translated result will include the language code of the language detected 92 // in the audio. 93 // Note: 94 // 1. If the provided alternative_source_language_code is not supported 95 // by current API version, we will skip that language code. 96 // 2. If user only provided one eligible alternative_source_language_codes, 97 // the translation will happen between source_language_code and 98 // alternative_source_language_codes. The target_language_code will be 99 // ignored. It will be useful in conversation mode. 100 repeated string alternative_source_language_codes = 6 [(google.api.field_behavior) = OPTIONAL]; 101 102 // Optional. Sample rate in Hertz of the audio data. Valid values are: 103 // 8000-48000. 16000 is optimal. For best results, set the sampling rate of 104 // the audio source to 16000 Hz. If that's not possible, use the native sample 105 // rate of the audio source (instead of re-sampling). 106 // 107 int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL]; 108 109 // Optional. 110 string model = 5 [(google.api.field_behavior) = OPTIONAL]; 111} 112 113// Config used for streaming translation. 114message StreamingTranslateSpeechConfig { 115 // Required. The common config for all the following audio contents. 116 TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED]; 117 118 // Optional. If `false` or omitted, the system performs 119 // continuous translation (continuing to wait for and process audio even if 120 // the user pauses speaking) until the client closes the input stream (gRPC 121 // API) or until the maximum time limit has been reached. May return multiple 122 // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`. 123 // 124 // If `true`, the speech translator will detect a single spoken utterance. 125 // When it detects that the user has paused or stopped speaking, it will 126 // return an `END_OF_SINGLE_UTTERANCE` event and cease translation. 127 // When the client receives `END_OF_SINGLE_UTTERANCE` event, the client should 128 // stop sending the requests. However, clients should keep receiving remaining 129 // responses until the stream is terminated. To construct the complete 130 // sentence in a streaming way, one should override (if `is_final` of previous 131 // response is false), or append (if 'is_final' of previous response is true). 132 bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL]; 133 134 // Optional. Stability control for the media translation text. The value should be 135 // "LOW", "MEDIUM", "HIGH". It applies to text/text_and_audio translation 136 // only. 137 // For audio translation mode, we only support HIGH stability mode, 138 // low/medium stability mode will throw argument error. 139 // Default empty string will be treated as "HIGH" in audio translation mode; 140 // will be treated as "LOW" in other translation mode. 141 // Note that stability and speed would be trade off. 142 // 1. "LOW": In low mode, translation service will start to do translation 143 // right after getting recognition response. The speed will be faster. 144 // 2. "MEDIUM": In medium mode, translation service will 145 // check if the recognition response is stable enough or not, and only 146 // translate recognition response which is not likely to be changed later. 147 // 3. "HIGH": In high mode, translation service will wait for more stable 148 // recognition responses, and then start to do translation. Also, the 149 // following recognition responses cannot modify previous recognition 150 // responses. Thus it may impact quality in some situation. "HIGH" stability 151 // will generate "final" responses more frequently. 152 // 153 string stability = 3 [(google.api.field_behavior) = OPTIONAL]; 154 155 // Optional. Translation mode, the value should be "text", "audio", "text_and_audio". 156 // Default empty string will be treated as "text". 157 // 1. "text": The response will be text translation. Text translation has a 158 // field "is_final". Detailed definition can be found in 159 // `TextTranslationResult`. 160 // 2. "audio": The response will be audio translation. Audio translation does 161 // not have "is_final" field, which means each audio translation response is 162 // stable and will not be changed by later response. 163 // Translation mode "audio" can only be used with "high" stability mode, 164 // 3. "text_and_audio": The response will have a text translation, when 165 // "is_final" is true, we will also output its corresponding audio 166 // translation. When "is_final" is false, audio_translation field will be 167 // empty. 168 string translation_mode = 4 [(google.api.field_behavior) = OPTIONAL]; 169 170 // Optional. If disable_interim_results is true, we will only return "final" responses. 171 // Otherwise, we will return all the responses. Default value will be false. 172 // User can only set disable_interim_results to be true with "high" stability 173 // mode. 174 bool disable_interim_results = 5 [(google.api.field_behavior) = OPTIONAL]; 175} 176 177// The top-level message sent by the client for the `StreamingTranslateSpeech` 178// method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The 179// first message must contain a `streaming_config` message and must not contain 180// `audio_content` data. All subsequent messages must contain `audio_content` 181// data and must not contain a `streaming_config` message. 182message StreamingTranslateSpeechRequest { 183 // The streaming request, which is either a streaming config or content. 184 oneof streaming_request { 185 // Provides information to the recognizer that specifies how to process the 186 // request. The first `StreamingTranslateSpeechRequest` message must contain 187 // a `streaming_config` message. 188 StreamingTranslateSpeechConfig streaming_config = 1; 189 190 // The audio data to be translated. Sequential chunks of audio data are sent 191 // in sequential `StreamingTranslateSpeechRequest` messages. The first 192 // `StreamingTranslateSpeechRequest` message must not contain 193 // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest` 194 // messages must contain `audio_content` data. The audio bytes must be 195 // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with 196 // all bytes fields, protobuffers use a pure binary representation (not 197 // base64). 198 bytes audio_content = 2; 199 } 200} 201 202// A streaming speech translation result corresponding to a portion of the audio 203// that is currently being processed. 204message StreamingTranslateSpeechResult { 205 // Text translation result. 206 message TextTranslationResult { 207 // Output only. The translated sentence. 208 string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 209 210 // Output only. If `false`, this `StreamingTranslateSpeechResult` represents 211 // an interim result that may change. If `true`, this is the final time the 212 // translation service will return this particular 213 // `StreamingTranslateSpeechResult`, the streaming translator will not 214 // return any further hypotheses for this portion of the transcript and 215 // corresponding audio. 216 bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 217 } 218 219 // Audio translation result. 220 message AudioTranslationResult { 221 // Output only. The translated audio. 222 bytes audio_translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 223 } 224 225 // Text translation result. 226 TextTranslationResult text_translation_result = 1; 227 228 // Audio translation result. 229 AudioTranslationResult audio_translation_result = 2; 230 231 // Output only. The debug only recognition result in original language. This field is debug 232 // only and will be set to empty string if not available. 233 // This is implementation detail and will not be backward compatible. 234 string recognition_result = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 235 236 // Output only. 237 string detected_source_language_code = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; 238} 239 240// A streaming speech translation response corresponding to a portion of 241// the audio currently processed. 242message StreamingTranslateSpeechResponse { 243 // Indicates the type of speech event. 244 enum SpeechEventType { 245 // No speech event specified. 246 SPEECH_EVENT_TYPE_UNSPECIFIED = 0; 247 248 // This event indicates that the server has detected the end of the user's 249 // speech utterance and expects no additional speech. Therefore, the server 250 // will not process additional audio (although it may subsequently return 251 // additional results). When the client receives `END_OF_SINGLE_UTTERANCE` 252 // event, the client should stop sending the requests. However, clients 253 // should keep receiving remaining responses until the stream is terminated. 254 // To construct the complete sentence in a streaming way, one should 255 // override (if `is_final` of previous response is `false`), or append (if 256 // `is_final` of previous response is `true`). This event is only sent if 257 // `single_utterance` was set to `true`, and is not used otherwise. 258 END_OF_SINGLE_UTTERANCE = 1; 259 } 260 261 // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that 262 // specifies the error for the operation. 263 google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 264 265 // Output only. The translation result that is currently being processed (For text 266 // translation, `is_final` could be `true` or `false`. 267 // For audio translation, we do not have is_final field, which means each 268 // audio response is stable and will not get changed later. For 269 // text_and_audio, we still have `is_final` field in text translation, but we 270 // only output corresponsding audio when `is_final` is true.). 271 StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 272 273 // Output only. Indicates the type of speech event. 274 SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 275} 276