• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.mediatranslation.v1alpha1;
18
19import "google/api/field_behavior.proto";
20import "google/rpc/status.proto";
21import "google/api/client.proto";
22
23option cc_enable_arenas = true;
24option go_package = "cloud.google.com/go/mediatranslation/apiv1alpha1/mediatranslationpb;mediatranslationpb";
25option java_package = "com.google.cloud.mediatranslation.v1alpha1";
26
27// Provides translation from/to media types.
28service SpeechTranslationService {
29  option (google.api.default_host) = "mediatranslation.googleapis.com";
30  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
31
32  // Performs bidirectional streaming speech translation: receive results while
33  // sending audio. This method is only available via the gRPC API (not REST).
34  rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) {
35  }
36}
37
38// Provides information to the speech translation that specifies how to process
39// the request.
40message TranslateSpeechConfig {
41  // Required. Encoding of audio data.
42  // Supported formats:
43  //
44  // - `linear16`
45  //
46  //   Uncompressed 16-bit signed little-endian samples (Linear PCM).
47  //
48  // - `flac`
49  //
50  //   `flac` (Free Lossless Audio Codec) is the recommended encoding
51  //   because it is lossless--therefore recognition is not compromised--and
52  //   requires only about half the bandwidth of `linear16`.
53  //
54  // - `mulaw`
55  //
56  //   8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
57  //
58  // - `amr`
59  //
60  //   Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
61  //
62  // - `amr-wb`
63  //
64  //   Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
65  //
66  // - `ogg-opus`
67  //
68  //   Opus encoded audio frames in Ogg container
69  //   ([OggOpus](https://wiki.xiph.org/OggOpus)).
70  //   `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
71  //
72  // - `mp3`
73  //
74  //   MP3 audio. Support all standard MP3 bitrates (which range from 32-320
75  //   kbps). When using this encoding, `sample_rate_hertz` has to match the
76  //   sample rate of the file being used.
77  //
78  //
79  string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
80
81  // Required. Source language code (BCP-47) of the input audio.
82  string source_language_code = 2 [(google.api.field_behavior) = REQUIRED];
83
84  // Required. Target language code (BCP-47) of the output.
85  string target_language_code = 3 [(google.api.field_behavior) = REQUIRED];
86
87  // Optional. A list of up to 3 additional language codes (BCP-47), listing possible
88  // alternative languages of the supplied audio. If alternative source
89  // languages are listed, speech translation result will translate in the most
90  // likely language detected including the main source_language_code. The
91  // translated result will include the language code of the language detected
92  // in the audio.
93  // Note:
94  // 1. If the provided alternative_source_language_code is not supported
95  // by current API version, we will skip that language code.
96  // 2. If user only provided one eligible alternative_source_language_codes,
97  // the translation will happen between source_language_code and
98  // alternative_source_language_codes. The target_language_code will be
99  // ignored. It will be useful in conversation mode.
100  repeated string alternative_source_language_codes = 6 [(google.api.field_behavior) = OPTIONAL];
101
102  // Optional. Sample rate in Hertz of the audio data. Valid values are:
103  // 8000-48000. 16000 is optimal. For best results, set the sampling rate of
104  // the audio source to 16000 Hz. If that's not possible, use the native sample
105  // rate of the audio source (instead of re-sampling).
106  //
107  int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL];
108
109  // Optional.
110  string model = 5 [(google.api.field_behavior) = OPTIONAL];
111}
112
113// Config used for streaming translation.
114message StreamingTranslateSpeechConfig {
115  // Required. The common config for all the following audio contents.
116  TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED];
117
118  // Optional. If `false` or omitted, the system performs
119  // continuous translation (continuing to wait for and process audio even if
120  // the user pauses speaking) until the client closes the input stream (gRPC
121  // API) or until the maximum time limit has been reached. May return multiple
122  // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`.
123  //
124  // If `true`, the speech translator will detect a single spoken utterance.
125  // When it detects that the user has paused or stopped speaking, it will
126  // return an `END_OF_SINGLE_UTTERANCE` event and cease translation.
127  // When the client receives `END_OF_SINGLE_UTTERANCE` event, the client should
128  // stop sending the requests. However, clients should keep receiving remaining
129  // responses until the stream is terminated. To construct the complete
130  // sentence in a streaming way, one should override (if `is_final` of previous
131  // response is false), or append (if 'is_final' of previous response is true).
132  bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL];
133
134  // Optional. Stability control for the media translation text. The value should be
135  // "LOW", "MEDIUM", "HIGH". It applies to text/text_and_audio translation
136  // only.
137  // For audio translation mode, we only support HIGH stability mode,
138  // low/medium stability mode will throw argument error.
139  // Default empty string will be treated as "HIGH" in audio translation mode;
140  // will be treated as "LOW" in other translation mode.
141  // Note that stability and speed would be trade off.
142  // 1. "LOW": In low mode, translation service will start to do translation
143  // right after getting recognition response. The speed will be faster.
144  // 2. "MEDIUM": In medium mode, translation service will
145  // check if the recognition response is stable enough or not, and only
146  // translate recognition response which is not likely to be changed later.
147  // 3. "HIGH": In high mode, translation service will wait for more stable
148  // recognition responses, and then start to do translation. Also, the
149  // following recognition responses cannot modify previous recognition
150  // responses. Thus it may impact quality in some situation. "HIGH" stability
151  // will generate "final" responses more frequently.
152  //
153  string stability = 3 [(google.api.field_behavior) = OPTIONAL];
154
155  // Optional. Translation mode, the value should be "text", "audio", "text_and_audio".
156  // Default empty string will be treated as "text".
157  // 1. "text": The response will be text translation. Text translation has a
158  // field "is_final". Detailed definition can be found in
159  // `TextTranslationResult`.
160  // 2. "audio": The response will be audio translation. Audio translation does
161  // not have "is_final" field, which means each audio translation response is
162  // stable and will not be changed by later response.
163  // Translation mode "audio" can only be used with "high" stability mode,
164  // 3. "text_and_audio": The response will have a text translation, when
165  // "is_final" is true, we will also output its corresponding audio
166  // translation. When "is_final" is false, audio_translation field will be
167  // empty.
168  string translation_mode = 4 [(google.api.field_behavior) = OPTIONAL];
169
170  // Optional. If disable_interim_results is true, we will only return "final" responses.
171  // Otherwise, we will return all the responses. Default value will be false.
172  // User can only set disable_interim_results to be true with "high" stability
173  // mode.
174  bool disable_interim_results = 5 [(google.api.field_behavior) = OPTIONAL];
175}
176
177// The top-level message sent by the client for the `StreamingTranslateSpeech`
178// method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The
179// first message must contain a `streaming_config` message and must not contain
180// `audio_content` data. All subsequent messages must contain `audio_content`
181// data and must not contain a `streaming_config` message.
182message StreamingTranslateSpeechRequest {
183  // The streaming request, which is either a streaming config or content.
184  oneof streaming_request {
185    // Provides information to the recognizer that specifies how to process the
186    // request. The first `StreamingTranslateSpeechRequest` message must contain
187    // a `streaming_config` message.
188    StreamingTranslateSpeechConfig streaming_config = 1;
189
190    // The audio data to be translated. Sequential chunks of audio data are sent
191    // in sequential `StreamingTranslateSpeechRequest` messages. The first
192    // `StreamingTranslateSpeechRequest` message must not contain
193    // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest`
194    // messages must contain `audio_content` data. The audio bytes must be
195    // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with
196    // all bytes fields, protobuffers use a pure binary representation (not
197    // base64).
198    bytes audio_content = 2;
199  }
200}
201
202// A streaming speech translation result corresponding to a portion of the audio
203// that is currently being processed.
204message StreamingTranslateSpeechResult {
205  // Text translation result.
206  message TextTranslationResult {
207    // Output only. The translated sentence.
208    string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
209
210    // Output only. If `false`, this `StreamingTranslateSpeechResult` represents
211    // an interim result that may change. If `true`, this is the final time the
212    // translation service will return this particular
213    // `StreamingTranslateSpeechResult`, the streaming translator will not
214    // return any further hypotheses for this portion of the transcript and
215    // corresponding audio.
216    bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
217  }
218
219  // Audio translation result.
220  message AudioTranslationResult {
221    // Output only. The translated audio.
222    bytes audio_translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
223  }
224
225  // Text translation result.
226  TextTranslationResult text_translation_result = 1;
227
228  // Audio translation result.
229  AudioTranslationResult audio_translation_result = 2;
230
231  // Output only. The debug only recognition result in original language. This field is debug
232  // only and will be set to empty string if not available.
233  // This is implementation detail and will not be backward compatible.
234  string recognition_result = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
235
236  // Output only.
237  string detected_source_language_code = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
238}
239
240// A streaming speech translation response corresponding to a portion of
241// the audio currently processed.
242message StreamingTranslateSpeechResponse {
243  // Indicates the type of speech event.
244  enum SpeechEventType {
245    // No speech event specified.
246    SPEECH_EVENT_TYPE_UNSPECIFIED = 0;
247
248    // This event indicates that the server has detected the end of the user's
249    // speech utterance and expects no additional speech. Therefore, the server
250    // will not process additional audio (although it may subsequently return
251    // additional results). When the client receives `END_OF_SINGLE_UTTERANCE`
252    // event, the client should stop sending the requests. However, clients
253    // should keep receiving remaining responses until the stream is terminated.
254    // To construct the complete sentence in a streaming way, one should
255    // override (if `is_final` of previous response is `false`), or append (if
256    // `is_final` of previous response is `true`). This event is only sent if
257    // `single_utterance` was set to `true`, and is not used otherwise.
258    END_OF_SINGLE_UTTERANCE = 1;
259  }
260
261  // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
262  // specifies the error for the operation.
263  google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
264
265  // Output only. The translation result that is currently being processed (For text
266  // translation, `is_final` could be `true` or `false`.
267  // For audio translation, we do not have is_final field, which means each
268  // audio response is stable and will not get changed later. For
269  // text_and_audio, we still have `is_final` field in text translation, but we
270  // only output corresponsding audio when `is_final` is true.).
271  StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
272
273  // Output only. Indicates the type of speech event.
274  SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
275}
276