• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1<html><body>
2<style>
3
4body, h1, h2, h3, div, span, p, pre, a {
5  margin: 0;
6  padding: 0;
7  border: 0;
8  font-weight: inherit;
9  font-style: inherit;
10  font-size: 100%;
11  font-family: inherit;
12  vertical-align: baseline;
13}
14
15body {
16  font-size: 13px;
17  padding: 1em;
18}
19
20h1 {
21  font-size: 26px;
22  margin-bottom: 1em;
23}
24
25h2 {
26  font-size: 24px;
27  margin-bottom: 1em;
28}
29
30h3 {
31  font-size: 20px;
32  margin-bottom: 1em;
33  margin-top: 1em;
34}
35
36pre, code {
37  line-height: 1.5;
38  font-family: Monaco, 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', 'Lucida Console', monospace;
39}
40
41pre {
42  margin-top: 0.5em;
43}
44
45h1, h2, h3, p {
46  font-family: Arial, sans serif;
47}
48
49h1, h2, h3 {
50  border-bottom: solid #CCC 1px;
51}
52
53.toc_element {
54  margin-top: 0.5em;
55}
56
57.firstline {
58  margin-left: 2 em;
59}
60
61.method  {
62  margin-top: 1em;
63  border: solid 1px #CCC;
64  padding: 1em;
65  background: #EEE;
66}
67
68.details {
69  font-weight: bold;
70  font-size: 14px;
71}
72
73</style>
74
75<h1><a href="speech_v1.html">Cloud Speech-to-Text API</a> . <a href="speech_v1.speech.html">speech</a></h1>
76<h2>Instance Methods</h2>
77<p class="toc_element">
78  <code><a href="#longrunningrecognize">longrunningrecognize(body, x__xgafv=None)</a></code></p>
79<p class="firstline">Performs asynchronous speech recognition: receive results via the</p>
80<p class="toc_element">
81  <code><a href="#recognize">recognize(body, x__xgafv=None)</a></code></p>
82<p class="firstline">Performs synchronous speech recognition: receive results after all audio</p>
83<h3>Method Details</h3>
84<div class="method">
85    <code class="details" id="longrunningrecognize">longrunningrecognize(body, x__xgafv=None)</code>
86  <pre>Performs asynchronous speech recognition: receive results via the
87google.longrunning.Operations interface. Returns either an
88`Operation.error` or an `Operation.response` which contains
89a `LongRunningRecognizeResponse` message.
90For more information on asynchronous speech recognition, see the
91[how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
92
93Args:
94  body: object, The request body. (required)
95    The object takes the form of:
96
97{ # The top-level message sent by the client for the `LongRunningRecognize`
98      # method.
99    "audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # *Required* The audio data to be recognized.
100        # Either `content` or `uri` must be supplied. Supplying both or neither
101        # returns google.rpc.Code.INVALID_ARGUMENT. See
102        # [content limits](/speech-to-text/quotas#content).
103      "content": "A String", # The audio data bytes encoded as specified in
104          # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
105          # pure binary representation, whereas JSON representations use base64.
106      "uri": "A String", # URI that points to a file that contains audio data bytes as specified in
107          # `RecognitionConfig`. The file must not be compressed (for example, gzip).
108          # Currently, only Google Cloud Storage URIs are
109          # supported, which must be specified in the following format:
110          # `gs://bucket_name/object_name` (other URI formats return
111          # google.rpc.Code.INVALID_ARGUMENT). For more information, see
112          # [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
113    },
114    "config": { # Provides information to the recognizer that specifies how to process the # *Required* Provides information to the recognizer that specifies how to
115        # process the request.
116        # request.
117      "languageCode": "A String", # *Required* The language of the supplied audio as a
118          # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
119          # Example: "en-US".
120          # See [Language Support](/speech-to-text/docs/languages)
121          # for a list of the currently supported language codes.
122      "audioChannelCount": 42, # *Optional* The number of channels in the input audio data.
123          # ONLY set this for MULTI-CHANNEL recognition.
124          # Valid values for LINEAR16 and FLAC are `1`-`8`.
125          # Valid values for OGG_OPUS are '1'-'254'.
126          # Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
127          # If `0` or omitted, defaults to one channel (mono).
128          # Note: We only recognize the first channel by default.
129          # To perform independent recognition on each channel set
130          # `enable_separate_recognition_per_channel` to 'true'.
131      "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
132          # This field is optional for `FLAC` and `WAV` audio files and required
133          # for all other audio formats. For details, see AudioEncoding.
134      "enableAutomaticPunctuation": True or False, # *Optional* If 'true', adds punctuation to recognition result hypotheses.
135          # This feature is only available in select languages. Setting this for
136          # requests in other languages has no effect at all.
137          # The default 'false' value does not add punctuation to result hypotheses.
138          # Note: This is currently offered as an experimental service, complimentary
139          # to all users. In the future this may be exclusively available as a
140          # premium feature.
141      "enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1
142          # to get each channel recognized separately. The recognition result will
143          # contain a `channel_tag` field to state which channel that result belongs
144          # to. If this is not true, we will only recognize the first channel. The
145          # request is billed cumulatively for all channels recognized:
146          # `audio_channel_count` multiplied by the length of the audio.
147      "enableWordTimeOffsets": True or False, # *Optional* If `true`, the top result includes a list of words and
148          # the start and end time offsets (timestamps) for those words. If
149          # `false`, no word-level time offset information is returned. The default is
150          # `false`.
151      "maxAlternatives": 42, # *Optional* Maximum number of recognition hypotheses to be returned.
152          # Specifically, the maximum number of `SpeechRecognitionAlternative` messages
153          # within each `SpeechRecognitionResult`.
154          # The server may return fewer than `max_alternatives`.
155          # Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
156          # one. If omitted, will return a maximum of one.
157      "useEnhanced": True or False, # *Optional* Set to true to use an enhanced model for speech recognition.
158          # If `use_enhanced` is set to true and the `model` field is not set, then
159          # an appropriate enhanced model is chosen if:
160          # 1. project is eligible for requesting enhanced models
161          # 2. an enhanced model exists for the audio
162          #
163          # If `use_enhanced` is true and an enhanced version of the specified model
164          # does not exist, then the speech is recognized using the standard version
165          # of the specified model.
166          #
167          # Enhanced speech models require that you opt-in to data logging using
168          # instructions in the
169          # [documentation](/speech-to-text/docs/enable-data-logging). If you set
170          # `use_enhanced` to true and you have not enabled audio logging, then you
171          # will receive an error.
172      "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
173          # `RecognitionAudio` messages. Valid values are: 8000-48000.
174          # 16000 is optimal. For best results, set the sampling rate of the audio
175          # source to 16000 Hz. If that's not possible, use the native sample rate of
176          # the audio source (instead of re-sampling).
177          # This field is optional for FLAC and WAV audio files, but is
178          # required for all other audio formats. For details, see AudioEncoding.
179      "profanityFilter": True or False, # *Optional* If set to `true`, the server will attempt to filter out
180          # profanities, replacing all but the initial character in each filtered word
181          # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
182          # won't be filtered out.
183      "model": "A String", # *Optional* Which model to select for the given request. Select the model
184          # best suited to your domain to get best results. If a model is not
185          # explicitly specified, then we auto-select a model based on the parameters
186          # in the RecognitionConfig.
187          # <table>
188          #   <tr>
189          #     <td><b>Model</b></td>
190          #     <td><b>Description</b></td>
191          #   </tr>
192          #   <tr>
193          #     <td><code>command_and_search</code></td>
194          #     <td>Best for short queries such as voice commands or voice search.</td>
195          #   </tr>
196          #   <tr>
197          #     <td><code>phone_call</code></td>
198          #     <td>Best for audio that originated from a phone call (typically
199          #     recorded at an 8khz sampling rate).</td>
200          #   </tr>
201          #   <tr>
202          #     <td><code>video</code></td>
203          #     <td>Best for audio that originated from from video or includes multiple
204          #         speakers. Ideally the audio is recorded at a 16khz or greater
205          #         sampling rate. This is a premium model that costs more than the
206          #         standard rate.</td>
207          #   </tr>
208          #   <tr>
209          #     <td><code>default</code></td>
210          #     <td>Best for audio that is not one of the specific audio models.
211          #         For example, long-form audio. Ideally the audio is high-fidelity,
212          #         recorded at a 16khz or greater sampling rate.</td>
213          #   </tr>
214          # </table>
215      "speechContexts": [ # *Optional* array of SpeechContext.
216          # A means to provide context to assist the speech recognition. For more
217          # information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
218        { # Provides "hints" to the speech recognizer to favor specific words and phrases
219            # in the results.
220          "phrases": [ # *Optional* A list of strings containing words and phrases "hints" so that
221              # the speech recognition is more likely to recognize them. This can be used
222              # to improve the accuracy for specific words and phrases, for example, if
223              # specific commands are typically spoken by the user. This can also be used
224              # to add additional words to the vocabulary of the recognizer. See
225              # [usage limits](/speech-to-text/quotas#content).
226              #
227              # List items can also be set to classes for groups of words that represent
228              # common concepts that occur in natural language. For example, rather than
229              # providing phrase hints for every month of the year, using the $MONTH class
230              # improves the likelihood of correctly transcribing audio that includes
231              # months.
232            "A String",
233          ],
234        },
235      ],
236      "metadata": { # Description of audio data to be recognized. # *Optional* Metadata regarding this request.
237        "recordingDeviceType": "A String", # The type of device the speech was recorded with.
238        "originalMediaType": "A String", # The original media the speech was recorded on.
239        "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
240        "obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of
241            # unique users using the service.
242        "originalMimeType": "A String", # Mime type of the original audio file.  For example `audio/m4a`,
243            # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
244            # A list of possible audio mime types is maintained at
245            # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
246        "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
247            # closely applies. This is most indicative of the topics contained
248            # in the audio.  Use the 6-digit NAICS code to identify the industry
249            # vertical - see https://www.naics.com/search/.
250        "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
251            # hearings from 2012".
252        "recordingDeviceName": "A String", # The device used to make the recording.  Examples 'Nexus 5X' or
253            # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
254            # 'Cardioid Microphone'.
255        "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
256      },
257    },
258  }
259
260  x__xgafv: string, V1 error format.
261    Allowed values
262      1 - v1 error format
263      2 - v2 error format
264
265Returns:
266  An object of the form:
267
268    { # This resource represents a long-running operation that is the result of a
269      # network API call.
270    "error": { # The `Status` type defines a logical error model that is suitable for # The error result of the operation in case of failure or cancellation.
271        # different programming environments, including REST APIs and RPC APIs. It is
272        # used by [gRPC](https://github.com/grpc). Each `Status` message contains
273        # three pieces of data: error code, error message, and error details.
274        #
275        # You can find out more about this error model and how to work with it in the
276        # [API Design Guide](https://cloud.google.com/apis/design/errors).
277      "message": "A String", # A developer-facing error message, which should be in English. Any
278          # user-facing error message should be localized and sent in the
279          # google.rpc.Status.details field, or localized by the client.
280      "code": 42, # The status code, which should be an enum value of google.rpc.Code.
281      "details": [ # A list of messages that carry the error details.  There is a common set of
282          # message types for APIs to use.
283        {
284          "a_key": "", # Properties of the object. Contains field @type with type URL.
285        },
286      ],
287    },
288    "done": True or False, # If the value is `false`, it means the operation is still in progress.
289        # If `true`, the operation is completed, and either `error` or `response` is
290        # available.
291    "response": { # The normal response of the operation in case of success.  If the original
292        # method returns no data on success, such as `Delete`, the response is
293        # `google.protobuf.Empty`.  If the original method is standard
294        # `Get`/`Create`/`Update`, the response should be the resource.  For other
295        # methods, the response should have the type `XxxResponse`, where `Xxx`
296        # is the original method name.  For example, if the original method name
297        # is `TakeSnapshot()`, the inferred response type is
298        # `TakeSnapshotResponse`.
299      "a_key": "", # Properties of the object. Contains field @type with type URL.
300    },
301    "name": "A String", # The server-assigned name, which is only unique within the same service that
302        # originally returns it. If you use the default HTTP mapping, the
303        # `name` should be a resource name ending with `operations/{unique_id}`.
304    "metadata": { # Service-specific metadata associated with the operation.  It typically
305        # contains progress information and common metadata such as create time.
306        # Some services might not provide such metadata.  Any method that returns a
307        # long-running operation should document the metadata type, if any.
308      "a_key": "", # Properties of the object. Contains field @type with type URL.
309    },
310  }</pre>
311</div>
312
313<div class="method">
314    <code class="details" id="recognize">recognize(body, x__xgafv=None)</code>
315  <pre>Performs synchronous speech recognition: receive results after all audio
316has been sent and processed.
317
318Args:
319  body: object, The request body. (required)
320    The object takes the form of:
321
322{ # The top-level message sent by the client for the `Recognize` method.
323    "audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # *Required* The audio data to be recognized.
324        # Either `content` or `uri` must be supplied. Supplying both or neither
325        # returns google.rpc.Code.INVALID_ARGUMENT. See
326        # [content limits](/speech-to-text/quotas#content).
327      "content": "A String", # The audio data bytes encoded as specified in
328          # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
329          # pure binary representation, whereas JSON representations use base64.
330      "uri": "A String", # URI that points to a file that contains audio data bytes as specified in
331          # `RecognitionConfig`. The file must not be compressed (for example, gzip).
332          # Currently, only Google Cloud Storage URIs are
333          # supported, which must be specified in the following format:
334          # `gs://bucket_name/object_name` (other URI formats return
335          # google.rpc.Code.INVALID_ARGUMENT). For more information, see
336          # [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
337    },
338    "config": { # Provides information to the recognizer that specifies how to process the # *Required* Provides information to the recognizer that specifies how to
339        # process the request.
340        # request.
341      "languageCode": "A String", # *Required* The language of the supplied audio as a
342          # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
343          # Example: "en-US".
344          # See [Language Support](/speech-to-text/docs/languages)
345          # for a list of the currently supported language codes.
346      "audioChannelCount": 42, # *Optional* The number of channels in the input audio data.
347          # ONLY set this for MULTI-CHANNEL recognition.
348          # Valid values for LINEAR16 and FLAC are `1`-`8`.
349          # Valid values for OGG_OPUS are '1'-'254'.
350          # Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
351          # If `0` or omitted, defaults to one channel (mono).
352          # Note: We only recognize the first channel by default.
353          # To perform independent recognition on each channel set
354          # `enable_separate_recognition_per_channel` to 'true'.
355      "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
356          # This field is optional for `FLAC` and `WAV` audio files and required
357          # for all other audio formats. For details, see AudioEncoding.
358      "enableAutomaticPunctuation": True or False, # *Optional* If 'true', adds punctuation to recognition result hypotheses.
359          # This feature is only available in select languages. Setting this for
360          # requests in other languages has no effect at all.
361          # The default 'false' value does not add punctuation to result hypotheses.
362          # Note: This is currently offered as an experimental service, complimentary
363          # to all users. In the future this may be exclusively available as a
364          # premium feature.
365      "enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1
366          # to get each channel recognized separately. The recognition result will
367          # contain a `channel_tag` field to state which channel that result belongs
368          # to. If this is not true, we will only recognize the first channel. The
369          # request is billed cumulatively for all channels recognized:
370          # `audio_channel_count` multiplied by the length of the audio.
371      "enableWordTimeOffsets": True or False, # *Optional* If `true`, the top result includes a list of words and
372          # the start and end time offsets (timestamps) for those words. If
373          # `false`, no word-level time offset information is returned. The default is
374          # `false`.
375      "maxAlternatives": 42, # *Optional* Maximum number of recognition hypotheses to be returned.
376          # Specifically, the maximum number of `SpeechRecognitionAlternative` messages
377          # within each `SpeechRecognitionResult`.
378          # The server may return fewer than `max_alternatives`.
379          # Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
380          # one. If omitted, will return a maximum of one.
381      "useEnhanced": True or False, # *Optional* Set to true to use an enhanced model for speech recognition.
382          # If `use_enhanced` is set to true and the `model` field is not set, then
383          # an appropriate enhanced model is chosen if:
384          # 1. project is eligible for requesting enhanced models
385          # 2. an enhanced model exists for the audio
386          #
387          # If `use_enhanced` is true and an enhanced version of the specified model
388          # does not exist, then the speech is recognized using the standard version
389          # of the specified model.
390          #
391          # Enhanced speech models require that you opt-in to data logging using
392          # instructions in the
393          # [documentation](/speech-to-text/docs/enable-data-logging). If you set
394          # `use_enhanced` to true and you have not enabled audio logging, then you
395          # will receive an error.
396      "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
397          # `RecognitionAudio` messages. Valid values are: 8000-48000.
398          # 16000 is optimal. For best results, set the sampling rate of the audio
399          # source to 16000 Hz. If that's not possible, use the native sample rate of
400          # the audio source (instead of re-sampling).
401          # This field is optional for FLAC and WAV audio files, but is
402          # required for all other audio formats. For details, see AudioEncoding.
403      "profanityFilter": True or False, # *Optional* If set to `true`, the server will attempt to filter out
404          # profanities, replacing all but the initial character in each filtered word
405          # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
406          # won't be filtered out.
407      "model": "A String", # *Optional* Which model to select for the given request. Select the model
408          # best suited to your domain to get best results. If a model is not
409          # explicitly specified, then we auto-select a model based on the parameters
410          # in the RecognitionConfig.
411          # <table>
412          #   <tr>
413          #     <td><b>Model</b></td>
414          #     <td><b>Description</b></td>
415          #   </tr>
416          #   <tr>
417          #     <td><code>command_and_search</code></td>
418          #     <td>Best for short queries such as voice commands or voice search.</td>
419          #   </tr>
420          #   <tr>
421          #     <td><code>phone_call</code></td>
422          #     <td>Best for audio that originated from a phone call (typically
423          #     recorded at an 8khz sampling rate).</td>
424          #   </tr>
425          #   <tr>
426          #     <td><code>video</code></td>
427          #     <td>Best for audio that originated from from video or includes multiple
428          #         speakers. Ideally the audio is recorded at a 16khz or greater
429          #         sampling rate. This is a premium model that costs more than the
430          #         standard rate.</td>
431          #   </tr>
432          #   <tr>
433          #     <td><code>default</code></td>
434          #     <td>Best for audio that is not one of the specific audio models.
435          #         For example, long-form audio. Ideally the audio is high-fidelity,
436          #         recorded at a 16khz or greater sampling rate.</td>
437          #   </tr>
438          # </table>
439      "speechContexts": [ # *Optional* array of SpeechContext.
440          # A means to provide context to assist the speech recognition. For more
441          # information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
442        { # Provides "hints" to the speech recognizer to favor specific words and phrases
443            # in the results.
444          "phrases": [ # *Optional* A list of strings containing words and phrases "hints" so that
445              # the speech recognition is more likely to recognize them. This can be used
446              # to improve the accuracy for specific words and phrases, for example, if
447              # specific commands are typically spoken by the user. This can also be used
448              # to add additional words to the vocabulary of the recognizer. See
449              # [usage limits](/speech-to-text/quotas#content).
450              #
451              # List items can also be set to classes for groups of words that represent
452              # common concepts that occur in natural language. For example, rather than
453              # providing phrase hints for every month of the year, using the $MONTH class
454              # improves the likelihood of correctly transcribing audio that includes
455              # months.
456            "A String",
457          ],
458        },
459      ],
460      "metadata": { # Description of audio data to be recognized. # *Optional* Metadata regarding this request.
461        "recordingDeviceType": "A String", # The type of device the speech was recorded with.
462        "originalMediaType": "A String", # The original media the speech was recorded on.
463        "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
464        "obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of
465            # unique users using the service.
466        "originalMimeType": "A String", # Mime type of the original audio file.  For example `audio/m4a`,
467            # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
468            # A list of possible audio mime types is maintained at
469            # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
470        "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
471            # closely applies. This is most indicative of the topics contained
472            # in the audio.  Use the 6-digit NAICS code to identify the industry
473            # vertical - see https://www.naics.com/search/.
474        "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
475            # hearings from 2012".
476        "recordingDeviceName": "A String", # The device used to make the recording.  Examples 'Nexus 5X' or
477            # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
478            # 'Cardioid Microphone'.
479        "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
480      },
481    },
482  }
483
484  x__xgafv: string, V1 error format.
485    Allowed values
486      1 - v1 error format
487      2 - v2 error format
488
489Returns:
490  An object of the form:
491
492    { # The only message returned to the client by the `Recognize` method. It
493      # contains the result as zero or more sequential `SpeechRecognitionResult`
494      # messages.
495    "results": [ # Output only. Sequential list of transcription results corresponding to
496        # sequential portions of audio.
497      { # A speech recognition result corresponding to a portion of the audio.
498        "channelTag": 42, # For multi-channel audio, this is the channel number corresponding to the
499            # recognized result for the audio from that channel.
500            # For audio_channel_count = N, its output values can range from '1' to 'N'.
501        "alternatives": [ # Output only. May contain one or more recognition hypotheses (up to the
502            # maximum specified in `max_alternatives`).
503            # These alternatives are ordered in terms of accuracy, with the top (first)
504            # alternative being the most probable, as ranked by the recognizer.
505          { # Alternative hypotheses (a.k.a. n-best list).
506            "confidence": 3.14, # Output only. The confidence estimate between 0.0 and 1.0. A higher number
507                # indicates an estimated greater likelihood that the recognized words are
508                # correct. This field is set only for the top alternative of a non-streaming
509                # result or, of a streaming result where `is_final=true`.
510                # This field is not guaranteed to be accurate and users should not rely on it
511                # to be always provided.
512                # The default of 0.0 is a sentinel value indicating `confidence` was not set.
513            "transcript": "A String", # Output only. Transcript text representing the words that the user spoke.
514            "words": [ # Output only. A list of word-specific information for each recognized word.
515                # Note: When `enable_speaker_diarization` is true, you will see all the words
516                # from the beginning of the audio.
517              { # Word-specific information for recognized words.
518                "endTime": "A String", # Output only. Time offset relative to the beginning of the audio,
519                    # and corresponding to the end of the spoken word.
520                    # This field is only set if `enable_word_time_offsets=true` and only
521                    # in the top hypothesis.
522                    # This is an experimental feature and the accuracy of the time offset can
523                    # vary.
524                "word": "A String", # Output only. The word corresponding to this set of information.
525                "startTime": "A String", # Output only. Time offset relative to the beginning of the audio,
526                    # and corresponding to the start of the spoken word.
527                    # This field is only set if `enable_word_time_offsets=true` and only
528                    # in the top hypothesis.
529                    # This is an experimental feature and the accuracy of the time offset can
530                    # vary.
531              },
532            ],
533          },
534        ],
535      },
536    ],
537  }</pre>
538</div>
539
540</body></html>