• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  SR_RecognizerImpl.h  *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 #ifndef __SR_RECOGNIZERIMPL_H
21 #define __SR_RECOGNIZERIMPL_H
22 
23 
24 
25 #include "ArrayList.h"
26 #include "CircularBuffer.h"
27 #include "ESR_ReturnCode.h"
28 #include "ESR_SessionType.h"
29 #include "HashMap.h"
30 #include "SR_AcousticState.h"
31 #include "SR_Recognizer.h"
32 #include "SR_EventLog.h"
33 #include "ptimestamp.h"
34 #include "SR_Grammar.h"
35 #include "SR_Nametag.h"
36 
37 
38 #include "frontapi.h"
39 #include "simapi.h"
40 
41 /***
42  * Recognizer timings to be written to OSI logs
43  */
44 
45 typedef struct RecogLogTimings_t
46 {
47   size_t BORT;    /* beginning of recognition time (millisec) */
48   size_t DURS;    /* amount of speech processed (millisec) */
49   size_t EORT;    /* end of recognition time (millisec) */
50   size_t EOSD;    /* num of frames of speech before EOSS (frames) */
51   size_t EOSS;    /* frame where end of speech signal occurred (frames) */
52   size_t BOSS;    /* frame where start of speech signal occurred (frames) */
53   size_t EOST;    /* instant where end of speech signal occurred (millisec) */
54 }
55 RecogLogTimings;
56 
57 
58 typedef enum
59 {
60   /**
61    * Initial state.
62    */
63   SR_RECOGNIZER_INTERNAL_BEGIN,
64   /**
65    * Timeout before beginning of speech.
66    */
67   SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT,
68   /**
69    * Got end of input before beginning of speech.
70    */
71   SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH,
72   /**
73    * Waiting for beginning of speech.
74    */
75   SR_RECOGNIZER_INTERNAL_BOS_DETECTION,
76   /**
77    * Waiting for end of speech or input.
78    */
79   SR_RECOGNIZER_INTERNAL_EOS_DETECTION,
80   /**
81    * Got end of input.
82    */
83   SR_RECOGNIZER_INTERNAL_EOI,
84   /**
85    * Detected end of speech (not due to end of input).
86    */
87   SR_RECOGNIZER_INTERNAL_EOS,
88   /**
89    * Final state.
90    */
91   SR_RECOGNIZER_INTERNAL_END,
92 } SR_RecognizerInternalStatus;
93 
94 
95 /**
96  * Waveform Buffering stuff (for Nametags)
97  **/
98 
99 #define DEFAULT_WAVEFORM_BUFFER_MAX_SIZE       65  /* kBytes, will not grow */
100 #define DEFAULT_WAVEFORM_WINDBACK_FRAMES       50  /* will convert frames to bytes, will not grow */
101 #define DEFAULT_BOS_COMFORT_FRAMES              2
102 #define DEFAULT_EOS_COMFORT_FRAMES              2
103 
104 typedef enum
105 {
106   WAVEFORM_BUFFERING_OFF,             /* no buffering */
107   WAVEFORM_BUFFERING_ON_CIRCULAR,     /* buffer but, do not grow past a certain upper bound, just loop & overwrite */
108   WAVEFORM_BUFFERING_ON_LINEAR,       /* buffer and report overflow if necessary */
109 } waveform_buffering_state_t;
110 
111 /* audio buffer which supports windback */
112 
113 typedef struct WaveformBuffer_t
114 {
115   void   *windback_buffer;        /* a temp buffer used for windback functionality (malloc only at init)*/
116   size_t windback_buffer_sz;      /* sizeof buffer */
117   waveform_buffering_state_t state; /* state of the buffer (considered only when writing to buffer) */
118   CircularBuffer* cbuffer;        /* the actual buffer */
119   size_t   overflow_count;        /* indicates the total number of bytes the overflowed */
120   size_t read_size;
121   size_t eos_comfort_frames;
122   size_t bos_comfort_frames;
123 }
124 WaveformBuffer;
125 
126 
127 /* create the buffer */
128 ESR_ReturnCode WaveformBuffer_Create(WaveformBuffer** waveformBuffer, size_t frame_size);
129 
130 /* reset the buffer... do not release memeory */
131 ESR_ReturnCode WaveformBuffer_Reset(WaveformBuffer* waveformBuffer);
132 
133 /* get size */
134 ESR_ReturnCode WaveformBuffer_GetSize(WaveformBuffer* waveformBuffer, size_t* size);
135 
136 /* write to buffer. will grow only if buffering state is set to allow it */
137 ESR_ReturnCode WaveformBuffer_Write(WaveformBuffer* waveformBuffer, void *data, size_t num_bytes);
138 
139 /* read the whole buffer (starting from start offset, up to read_size) into a chunk allocated outside */
140 ESR_ReturnCode WaveformBuffer_Read(WaveformBuffer* waveformBuffer, void *data, size_t* num_bytes);
141 
142 /* does the windback after bos detected */
143 ESR_ReturnCode WaveformBuffer_WindBack(WaveformBuffer* waveformBuffer, const size_t num_bytes);
144 
145 /* sets the start offset and read_size at the end of recognition when endpointed transcription is known */
146 ESR_ReturnCode WaveformBuffer_ParseEndPointedResultAndTrim(WaveformBuffer* waveformBuffer, const LCHAR* end_pointed_result, const size_t bytes_per_frame);
147 
148 /* free the memory allocated for blocks and for windback */
149 ESR_ReturnCode WaveformBuffer_Destroy(WaveformBuffer* waveformBuffer);
150 
151 /* sets the state of buffer */
152 ESR_ReturnCode WaveformBuffer_SetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t state);
153 
154 /* gets the state of buffer */
155 ESR_ReturnCode WaveformBuffer_GetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t* state);
156 
157 /* skip the first few bytes (moves read pointer forward */
158 ESR_ReturnCode WaveformBuffer_Skip(WaveformBuffer* waveformBuffer, const size_t bytes);
159 
160 
161 
162 /**
163  * Speech recognizer.
164  */
165 typedef struct SR_RecognizerImpl_t
166 {
167   /**
168    * Interface functions that must be implemented.
169    */
170   SR_Recognizer Interface;
171 
172   /**
173    * Legacy CREC frontend.
174    */
175   CA_Frontend* frontend;
176   /**
177    * Legacy CREC Input waveform object.
178    */
179   CA_Wave* wavein;
180   /**
181    * Legacy CREC Utterance object.
182    */
183   CA_Utterance* utterance;
184   /**
185    * Legacy CREC confidence score calculator.
186    */
187   CA_ConfidenceScorer* confidenceScorer;
188   /**
189    * Legacy CREC recognizer.
190    */
191   CA_Recog* recognizer;
192   /**
193    * AcousticModels associated with Recognizer.
194    */
195   SR_AcousticModels* models;
196   /**
197   * Active Recognizer grammars.
198   */
199   HashMap* grammars;
200   /**
201    * Recognition result.
202    */
203   SR_RecognizerResult* result;
204   /**
205    * Recognizer parameters.
206    */
207   ESR_SessionType* parameters;
208   /**
209    * AcousticState associated with Recognizer.
210    */
211   SR_AcousticState* acousticState;
212   /**
213    * Total number of frames pushed by SR_RecognizerPutAudio().
214    */
215   size_t frames;
216   /**
217    * Number of processed frames.
218    */
219   size_t processed;
220   /**
221    * The number of frames up until the windback point (where -pau- starts).
222    */
223   size_t beginningOfSpeechOffset;
224   /**
225    * Internal recognizer state.
226    */
227   SR_RecognizerInternalStatus internalState;
228   /**
229    * Indicates if SR_RecognizerStart() was called.
230    */
231   ESR_BOOL isStarted;
232   /**
233    * Indicates if PutAudio() was called with the last audio frame.
234    */
235   ESR_BOOL gotLastFrame;
236   /**
237    * Audio buffer used by PutAudio().
238    */
239   CircularBuffer* buffer;
240   /**
241    * Temporary buffer used to transfer audio data (PutAudio).
242    **/
243   asr_int16_t *audioBuffer;
244   /**
245    * Recognizer sample rate.
246    */
247   size_t sampleRate;
248   /**
249    * Whether reconition has begun after begiing of speech detection
250    */
251   ESR_BOOL isRecognizing;
252   /**
253    * Max number of frames to process before BOS timeout
254    */
255   size_t utterance_timeout;
256   /**
257    * Locking function associated.
258    */
259   SR_RecognizerLockFunction lockFunction;
260   /**
261    * Locking function data.
262    */
263   void* lockData;
264 
265   /**
266    * OSI logging level
267    * if bit0 (OSI_LOG_LEVEL_BASIC) is set: do basic logging
268    * if bit1 (OSI_LOG_LEVEL_AUDIO) is set: do audio waveform logging
269    * if bit2 (OSI_LOG_LEVEL_ADDWD) is set: do dynamic grammar addword logging
270    */
271   size_t osi_log_level;
272 
273   /**
274    * EventLog pointer
275    */
276   SR_EventLog* eventLog;
277   /**
278    * Data that should be logged in OSI
279    */
280   RecogLogTimings recogLogTimings;
281   /**
282    * Timestamp reference used for calculating timings
283    */
284   PTimeStamp timestamp;
285 
286   /**
287    * Waveform buffer (for nametags) .
288    */
289   WaveformBuffer* waveformBuffer;
290 
291   /**
292    * Reason for eos detected
293    */
294   LCHAR* eos_reason;
295 
296   /**
297    * Indicates if signal quality variables have been initialized.
298    */
299   ESR_BOOL isSignalQualityInitialized;
300   /**
301    * True if signal is being clipped.
302    */
303   ESR_BOOL isSignalClipping;
304   /**
305    * True if DCOffset is present in signal.
306    */
307   ESR_BOOL isSignalDCOffset;
308   /**
309    * True if signal is noisy.
310    */
311   ESR_BOOL isSignalNoisy;
312   /**
313    * True if signal is too quiet.
314    */
315   ESR_BOOL isSignalTooQuiet;
316   /**
317    * True if signal contains too few samples.
318    */
319   ESR_BOOL isSignalTooFewSamples;
320   /**
321    * True if signal contains too many samples.
322    */
323   ESR_BOOL isSignalTooManySamples;
324 
325   /**
326    * Number of bytes in a frame.
327    **/
328   size_t FRAME_SIZE;
329 
330   /**
331    * If TRUE, beginning of speech detection is enabled.
332    */
333   ESR_BOOL gatedMode;
334 
335   /**
336    * The minimum number of frames to sniff before beginning recognition.
337    */
338   size_t bgsniff;
339   /**
340    * Indicates if we've skipped holdOffPeriod frames at the beginning of the waveform.
341    */
342   ESR_BOOL holdOffPeriodSkipped;
343 }
344 SR_RecognizerImpl;
345 
346 /**
347  * Groups grammar with meta-data.
348  */
349 typedef struct GrammarBag_t
350 {
351   /**
352    * Grammar object.
353    */
354   SR_Grammar* grammar;
355   /**
356    * Grammar weight.
357    */
358   unsigned int weight;
359   /**
360    * Grammar ID.
361    */
362   LCHAR* grammarID;
363 }
364 GrammarBag;
365 
366 
367 /**
368  * Default implementation.
369  */
370 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStartImpl(SR_Recognizer* self);
371 /**
372  * Default implementation.
373  */
374 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStopImpl(SR_Recognizer* self);
375 /**
376  * Default implementation.
377  */
378 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDestroyImpl(SR_Recognizer* self);
379 /**
380  * Default implementation.
381  */
382 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetupImpl(SR_Recognizer* self);
383 /**
384  * Default implementation.
385  */
386 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerUnsetupImpl(SR_Recognizer* self);
387 /**
388  * Default implementation.
389  */
390 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSetupImpl(SR_Recognizer* self, ESR_BOOL* isSetup);
391 
392 /**
393  * Default implementation.
394  */
395 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value, size_t* len);
396 /**
397  * Default implementation.
398  */
399 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t* value);
400 /**
401  * Default implementation.
402  */
403 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL* value);
404 /**
405  * Default implementation.
406  */
407 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value);
408 /**
409  * Default implementation.
410  */
411 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t value);
412 /**
413  * Default implementation.
414  */
415 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL value);
416 
417 /**
418  * Default implementation.
419  */
420 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerHasSetupRulesImpl(SR_Recognizer* self,
421     ESR_BOOL* hasSetupRules);
422 /**
423  * Default implementation.
424  */
425 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerActivateRuleImpl(SR_Recognizer* self,
426     SR_Grammar* grammar,
427     const LCHAR* ruleName,
428     unsigned int weight);
429 /**
430  * Default implementation.
431  */
432 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateRuleImpl(SR_Recognizer* self,
433     SR_Grammar* grammar,
434     const LCHAR* ruleName);
435 
436 /**
437  * Default implementation.
438  */
439 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateAllRulesImpl(SR_Recognizer* self);
440 
441 /**
442  * Default implementation.
443  */
444 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsActiveRuleImpl(SR_Recognizer* self,
445     SR_Grammar* grammar,
446     const LCHAR* ruleName,
447     ESR_BOOL* isActiveRule);
448 /**
449  * Default implementation.
450  */
451 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetWordAdditionCeilingImpl(SR_Recognizer* self,
452     SR_Grammar* grammar);
453 /**
454  * Default implementation.
455  */
456 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerCheckGrammarConsistencyImpl(SR_Recognizer* self,
457     SR_Grammar* grammar,
458     ESR_BOOL* isConsistent);
459 /**
460  * Default implementation.
461  */
462 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetModelsImpl(SR_Recognizer* self,
463 															  SR_AcousticModels** models);
464 /**
465  * Default implementation.
466  */
467 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerPutAudioImpl(SR_Recognizer* self,
468     asr_int16_t* buffer,
469     size_t* bufferSize,
470     ESR_BOOL isLast);
471 /**
472  * Default implementation.
473  */
474 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerAdvanceImpl(SR_Recognizer* self,
475     SR_RecognizerStatus* status,
476     SR_RecognizerResultType* type,
477     SR_RecognizerResult** result);
478 
479 /**
480  * Default implementation.
481  */
482 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerClearAcousticStateImpl(SR_Recognizer* self);
483 /**
484  * Default implementation.
485  */
486 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadAcousticStateImpl(SR_Recognizer* self,
487     const LCHAR* filename);
488 
489 /**
490  * Default implementation.
491  */
492 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadUtteranceImpl(SR_Recognizer* self, const LCHAR* filename);
493 /**
494  * Default implementation.
495  */
496 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadWaveFileImpl(SR_Recognizer* self, const LCHAR* filename);
497 
498 /**
499  * Default implementation.
500  */
501 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenImpl(SR_Recognizer* self, const LCHAR* token, const LCHAR* value);
502 /**
503  * Default implementation.
504  */
505 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenIntImpl(SR_Recognizer* self, const LCHAR* token, int value);
506 /**
507  * Default implementation.
508  */
509 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogEventImpl(SR_Recognizer* self, const LCHAR* event);
510 /**
511  * Default implementation.
512  */
513 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionStartImpl(SR_Recognizer* self, const LCHAR* sessionName);
514 /**
515  * Default implementation.
516  */
517 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionEndImpl(SR_Recognizer* self);
518 /**
519  * Default implementation.
520  */
521 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogWaveformDataImpl(SR_Recognizer* self,
522     const LCHAR* waveformFilename,
523     const LCHAR* transcription,
524     const double bos,
525     const double eos,
526     ESR_BOOL isInvocab);
527 /**
528  * Default implementation.
529  */
530 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetLockFunctionImpl(SR_Recognizer *self, SR_RecognizerLockFunction function, void* data);
531 /**
532  * Default implementation.
533  */
534 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalClippingImpl(SR_Recognizer* self, ESR_BOOL* isClipping);
535 /**
536  * Default implementation.
537  */
538 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalDCOffsetImpl(SR_Recognizer* self, ESR_BOOL* isDCOffset);
539 /**
540  * Default implementation.
541  */
542 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalNoisyImpl(SR_Recognizer* self, ESR_BOOL* isNoisy);
543 /**
544  * Default implementation.
545  */
546 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooQuietImpl(SR_Recognizer* self, ESR_BOOL* isTooQuiet);
547 /**
548  * Default implementation.
549  */
550 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooFewSamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooFewSamples);
551 /**
552  * Default implementation.
553  */
554 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooManySamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooManySamples);
555 
556 SREC_RECOGNIZER_API ESR_ReturnCode SR_Recognizer_Change_Sample_RateImpl ( SR_Recognizer *self, size_t new_sample_rate );
557 
558 #endif /* __SR_RECOGNIZERIMPL_H */
559