• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  srec.h  *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 /* this file contains defines needed by the srec search component*/
21 
22 #ifndef _h_srec_
23 #define _h_srec_
24 
25 #include "swimodel.h"
26 #include "hmm_desc.h"
27 #include "utteranc.h"
28 #include "hmmlib.h"
29 #include "srec_sizes.h"
30 #include "search_network.h"
31 #include "srec_context.h"
32 #include "srec_eosd.h"
33 #include "astar.h"
34 
35 #define MAX_HMM 3            /*maximum HMM states in an allophone*/
36 #define DO_ALLOW_MULTIPLE_MODELS 1
37 
38 /*in order to keep data sizes as small as possible, most of the the structure
39   below use indices into one fsmarc_token array and one word_token array.  This
40   makes the code a bit confusing (compared to just keeping pointers to these
41   structure around), uses a bit more CPU, but saves memory and gives us more
42   flexibility in the sizes of these data types*/
43 
44 /**
45  * @todo document
46  */
47 typedef struct altword_token_t
48 {
49   costdata costdelta;        /* cost relative to path being propagated */
50   wordID word;               /* alternative word, diff from path b.p. */
51   wtokenID word_backtrace;   /* alternative backtrace, diff from path b.p.*/
52   struct altword_token_t* next_token; /* todo: change this to indices */
53   asr_int16_t refcount;
54   costdata costbasis;        /* cost of best fsmarc_token host */
55 }
56 altword_token;
57 #define AWTNULL 0
58 /* fsmarc_tokens and fsmnode_tokens point to a batch of altword_tokens
59    to save memory, many fsmarc_tokens can point to the same altword_token
60    and these are propagated by reference */
61 
62 /**
63  * @todo document
64  */
65 typedef struct fsmarc_token_t
66 {
67   frameID num_hmm_states;           /* number of hmm states */
68   costdata cost[MAX_HMM];           /* cost so far*/
69   wtokenID word_backtrace[MAX_HMM]; /* index into word tokens*/
70   wordID word[MAX_HMM];             /* when the path encounters an output
71              symbol, store it here*/
72   frameID duration[MAX_HMM];        /* frames observed for this hmm state, todo: pack into char! */
73   arcID FSMarc_index;               /* index into the FSM arc array */
74 
75   stokenID next_token_index;        /* for maintaining linked lists of these
76              tokens, both in search and in freelist */
77   altword_token* aword_backtrace[MAX_HMM];
78 }
79 fsmarc_token;
80 /* 30 bytes */
81 
82 
83 /**
84  * These are used while maximizing into FSM nodes.
85  */
86 typedef struct fsmnode_token_t
87 {
88   costdata cost;
89   wtokenID word_backtrace;  /* index into word tokens*/
90   wordID word;              /* when the path encounters an output*/
91   nodeID FSMnode_index;
92   ftokenID next_token_index;
93   altword_token* aword_backtrace;
94   frameID silence_duration;
95 }
96 fsmnode_token;
97 /* 10 bytes */
98 
99 /**
100  * @todo document
101  */
102 typedef struct word_token_t
103 {
104   wordID word;                /* the word just observed */
105   frameID end_time;           /* end time of the word just observed, includes trailing silence */
106   nodeID end_node;            /* for backtrace with word graph */
107   wtokenID backtrace;         /* for backtrace */
108   costdata cost;              /* cost for path up to this point*/
109   wtokenID next_token_index;  /* for maintaining linked lists of these tokens
110        (both in the search and in the freelist) */
111   frameID _word_end_time;     /* end time of the word just observed, excl trailing silence */
112   /* since frameID is 16 bit, and 15bits is plenty
113      (ie 32767 frames * 20ms/frame = 655 sec), we use the high-bit to store
114 	 whether this word_token represents a homonym, this is used in confidence
115 	 score fixing! */
116 #define WORD_TOKEN_GET_HOMONYM(wT)     (wT->_word_end_time & 0x8000)  // 10000000
117 #define WORD_TOKEN_SET_HOMONYM(wT,hM)  (wT->_word_end_time = (wT->_word_end_time&0x7fff)|(hM?0x8000:0))
118 #define WORD_TOKEN_GET_WD_ETIME(wT)    (wT->_word_end_time & 0x7fff) // 01111111
119 #define WORD_TOKEN_SET_WD_ETIME(wT,eT) (wT->_word_end_time = (wT->_word_end_time&0x8000)|(eT))
120 }
121 word_token;
122 /* 12 bytes */
123 
124 /**
125  * Contains what we need for later backtrace, nbest, etc.
126  */
127 typedef struct
128 {
129   /* there are various arrays below which frame number long - this is the number allocated */
130   frameID max_frames;
131 
132   /* for each frame, head of a linked list of word tokens for that frame */
133   wtokenID *words_for_frame;
134   asr_int16_t *whether_sorted;
135 
136 }
137 srec_word_lattice;
138 
139 /*This is just implemented as a list so far - use Johan's fancy implementation later*/
140 
141 /**
142  * @todo document
143  */
144 typedef struct priority_q_t
145 {
146   wtokenID word_token_list;  /* index of head token in queue - keep worst at end
147       (so we can pop one off) */
148   costdata max_cost_in_q;
149   miscdata num_in_q;
150   miscdata max_in_q;
151 }
152 priority_q;
153 
154 /*------------------------------------------------------------------*
155  *                                                                  *
156  *------------------------------------------------------------------*/
157 
158 /* notes ... what needs to be acoustic model specific
159 
160    (p)ool it
161    (1) single  .r but reset
162    (x) specific
163 
164    1 context
165    1 word_priority_q
166    x word_lattice
167    1 prune_delta
168    1 current_search_frame
169 
170    1.r best_token_for_arc[]  max_fsm_arcs
171    1.r best_token_for_node[]   max_fsm_nodes
172    1 cost_offset_for_frame MAX_FRAMES
173    1 accumulated_cost_offset_for_frame MAX_FRAMES
174 
175    x active_fsmarc_tokens
176    num_new_states   ... num in active_fsmarc_tokens
177    max_new_states   ... same as fsmarc_token_array_size
178 
179    x active_fsm_node_tokens
180 
181    ? current_model_scores num_model_slots_allocated
182 
183    p fsmarc_token_array _size _freelist
184    p fsmnode_token_array  _size _freelist
185    x word_token_array _size _freelist
186    x word_token_array_flags
187 
188    ... not used! best_fsmarc_token
189    srec_ended
190    astar_stack
191 */
192 
193 struct srec_t
194 {  /*contains everything needed to run the search*/
195   asr_int16_t id;                   /*contains an id for this recognizer*/
196   srec_context *context;      /*contains the recognition context (fst, info about models, etc)*/
197   priority_q *word_priority_q; /*used to keep track of new word in frame*/
198   srec_word_lattice *word_lattice;  /*used to keep track of word lattice in utterance*/
199 
200   costdata prune_delta;        /* controls the amount of score-based pruning - should this go in the context instead?*/
201   costdata current_prune_delta; /* when the above changes in mid-frame */
202   costdata current_best_cost;   /* 0 if single recog */
203 
204   frameID current_search_frame;
205   stokenID *best_token_for_arc;  /* non-owning ptr, see multi_srec below */
206 
207   stokenID active_fsmarc_tokens; /*head of list of state tokens for the next frame.  Used during
208         the search to keep track of new states for new frame.  This
209         is to allow us to efficently do things like prune, free state arrays, etc*/
210 
211 
212   nodeID num_new_states;
213   nodeID max_new_states;  /*the num allocated in the new_states array - if the search is exceeding this,
214          we need to tighten the pruning*/
215 
216   ftokenID *best_token_for_node;   /* non-owning ptr, see multi_srec below */
217 
218   ftokenID active_fsmnode_tokens;  /* linked list of all fsmnode token (same as ones in
219            best_state_for_node, just kept as a list)*/
220 
221   costdata *current_model_scores;  /* temporary array used by the search to contain model scores -
222            size is max number of models*/
223   modelID num_model_slots_allocated;  /*num allocated in above array - search will only
224        work with models with less than this number of models*/
225 
226   /*the following arrays handle all the state and word tokens.  All of them
227     are allocated to a fixed size at startup time, and the search uses elements
228     from the first array in the search.  The pruning of the search is used to
229     make sure that the allocated number is not exceeded*/
230 
231 
232   fsmarc_token *fsmarc_token_array;  /*used for storage of all state tokens
233            - allocated once at startup time and kept
234            around.  It's fixed size and the search
235            pruning must ensure that it is never
236            exceeded*/
237   stokenID fsmarc_token_array_size; /*total number of tokens allocated in this array*/
238   stokenID fsmarc_token_freelist;   /*index to head of state token freelist*/
239 
240   fsmnode_token *fsmnode_token_array;  /*used for storage of all fsmnode tokens
241            - allocated once at startup time and kept
242            around.  It's fixed size and the search
243            pruning must ensure that it is never
244            exceeded*/
245   ftokenID fsmnode_token_array_size; /*total number of tokens allocated in this array*/
246   ftokenID fsmnode_token_freelist;   /*index to head of fsmnode token freelist*/
247 
248   word_token *word_token_array;    /* used for storage of all word tokens -
249             allocated once at startup time and kept
250             around.  It's fixed size and the search
251             pruning must ensure that it is never
252             exceeded*/
253   asr_int16_t* word_token_array_flags;   /* bitarray used for flagging */
254   wtokenID word_token_array_size;  /* total number of tokens allocated in
255             this array*/
256   wtokenID word_token_freelist;    /* index to head of word token freelist*/
257 
258   altword_token* altword_token_array; /* used to store alternative words before a wb */
259   wtokenID altword_token_array_size;
260   altword_token* altword_token_freelist;
261   wtokenID altword_token_freelist_len;
262 
263   frameID max_frames;
264   costdata* best_model_cost_for_frame;
265   costdata* cost_offset_for_frame;        /* see multi_srec, below */
266   bigcostdata* accumulated_cost_offset;   /* see multi_srec, below */
267 
268   stokenID best_fsmarc_token;      /* ?? index of best scoring state token
269            this is used to lookup wtokens on the
270            top choice path, to make sure they're not
271            pruned via reprune_word_tokens() */
272   costdata current_best_ftoken_cost[NODE_INFO_NUMS];
273   ftokenID current_best_ftoken_index[NODE_INFO_NUMS];
274 
275   /*the following elements are to keep track of how big various arrays are*/
276   nodeID max_fsm_nodes;           /* see multi_srec below */
277   arcID max_fsm_arcs;             /* see multi_srec below */
278   asr_int16_t srec_ended;
279   AstarStack *astar_stack;        /* for backwards word search */
280   const featdata* avg_state_durations;  /* average state durations (from AMs) */
281 
282   srec_eos_detector_state eosd_state;
283 };
284 
285 #define MAX_RECOGNIZERS 2          /* generally, 1x for each acoustic model */
286 #define MAX_ACOUSTIC_MODELS 2
287 
288 /**
289  * @todo document
290  */
291 typedef struct
292 {
293   asr_int32_t num_allocated_recs;
294   asr_int32_t num_activated_recs;
295   srec* rec;                       /* size num_allocated_recs, one for
296             each gender */
297 
298   frameID max_frames;
299   costdata* cost_offset_for_frame; /* size max_frames, keeps track of
300             current_best_costs bookkeeping from
301             reset_current_best_costs_to_zero() */
302   bigcostdata *accumulated_cost_offset; /* same as above but cumulative */
303 
304 
305   ftokenID *best_token_for_node;  /* array (size max_fsm_nodes) best path into
306            fsmnode - kept as an fsmnode_token */
307   nodeID max_fsm_nodes;
308   stokenID *best_token_for_arc;   /* array (size max_fsm_arcs) best path into
309            fsmarc - kept as a fsmarc_token */
310   arcID max_fsm_arcs;
311 
312   /* non owning pointer to compact acoustic models */
313   asr_int32_t num_swimodels;
314   const SWIModel    *swimodel[MAX_ACOUSTIC_MODELS];
315   EOSrc eos_status;
316 }
317 multi_srec;
318 
319 #ifdef __cplusplus
320 extern "C"
321 {
322 #endif
323   priority_q* allocate_priority_q(int max_n);
324   void free_priority_q(priority_q* pq);
325   void clear_priority_q(priority_q *pq);
326   wtokenID get_word_token_list(priority_q *pq, word_token *word_token_array);
327   wtokenID add_word_token_to_priority_q(priority_q *pq, wtokenID token_index_to_add, word_token *word_token_array);
328   void remove_non_end_word_from_q(srec *rec, priority_q *pq, word_token *word_token_array, nodeID end_node);
329   costdata get_priority_q_threshold(priority_q *pq, word_token *word_token_array);
330 
331   void free_word_token(srec *rec, wtokenID old_token_index);
332   int srec_begin(srec* rec, int begin_syn_node);
333   void srec_no_more_frames(srec* rec);
334   bigcostdata accumulated_cost_offset(costdata *cost_offsets, frameID frame);
335   void multi_srec_get_speech_bounds(multi_srec* rec, frameID* start_frame, frameID* end_frame);
336   int multi_srec_get_eos_status(multi_srec* rec);
337 #ifdef __cplusplus
338 }
339 #endif
340 
341 /**
342  * For visualization in the debugger
343  */
344 typedef struct
345 {
346   asr_uint16_t data[50];
347 }
348 us50;
349 
350 /**
351  * @todo document
352  */
353 typedef struct
354 {
355   asr_uint16_t data[250];
356 }
357 us250;
358 
359 /**
360  * @todo document
361  */
362 typedef struct
363 {
364   asr_uint16_t data[1000];
365 }
366 us1000;
367 
368 #endif
369