• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  srec_context.c                                                           *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                         *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 
21 #include "stdlib.h"
22 #include "string.h"
23 #include "buildopt.h"
24 #include "setting.h"
25 #include "passert.h"
26 #include "pendian.h"
27 #include "portable.h"
28 #include "pstdio.h"
29 #include "ptypes.h"
30 #include "srec_context.h"
31 #include "srec_sizes.h"
32 #include "search_network.h"
33 #include "srec_arb.h"
34 #include "hmm_desc.h"
35 #if USE_COMP_STATS
36 #include "comp_stats.h"
37 #endif
38 
39 #ifdef SET_RCSID
40 static const char *rcsid = 0 ? (const char *) &rcsid :
41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $";
42 #endif
43 
44 #include <stdint.h>
45 
46 /* these are constants for now, we need to figure out how to make
47    this more flexible, possible ideas are:
48    (1) to specify these via API functions such as CR_LoadSyntax(),
49    (2) use defaults that would have been set the main parfile,
50    (3) put values for these fields in an actual grammar
51 */
52 
53 #define AVG_ARCS_PER_WORD    10
54 #define AVG_NODES_PER_WORD    7
55 #define AVG_CHARS_PER_WORD   18
56 #define MAX_PRON_LEN         1024
57 #define MAX_WORD_LEN 128
58 #define DO_WEIGHTED_ADDWORD   1
59 #define SLOTLOOP_OFFSET 64
60 #define SLOTNAME_INDICATOR "__"
61 #define RULE_INDICATOR '@'
62 #define MAX_LINE_LENGTH 512
63 #define DEFAULT_WB_COST 40
64 #define DISABLE_ARC_COST 999
65 #define DO_ARCS_IO_IN_ARC_ORDER 1
66 
67 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS)
68 // looking to match filename.grxml.Names@__Names__
69 // see also grxmlcompile.cpp where these names are constructed
70 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) )
71 
72 /*
73 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled)
74 EXAMPLE: We need to cast a memory address to a (wordmap*)
75          so we call DATA_ALIGN(memptr, wordmap, filled),
76          where FILLED contains the number of bytes that were used to align the pointer
77 */
78 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM)
79 /* under IPAQ it appears we need to align the structures
80    to certain boundaries.  More attention needed here for other
81    platforms ! */
82 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \
83     (z)=sizeof(y) - ((int) (x) % sizeof(y)); \
84     (x) += (z); /* force correct data alignment */ \
85   } else z=0;
86 #else
87 #define DATA_ALIGN(x, y, z)
88 #endif
89 
90 static const int do_minimize = 1;
91 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base)))
92 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx)
93 
94 /* prototypes */
95 
96 /*----------------------------------------------------------------------*
97  *                                                                      *
98  * internal prototypes                                                  *
99  *                                                                      *
100  *----------------------------------------------------------------------*/
101 
102 /* general use functions */
103 char split_line(char* line, char** av);
104 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max);
105 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc);
106 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc);
107 int printf_node1(srec_context* fst, FSMnode* node);
108 
109 /* fst_* functions are internal fst functions */
110 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
111                  wordID olabel, costdata cost,
112                  modelID* model_sequence, int num_models);
113 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc);
114 int fst_push_arc_cost(srec_context* fst, FSMarc* arc);
115 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc);
116 int fst_free_arc(srec_context* fst, FSMarc* arc);
117 int fst_free_node(srec_context* fst, FSMnode* node);
118 int fst_free_arc_net(srec_context* fst, FSMarc* arc);
119 arcID fst_get_free_arc(srec_context* fst);
120 nodeID fst_get_free_node(srec_context* fst);
121 int fst_pack_arc_usage(srec_context* fst);
122 
123 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok);
124 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok);
125 int num_arcs_leaving(srec_context* fst, FSMnode* node);
126 
127 int num_arcs_arriving(srec_context* fst, FSMnode* node);
128 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node);
129 int fst_fill_node_info(srec_context* context);
130 int fst_alloc_transit_points(srec_context* context);
131 
132 /* higher-level functions */
133 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add,
134                              PFile* fp);
135 int FST_LoadParams(srec_context* context, PFile* fp);
136 int FST_AssumeParams(srec_context* context);
137 int FST_UnloadReverseWordGraph(srec_context* context);
138 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree);
139 
140 static ESR_ReturnCode wordmap_clean ( wordmap *word_map );
141 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words );
142 
143 /*--------------------------------------------------------------------------*
144  *                                                                          *
145  *                                                                          *
146  *                                                                          *
147  *--------------------------------------------------------------------------*/
148 
FST_IsVoiceEnrollment(srec_context * context)149 int FST_IsVoiceEnrollment(srec_context* context)
150 {
151   if (context->olabels == NULL) return 0;
152   if (context->olabels->num_words < 2) return 0;
153   if (strstr(context->olabels->words[1], "enroll")) return 1;
154   return 0;
155 }
156 
FST_LoadContext(const char * synbase,srec_context ** pcontext,int num_words_to_add)157 int FST_LoadContext(const char* synbase, srec_context** pcontext,
158                     int num_words_to_add)
159 {
160   int rc;
161   PFile* fp;
162   char buffer[MAX_LINE_LENGTH];
163   srec_context* context;
164 
165   context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base");
166   if(!context)
167 	  return FST_FAILED_ON_MEMORY;
168   memset(context, 0, sizeof(srec_context));
169 
170   context->addWordCaching_lastslot_name = 0;
171   context->addWordCaching_lastslot_num = MAXwordID;
172   context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
173 
174   sprintf(buffer, "%s.map", synbase);
175   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
176   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
177   rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp);
178   pfclose(fp);
179   if (rc) return rc;
180 
181   sprintf(buffer, "%s.PCLG.txt", synbase);
182   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
183   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
184   rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp);
185   pfclose(fp);
186   if (rc != FST_SUCCESS) return rc;
187 
188   sprintf(buffer, "%s.Grev2.det.txt", synbase);
189   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
190   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
191   rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp);
192   pfclose(fp);
193   if (rc != FST_SUCCESS) return rc;
194 
195   sprintf(buffer, "%s.params", synbase);
196   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
197   if (fp) {
198     rc = FST_LoadParams(context, fp);
199     pfclose(fp);
200     if (rc != FST_SUCCESS) return rc;
201   } else {
202     rc = FST_AssumeParams(context);
203   }
204 
205   context->grmtyp = (arcID)FST_GetGrammarType(context);
206 
207   *pcontext = context;
208   rc = fst_fill_node_info(context);
209   return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
210 }
211 
FST_UnloadContext(srec_context * context)212 void FST_UnloadContext(srec_context* context)
213 {
214   if (!context) return;
215 
216   FST_UnloadWordMap(&context->ilabels);
217   FST_UnloadWordMap(&context->olabels);
218   FST_UnloadGraph(context);
219   FST_UnloadReverseWordGraph(context);
220   FREE(context);
221 }
222 
FST_PrepareContext(srec_context * context)223 int FST_PrepareContext(srec_context* context)
224 {
225   nodeID i;
226   int rc = FST_SUCCESS;
227   /* after all word additions, we need to "prepare" the context */
228 
229   /* first check for any changes to optional endnodes etc .. */
230   for (i = 0; i < context->num_nodes; i++)
231     if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN)
232       break;
233   if (i != context->num_nodes)
234     rc = fst_fill_node_info(context);
235 
236   context->whether_prepared = 1;
237   return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
238 }
239 
fst_set_wb_costs(srec_context * context,costdata wbcost)240 void fst_set_wb_costs( srec_context* context, costdata wbcost)
241 {
242   unsigned int i;
243   for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) {
244     if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY)
245       context->FSMarc_list[i].cost = wbcost;
246   }
247 }
248 
FST_LoadParams(srec_context * context,PFile * fp)249 int FST_LoadParams(srec_context* context, PFile* fp)
250 {
251   char line[MAX_LINE_LENGTH];
252   costdata wbcost = MAXcostdata;
253   while (pfgets(line, MAX_LINE_LENGTH, fp))
254   {
255     char* key = strtok(line," \n\r\t");
256     char* val = key ? strtok(NULL," \n\r\t") : NULL;
257     if(val && !strcmp(val,"="))
258       val = key ? strtok(NULL," \n\r\t") : NULL;
259     if(!key || !key[0])
260       continue;
261     else if(val && val[0]) {
262       if(!strcmp(key,"word_penalty")) {
263         wbcost = (costdata)atoi_with_check(val, MAXcostdata);
264 	if(wbcost == MAXcostdata) {
265 	  return FST_FAILED_ON_INVALID_ARGS;
266 	}
267       } else {
268 	PLogError(L("error: unknown parameter %s in .params file"), key);
269 	return FST_FAILED_ON_INVALID_ARGS;
270       }
271     }
272   }
273   if(wbcost != MAXcostdata)
274     fst_set_wb_costs( context, wbcost);
275   return FST_SUCCESS;
276 }
277 
FST_AssumeParams(srec_context * context)278 int FST_AssumeParams( srec_context* context)
279 {
280   fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST);
281   return FST_SUCCESS;
282 }
283 
284 
285 /*--------------------------------------------------------------------------*
286  *                                                                          *
287  *                                                                          *
288  *                                                                          *
289  *--------------------------------------------------------------------------*/
290 
hmm_number(const char * hmm_Name,modelID hmm_ilabel_offset)291 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset)
292 {
293   if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL;
294   if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY;
295   if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY;
296   ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]);
297   return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID));
298 }
299 
hmm_name(modelID ilabel,modelID hmm_ilabel_offset,char * buf)300 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf)
301 {
302   if (ilabel == EPSILON_LABEL)
303     sprintf(buf, "eps");
304   else if (ilabel == WORD_BOUNDARY)
305     sprintf(buf, ".wb");
306   else if (ilabel == PHONE_BOUNDARY)
307     sprintf(buf, ".ph");
308   else
309     sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset);
310   return buf;
311 }
312 
313 /*------------------------------------------------------------------*
314  *                                                                  *
315  * words                                                            *
316  *                                                                  *
317  *------------------------------------------------------------------*/
HashCmpWord(const LCHAR * key1,const LCHAR * key2)318 int HashCmpWord(const LCHAR *key1, const LCHAR *key2)
319 {
320 	return LSTRCMP(key1,key2);
321 }
HashGetCode(const void * key)322 unsigned int HashGetCode(const void *key)
323 {
324 	const LCHAR* k = (const LCHAR*)key;
325 	unsigned int i, len, h = 0;
326 	len = LSTRLEN( k);
327 	for ( i = 0; i < len; i++)
328 		h = 31*h + (unsigned int)k[i];
329 	return h;
330 }
331 
wordmap_create(wordmap ** pwmap,int num_chars,int num_words,int num_words_to_add)332 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add)
333 {
334   wordmap* Interface;
335   PHashTableArgs hashArgs;
336   ESR_ReturnCode rc;
337 
338   Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base");
339   Interface->max_words = (wordID)(num_words + num_words_to_add);
340   Interface->num_words = (wordID)0;
341   /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */
342   Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words");
343   Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD;
344   Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars");
345   Interface->next_chars = Interface->chars;
346   Interface->wordIDForWord = NULL;
347   *pwmap = Interface;
348 
349   /* use a hashtable to save mapping between wdID and array index */
350   if (num_words_to_add >= 0)
351   {
352     hashArgs.capacity = num_words + num_words_to_add + 10;
353 	if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
354     hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION;
355     hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION;
356     hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
357     CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord));
358   }
359   else
360   {
361     Interface->wordIDForWord = NULL;
362   }
363 
364   return FST_SUCCESS;
365 
366 CLEANUP:
367   wordmap_destroy(pwmap);
368   return rc;
369 }
370 wordID wordmap_add_word(wordmap* wmap, const char* word);
371 
FST_LoadWordMap(wordmap ** pwmap,int num_words_to_add,PFile * fp)372 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp)
373 {
374   wordID my_wID, wm_wID;
375   char *word, *wID_string;
376   int num_words;
377   asr_int32_t num_chars;
378   char buf[MAX_LINE_LENGTH];
379   long fpos;
380   wordmap* wmap;
381   /* figure out the number of words */
382   fpos = pftell(fp);
383   for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++)
384   {
385     word = strtok(buf, " \n\r\t");
386     num_chars += strlen(word);
387   }
388   num_chars += num_words * 2; /* for null terminators */
389   pfseek(fp, fpos, SEEK_SET);
390 
391   /* Alex added this to reuse this functionality elsewhere
392     destroy function for this create function is actually taken care of by the
393     FST_UnloadWordMap function
394   */
395   wordmap_create(&wmap, num_chars, num_words, num_words_to_add);
396 
397   /* later replace this with wordblocks, linked list of 8 character blocks! */
398   while (pfgets(buf, MAX_LINE_LENGTH, fp))
399   {
400     word = strtok(buf, " \n\r\t");
401     wID_string = strtok(NULL, " \n\r\t");
402     my_wID = (wordID)atoi_with_check(wID_string, MAXwordID);
403 
404     // if(strstr(word,SLOTNAME_INDICATOR))
405     //    word = strstr(word,SLOTNAME_INDICATOR);
406     wm_wID = wordmap_add_word(wmap, word);
407     ASSERT(my_wID == wm_wID);
408   }
409   ASSERT(wmap->num_words == num_words);
410 
411   for (my_wID = 1; my_wID < num_words; my_wID++)
412   {
413     if (!IS_SLOT_OLABEL(wmap->words[my_wID]))
414       break;
415   }
416   wmap->num_slots = my_wID;
417   wordmap_setbase(wmap);
418   *pwmap = wmap;
419   if(wmap->num_slots > MAX_NUM_SLOTS)
420     return FST_FAILED_ON_INVALID_ARGS;
421   else
422     return FST_SUCCESS;
423 }
424 
wordmap_destroy(wordmap ** wmap)425 int wordmap_destroy(wordmap** wmap)
426 {
427   if (wmap && *wmap)
428   {
429     wordmap_clean ( *wmap );
430     if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord);
431     if (((*wmap)->chars)) FREE((*wmap)->chars);
432     if (((*wmap)->words)) FREE((*wmap)->words);
433     if ((*wmap)) FREE((*wmap));
434     *wmap = 0;
435   }
436   return FST_SUCCESS;
437 }
438 
FST_UnloadWordMap(wordmap ** wmap)439 int FST_UnloadWordMap(wordmap** wmap)
440 {
441   return wordmap_destroy(wmap);
442 }
443 
444 
FST_DumpWordMap(PFile * fp,wordmap * wmap)445 int FST_DumpWordMap(PFile* fp, wordmap* wmap)
446 {
447   wordID my_wID;
448   for (my_wID = 0; my_wID < wmap->num_words; my_wID++)
449   {
450     pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID);
451   }
452   return FST_SUCCESS;
453 }
454 
wordmap_find_index(wordmap * wmap,const char * word)455 wordID wordmap_find_index(wordmap* wmap, const char* word)
456 {
457   void *wdID_void;
458   ESR_ReturnCode rc;
459   size_t i;
460 
461   if (!word)
462     return MAXwordID;
463 
464   if (wmap->num_words == 0)
465     return MAXwordID;
466 
467   if (wmap->wordIDForWord)
468   {
469     rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void);
470 
471     if (rc == ESR_SUCCESS)
472       return (wordID)(uintptr_t)wdID_void;
473   }
474   else
475   {
476     for (i = 0; i < wmap->num_words; i++)
477     {
478       if (!strcmp(wmap->words[i], word))
479       {
480         return (wordID)i;
481       }
482     }
483   }
484   return MAXwordID;
485 }
486 
wordmap_find_rule_index(wordmap * wmap,const char * rule)487 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule)
488 {
489   int i;
490   int strlen_rule = strlen(rule);
491   for (i = wmap->num_slots; --i > 0;)
492   { /* > (not >=) because eps is always 0 */
493     char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR);
494     if(p != NULL){
495       if (!strcmp(p, rule)) break;
496       else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break;
497     }
498   }
499   if (i == 0) return MAXwordID;
500   else return(wordID)i;
501 }
502 
wordmap_find_index_in_rule(wordmap * wmap,const char * word,wordID rule)503 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule)
504 {
505   int len = strlen(word);
506   int rule0 = rule + '0';
507   void *wdID_void;
508   LCHAR word_dot_rule[256];
509   ESR_ReturnCode rc;
510 
511   if (!word)
512     return MAXwordID;
513   LSTRCPY(word_dot_rule, word);
514   word_dot_rule[len++] = IMPORTED_RULES_DELIM;
515   word_dot_rule[len++] = (char)rule0;
516   word_dot_rule[len++] = 0;
517   rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void);
518   if (rc == ESR_SUCCESS)
519     return (wordID)(uintptr_t)(wdID_void);
520   return MAXwordID;
521 }
522 
strlen_with_null(const char * word)523 int strlen_with_null(const char* word)
524 {
525 #if 1 /* NT  */
526   int len = strlen(word) + 1;
527   if (len % 2 == 1) len++;
528   return len;
529 #else /* C54 */
530 #endif
531 }
532 
wordmap_whether_in_rule(wordmap * wmap,wordID word,wordID rule)533 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule)
534 {
535   char* word_chars;
536   int len;
537   if (word > wmap->num_words) return 0;
538   word_chars = wmap->words[word];
539   len = strlen(word_chars);
540   if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM)
541     return 1;
542   else
543     return 0;
544 }
545 
wordmap_setbase(wordmap * wmap)546 void wordmap_setbase(wordmap* wmap)
547 {
548   wmap->num_base_words = wmap->num_words;
549   wmap->next_base_chars = wmap->next_chars;
550 }
551 
wordmap_ceiling(wordmap * wmap)552 void wordmap_ceiling(wordmap* wmap)
553 {
554   /* this is irreversible */
555   wmap->max_words = wmap->num_words;
556   wmap->max_chars = (wmap->next_chars - wmap->chars);
557 }
558 
559 
wordmap_clean(wordmap * word_map)560 static ESR_ReturnCode wordmap_clean ( wordmap *word_map )
561     {
562   ESR_ReturnCode  clean_status;
563   PHashTableEntry *entry;
564   PHashTableEntry *oldEntry;
565   wordID          *value;
566 
567   clean_status = ESR_SUCCESS;
568 
569   if ( word_map->wordIDForWord == NULL )
570     return clean_status;
571 
572   clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry);
573 
574   while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) )
575     {
576       clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value );
577 
578       if ( clean_status == ESR_SUCCESS )
579 	{
580 	  oldEntry = entry;
581 	  clean_status = PHashTableEntryAdvance ( &entry );
582 
583 	  if ( clean_status == ESR_SUCCESS )
584 	    clean_status = PHashTableEntryRemove ( oldEntry );
585 	}
586     }
587     return ( clean_status );
588     }
589 
590 
wordmap_populate(wordmap * word_map,wordID num_words)591 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words )
592 {
593   ESR_ReturnCode  populate_status;
594   wordID          wdID;
595 
596   populate_status = ESR_SUCCESS;
597   wdID = 0;
598 
599   if ( word_map->wordIDForWord != NULL )
600     {
601       while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) )
602 	{
603 	  populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID],
604 						 (const void *)(uintptr_t)wdID, NULL );
605 	  if ( populate_status == ESR_SUCCESS )
606 	    wdID++;
607 	  else {
608 	    return (populate_status);
609 	  }
610 	}
611     }
612   return ( populate_status );
613 }
614 
wordmap_reset(wordmap * wmap)615 void wordmap_reset(wordmap* wmap)
616 {
617   char** tmp_words;
618   int i;
619   ESR_ReturnCode    reset_status;
620 
621   if (wmap->num_base_words < wmap->num_words)
622   {
623     /*wordID i = (wordID)(wmap->num_base_words);*/
624     char* old_wmap_chars = wmap->chars;
625     int offset = wmap->next_base_chars - wmap->chars;
626     char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars"));
627     if(!tmp_chars) {
628       passert ( 0 && L("failed to reset the memory for wordmap.chars ") );
629     }
630     memcpy(tmp_chars,wmap->chars, offset * sizeof(char));
631     FREE(wmap->chars);
632 
633     wmap->chars = tmp_chars;
634     wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars);
635     wmap->max_chars = (wordID) offset;
636     wmap->next_chars = wmap->next_base_chars;
637 
638     tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words");
639     if(!tmp_words) {
640       passert ( 0 && L("failed to reset the memory for wordmap.words ") );
641     }
642     memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*));
643     FREE( wmap->words);
644 
645     wmap->words = tmp_words;
646     wmap->max_words = wmap->num_base_words;
647     wmap->num_words  = wmap->num_base_words;
648 
649     for(i=0; i<wmap->num_words; i++)
650       wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars);
651   }
652 
653   reset_status = wordmap_clean ( wmap );
654 
655   if ( reset_status == ESR_SUCCESS )
656     {
657       reset_status = wordmap_populate ( wmap, wmap->num_base_words );
658 
659       if ( reset_status != ESR_SUCCESS )
660 	{
661 	  wordmap_clean ( wmap );
662 	  passert ( 0 && L("wordmap_reset failed") );
663 	}
664     }
665   else
666     {
667       passert ( 0 && L("wordmap_reset failed") );
668     }
669 }
670 
671 #define FST_GROW_FACTOR 12/10
672 #define FST_GROW_MINCHARS 256
673 #define FST_GROW_MINWORDS  32
wordmap_add_word(wordmap * wmap,const char * word)674 wordID wordmap_add_word(wordmap* wmap, const char* word)
675 {
676   int len = strlen_with_null(word);
677   wordID wdID =0;
678 
679   if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
680   {
681 #if defined(FST_GROW_FACTOR)
682       uintptr_t i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR;
683 	 char* old_wmap__chars = wmap->chars;
684       if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS)
685 	tmp_max_chars +=FST_GROW_MINCHARS;
686 
687       char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
688       if(!tmp_chars) {
689           PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars"));
690           return MAXwordID;
691       }
692       memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char));
693       FREE(wmap->chars);
694 
695       wmap->chars = tmp_chars;
696       wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars);
697       wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars);
698       wmap->max_chars = (wordID)tmp_max_chars;
699       //Remove old keys --- Add WORD
700       wordmap_clean (wmap );
701 
702       // adjust word pointers
703       for(i=0; i<wmap->num_words; i++)
704 	{
705 	  wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars);
706 	  // adjust hashtable ---
707 	  if (wmap->wordIDForWord)
708 	    {
709 	      ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
710 						       (const void *)i, NULL );
711 	      if ( rc != ESR_SUCCESS ) {
712 		goto CLEANUP;
713 	      }
714 	    }
715 	}
716 #else // so not defined(FST_GROW_FACTOR)
717       PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
718       return MAXwordID;
719 #endif // defined(FST_GROW_FACTOR)
720   }
721   //Add word
722   if (wmap->num_words == wmap->max_words)
723     {
724 #if defined(FST_GROW_FACTOR)
725       char** tmp_words;
726       wordID tmp_max_words;
727       int itmp_max_words =  wmap->max_words * FST_GROW_FACTOR;
728 
729       if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
730 	    itmp_max_words += FST_GROW_MINWORDS;
731 
732       if( itmp_max_words >= MAXwordID) {
733 	    PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
734 	    return MAXwordID;
735       }
736       tmp_max_words = (wordID)itmp_max_words;
737 	  tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
738       if(!tmp_words) {
739 	PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words"));
740         return MAXwordID;
741       }
742       memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
743       FREE( wmap->words);
744       wmap->words = tmp_words;
745       wmap->max_words = tmp_max_words;
746 #else // so not defined(FST_GROW_FACTOR)
747       PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
748       return MAXwordID;
749 #endif // defined(FST_GROW_FACTOR)
750     }
751   if(1)
752     {
753     strcpy(wmap->next_chars, word);
754     wmap->words[ wmap->num_words++] = wmap->next_chars;
755     wmap->next_chars += len;
756     wdID = (wordID)(wmap->num_words - (wordID)1);
757     if (wmap->wordIDForWord)
758     {
759        ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
760                       (const void *)(uintptr_t)wdID, NULL );
761     if ( rc != ESR_SUCCESS )
762       goto CLEANUP;
763     }
764     return wdID;
765   }
766 CLEANUP:
767   PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
768   return MAXwordID;
769 }
770 
wordmap_add_word_in_rule(wordmap * wmap,const char * word,wordID rule)771 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule)
772 {
773   int len = strlen_with_null(word) + 2;
774   wordID wdID = 0;
775   wordID i;
776 //wordmap_add_word_in_rule
777   if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
778   {
779 #if defined(FST_GROW_FACTOR)
780       int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR;
781       char* tmp_chars;
782 	  char* old_next_chars = wmap->next_chars;
783 	  char* old_chars = wmap->chars;
784 
785       if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS )
786 	    tmp_max_chars += FST_GROW_MINCHARS;
787       if( wmap->chars + tmp_max_chars <= wmap->next_chars + len)
788 	    tmp_max_chars += len;
789 
790       tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
791     if(!tmp_chars) {
792           PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars"));
793           return MAXwordID;
794       }
795       memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char));
796       FREE(wmap->chars);
797 
798       wmap->chars = tmp_chars;
799       wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ;
800       wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars);
801       wmap->max_chars = (wordID)tmp_max_chars;
802 
803       //Remove old keys
804       wordmap_clean ( wmap );
805 
806       // adjust word pointers wordmap_add_word_in_rule
807       for(i=0; i<wmap->num_words; i++)
808 	{
809  	  wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ;
810 	  // adjust hashtable ----- add in rulewordmap_add_word_in_rule ----
811 	  if (wmap->wordIDForWord) {
812 	    ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
813 						     (void*)(uintptr_t)(i), NULL );
814 	    if ( rc != ESR_SUCCESS )
815 	      goto CLEANUP;
816 	  }
817 	}
818 #else
819       PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
820       return MAXwordID;
821 #endif
822   }//wordmap_add_word_in_rule
823   if (wmap->num_words == wmap->max_words)
824     {
825 #if defined(FST_GROW_FACTOR)
826       char** tmp_words;
827       wordID tmp_max_words;
828       int itmp_max_words =  wmap->max_words * FST_GROW_FACTOR;
829       if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
830 	itmp_max_words += FST_GROW_MINWORDS;
831 
832       if( itmp_max_words >= MAXwordID) {
833 	PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
834 	return MAXwordID;
835       }
836 
837       tmp_max_words = (wordID)itmp_max_words;
838       tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
839       if(!tmp_words) {
840         PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words"));
841         return MAXwordID;
842       }
843       memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
844       FREE( wmap->words);
845       wmap->words = tmp_words;
846       wmap->max_words = tmp_max_words;
847 #else
848       PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
849       return MAXwordID;
850 #endif
851     }
852   if(1)
853   {
854     char *p;
855     const char *q;
856     /* word.1 */
857     for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */
858     *p++ = IMPORTED_RULES_DELIM;
859     *p++ = (char)(rule + ((wordID)'0'));
860     *p   = '\0';
861     wmap->words[ wmap->num_words++] = wmap->next_chars;
862     wmap->next_chars += len;
863     wdID = (wordID)(wmap->num_words - (wordID)1);
864     if (wmap->wordIDForWord)
865       {
866         ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
867 						 (void*)(uintptr_t)(wdID), NULL );
868 	if ( rc != ESR_SUCCESS )
869 	  goto CLEANUP;
870       }
871     return wdID;
872   }
873 CLEANUP:
874   PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
875   return MAXwordID;
876 }
877 
878 /*----------------------------------------------------------------------*
879  *                                                                      *
880  * API                                                                  *
881  *                                                                      *
882  *----------------------------------------------------------------------*/
883 
FST_AttachArbdata(srec_context * fst,srec_arbdata * allophone_tree)884 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree)
885 {
886   int rc = 0;
887   unsigned int allotree__modelid;
888 
889   fst->allotree = allophone_tree;
890   if( !allophone_tree)
891       return 1;
892   fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset;
893   allotree__modelid = version_arbdata_models(allophone_tree);
894   if (allotree__modelid != 0 && fst->modelid != 0)
895   {
896     if (allotree__modelid != fst->modelid)
897     {
898       PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid);
899       rc = FST_FAILED_ON_INVALID_ARGS;
900     }
901   }
902 
903   return rc;
904 }
905 
FST_LoadGraph(srec_context * pfst,wordmap * imap,wordmap * omap,int num_words_to_add,PFile * fp)906 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap,
907                   int num_words_to_add, PFile* fp)
908 {
909   int i, rc = 0;
910   char line[MAX_LINE_LENGTH], *args[32], nargs;
911   arcID max_num_FSMarcs;
912   arcID max_num_FSMnodes;
913   nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0;
914   FSMarc_ptr atok =  (FSMarc_ptr)0;
915   FSMarc *atoken = NULL;
916   FSMnode *fr_node, *to_node;
917   costdata cost = FREEcostdata;
918   arcID num_arcs;
919 
920   srec_context* fst = pfst;
921   char *ilabel_str = 0, *olabel_str = 0;
922   long fpos;
923   arcID new_arc_id;
924   asr_int32_t temp;
925 
926 
927   /* determine number of arcs and nodes to allocate, add 50% for dynamic */
928   fpos = pftell(fp);
929   max_num_FSMnodes = 0;
930   pfst->modelid = 0;
931   for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++)
932   {
933     if (strstr(line, "modelid:") == line)
934     {
935       char *p;
936       pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10);
937     }
938     from_node = (nodeID)atoi_with_check(line, MAXnodeID);
939     if (from_node > max_num_FSMnodes)
940       max_num_FSMnodes = from_node;
941   }
942   pfseek(fp, fpos, SEEK_SET);
943   temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD;
944   if (temp >= MAXnodeID)
945   {
946     max_num_FSMnodes = MAXnodeID - 1;
947     PLogMessage("Warning: using max nodes instead\n");
948   }
949   else
950   {
951     max_num_FSMnodes = (nodeID)temp;
952   }
953   temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD;
954   if (temp >= MAXarcID)
955   {
956     max_num_FSMarcs = MAXarcID - 1;
957     PLogMessage("Warning: using max arcs instead\n");
958   }
959   else
960   {
961     max_num_FSMarcs = (arcID)temp;
962   }
963   fst->olabels = omap;
964   if (imap)
965   {
966     /* generally no imap is specified */
967     fst->ilabels = imap;
968     fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0");
969     ASSERT(fst->hmm_ilabel_offset >= 0);
970   }
971   else
972   {
973     fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
974     fst->ilabels->num_words = fst->ilabels->max_words = 0;
975     fst->ilabels->words = 0;
976     /* this is bad hard code, we can get this from the swiarb file, it is
977        equal to the number of phonemes (53 for enu, 39 for jpn) plus the special
978        models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */
979     fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */
980   }
981   fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs");
982   fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes");
983   fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos");
984 
985   /* setup the arc freelist */
986   fst->FSMarc_freelist = 0;
987   fst->num_arcs = 0;
988   for (i = 0; i < max_num_FSMarcs - 1; i++)
989   {
990     fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
991     fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
992   }
993   fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
994   fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
995 
996   /* initialize the nodes, 'cuz reading is random order */
997   fst->num_nodes = 0;
998   for (i = 0; i < max_num_FSMnodes; i++)
999   {
1000     fr_node = &fst->FSMnode_list[i];
1001     fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
1002   }
1003 
1004   /* 1. first load up all the information */
1005   IF_DEBUG_WDADD(printf("load graph ... 1\n"));
1006   num_arcs = 0;
1007   last_from_node = MAXnodeID;
1008   while (pfgets(line, sizeof(line) / sizeof(char), fp))
1009   {
1010     if (strstr(line, "modelid:") == line) continue;
1011     IF_DEBUG_WDADD(printf("read arc %s", line));
1012     nargs = split_line(line, args);
1013     if (nargs >= 4)
1014     {
1015       from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1016       into_node = (nodeID)atoi_with_check(args[1], MAXnodeID);
1017       ilabel_str = args[2];
1018       olabel_str = args[3];
1019       cost = FREEcostdata;
1020       if (nargs == 5)
1021         PLogError(L("Warning: too many arguments on line %s"), line);
1022     }
1023     else if (nargs == 1)
1024     {
1025       from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1026       into_node = MAXnodeID;
1027       ilabel_str = 0;
1028       olabel_str = 0;
1029       cost = FREEcostdata;
1030     }
1031     else
1032     {
1033       from_node = into_node = 0;
1034       PLogError("can't parse line %s\n", line);
1035       ASSERT(0);
1036     }
1037 
1038     if (into_node == MAXnodeID)
1039     {
1040       fst->end_node = from_node;
1041     }
1042     else
1043     {
1044       new_arc_id = fst_get_free_arc(fst);
1045       if (new_arc_id == MAXarcID)
1046         return FST_FAILED_ON_MEMORY;
1047       atok = ARC_ItoX(new_arc_id);
1048       atoken = ARC_XtoP(atok);
1049       num_arcs++;
1050       fr_node = &fst->FSMnode_list[from_node];
1051       to_node = &fst->FSMnode_list[into_node];
1052       if (fst->ilabels->num_words == 0)
1053       {
1054         atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset);
1055 	/* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/
1056 	if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL)
1057 	  atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET;
1058       }
1059       else
1060       {
1061         atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str);
1062       }
1063       atoken->olabel = wordmap_find_index(fst->olabels, olabel_str);
1064       /* if(g_fst_options.wtw_cost_override>0) {
1065       if(atoken->ilabel == WORD_BOUNDARY)
1066       atoken->cost = g_fst_options.wtw_cost_override;
1067       else
1068       atoken->cost = g_fst_options.arc_cost_override;
1069       } else  */
1070       atoken->cost = cost;
1071 #if DEBUG_WDADD
1072       atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0);
1073       atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0);
1074 #endif
1075       append_arc_leaving_node(fst, fr_node, atok);
1076       if (into_node > max_node_number)
1077         max_node_number = into_node;
1078       append_arc_arriving_node(fst, to_node, atok);
1079       atoken->fr_node = NODE_ItoX(from_node);
1080       atoken->to_node = NODE_ItoX(into_node);
1081     }
1082     last_from_node = from_node;
1083   }
1084   ASSERT(fst->num_arcs == num_arcs);
1085 
1086   /* setup the node freelist */
1087   IF_DEBUG_WDADD(printf("load graph ... 6\n"));
1088   num_nodes = (nodeID)(max_node_number + 1);
1089   if( max_num_FSMnodes > num_nodes) {
1090   fst->FSMnode_freelist = num_nodes;
1091   for (i = num_nodes; i < (max_num_FSMnodes - 1); i++)
1092   {
1093     fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
1094     fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1095   }
1096 	if (i == (max_num_FSMnodes - 1)) {
1097      fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
1098      fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1099     }
1100   } else
1101 	fst->FSMnode_freelist  = FSMNODE_NULL;
1102 
1103   /* some book-keeping stuff */
1104   IF_DEBUG_WDADD(printf("load graph ... 7\n"));
1105   fst->num_base_arcs = fst->num_arcs = num_arcs;
1106   fst->FSMarc_list_len = max_num_FSMarcs;
1107   fst->FSMnode_list_len = max_num_FSMnodes;
1108   fst->num_base_nodes = fst->num_nodes = num_nodes;
1109   fst->start_node = 0;
1110   /* fst->end_node = 0; this is set up above */
1111 
1112   /* beg and end word labels */
1113   fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-");
1114   fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-");
1115   fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence");
1116 
1117   /* set the search limitations, real ones will be registered later */
1118   fst->max_searchable_nodes = 0;
1119   fst->max_searchable_arcs = 0;
1120 
1121   rc = fst_alloc_transit_points(fst);
1122   return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS);
1123 }
1124 
FST_UnloadGraph(srec_context * pfst)1125 int FST_UnloadGraph(srec_context* pfst)
1126 {
1127   if (pfst->ilabels)
1128     FREE(pfst->ilabels);
1129   FREE(pfst->FSMarc_list);
1130   FREE(pfst->FSMnode_list);
1131   FREE(pfst->FSMnode_info_list);
1132   pfst->FSMarc_list = 0;
1133   pfst->FSMnode_list = 0;
1134   pfst->FSMnode_info_list = 0;
1135   return FST_SUCCESS;
1136 }
1137 
FST_DumpGraph(srec_context * fst,PFile * fp)1138 int FST_DumpGraph(srec_context* fst, PFile* fp)
1139 {
1140   int rc = 0;
1141   nodeID i;
1142   FSMarc_ptr atok;
1143   FSMarc* atoken;
1144   nodeID from_node, into_node;
1145   FSMnode* ntoken;
1146   char *ilabel, *olabel;
1147 
1148   for (i = 0; i < fst->num_nodes; i++)
1149   {
1150     from_node = i;
1151     ntoken = &fst->FSMnode_list[i];
1152     if (ntoken->first_prev_arc == FSMARC_FREE)
1153       continue;
1154     if (ntoken->un_ptr.first_next_arc != FSMARC_NULL)
1155     {
1156       for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
1157       {
1158         char buf[32];
1159         atoken = ARC_XtoP(atok);
1160         into_node = NODE_XtoI(atoken->to_node);
1161         ilabel = fst->ilabels->num_words == 0 ?
1162                  hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) :
1163                  fst->ilabels->words[atoken->ilabel] ;
1164         olabel = fst->olabels->words[atoken->olabel];
1165 
1166         if (atoken->cost != FREEcostdata)
1167         {
1168           /* regular arc */
1169           pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n",
1170                   from_node, into_node, ilabel, olabel, atoken->cost);
1171         }
1172         else
1173         {
1174           /* regular zero cost arc */
1175           pfprintf(fp, "%hu\t%hu\t%s\t%s\n",
1176                   from_node, into_node, ilabel, olabel);
1177         }
1178       }
1179     }
1180     else
1181     {
1182       pfprintf(fp, "%hu\n", from_node);
1183     }
1184   }
1185   return rc;
1186 }
1187 
FST_AddWordToGrammar(srec_context * fst,const char * _slot,const char * word,const char * pron,const int cost)1188 int FST_AddWordToGrammar(srec_context* fst, const char* _slot,
1189                          const char* word, const char* pron,
1190                          const int cost)
1191 {
1192   int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0;
1193   int i, pron_len, model_sequence_len;
1194   modelID model_sequence[MAX_PRON_LEN];
1195   char phoneme_sequence[MAX_PRON_LEN];
1196   wordID olabel = MAXwordID;
1197   int irc, rc = FST_SUCCESS;
1198   nodeID start_node = MAXnodeID, end_node = MAXnodeID;
1199   char veslot[MAX_WORD_LEN];
1200 #if USE_HMM_BASED_ENROLLMENT
1201   const char* Tpron;
1202 #endif
1203 
1204   /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */
1205   if(_slot[0] == '@') {
1206     strcpy(veslot,SLOTNAME_INDICATOR);
1207     strcat(veslot,_slot+1);
1208     strcat(veslot,SLOTNAME_INDICATOR);
1209   } else
1210     strcpy(veslot, _slot);
1211 
1212 #if USE_COMP_STATS
1213   if (!comp_stats)
1214     comp_stats = init_comp_stats1();
1215   start_cs_clock1(&comp_stats->word_addition);
1216 #endif
1217 
1218   /* we expect developers to call AddWord with the same slot many times,
1219   so we cache the slot start and end nodes and re-use on subseq calls */
1220   if( fst->addWordCaching_lastslot_num == MAXwordID )
1221     fst->addWordCaching_lastslot_name = NULL;
1222   else {
1223     fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1224     fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1225     ASSERT( fst->addWordCaching_lastslot_name);
1226   }
1227   if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) {
1228 
1229     fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot);
1230     /* olabel not found is a user error */
1231     if (fst->addWordCaching_lastslot_num == MAXwordID)
1232     {
1233       size_t i;
1234 
1235       pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot);
1236       for (i = 1; i < (size_t) fst->olabels->num_slots; ++i)
1237         pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]);
1238       pfprintf(PSTDOUT, L("] possible\n"));
1239       rc = FST_FAILED_ON_INVALID_ARGS;
1240       goto RETRC;
1241     }
1242 
1243     fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1244     fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1245     ASSERT(fst->addWordCaching_lastslot_name);
1246 
1247     /* now find where in the graph this slot is referenced, note that
1248        there might be more than one place, but here we ignore multiples */
1249     for (i = fst->num_fsm_exit_points; --i >= 0;)
1250       {
1251 	arcID arcid = fst->fsm_exit_points[i].arc_index;
1252 	if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num)
1253 	  {
1254 	    FSMarc* arc;
1255 	    FSMnode* node;
1256 	    start_node = fst->fsm_exit_points[i].from_node_index;
1257 	    end_node = fst->fsm_exit_points[i].wbto_node_index;
1258 	    fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE;
1259 	    fst->addWordCaching_lastslot_ifsm_exit_point = i;
1260 	    node = &fst->FSMnode_list[ end_node];
1261 	    arc = &fst->FSMarc_list[node->un_ptr.first_next_arc];
1262 	    if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID)
1263 	      fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
1264 	    break;
1265 	  }
1266       }
1267 
1268     /* not found in the graph is an internal error, coding error */
1269     if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes)
1270       {
1271 	PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num,
1272 		  start_node, end_node);
1273 	goto RETRC;
1274       }
1275   } /* cached or not */
1276 
1277   i = fst->addWordCaching_lastslot_ifsm_exit_point;
1278   start_node = fst->fsm_exit_points[i].from_node_index;
1279   end_node = fst->fsm_exit_points[i].wbto_node_index;
1280 
1281   /* now start_node and end_node are known */
1282   if (!word || !*word || !pron || !*pron)
1283     {
1284       rc = FST_FAILED_ON_INVALID_ARGS;
1285       PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n");
1286       goto RETRC;
1287     }
1288 
1289   /* from here */
1290   IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron));
1291 
1292   /* loop over all prons, we break when we hit the double-null (hence the +1) */
1293   for( ; (*pron)!='\0'; pron+=(pron_len+1)) {
1294     pron_len = strlen((char*)pron);
1295     if (pron_len >= MAX_PRON_LEN)
1296       {
1297 	PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1298 	rc = FST_FAILED_ON_INVALID_ARGS;
1299 	goto RETRC;
1300       }
1301 
1302     /* check for searchability after adding, estimating at most pron_len
1303        arcs and nodes will be added */
1304     if (fst->num_arcs + pron_len > fst->max_searchable_arcs)
1305       {
1306 	PLogError("error: wordadd failed on word %s due to %d arc search limit\n",
1307 		  word, fst->max_searchable_arcs);
1308 	rc = FST_FAILED_ON_MEMORY;
1309 	goto RETRC;
1310       }
1311 
1312     if (fst->num_nodes + pron_len > fst->max_searchable_nodes)
1313       {
1314 	PLogError("error: wordadd failed on word %s due to %d node search limit\n",
1315 		  word, fst->max_searchable_nodes);
1316 	rc = FST_FAILED_ON_MEMORY;
1317 	goto RETRC;
1318       }
1319 
1320     /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding
1321        an alternate pronunciation */
1322     olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1323     if (olabel == MAXwordID)
1324       {
1325 	olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1326 	if (olabel != MAXwordID)
1327 	  num_words_added++;
1328       }
1329     if (olabel == MAXwordID)
1330       {
1331 	PLogError("error: wordmap_add_word failed\n");
1332 	rc = FST_FAILED_ON_MEMORY;
1333 	goto RETRC;
1334       }
1335 
1336 #if USE_HMM_BASED_ENROLLMENT
1337     // pron should be converted to model_sequence, model_sequence_len
1338 #define VETRANS_PREFIX "wd_hmm"
1339 #define VETRANS_PREFIX_LEN 6
1340     if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) {
1341       // printf("Debug-pron: %d, %s\n",pron_len, pron);
1342       model_sequence_len=0;
1343       for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){
1344 	// Tpron = strstr(Tpron, "wd_hmm");
1345 	Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm"
1346 	// Copy hmm number (56,132,...)
1347 	model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID);
1348 	model_sequence_len++;
1349       }
1350 
1351 
1352       /* we need to append silence at the end since we might be dealing with a slot
1353 	 that has ensuing speech, formerly we only dealt with ROOT which defacto
1354 	 was followed by the pau2 silence, so ROOT did not need its own */
1355       if (fst->addWordCaching_lastslot_needs_post_silence)
1356 	model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here
1357 
1358       /* append the word boundary */
1359       model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1360       /* translate to input label ids */
1361       for (i = 0; i < model_sequence_len; i++)
1362 	{
1363 	  if (model_sequence[i] >= EPSILON_OFFSET)
1364 	    model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1365 	}
1366     }
1367     else
1368 #endif
1369       {
1370 	pron_len = strlen((char*)pron);
1371 	if (pron_len >= MAX_PRON_LEN)
1372 	  {
1373 	    PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1374 	    rc = FST_FAILED_ON_INVALID_ARGS;
1375 	    goto RETRC;
1376 	  }
1377 
1378 	for (i = 0; i < pron_len; i++)
1379 	  {
1380 	    if (pron[i] == OPTSILENCE_CODE)
1381 	      {
1382 		phoneme_sequence[i] = SILENCE_CODE;
1383 		num_pron_ampersands++;
1384 	      }
1385 	    else
1386 	      phoneme_sequence[i] = pron[i];
1387 	  }
1388 	/* we need to append silence at the end since we might be dealing with a slot
1389 	   that has ensuing speech, formerly we only dealt with ROOT which defacto
1390 	   was followed by the pau2 silence, so ROOT did not need its own */
1391 	if (fst->addWordCaching_lastslot_needs_post_silence)
1392 	  phoneme_sequence[i++] = SILENCE_CODE;
1393 
1394 	model_sequence_len = i;
1395 	irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1396 	/* check for bad sequence of phonemes */
1397 	if (irc)
1398 	  {
1399 	    PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc);
1400 	    rc = FST_FAILED_ON_INVALID_ARGS;
1401 	    goto RETRC;
1402 	  }
1403 	IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1404 
1405 	/* append the word boundary */
1406 	model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1407 	/* translate to input label ids */
1408 	for (i = 0; i < model_sequence_len; i++)
1409 	  {
1410 	    if (model_sequence[i] >= EPSILON_OFFSET)
1411 	      model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1412 	  }
1413       } /*  end of ph_t ph_r .. type decoding of the model  sequence */
1414 
1415     /* now do the actual addition */
1416     rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len);
1417     if (rc == FST_SUCCESS)
1418       {
1419 	num_prons_added++;
1420       }
1421     else if (rc == FST_FAILED_ON_HOMONYM)
1422       {
1423 	/* maybe the another pron will work? */
1424       }
1425     else
1426       {
1427 	PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron);
1428 	goto RETRC;
1429       }
1430 
1431     /* second add the pron with no silences,
1432        this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */
1433 
1434     if (num_pron_ampersands > 0)
1435       {
1436 	for (i = 0, model_sequence_len = 0; i < pron_len; i++)
1437 	  {
1438 	    if (pron[i] != OPTSILENCE_CODE)
1439 	      phoneme_sequence[model_sequence_len++] = pron[i];
1440 	  }
1441 
1442 	irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1443 	/* check for bad sequence of phonemes */
1444 	if (irc)
1445 	  {
1446 	    PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc);
1447 	    rc = FST_FAILED_ON_INVALID_ARGS;
1448 	    goto RETRC;
1449 	  }
1450 	else
1451 	  {
1452 	    IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1453 
1454 	    /* append the word boundary */
1455 	    model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1456 	    /* translate to input label ids */
1457 	    for (i = 0; i < model_sequence_len; i++)
1458 	      {
1459 		if (model_sequence[i] >= EPSILON_OFFSET)
1460 		  model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1461 	      }
1462 	    /* now do the actual addition */
1463 	    rc = fst_add_arcs(fst, start_node, end_node,
1464 			      olabel, (costdata)cost, model_sequence, model_sequence_len);
1465 
1466 	    if (rc == FST_SUCCESS)
1467 	      {
1468 		num_prons_added++;
1469 	      }
1470 	    else if (rc == FST_FAILED_ON_HOMONYM)
1471 	      {
1472 		/* maybe another pron worked? */
1473 	      }
1474 	    else
1475 	      {
1476 		PLogMessage("Warning: fst_add_arcs() failed while adding "
1477 			    "word %s pron %s (skip '&')\n", word, (char*)pron);
1478 		goto RETRC;
1479 	      }
1480 	  }
1481       }
1482   }
1483 
1484 
1485  RETRC:
1486   /* set this to make sure that FST_Prepare gets called after add word */
1487   fst->whether_prepared = 0;
1488 #if USE_COMP_STATS
1489   end_cs_clock1(&comp_stats->word_addition, 1);
1490 #endif
1491 
1492   if (rc < 0 && rc != FST_FAILED_ON_HOMONYM)
1493     return rc;
1494 
1495   if (num_prons_added == 0 && num_words_added > 0)
1496     {
1497       return FST_FAILED_ON_HOMONYM;
1498     }
1499   else if (num_prons_added == 0 && num_words_added == 0)
1500     {
1501       return FST_FAILED_ON_HOMOGRAPH;
1502     }
1503   else if (num_prons_added > 0 && num_words_added == 0)
1504     {
1505       return FST_SUCCESS_ON_OLD_WORD;
1506     }
1507   else
1508     {
1509       /* if(num_prons_added>0 && num_words_added>0) */
1510       return FST_SUCCESS;
1511     }
1512 }
1513 
1514 /* remove arcs leaving a node, arcs added via word add, are just discarded
1515 to be cleaned up for re-use by other functions */
remove_added_arcs_leaving(srec_context * fst,nodeID ni)1516 void remove_added_arcs_leaving(srec_context* fst, nodeID ni)
1517 {
1518   FSMnode* node = &fst->FSMnode_list[ ni];
1519   FSMarc *arc = NULL, *arc2;
1520   arcID *pai, ai, ai2;
1521   for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai))
1522   {
1523     if (ai < fst->num_base_arcs)
1524     {
1525       arc = &fst->FSMarc_list[ai];
1526     }
1527     else
1528     {
1529       arc2 = &fst->FSMarc_list[ai];
1530       for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1531            ai2 = arc2->linkl_next_arc)
1532       {
1533         arc2 = &fst->FSMarc_list[ai2];
1534       }
1535       *pai = ai2;
1536     }
1537   }
1538 }
1539 
1540 /* remove arcs arriving at a node, arcs added via word add, are just discarded
1541    to be cleaned up for re-use by other functions */
remove_added_arcs_arriving(srec_context * fst,nodeID ni)1542 void remove_added_arcs_arriving(srec_context* fst, nodeID ni)
1543 {
1544   FSMnode* node = &fst->FSMnode_list[ni];
1545   FSMarc *arc = NULL, *arc2;
1546   arcID *pai, ai, ai2;
1547   for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID;
1548        pai = &arc->linkl_prev_arc, ai = (*pai))
1549   {
1550     if (ai < fst->num_base_arcs)
1551     {
1552       arc = &fst->FSMarc_list[ai];
1553     }
1554     else
1555     {
1556       arc2 = &fst->FSMarc_list[ai];
1557       for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1558            ai2 = arc2->linkl_prev_arc)
1559       {
1560         arc2 = &fst->FSMarc_list[ai2];
1561       }
1562       *pai = ai2;
1563     }
1564   }
1565 }
1566 
1567 /* reset greammar, but resetting all the arcs, nodes and words added
1568    via the addword functions */
1569 
FST_ResetGrammar(srec_context * fst)1570 int FST_ResetGrammar(srec_context* fst)
1571 {
1572   int i, rc = 0;
1573   nodeID fst_slot_start_node, fst_slot_end_node;
1574   wordID fst_slot_slotnum;
1575   arcID ai;
1576   nodeID ni2, ni3, ni4, ni3a;
1577   FSMnode_t *node, *node2, *node3;
1578   FSMarc_t *arc, *arc2, *arc3;
1579 
1580   /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/
1581   for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots;
1582        fst_slot_slotnum++)
1583   {
1584 
1585     if (fst_slot_slotnum == MAXwordID)
1586     {
1587       char *slot = "";
1588       PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1);
1589       return (rc = FST_FAILED_ON_INVALID_ARGS);
1590     }
1591 
1592     /* now find where in the graph this slot is referenced, note that
1593        there might be more than one place, but here we ignore multiples */
1594     fst_slot_start_node = MAXnodeID;
1595     fst_slot_end_node = MAXnodeID;
1596     for (i = fst->num_fsm_exit_points; --i >= 0;)
1597     {
1598       ai = fst->fsm_exit_points[i].arc_index;
1599       if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum)
1600       {
1601         fst_slot_start_node = fst->fsm_exit_points[i].from_node_index;
1602         fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index;
1603       }
1604     }
1605 
1606     /* this 'slot' could be the root rule, which can't be removed */
1607     if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID)
1608       continue;
1609 
1610     /*
1611       start                     end
1612       node   node2    node3   node4
1613        o--@N-->o--sil-->o--wb-->o
1614          arc     arc2   | arc3  ^
1615                  |       |wb
1616                  \__sil->o
1617     arc4   ni3a
1618     */
1619 
1620     remove_added_arcs_leaving(fst, fst_slot_start_node);
1621     node = &fst->FSMnode_list[ fst_slot_start_node];
1622     for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc)
1623     {
1624       arc = &fst->FSMarc_list[ai];
1625       if (arc->olabel != fst_slot_slotnum)
1626         continue;
1627 
1628       ni2 = arc->to_node;
1629       remove_added_arcs_arriving(fst, ni2);
1630       if (ni2 == fst_slot_end_node)
1631         continue;
1632       node2 = &fst->FSMnode_list[ni2];
1633       arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc];
1634 
1635 
1636       ni3 = arc2->to_node;
1637       remove_added_arcs_arriving(fst, ni3);
1638       if (ni3 == fst_slot_end_node)
1639         continue;
1640       node3 = &fst->FSMnode_list[ni3];
1641       arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1642       while (arc3->linkl_next_arc != MAXarcID)
1643       {
1644         arc3 = &fst->FSMarc_list[arc3->linkl_next_arc];
1645         ni3a = arc3->to_node;
1646         remove_added_arcs_arriving(fst, ni3a);
1647       }
1648       arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1649 
1650       ni4 = arc3->to_node;
1651       remove_added_arcs_arriving(fst, ni4);
1652       ASSERT(ni4 == fst_slot_end_node);
1653       if (ni4 == fst_slot_end_node)
1654         continue;
1655     }
1656   }
1657 
1658   /* reset the freelist for nodes */
1659   if( fst->num_nodes == fst->num_base_nodes )
1660     {}
1661   else{
1662     FSMnode *ntoken;
1663     FSMnode* tmp_FSMnode_list;
1664     FSMnode_info* tmp_FSMnode_info_list;
1665 
1666 	fst->FSMnode_freelist = MAXnodeID;
1667     fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes;
1668 
1669 	tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
1670     if(!tmp_FSMnode_list){
1671      PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes");
1672      return FST_FAILED_ON_MEMORY;
1673      }
1674     memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
1675 
1676     /* scan to the end of the free list (to be re-used) */
1677     nodeID* last_free_node = (&fst->FSMnode_freelist);
1678 	ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
1679 
1680 	for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
1681 		 ntoken = &tmp_FSMnode_list[ *last_free_node];
1682 
1683 	FREE( fst->FSMnode_list);
1684     tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
1685     if(!tmp_FSMnode_info_list){
1686      PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos");
1687      return FST_FAILED_ON_MEMORY;
1688      }
1689 	// copy in node info
1690 	memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
1691 
1692 
1693 	FREE( fst->FSMnode_info_list);
1694 	fst->FSMnode_info_list = tmp_FSMnode_info_list;
1695     fst->FSMnode_list = tmp_FSMnode_list;
1696   }
1697 
1698   /*ni = fst->FSMnode_freelist = fst->num_base_nodes;
1699     node = &fst->FSMnode_list[ni];
1700     for (; ni < fst->FSMnode_list_len - 1; ni++, node++)
1701        node->un_ptr.next_node = (nodeID)(ni + 1);
1702     node->un_ptr.next_node = MAXnodeID;
1703     fst->num_nodes = fst->num_base_nodes;*/
1704 
1705   /* reset the freelist for arcs */
1706   if( fst->num_arcs == fst->num_base_arcs )
1707     {}
1708   else {
1709     FSMarc* atoken = NULL;
1710     FSMarc* tmp_FSMarc_list;
1711 
1712     fst->num_arcs = fst->num_base_arcs;
1713     fst->FSMarc_list_len = fst->num_base_arcs;
1714     fst->FSMarc_freelist = MAXarcID;
1715     tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
1716     if(!tmp_FSMarc_list){
1717       PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs");
1718      return FST_FAILED_ON_MEMORY;
1719     }
1720     memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
1721 
1722     /* scan to the end of the free list (to be re-used) */
1723     arcID* last_free_arc = &fst->FSMarc_freelist;
1724     atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
1725 
1726     for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
1727       atoken = &tmp_FSMarc_list[ *last_free_arc];
1728 
1729     FREE( fst->FSMarc_list);
1730     fst->FSMarc_list = tmp_FSMarc_list;
1731   }
1732 
1733   /*ai = fst->FSMarc_freelist = fst->num_base_arcs;
1734     arc = &fst->FSMarc_list[ai];
1735     for (; ai < fst->FSMarc_list_len - 1; ai++, arc++)
1736     arc->linkl_next_arc = (arcID)(ai + 1);
1737     arc->linkl_next_arc = MAXarcID;
1738     fst->num_arcs = fst->num_base_arcs;
1739     fst->FSMarc_list_len = fst->num_base_arcs;*/
1740 
1741   /* now remove all the added words */
1742   wordmap_reset(fst->olabels);
1743   return FST_SUCCESS;
1744 }
1745 
1746 /* See the description of arc_tokens in the header file. These
1747    arcs are nodeless, which make loading from a file that
1748    references nodes a little harder.  We need to do it in stages.
1749    Later, when we load from binary image it should be much simpler.
1750 */
1751 
get_first_arc_leaving_node(arc_token * arc_token_list,arcID num_arcs,nodeID node)1752 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list,
1753     arcID num_arcs,
1754     nodeID node)
1755 {
1756   arcID i;
1757   for (i = 0; i < num_arcs; i++)
1758   {
1759     if ((nodeID)(int)arc_token_list[i].next_token_index == node)
1760       return ARC_TOKEN_LNK(arc_token_list, i);
1761   }
1762   return ARC_TOKEN_NULL;
1763 }
1764 
FST_LoadReverseWordGraph(srec_context * context,int num_words_to_add,PFile * fp)1765 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp)
1766 {
1767   arcID i;
1768   char line[MAX_LINE_LENGTH];
1769   char word_label_as_str[128];
1770   arcID num_alloc_arc_tokens;
1771   nodeID from_node, into_node;
1772   labelID word_label = MAXwordID;
1773   arc_token *atoken, *last_atoken;
1774   costdata cost;
1775   arcID num_arcs;
1776   arc_token *arc_token_list, *tmp;
1777   long fpos;
1778 
1779   /* determine number of word arcs to allocate */
1780   fpos = pftell(fp);
1781   for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp);  num_arcs++) ;
1782   num_alloc_arc_tokens = num_arcs;
1783   num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add);
1784   pfseek(fp, fpos, SEEK_SET);
1785 
1786   context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph");
1787   arc_token_list = context->arc_token_list;
1788 
1789   /* 1. first load up all the information */
1790   i = 0;
1791   while (pfgets(line, MAX_LINE_LENGTH, fp))
1792   {
1793     if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3)
1794     {
1795       word_label = wordmap_find_index(context->olabels, word_label_as_str);
1796       // ASSERT(word_label >= 0);
1797       cost = FREEcostdata;
1798     }
1799     else if (sscanf(line, "%hu", &from_node) == 1)
1800     {
1801       into_node = MAXnodeID;
1802       word_label = MAXwordID;
1803       cost = FREEcostdata;
1804     }
1805     else
1806     {
1807       PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line);
1808       ASSERT(0);
1809     }
1810     atoken = &arc_token_list[i];
1811     i++;
1812     atoken->ilabel = word_label;
1813 	/* atoken->olabel = WORD_EPSILON_LABEL; */
1814     /*atoken->cost = cost; cost is not used for now */
1815 #if DEBUG_ASTAR
1816     atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]);
1817 #endif
1818     atoken->first_next_arc = (arc_token_lnk)into_node;
1819     atoken->next_token_index = (arc_token_lnk)from_node;
1820   }
1821   num_arcs = i;
1822 
1823   /* 2. now do the internal cross references */
1824   for (i = 0; i < num_arcs; i++)
1825   {
1826     atoken = &arc_token_list[i];
1827     into_node = (nodeID)atoken->first_next_arc;
1828     if (into_node == MAXnodeID)
1829       atoken->first_next_arc = ARC_TOKEN_NULL;
1830     else
1831       atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc);
1832   }
1833 
1834   /* 3. now do more internal cross refs */
1835   last_atoken = &arc_token_list[0];
1836   for (i = 1; i < num_arcs; i++)
1837   {
1838     atoken = &arc_token_list[i];
1839     if (atoken->next_token_index != last_atoken->next_token_index)
1840     {
1841       last_atoken->next_token_index = ARC_TOKEN_NULL;
1842     }
1843     else
1844     {
1845       last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i);
1846     }
1847     last_atoken = atoken;
1848   }
1849   last_atoken->next_token_index = ARC_TOKEN_NULL;
1850 
1851 #if DEBUG_ASTAR
1852   /* under debug, it's nice to be able to see the words leaving the
1853      destination node, they are stored sequentially in the debug ary */
1854   for (i = 0; i < num_arcs; i++)
1855   {
1856     char * p;
1857     atoken = &arc_token_list[i];
1858     atoken->debug[0] = 0;
1859     for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL;
1860          tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index))
1861     {
1862       if (tmp->first_next_arc == ARC_TOKEN_NULL)
1863       {
1864         p = "END";
1865       }
1866       else if (!tmp->label)
1867       {
1868         p = "NULL";
1869       }
1870       else
1871       {
1872         p = tmp->label;
1873       }
1874       if (strlen(atoken->debug) + strlen(p) + 6 < 64)
1875       {
1876         strcat(atoken->debug, p);
1877         strcat(atoken->debug, " ");
1878       }
1879       else
1880       {
1881         strcat(atoken->debug, "...");
1882         break;
1883       }
1884     }
1885   }
1886 #endif
1887   context->arc_token_list_len = num_alloc_arc_tokens;
1888   if (num_alloc_arc_tokens > num_arcs)
1889   {
1890     atoken = &context->arc_token_list[num_arcs];
1891     for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++)
1892     {
1893       atoken->first_next_arc = ARC_TOKEN_NULL;
1894       atoken->ilabel         = MAXwordID;
1895       atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/
1896     }
1897     atoken--;
1898     atoken->next_token_index = ARC_TOKEN_NULL;
1899     context->arc_token_freelist = &context->arc_token_list[num_arcs];
1900   }
1901   else
1902   {
1903     context->arc_token_freelist = NULL;
1904   }
1905   /* the insert start is the arc that goes to the end */
1906   atoken = context->arc_token_list;
1907   atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc);
1908   for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index))
1909   {
1910     if (atoken->first_next_arc == ARC_TOKEN_NULL)
1911       break;
1912     tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc);
1913     if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL)
1914       break;
1915   }
1916   context->arc_token_insert_start = atoken;
1917 
1918   return FST_SUCCESS;
1919 }
1920 
FST_UnloadReverseWordGraph(srec_context * context)1921 int FST_UnloadReverseWordGraph(srec_context* context)
1922 {
1923   FREE(context->arc_token_list);
1924   context->arc_token_list = 0;
1925   return FST_SUCCESS;
1926 }
1927 
1928 /*----------------------------------------------------------------------*
1929  *                                                                      *
1930  * general                                                              *
1931  *                                                                      *
1932  *----------------------------------------------------------------------*/
1933 
split_line(char * line,char ** av)1934 char split_line(char* line, char** av)
1935 {
1936   int i = 0;
1937   av[i] = strtok(line, "\n\r\t ");
1938   for (; av[i];)
1939   {
1940     av[++i] = strtok(NULL, "\n\r\t ");
1941   }
1942   return i;
1943 }
1944 
atoi_with_check(const char * buf,asr_int32_t mymax)1945 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax)
1946 {
1947   asr_int32_t ans = atol(buf);
1948   ASSERT(ans < mymax);
1949   return ans;
1950 }
1951 
1952 /*----------------------------------------------------------------------*
1953  *                                                                      *
1954  * fst                                                                  *
1955  *                                                                      *
1956  *----------------------------------------------------------------------*/
1957 
fst_get_free_arc(srec_context * fst)1958 arcID fst_get_free_arc(srec_context* fst)
1959 {
1960   FSMarc* atoken;
1961   arcID atokid = fst->FSMarc_freelist;
1962   if (atokid == MAXarcID)
1963   {
1964     PLogError("error: ran out of arcs\n");
1965     return atokid;
1966   }
1967   atoken = &fst->FSMarc_list[atokid];
1968   fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc);
1969   memset(atoken, 0, sizeof(FSMarc));
1970   atoken->to_node = FSMNODE_NULL;
1971   atoken->fr_node = FSMNODE_NULL;
1972   atoken->linkl_next_arc = FSMARC_NULL;
1973   atoken->linkl_prev_arc = FSMARC_NULL;
1974   atoken->ilabel = 0;
1975   atoken->olabel = 0;
1976   atoken->cost   = 0;
1977   fst->num_arcs++;
1978   return atokid;
1979 }
fst_get_free_node(srec_context * fst)1980 nodeID fst_get_free_node(srec_context* fst)
1981 {
1982   FSMnode* ntoken;
1983   nodeID ntokid = fst->FSMnode_freelist;
1984   if (ntokid == MAXnodeID)
1985   {
1986     PLogError("error: ran out of nodes\n");
1987     return ntokid;
1988   }
1989   ntoken = &fst->FSMnode_list[ntokid];
1990   fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node);
1991   ntoken->un_ptr.first_next_arc = FSMARC_NULL;
1992   ntoken->first_prev_arc = FSMARC_NULL;
1993   /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */
1994   fst->num_nodes++;
1995   return ntokid;
1996 }
1997 
fst_free_node(srec_context * fst,FSMnode * node)1998 int fst_free_node(srec_context* fst, FSMnode* node)
1999 {
2000   IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node));
2001   node->first_prev_arc = FSMARC_NULL;
2002   node->un_ptr.next_node = fst->FSMnode_freelist;
2003   node->first_prev_arc = FSMARC_FREE;
2004   fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list);
2005   /* fst->num_nodes--; not allowed unless we compress! */
2006   return 0;
2007 }
2008 
fst_free_arc(srec_context * fst,FSMarc * arc)2009 int fst_free_arc(srec_context* fst, FSMarc* arc)
2010 {
2011   IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc));
2012   arc->linkl_prev_arc = FSMARC_FREE;
2013   arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist);
2014   arc->cost = 2;
2015   arc->ilabel = 0;
2016   arc->olabel = 0;
2017   IF_DEBUG_WDADD(arc->ilabel_str = 0);
2018   IF_DEBUG_WDADD(arc->olabel_str = 0);
2019   fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list);
2020   fst->num_arcs--;
2021   return 0;
2022 }
2023 
fst_free_arc_net(srec_context * fst,FSMarc * arc)2024 int fst_free_arc_net(srec_context* fst, FSMarc* arc)
2025 {
2026   if (!arc)
2027     return 0;
2028   /* need to traverse the arc, get the 'from' nodes,
2029      then traverse the from nodes and kill arcs */
2030   /* we'll need this after an error condition */
2031   return 0;
2032 }
2033 
fst_push_arc_olabel(srec_context * fst,FSMarc * arc)2034 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc)
2035 {
2036   FSMarc_ptr atok;
2037   FSMarc* atoken;
2038   FSMnode_ptr ntok = arc->to_node;
2039 
2040   /* look for all arcs leaving this arc, and mark them
2041      normally there is only one, except for multipron words */
2042   for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2043   {
2044     atoken = ARC_XtoP(atok);
2045     /* cannot have multiple olabels before a boundary */
2046     if (atoken->olabel != WORD_EPSILON_LABEL)
2047       return FST_FAILED_INTERNAL;
2048     atoken->olabel = arc->olabel;
2049 #if DO_WEIGHTED_ADDWORD
2050     atoken->cost = (costdata)(atoken->cost + arc->cost);
2051 #endif
2052     IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2053   }
2054   /* unlabel this arc */
2055   arc->olabel = WORD_EPSILON_LABEL;
2056 #if DO_WEIGHTED_ADDWORD
2057   arc->cost = FREEcostdata;
2058 #endif
2059   IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2060   return FST_SUCCESS;
2061 }
2062 
2063 #if DO_WEIGHTED_ADDWORD
fst_push_arc_cost(srec_context * fst,FSMarc * arc)2064 int fst_push_arc_cost(srec_context* fst, FSMarc* arc)
2065 {
2066   FSMarc_ptr atok;
2067   FSMarc* atoken;
2068   FSMnode_ptr ntok = arc->to_node;
2069   /* look for all arcs leaving this arc, and push the cost thereto */
2070   for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2071   {
2072     atoken = ARC_XtoP(atok);
2073     atoken->cost = (costdata)(atoken->cost + arc->cost);
2074   }
2075   arc->cost = FREEcostdata;
2076   return FST_SUCCESS;
2077 }
2078 #endif
2079 
2080 
fst_pull_arc_olabel(srec_context * fst,FSMarc * arc)2081 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc)
2082 {
2083   FSMarc_ptr atok;
2084   FSMarc* atoken;
2085   FSMnode_ptr fr_node = arc->fr_node;
2086   FSMarc_ptr tmp_atok;
2087   FSMarc *tmp_atoken;
2088   FSMnode *anode;
2089 
2090   if (arc->olabel == WORD_EPSILON_LABEL)
2091     return FST_SUCCESS;
2092 
2093   /* can the olabel be pulled earlier? */
2094   for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2095   {
2096     atoken = ARC_XtoP(atok);
2097     anode = NODE_XtoP(atoken->to_node);
2098 
2099     tmp_atok = anode->un_ptr.first_next_arc;
2100     if (tmp_atok != FSMARC_NULL)
2101     {
2102       tmp_atoken = ARC_XtoP(tmp_atok);
2103       if (tmp_atoken->linkl_next_arc != FSMARC_NULL)
2104       {
2105         return FST_CONTINUE;
2106       }
2107     }
2108   }
2109 
2110   /* look for all arcs arriving at this arc, and mark them */
2111   for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc)
2112   {
2113     atoken = ARC_XtoP(atok);
2114     /* cannot have multiple olabels! serious internal error!? */
2115     if (atoken->olabel != WORD_EPSILON_LABEL)
2116     {
2117       PLogError("error: internal error, in fst_pull_arc_olabel()\n");
2118       return FST_FAILED_INTERNAL;
2119     }
2120     atoken->olabel = arc->olabel;
2121 #if DO_WEIGHTED_ADDWORD
2122     atoken->cost = arc->cost;
2123 #endif
2124     IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2125   }
2126   /* unlabel this arc */
2127   arc->olabel = WORD_EPSILON_LABEL;
2128 #if DO_WEIGHTED_ADDWORD
2129   arc->cost   = FREEcostdata;
2130 #endif
2131   IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2132   return FST_SUCCESS;
2133 }
2134 
find_next_arc_with_ilabel(srec_context * fst,FSMnode * node,labelID ilabel,FSMarc ** last)2135 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last)
2136 {
2137   FSMarc_ptr atok;
2138   FSMarc* atoken = NULL;
2139   for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2140   {
2141     atoken = ARC_XtoP(atok);
2142     if (atoken->ilabel == ilabel)
2143       return atoken;
2144   }
2145   *last = atoken; /* to this we may want to append a new arc */
2146   return NULL;
2147 }
2148 
find_prev_arc_with_iolabels(srec_context * fst,FSMnode * node,labelID ilabel,labelID olabel,FSMarc ** last)2149 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last)
2150 {
2151   FSMarc_ptr atok;
2152   FSMarc* atoken = NULL;
2153   FSMarc_ptr tmp_atok;
2154   FSMarc* tmp_atoken;
2155 
2156   for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2157   {
2158     atoken = ARC_XtoP(atok);
2159     if (atoken->olabel == olabel && atoken->ilabel == ilabel)
2160     {
2161       tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc;
2162       if (tmp_atok != FSMARC_NULL)
2163       {
2164         tmp_atoken = ARC_XtoP(tmp_atok);
2165 
2166         if (tmp_atoken->linkl_next_arc == FSMARC_NULL)
2167           return atoken;
2168       }
2169       else
2170         return atoken;
2171     }
2172 
2173   }
2174   if (last) *last = atoken;
2175   return NULL;
2176 }
2177 
num_arcs_leaving(srec_context * fst,FSMnode * node)2178 int num_arcs_leaving(srec_context* fst, FSMnode* node)
2179 {
2180   int num_arcs = 0;
2181   FSMarc_ptr atok;
2182   FSMarc* atoken;
2183   for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2184   {
2185     atoken = ARC_XtoP(atok);
2186     num_arcs++;
2187   }
2188   return num_arcs;
2189 }
2190 
num_arcs_arriving(srec_context * fst,FSMnode * node)2191 int num_arcs_arriving(srec_context* fst, FSMnode* node)
2192 {
2193   int num_arcs = 0;
2194   FSMarc_ptr atok;
2195   FSMarc* atoken;
2196   for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2197   {
2198     atoken = ARC_XtoP(atok);
2199     num_arcs++;
2200   }
2201   return num_arcs;
2202 }
2203 
num_arcs_arriving_gt_1(srec_context * fst,FSMnode * node)2204 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node)
2205 {
2206   FSMarc_ptr atok = node->first_prev_arc;
2207   FSMarc* atoken;
2208   if (atok == FSMARC_NULL)
2209     return 0;
2210   atoken = ARC_XtoP(atok);
2211   return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1;
2212 }
2213 
remove_arc_arriving(srec_context * fst,FSMnode * to_node,FSMarc * arc)2214 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc)
2215 {
2216   FSMarc* atoken;
2217   FSMarc_ptr* atok;
2218   for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2219   {
2220     atoken = ARC_XtoP(*atok);
2221     if (atoken == arc)
2222     {
2223       (*atok) = arc->linkl_prev_arc;
2224       return;
2225     }
2226   }
2227   ASSERT(0);
2228 }
2229 
split_node_for_arc(srec_context * fst,FSMarc * arc)2230 int split_node_for_arc(srec_context* fst, FSMarc* arc)
2231 {
2232   arcID new_arc_id;
2233   nodeID new_node_id;
2234   FSMnode* new_node;
2235   FSMnode* old_node;
2236   FSMarc_ptr atok, new_next, last_new_next;
2237   FSMarc *atoken, *new_next_p;
2238 
2239   IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc));
2240 
2241   new_node_id = fst_get_free_node(fst);
2242   if (new_node_id == MAXnodeID)
2243     return FST_FAILED_ON_MEMORY;
2244   new_node = &fst->FSMnode_list[ new_node_id];
2245   old_node = NODE_XtoP(arc->to_node);
2246   arc->to_node = NODE_ItoX(new_node_id);
2247   new_node->first_prev_arc = ARC_PtoX(arc);
2248   remove_arc_arriving(fst, old_node, arc);
2249   arc->linkl_prev_arc = FSMARC_NULL;
2250 
2251   last_new_next = FSMARC_NULL;
2252   for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2253   {
2254     atoken = ARC_XtoP(atok);
2255     new_arc_id = fst_get_free_arc(fst);
2256     if (new_arc_id == MAXarcID)
2257       return FST_FAILED_ON_MEMORY;
2258     new_next = ARC_ItoX(new_arc_id);
2259     if (last_new_next != FSMARC_NULL)
2260       LINKL_NEXT(last_new_next) = new_next;
2261     else
2262       new_node->un_ptr.first_next_arc = new_next;
2263     new_next_p = ARC_XtoP(new_next);
2264     new_next_p->ilabel = atoken->ilabel;
2265     new_next_p->to_node = atoken->to_node;
2266     new_next_p->fr_node = arc->to_node;
2267     new_next_p->olabel = atoken->olabel;
2268     IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str);
2269     IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str);
2270     new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */
2271     append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next);
2272     last_new_next = new_next;
2273   }
2274   return FST_SUCCESS;
2275 }
2276 
fst_add_arcs(srec_context * fst,nodeID start_node,nodeID end_node,wordID add_tree_olabel,costdata add_tree_cost,modelID * add_tree_start,int add_tree_len)2277 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
2278                  wordID add_tree_olabel, costdata add_tree_cost,
2279                  modelID* add_tree_start, int add_tree_len)
2280 {
2281   int rc = FST_SUCCESS;
2282   FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0;
2283   FSMarc* atoken = NULL;
2284   FSMarc *next_atoken, *prev_atoken;
2285   FSMarc *append_to;
2286   FSMnode *late_start_node, *early_end_node;
2287   FSMnode *ntoken, *last_ntoken;
2288   FSMnode_ptr ntok, last_ntok;
2289   modelID *add_tree_end = add_tree_start + add_tree_len - 1;
2290   modelID *add_tree_late_start, *add_tree_early_end;
2291   modelID *add_tree;
2292   arcID new_arc_id, atokid;
2293   nodeID new_node_id, atokid_node;
2294   wordID add_tree_olabel_use = add_tree_olabel;
2295   wordID add_tree_cost_use   = add_tree_cost;
2296 
2297   append_to = NULL;
2298   add_tree = add_tree_start;
2299 #define FST_GROW_MINARCS  100
2300 #define FST_GROW_MINNODES 100
2301 #if defined(FST_GROW_FACTOR)
2302 
2303   /* make sure we have enough arcs available */
2304   if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len)
2305    {
2306 	FSMarc* tmp_FSMarc_list;
2307 	arcID tmp_FSMarc_list_len;
2308 	int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ;
2309 
2310 	if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS)
2311 		itmp_FSMarc_list_len += FST_GROW_MINARCS;
2312 
2313 	if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len)
2314 		itmp_FSMarc_list_len += add_tree_len;
2315 
2316     if(itmp_FSMarc_list_len >= (int)MAXarcID)
2317 	     return FST_FAILED_ON_MEMORY;
2318 
2319 	tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len;
2320 
2321     tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
2322       if(!tmp_FSMarc_list) {
2323 	PLogError("error: Failed to extend the memory for arcs\n");
2324         return FST_FAILED_INTERNAL;
2325       }
2326     memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
2327 
2328 	/* initialize the new free list.
2329     head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */
2330 	for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) {
2331 	   tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1);
2332        tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2333     }
2334     tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL;
2335     tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2336 
2337     /* scan to the end of the old free list (to be re-used) */
2338     arcID* last_free_arc = &fst->FSMarc_freelist;
2339     atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
2340 
2341     for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
2342 	    atoken = &tmp_FSMarc_list[ *last_free_arc];
2343 
2344 	/* append the new free list to the current free list */
2345 	*last_free_arc = fst->FSMarc_list_len;
2346 
2347 	FREE( fst->FSMarc_list);
2348     fst->FSMarc_list = tmp_FSMarc_list;
2349     fst->FSMarc_list_len = tmp_FSMarc_list_len;
2350    }
2351 
2352    /* make sure we have enough nodes available */
2353    if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len)
2354    {
2355      FSMnode* tmp_FSMnode_list;
2356 	 nodeID tmp_FSMnode_list_len;
2357 	 FSMnode_info* tmp_FSMnode_info_list;
2358 	 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ;
2359 
2360 	 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES)
2361 		itmp_FSMnode_list_len += FST_GROW_MINNODES;
2362 
2363 	 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len)
2364 		itmp_FSMnode_list_len += add_tree_len;
2365 
2366 	 if(itmp_FSMnode_list_len >= (int)MAXnodeID)
2367 	     return FST_FAILED_ON_MEMORY;
2368 
2369 	 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len;
2370 
2371      tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
2372      if(!tmp_FSMnode_list) {
2373        PLogError("ERROR: Failed to extend the memory for nodes\n");
2374        return FST_FAILED_INTERNAL;
2375      }
2376      memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
2377 
2378 	/* initialize the new free node list.
2379     head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */
2380     for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++)
2381 	{
2382 	   tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1);
2383        tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2384     }
2385        tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL;
2386        tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2387 
2388     /* scan to the end of the old free list (to be re-used) */
2389     nodeID* last_free_node = (&fst->FSMnode_freelist);
2390 	ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
2391 
2392 	for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
2393 		 ntoken = &tmp_FSMnode_list[ *last_free_node];
2394 
2395 	/* append the new free list to the current free list */
2396 	*last_free_node = fst->FSMnode_list_len;
2397 
2398 	FREE( fst->FSMnode_list);
2399 
2400 	tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
2401      if(!tmp_FSMnode_info_list) {
2402        PLogError("ERROR: Failed to extend the memory for node infos\n");
2403        return FST_FAILED_INTERNAL;
2404      }
2405 	// copy in old node info
2406 	memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
2407 
2408 	// initialize the new node info
2409 	for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++)
2410 		tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN;
2411 
2412 	FREE( fst->FSMnode_info_list);
2413 	fst->FSMnode_info_list = tmp_FSMnode_info_list;
2414     fst->FSMnode_list = tmp_FSMnode_list;
2415     fst->FSMnode_list_len = tmp_FSMnode_list_len;
2416 }
2417 #endif
2418    late_start_node = &fst->FSMnode_list[start_node];
2419 
2420   while (1)
2421   {
2422 
2423     next_atoken = find_next_arc_with_ilabel(fst, late_start_node,
2424                                             *add_tree /*->ilabel*/,
2425                                             &append_to);
2426     if (next_atoken != NULL)
2427     {
2428       /* so next_atok->ilabel == add_tree->ilabel */
2429 
2430       if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY)
2431       {
2432 	 next_atoken = NULL;
2433 	 break; /* break as if nothing more can be shared! */
2434 	// return FST_FAILED_ON_HOMONYM;
2435       }
2436 
2437       if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node)))
2438       {
2439         split_node_for_arc(fst, next_atoken);
2440         /* unfortunate side effect here, that if we later find out this
2441            was a homonym addition, then this expansion was useless and
2442            for now we don't undo it! */
2443       }
2444 
2445       /* we shouldn't really push the olabel if it's the same for both! */
2446       if (next_atoken->olabel != add_tree_olabel_use)
2447       {
2448         if (next_atoken->olabel != WORD_EPSILON_LABEL)
2449         {
2450           if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS)
2451           {
2452             PLogError("error: internal error fst_push_arc_olabel()\n");
2453             return FST_FAILED_INTERNAL;
2454           }
2455         }
2456 #if DO_WEIGHTED_ADDWORD
2457         else
2458           fst_push_arc_cost(fst, next_atoken);
2459 #endif
2460       }
2461       else if (add_tree_olabel_use != WORD_EPSILON_LABEL)
2462       {
2463         /* the graph already has this word, so we just re-use the olabel
2464            and disable the setting of olabel down below */
2465         add_tree_olabel_use = WORD_EPSILON_LABEL;
2466 #if DO_WEIGHTED_ADDWORD
2467         add_tree_cost_use   = FREEcostdata;
2468 #endif
2469       }
2470       add_tree++;
2471       late_start_node = NODE_XtoP(next_atoken->to_node);
2472     }
2473     else
2474     {
2475       break;
2476     }
2477   }
2478 
2479   add_tree_late_start = add_tree;
2480   early_end_node = &fst->FSMnode_list[end_node];
2481 
2482   if (!do_minimize)
2483   {
2484     last_ntoken = NULL;
2485     last_ntok = NODE_PtoX(late_start_node);
2486     for (; add_tree <= add_tree_end; add_tree++)
2487     {
2488       new_arc_id = fst_get_free_arc(fst);
2489       if (new_arc_id == MAXarcID)
2490       {
2491         PLogError("fst_get_free_arc() failed\n");
2492         return FST_FAILED_ON_MEMORY;
2493       }
2494       atok = ARC_ItoX(new_arc_id);
2495       atoken = ARC_XtoP(atok);
2496       if (add_tree != add_tree_end)
2497       {
2498         new_node_id = fst_get_free_node(fst);
2499         if (new_node_id == MAXnodeID)
2500         {
2501           PLogError("fst_get_free_node() failed\n");
2502           return FST_FAILED_ON_MEMORY;
2503         }
2504         ntok = NODE_ItoX(new_node_id);
2505         ntoken = NODE_XtoP(ntok);
2506         ntoken->first_prev_arc = atok;
2507       }
2508       else
2509       {
2510         ntok = FSMNODE_NULL;
2511         ntoken = NULL;
2512       }
2513       atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */
2514       atoken->to_node = ntok;
2515       atoken->ilabel = *add_tree;
2516       atoken->linkl_next_arc = FSMARC_NULL;
2517       atoken->linkl_prev_arc = FSMARC_NULL;
2518       IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2519       if (!last_ntoken)
2520       {
2521         append_to->linkl_next_arc = atok;
2522         atoken->olabel = add_tree_olabel_use;
2523 #if DO_WEIGHTED_ADDWORD
2524         atoken->cost  =  add_tree_cost_use;
2525 #endif
2526         IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]);
2527       }
2528       else
2529       {
2530         last_ntoken->un_ptr.first_next_arc = atok;
2531       }
2532       last_ntoken = ntoken;
2533       last_ntok   = ntok;
2534     }
2535     /*  add_tree_end->to_node = early_end_node;
2536     add_tree_end->linkl_next_arc = FSMARC_NULL;
2537     atok = last_arc_arriving( early_end_node);
2538     atok->linkl_prev_arc = add_tree_end;
2539     add_tree_end->linkl_prev_arc = FSMARC_NULL; */
2540     atoken->to_node = NODE_ItoX(end_node);
2541     append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok);
2542   }
2543 
2544   else /* ie. do_minimize from the rear */
2545   {
2546 
2547     add_tree = add_tree_end;
2548     new_arc_id = fst_get_free_arc(fst);
2549     if (new_arc_id == MAXarcID)
2550       return FST_FAILED_ON_MEMORY;
2551     atok = ARC_ItoX(new_arc_id);
2552     atoken = ARC_XtoP(atok);
2553     atoken->olabel = add_tree_olabel_use;
2554 #if DO_WEIGHTED_ADDWORD
2555     atoken->cost   = add_tree_cost_use;
2556 #endif
2557     atoken->ilabel = *add_tree_late_start;
2558     atoken->fr_node = NODE_PtoX(late_start_node);
2559     if (atoken->ilabel == WORD_BOUNDARY)
2560     {
2561       atoken->cost = (costdata)(atoken->cost + fst->wtw_average);
2562     }
2563     else
2564     {
2565       atoken->cost   = add_tree_cost_use + fst->wtw_average;
2566       add_tree_cost_use = FREEcostdata;
2567     }
2568     IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]);
2569     IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2570     append_arc_leaving_node(fst, late_start_node, atok);
2571 
2572     last_atok = atok;
2573     while (1)
2574     {
2575 
2576       if (add_tree == add_tree_late_start)
2577         break;
2578       /* if there are other arcs leaving this node then joining
2579       earlier than here will result in over-generation, so don't
2580       if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL)
2581       break;
2582       */
2583       /*
2584       also need boundary conditions checker here
2585       */
2586 
2587       /* */
2588       IF_DEBUG_WDADD(printf_node(fst, early_end_node));
2589 
2590       /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) {
2591       printf("merge stopped due to num_arcs leaving ..\n");
2592       printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc);
2593       break;
2594       } */
2595 
2596       for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2597       {
2598         atoken = ARC_XtoP(atok);
2599 	/* never pull before the slot marker */
2600 	if(atoken->cost == DISABLE_ARC_COST) break;
2601 	if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break;
2602         fst_pull_arc_olabel(fst, atoken); /* fails are ok */
2603       }
2604 
2605       prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node,
2606                     *add_tree /*->ilabel*/,
2607                     WORD_EPSILON_LABEL,
2608                     NULL /*&append_to*/);
2609       if (!prev_atoken)
2610         break;
2611       else
2612       {
2613         /* this check is now inside find_prev_arc_with_iolabels */
2614         /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/
2615 
2616         /* now look to merge earlier still */
2617         early_end_node = NODE_XtoP(prev_atoken->fr_node);
2618         add_tree--;
2619         if (add_tree == add_tree_late_start)
2620           break;
2621       }
2622     }
2623     add_tree_early_end = add_tree;
2624     for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++)
2625     {
2626       new_node_id = fst_get_free_node(fst);
2627       if (new_node_id == MAXnodeID)
2628         return FST_FAILED_ON_MEMORY;
2629       ntok = NODE_ItoX(new_node_id);
2630       ntoken = NODE_XtoP(ntok);
2631       new_arc_id = fst_get_free_arc(fst);
2632       if (new_arc_id == MAXarcID)
2633         return FST_FAILED_ON_MEMORY;
2634       atok = ARC_ItoX(new_arc_id);
2635       atoken = ARC_XtoP(atok);
2636       atoken->ilabel = *add_tree;
2637 #if DO_WEIGHTED_ADDWORD
2638       atoken->cost   = FREEcostdata; /* idea: distribute add_tree_cost here;*/
2639 #endif
2640       atoken->olabel = WORD_EPSILON_LABEL;
2641       atoken->fr_node = ntok;
2642       atoken->to_node = FSMNODE_NULL; /* filled in next loop */
2643       IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]);
2644       IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]);
2645 
2646       ntoken->un_ptr.first_next_arc = atok;
2647       ntoken->first_prev_arc = last_atok;
2648       TO_NODE(last_atok) = ntok;
2649       last_atok = atok;
2650     }
2651     TO_NODE(last_atok) = NODE_PtoX(early_end_node);
2652     append_arc_arriving_node(fst, early_end_node, last_atok);
2653   }
2654   return rc;
2655 }
2656 
2657 
append_arc_leaving_node(srec_context * fst,FSMnode * fr_node,FSMarc_ptr arc)2658 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc)
2659 {
2660   FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc;
2661   FSMarc* atoken = ARC_XtoP(*atok);
2662   for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc)
2663   {
2664     atoken = ARC_XtoP(*atok);
2665   }
2666   *atok = arc;
2667   LINKL_NEXT(arc) = FSMARC_NULL;
2668 }
append_arc_arriving_node(srec_context * fst,FSMnode * to_node,FSMarc_ptr arc)2669 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc)
2670 {
2671   FSMarc_ptr* atok = &to_node->first_prev_arc;
2672   FSMarc* atoken = ARC_XtoP(*atok);
2673   for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2674   {
2675     atoken = ARC_XtoP(*atok);
2676   }
2677   *atok = arc;
2678   LINKL_PREV(arc) = FSMARC_NULL;
2679 }
2680 
2681 
printf_node1(srec_context * fst,FSMnode * node)2682 int printf_node1(srec_context* fst, FSMnode* node)
2683 {
2684   return 0;
2685 }
printf_arc1(srec_context * fst,char * msg,FSMarc * arc)2686 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc)
2687 {
2688   char buffer[MAX_LINE_LENGTH];
2689   sprintf_arc(buffer, fst, arc);
2690   printf("%s%s\n", msg, buffer);
2691   return 0;
2692 }
sprintf_arc(char * buf,srec_context * fst,FSMarc * arc)2693 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc)
2694 {
2695   int rc;
2696   FSMnode* to_node = NODE_XtoP(arc->to_node);
2697   arcID arc_index = (arcID)(arc - fst->FSMarc_list);
2698   if (to_node->un_ptr.first_next_arc == FSMARC_NULL)
2699     rc = sprintf(buf, "arc%hu\n", arc_index);
2700   else
2701   {
2702     rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n",
2703                  arc_index,
2704                  ARC_XtoI(to_node->un_ptr.first_next_arc),
2705                  arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1,
2706                  fst->ilabels->words[arc->ilabel],
2707                  fst->olabels->words[arc->olabel],
2708                  arc->cost);
2709   }
2710   return rc;
2711 }
2712 
2713 /* dumps the recognition context as a binary file,
2714    it will also store any empty space for growing */
2715 
2716 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); }
2717 
FST_sizes_signature()2718 asr_int32_t FST_sizes_signature()
2719 {
2720 #ifndef NDEBUG
2721   int shifted = 0;
2722 #endif
2723   asr_int32_t signature = 0;
2724   ENCODE(signature, arcID);
2725   ENCODE(signature, nodeID);
2726   ENCODE(signature, wordID);
2727   ENCODE(signature, labelID);
2728   ENCODE(signature, costdata);
2729   return signature;
2730 }
2731 
FST_ContextImageSize(srec_context * context)2732 int FST_ContextImageSize(srec_context* context)
2733 {
2734   int size = 0;
2735   size += sizeof(srec_context);
2736   size += sizeof(wordmap);
2737   size += sizeof(char*) * context->olabels->max_words;
2738   size += sizeof(char) * context->olabels->max_chars;
2739   size += sizeof(wordmap);
2740   if (context->ilabels->words != NULL)
2741     size += sizeof(char*) * context->olabels->max_words;
2742   if (context->ilabels->chars != NULL)
2743     size += sizeof(char) * context->olabels->max_chars;
2744   size += sizeof(FSMarc) * context->FSMarc_list_len;
2745   size += sizeof(FSMnode) * context->FSMnode_list_len;
2746   size += sizeof(FSMnode_info) * context->FSMnode_list_len;
2747   size += sizeof(arc_token) * context->arc_token_list_len;
2748   return size;
2749 }
2750 
serializeWordMapV2(wordmap * wordmap,PFile * fp)2751 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp)
2752 {
2753   unsigned int i = 0;
2754   unsigned int nfields;
2755   unsigned int tmp2[32];
2756 
2757   i = 0;
2758   tmp2[i++] = wordmap->num_words;
2759   tmp2[i++] = wordmap->num_slots;
2760   tmp2[i++] = wordmap->max_words;
2761   tmp2[i++] = wordmap->num_base_words;
2762   tmp2[i++] = wordmap->max_chars;
2763   tmp2[i++] = (wordmap->next_chars - wordmap->chars);
2764   tmp2[i++] = (wordmap->next_base_chars - wordmap->chars);
2765   nfields = i;
2766   if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2767     return ESR_WRITE_ERROR;
2768 
2769   if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars)
2770     return ESR_WRITE_ERROR;
2771 
2772   return ESR_SUCCESS;
2773 }
2774 
deserializeWordMapV2(wordmap ** pwordmap,PFile * fp)2775 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp)
2776 {
2777   unsigned int i = 0;
2778   unsigned int nfields;
2779   unsigned int tmp2[32];
2780   wordmap *awordmap;
2781   char *p;
2782   ESR_ReturnCode rc = ESR_SUCCESS;
2783   unsigned int next_chars_idx, next_base_chars_idx;
2784 
2785   awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base"));
2786   if (awordmap == NULL)
2787   {
2788     PLogError("NEW failed on srec.g2g.graph.wordmap.base\n");
2789     rc = ESR_OUT_OF_MEMORY;
2790     goto CLEANUP;
2791   }
2792   awordmap->wordIDForWord = NULL; // early break to cleanup needs this
2793 
2794   nfields = 7;
2795   if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2796   {
2797     PLogError("pfread failed when reading nfields\n");
2798     rc = ESR_READ_ERROR;
2799     goto CLEANUP;
2800   }
2801 
2802   i = 0;
2803   awordmap->num_words      = (wordID)tmp2[i++];
2804   awordmap->num_slots      = (wordID)tmp2[i++];
2805   awordmap->max_words      = (wordID)tmp2[i++];
2806   awordmap->num_base_words = (wordID)tmp2[i++];
2807   awordmap->max_chars      = tmp2[i++];
2808   next_chars_idx           = tmp2[i++];
2809   next_base_chars_idx      = tmp2[i++];
2810   ASSERT(nfields == i);
2811 
2812   awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words"));
2813   if (awordmap->words == NULL)
2814   {
2815     PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words);
2816     rc = ESR_OUT_OF_MEMORY;
2817     goto CLEANUP;
2818   }
2819   awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars"));
2820   if (awordmap->chars == NULL)
2821   {
2822     PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars);
2823     rc = ESR_OUT_OF_MEMORY;
2824     goto CLEANUP;
2825   }
2826   awordmap->next_chars = awordmap->chars + next_chars_idx;
2827   awordmap->next_base_chars = awordmap->chars + next_base_chars_idx;
2828 
2829   if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars)
2830   {
2831     PLogError("pfread failed while reading %d chars\n", awordmap->max_chars);
2832     rc = ESR_READ_ERROR;
2833     goto CLEANUP;
2834   }
2835 
2836 
2837   p = awordmap->chars;
2838   ASSERT((uintptr_t)p % 2 == 0);
2839   nfields = 0;
2840   if (nfields < awordmap->num_words)
2841     awordmap->words[nfields++] = p;
2842   for (; p < awordmap->next_chars; p++) // was next_base_chars
2843   {
2844     if ( ( *p ) == '\0' )
2845     {
2846       if (nfields == awordmap->num_words) // was num_base_words
2847         break;
2848       if (((uintptr_t)p) % 2 == 0) p++; /* so that words begin on even byte bound */
2849       awordmap->words[nfields++] = p + 1;
2850     }
2851   }
2852   ASSERT(nfields == awordmap->num_words); // was num_base_words
2853 
2854   if (awordmap->max_words >= awordmap->num_base_words)
2855     {
2856       PHashTableArgs hashArgs;
2857       hashArgs.capacity = awordmap->max_words;
2858       if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
2859       hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION;
2860       hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION;
2861       hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
2862       CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord));
2863 
2864       rc = wordmap_populate ( awordmap, awordmap->num_words );
2865 
2866       if (rc != ESR_SUCCESS)
2867 	{
2868         wordmap_clean ( awordmap );
2869         goto CLEANUP;
2870       }
2871     }
2872   else
2873     {
2874       awordmap->wordIDForWord = NULL;
2875     }
2876 
2877   /* success */
2878   *pwordmap = awordmap;
2879   return ESR_SUCCESS;
2880 
2881  CLEANUP:
2882   if (awordmap != NULL)
2883   {
2884     if (awordmap->wordIDForWord != NULL)
2885       PHashTableDestroy(awordmap->wordIDForWord);
2886     if (awordmap->words != NULL) FREE(awordmap->words);
2887     if (awordmap->chars != NULL) FREE(awordmap->chars);
2888     FREE(awordmap);
2889   }
2890   return rc;
2891 }
2892 
serializeArcToken(arc_token * token_base,int i,PFile * fp)2893 static ESR_ReturnCode serializeArcToken(arc_token *token_base,
2894                                         int i,
2895                                         PFile* fp)
2896 {
2897   arc_token *token = token_base + i;
2898   asr_uint32_t idx;
2899 
2900   if (pfwrite(&token->ilabel, 2, 1, fp) != 1)
2901     return ESR_WRITE_ERROR;
2902 
2903   if (pfwrite(&token->olabel, 2, 1, fp) != 1)
2904     return ESR_WRITE_ERROR;
2905 
2906   /* if (pfwrite(&token->cost, 2, 1, fp) != 1)
2907      return ESR_WRITE_ERROR; */
2908   idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base);
2909 
2910   if (pfwrite(&idx, 4, 1, fp) != 1)
2911     return ESR_WRITE_ERROR;
2912 
2913   idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base);
2914 
2915   if (pfwrite(&idx, 4, 1, fp) != 1)
2916     return ESR_WRITE_ERROR;
2917 
2918   return ESR_SUCCESS;
2919 }
2920 
serializeArcTokenInfo(srec_context * context,PFile * fp)2921 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context,
2922     PFile* fp)
2923 {
2924   int i;
2925   asr_uint32_t idx;
2926   ESR_ReturnCode rc;
2927 
2928   if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1)
2929     return ESR_WRITE_ERROR;
2930 
2931   idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list);
2932 
2933   if (pfwrite(&idx, 4, 1, fp) != 1)
2934     return ESR_WRITE_ERROR;
2935 
2936   idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list);
2937 
2938   if (pfwrite(&idx, 4, 1, fp) != 1)
2939     return ESR_WRITE_ERROR;
2940 
2941   for (i = 0; i < context->arc_token_list_len; ++i)
2942   {
2943     rc = serializeArcToken(context->arc_token_list, i, fp);
2944     if (rc != ESR_SUCCESS) return rc;
2945   }
2946   return ESR_SUCCESS;
2947 }
2948 
deserializeArcToken(arc_token * token_base,int i,PFile * fp)2949 static ESR_ReturnCode deserializeArcToken(arc_token *token_base,
2950     int i,
2951     PFile* fp)
2952 {
2953   arc_token *token = token_base + i;
2954   asr_uint32_t idx[2];
2955   asr_uint16_t labels[2];
2956 
2957   if (pfread(labels, 2, 2, fp) != 2)
2958     return ESR_READ_ERROR;
2959 
2960   token->ilabel = labels[0];
2961   token->olabel = labels[1];
2962 
2963   /* if (pfread(&token->cost, 2, 1, fp) != 1)
2964      return ESR_READ_ERROR; */
2965 
2966   if (pfread(idx, 4, 2, fp) != 2)
2967     return ESR_READ_ERROR;
2968 
2969   token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base));
2970   token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base));
2971 
2972   return ESR_SUCCESS;
2973 }
2974 
deserializeArcTokenInfo(srec_context * context,PFile * fp)2975 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context,
2976     PFile* fp)
2977 {
2978   ESR_ReturnCode rc;
2979   int i;
2980   asr_uint32_t idx;
2981 
2982   if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) {
2983     PLogError("pfread failed in deserializeArcTokenInfo()\n");
2984     return ESR_READ_ERROR;
2985   }
2986 
2987   context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len,
2988                                       L("srec.graph.wordgraph"));
2989 
2990   if (context->arc_token_list == NULL)
2991     return ESR_OUT_OF_MEMORY;
2992 
2993   if (pfread(&idx, 4, 1, fp) != 1)
2994   {
2995     rc = ESR_READ_ERROR;
2996     goto CLEANUP;
2997   }
2998 
2999   context->arc_token_freelist =
3000     IDX_TO_PTR(idx, context->arc_token_list);
3001 
3002   if (pfread(&idx, 4, 1, fp) != 1)
3003   {
3004     rc = ESR_READ_ERROR;
3005     goto CLEANUP;
3006   }
3007 
3008   context->arc_token_insert_start =
3009     IDX_TO_PTR(idx, context->arc_token_list);
3010 
3011   for (i = 0; i < context->arc_token_list_len; ++i)
3012   {
3013     rc = deserializeArcToken(context->arc_token_list, i, fp);
3014     if (rc != ESR_SUCCESS) goto CLEANUP;
3015   }
3016   return ESR_SUCCESS;
3017 
3018 CLEANUP:
3019   FREE(context->arc_token_list);
3020   context->arc_token_list =
3021     context->arc_token_freelist =
3022       context->arc_token_insert_start = NULL;
3023   return rc;
3024 }
3025 
FST_GetGrammarType(srec_context * context)3026 int FST_GetGrammarType(srec_context* context)
3027 {
3028   arc_token *t, *u;
3029   wordID expected_wdid;
3030   /* 0 1 0
3031      1 2 4
3032      ...
3033      1 2 1316
3034      2
3035   */
3036   t = context->arc_token_list;
3037   if (t->ilabel != WORD_EPSILON_LABEL)
3038     return GrammarTypeBNF;
3039   if (t->next_token_index)
3040     return GrammarTypeBNF;
3041   t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3042   expected_wdid = NUM_ITEMLIST_HDRWDS;
3043   for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index))
3044   {
3045     if (t->ilabel != expected_wdid)
3046       return GrammarTypeBNF;
3047     u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3048     if (u != NULL && u->ilabel != MAXwordID)
3049       return GrammarTypeBNF;
3050     expected_wdid++;
3051   }
3052   if (expected_wdid != context->olabels->num_words)
3053     return GrammarTypeBNF;
3054   return GrammarTypeItemList;
3055 }
3056 
FST_DumpContextAsImageV2(srec_context * context,PFile * fp)3057 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp)
3058 {
3059   asr_uint32_t header[4];
3060   arcID tmp[32], i, j, nfields;
3061   FSMarc* arc;
3062   ESR_ReturnCode rc;
3063 #if !defined(DO_ARCS_IO_IN_ARC_ORDER)
3064   FSMnode* node;
3065   arcID arcid, num_arcs, num_arcs_written;
3066 #endif
3067 
3068   /* Write header information. */
3069   header[0] = 0;
3070   header[1] = IMAGE_FORMAT_V2;
3071   header[2] = context->modelid;
3072   header[3] = context->grmtyp;
3073 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3074   PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp);
3075 #endif
3076 
3077   if (pfwrite(header, sizeof(header[0]), 4, fp) != 4)
3078   {
3079     PLogError("FST_DumpContextAsImage: Could not write header.\n");
3080     return FST_FAILED_INTERNAL;
3081   }
3082 
3083   /* arcs and nodes */
3084   i = 0;
3085   tmp[i++] = context->num_arcs;
3086   tmp[i++] = context->FSMarc_list_len;
3087   tmp[i++] = context->num_base_arcs;
3088   tmp[i++] = context->FSMarc_freelist;
3089   tmp[i++] = context->max_searchable_arcs;
3090 
3091   tmp[i++] = context->num_nodes;
3092   tmp[i++] = context->FSMnode_list_len;
3093   tmp[i++] = context->num_base_nodes;
3094   tmp[i++] = context->FSMnode_freelist;
3095   tmp[i++] = context->start_node;
3096   tmp[i++] = context->end_node;
3097   tmp[i++] = context->max_searchable_nodes;
3098 
3099   tmp[i++] = context->beg_silence_word;
3100   tmp[i++] = context->end_silence_word;
3101   tmp[i++] = context->hack_silence_word;
3102   tmp[i++] = context->hmm_ilabel_offset;
3103 
3104   nfields = i;
3105 
3106   if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3107     return ESR_WRITE_ERROR;
3108 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3109   PLogMessage("G2G done WR hdrs %d", pftell(fp));
3110 #endif
3111 
3112 #if defined(DO_ARCS_IO_IN_ARC_ORDER)
3113   if( 1) {
3114 	 i = 0;
3115      tmp[i++] = context->end_node;
3116      tmp[i++] = MAXnodeID;
3117      tmp[i++] = MAXmodelID;
3118      tmp[i++] = MAXwordID;
3119      tmp[i++] = FREEcostdata;
3120      nfields = i;
3121      if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3122         return ESR_WRITE_ERROR;
3123   }
3124   for(j=0; j<context->num_arcs; j++) {
3125         arc = &context->FSMarc_list[j];
3126         i = 0;
3127         tmp[i++] = arc->fr_node;
3128         tmp[i++] = arc->to_node;
3129 	ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3130         tmp[i++] = arc->ilabel;
3131         tmp[i++] = arc->olabel;
3132         tmp[i++] = arc->cost;
3133         nfields = i;
3134         if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3135           return ESR_WRITE_ERROR;
3136   }
3137 #else
3138   num_arcs_written = 0;
3139   for (j = 0; j < context->num_nodes; j++) {
3140     node = &context->FSMnode_list[j];
3141     num_arcs = (arcID)num_arcs_leaving(context, node);
3142     arcid = node->un_ptr.first_next_arc;
3143     if (arcid == MAXarcID) {
3144       i = 0;
3145       tmp[i++] = (arcID)j;
3146       tmp[i++] = MAXarcID;
3147       tmp[i++] = MAXarcID;
3148       tmp[i++] = MAXarcID;
3149       tmp[i++] = MAXarcID;
3150       nfields = i;
3151       if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3152         return ESR_WRITE_ERROR;
3153       /* num_arcs_written++; end node don't have a to-arc */
3154     }
3155     else {
3156       for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) {
3157         arc = &context->FSMarc_list[arcid];
3158         i = 0;
3159         tmp[i++] = (arcID)j;
3160         tmp[i++] = arc->to_node;
3161 	ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3162         tmp[i++] = arc->ilabel;
3163         tmp[i++] = arc->olabel;
3164         tmp[i++] = arc->cost;
3165         nfields = i;
3166         if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3167           return ESR_WRITE_ERROR;
3168         num_arcs_written++;
3169       }
3170     }
3171   }
3172   ASSERT(num_arcs_written == context->num_arcs);
3173 #endif
3174 
3175 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3176   PLogMessage("G2G done WR arcs %d", pftell(fp));
3177 #endif
3178 
3179   /* node info   .. will now be calculated on line */
3180   /* wrapup_cost .. will now be calculated on line */
3181 
3182   /* exit points for slots */
3183   if (1)
3184   {
3185     srec_fsm_exit_point *p, *q;
3186     if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1)
3187       return ESR_WRITE_ERROR;
3188     p = context->fsm_exit_points;
3189     q = p + MAX_NUM_SLOTS;
3190     while (p < q)
3191     {
3192       if (pfwrite(&p->from_node_index, 2, 1, fp) != 1)
3193         return ESR_WRITE_ERROR;
3194       if (pfwrite(&p->arc_index, 2, 1, fp) != 1)
3195         return ESR_WRITE_ERROR;
3196       if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1)
3197         return ESR_WRITE_ERROR;
3198       ++p;
3199     }
3200   }
3201 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3202   PLogMessage("G2G done WR exits %d", pftell(fp));
3203 #endif
3204   /* no entry points stuff */
3205 
3206   /* no saving ilabels */
3207 
3208   /* now load the ilabels */
3209   rc = serializeWordMapV2(context->olabels, fp);
3210   if (rc != ESR_SUCCESS)
3211     return rc;
3212 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3213   PLogMessage("G2G done WR wmap %d", pftell(fp));
3214 #endif
3215 
3216   if (context->grmtyp != GrammarTypeItemList)
3217   {
3218     if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS)
3219     {
3220       PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3221       return FST_FAILED_INTERNAL;
3222     }
3223   }
3224   else
3225   {
3226     /* nothing, at decoding time, we'll just construct the graph
3227        from the word list, ie just this ..
3228        0 1 0
3229        1 2 4
3230        ...
3231        1 2 1316
3232        2
3233        ie
3234        0 1 0
3235        for(i=4;i<nwords;i++) { "1 2 i" }
3236        2
3237        because words 0-3 are eps,-pau-,-pau2-,slot@grammarname.grxml
3238     */
3239   }
3240 
3241   /* Write header information. */
3242   header[0] = pftell(fp);
3243   header[1] = IMAGE_FORMAT_V2;
3244 
3245   /* goto beginning of file. */
3246   if (pfseek(fp, 0, SEEK_SET))
3247   {
3248     PLogError("FST_DumpContextAsImage: could not reposition for header.\n");
3249     return FST_FAILED_INTERNAL;
3250   }
3251 
3252   if (pfwrite(header, 4, 2, fp) != 2)
3253   {
3254     PLogError("FST_DumpContextAsImage: Could not write header.\n");
3255     return FST_FAILED_INTERNAL;
3256   }
3257 
3258   /* reset pointer at end of file. */
3259   if (pfseek(fp, 0, SEEK_END))
3260   {
3261     PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n");
3262     return FST_FAILED_INTERNAL;
3263   }
3264 
3265   return FST_SUCCESS;
3266 }
3267 
3268 
FST_LoadContextFromImageV2(srec_context * fst,PFile * fp)3269 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp)
3270 {
3271   asr_int32_t header[6];
3272   arcID tmp[32], num_arcs, new_arc_id;
3273   unsigned int i, nfields;
3274   srec_fsm_exit_point *p, *q;
3275   arcID max_num_FSMarcs;
3276   nodeID max_num_FSMnodes;
3277   FSMarc_ptr atok =  (FSMarc_ptr)0;
3278   FSMarc *atoken = NULL;
3279   FSMnode *fr_node, *to_node;
3280   int rc = FST_SUCCESS;
3281   ESR_ReturnCode esr_rc;
3282   ESR_BOOL seenFinalNode = ESR_FALSE;
3283 
3284   /* read header information. */
3285   if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2)
3286   {
3287     PLogError("FST_DumpContextAsImage: Could not read header.\n");
3288     return FST_FAILED_INTERNAL;
3289   }
3290   fst->modelid = header[2];
3291   fst->grmtyp  = header[3];
3292 
3293   nfields = 16;
3294   if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3295     return ESR_READ_ERROR;
3296 
3297   /* arcs and nodes */
3298   i = 0;
3299   fst->num_arcs             = max_num_FSMarcs = tmp[i++];
3300   fst->FSMarc_list_len      = tmp[i++];
3301   fst->num_base_arcs        = tmp[i++];
3302   fst->FSMarc_freelist      = tmp[i++];
3303   fst->max_searchable_arcs  = tmp[i++] * 0; /*5*/
3304   fst->num_nodes            = max_num_FSMnodes = tmp[i++];
3305   fst->FSMnode_list_len     = tmp[i++];
3306   fst->num_base_nodes       = tmp[i++];
3307   fst->FSMnode_freelist     = tmp[i++];
3308   fst->start_node           = tmp[i++];/*10*/
3309   fst->end_node             = tmp[i++];
3310   fst->max_searchable_nodes = tmp[i++] * 0;
3311   fst->beg_silence_word     = tmp[i++];
3312   fst->end_silence_word     = tmp[i++];
3313   fst->hack_silence_word    = tmp[i++]; /*15*/
3314   fst->hmm_ilabel_offset    = tmp[i++]; /* 16 */
3315 
3316   fst->addWordCaching_lastslot_name = 0;
3317   fst->addWordCaching_lastslot_num = MAXwordID;
3318   fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
3319 
3320   ASSERT(i == nfields);
3321 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3322   PLogMessage("G2G done RD hdrs %d", pftell(fp));
3323 #endif
3324 
3325   ASSERT(fst->num_arcs >= fst->num_base_arcs); // was ==
3326   ASSERT(fst->num_nodes >= fst->num_base_nodes); // was ==
3327 
3328   fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
3329   if (!fst->FSMarc_list)
3330   {
3331     rc = FST_FAILED_ON_MEMORY;
3332     PLogError("CALLOC_CLR fst->FSMarc_list \n");
3333     goto CLEANUP;
3334   }
3335   fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
3336   if (!fst->FSMnode_list)
3337   {
3338     rc = FST_FAILED_ON_MEMORY;
3339     PLogError("CALLOC_CLR fst->FSMnode_list  failed\n");
3340     goto CLEANUP;
3341   }
3342   fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
3343 
3344   if (!fst->FSMnode_info_list)
3345   {
3346     rc = FST_FAILED_ON_MEMORY;
3347     PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n");
3348     goto CLEANUP;
3349   }
3350 
3351   /* setup the arc freelist */
3352   fst->FSMarc_freelist = 0;
3353   fst->num_arcs = 0;
3354   for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++)
3355   {
3356     fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
3357     fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3358   }
3359   fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
3360   fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3361 
3362   /* initialize the nodes, 'cuz reading is random order */
3363   for (i = 0; i < max_num_FSMnodes; i++)
3364   {
3365     fr_node = &fst->FSMnode_list[i];
3366     fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
3367   }
3368 
3369   /* 1. first load up all the information */
3370   num_arcs = 0;
3371   for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++)
3372   {
3373     if (i > max_num_FSMarcs && !seenFinalNode)
3374     {
3375       PLogError("Final node never encountered");
3376       rc = ESR_INVALID_STATE;
3377       goto CLEANUP;
3378     }
3379 	nfields = 5;
3380     if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields)
3381     {
3382       PLogError("reading arc");
3383       rc = ESR_INVALID_STATE;
3384       goto CLEANUP;
3385     }
3386 
3387     if (tmp[1] == MAXnodeID)
3388     {
3389       seenFinalNode = ESR_TRUE;
3390       if(fst->end_node != tmp[0]) {
3391 	PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]);
3392       ASSERT(fst->end_node == tmp[0]);
3393       }
3394       i--;
3395     }
3396     else
3397     {
3398       new_arc_id = fst_get_free_arc(fst);
3399       if (new_arc_id == MAXarcID)
3400         return FST_FAILED_ON_MEMORY;
3401       atok = ARC_ItoX(new_arc_id);
3402       atoken = ARC_XtoP(atok);
3403       num_arcs++;
3404 
3405       ASSERT(tmp[0] < fst->num_nodes);
3406       ASSERT(tmp[1] < fst->num_nodes);
3407       fr_node = &fst->FSMnode_list[ tmp[0]];
3408       to_node = &fst->FSMnode_list[ tmp[1]];
3409       atoken->ilabel = tmp[2];
3410       atoken->olabel = tmp[3];
3411       atoken->cost   = tmp[4];
3412       append_arc_leaving_node(fst, fr_node, atok);
3413       append_arc_arriving_node(fst, to_node, atok);
3414       atoken->fr_node = NODE_ItoX(tmp[0]);
3415       atoken->to_node = NODE_ItoX(tmp[1]);
3416     }
3417   }
3418   ASSERT(fst->num_arcs == num_arcs);
3419 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3420   PLogMessage("G2G done RD arcs %d", pftell(fp));
3421 #endif
3422 
3423   /* setup the node freelist */
3424   if (fst->num_nodes < fst->FSMnode_list_len)
3425   {
3426     fst->FSMnode_freelist = fst->num_nodes;
3427     for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++)
3428     {
3429       fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
3430       fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3431     }
3432     fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
3433     fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3434   }
3435   else
3436   {
3437     fst->FSMnode_freelist = MAXnodeID;
3438   }
3439 
3440   /* node info   .. will now be calculated on line */
3441   /* wrapup_cost .. will now be calculated on line */
3442   fst->whether_prepared = 0;
3443   fst_fill_node_info(fst);
3444   for (i = 0; i < fst->num_nodes; i++)
3445     fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN;
3446 
3447   /* exit points for slots */
3448   if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1)
3449     return ESR_READ_ERROR;
3450   p = fst->fsm_exit_points;
3451   q = p + MAX_NUM_SLOTS;
3452   while (p < q)
3453   {
3454     if (pfread(&p->from_node_index, 2, 1, fp) != 1)
3455       return ESR_WRITE_ERROR;
3456     if (pfread(&p->arc_index, 2, 1, fp) != 1)
3457       return ESR_WRITE_ERROR;
3458     if (pfread(&p->wbto_node_index, 2, 1, fp) != 1)
3459       return ESR_WRITE_ERROR;
3460     ++p;
3461   }
3462 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3463   PLogMessage("G2G done RD exits %d", pftell(fp));
3464 #endif
3465 
3466   /* no entry points stuff */
3467 
3468   /* ilabels were not saved, create them as empty */
3469   fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
3470   fst->ilabels->num_words = fst->ilabels->max_words = 0;
3471   fst->ilabels->words = 0;
3472 
3473   /* now save the olabels */
3474   esr_rc = deserializeWordMapV2(&fst->olabels, fp);
3475   if (esr_rc != ESR_SUCCESS)
3476   {
3477     PLogError("deserializeWordMapV2() failed\n");
3478     rc = FST_FAILED_INTERNAL;
3479     goto CLEANUP;
3480   }
3481 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3482   PLogMessage("G2G done RD wmap %d", pftell(fp));
3483 #endif
3484 
3485   /* arctokeninfo */
3486   if (fst->grmtyp != GrammarTypeItemList)
3487   {
3488     if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS)
3489     {
3490       PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3491       rc = FST_FAILED_INTERNAL;
3492       goto CLEANUP;
3493     }
3494   }
3495   else
3496   {
3497     /* here we just construct an isolated list, but we should scrap it later */
3498     wordID wdid;
3499     arc_token *atl, *final, *atli;
3500     fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS;
3501     atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph"));
3502     if(!atl) {
3503       PLogError("fst->arc_token_list_len failed\n");
3504       goto CLEANUP;
3505     }
3506     atl->first_next_arc = ARC_TOKEN_LNK(atl, 1);
3507     atl->next_token_index = ARC_TOKEN_NULL;
3508     final = atl + fst->arc_token_list_len - 1;
3509     for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++)
3510     {
3511       atli = &atl[i];
3512       atli->ilabel = wdid;
3513       atli->olabel = wdid;/*not used*/
3514       atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1));
3515       atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/
3516     }
3517     ASSERT(atl + i == final);
3518     final->next_token_index = ARC_TOKEN_NULL;
3519     final->first_next_arc = ARC_TOKEN_NULL;
3520     final->ilabel = final->olabel = 0;
3521     fst->arc_token_list = atl;
3522   }
3523   return FST_SUCCESS;
3524 CLEANUP:
3525   return FST_FAILED_INTERNAL;
3526 }
3527 
3528 
3529 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp);
3530 
FST_LoadContextFromImage(srec_context ** pcontext,PFile * fp)3531 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp)
3532 {
3533   srec_context* context = NULL;
3534   /* used to force correct data alignment */
3535   asr_int32_t header[2];
3536   int rc = FST_SUCCESS;
3537 
3538   /*printf("FST_LoadContexFromImage\n");*/
3539   if (!fp)
3540   {
3541     PLogError("FST_LoadContextImage() failed; bad file pointer\n");
3542     return FST_FAILED_ON_INVALID_ARGS;
3543   }
3544 
3545   context = NEW(srec_context, L("srec.graph.binary"));
3546   if (context == NULL)
3547   {
3548     PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n");
3549     return FST_FAILED_ON_MEMORY;
3550   }
3551   memset(context, 0, sizeof(srec_context));
3552 
3553   /* header */
3554   if (pfread(header, 4, 2, fp) != 2)
3555   {
3556     PLogError("FST_LoadContextFromImage: failed reading header.\n");
3557     rc = FST_FAILED_ON_INVALID_ARGS;
3558     goto CLEANUP;
3559   }
3560 
3561   if (header[1] == IMAGE_FORMAT_V2)
3562   {
3563     rc = FST_LoadContextFromImageV2(context, fp);
3564     if (rc != FST_SUCCESS)
3565       goto CLEANUP;
3566   }
3567   else
3568   {
3569     PLogError("FST_LoadContextFromImage() failed on image_format\n");
3570     goto CLEANUP;
3571   }
3572   *pcontext = context;
3573   return FST_SUCCESS;
3574 CLEANUP:
3575   if (context) FREE(context);
3576   *pcontext = 0;
3577   return rc;
3578 }
3579 
3580 
FST_DumpReverseWordGraph(srec_context * context,PFile * fp)3581 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp)
3582 {
3583   /* not implemented, use FST_DumpSyntaxAsImage() for now */
3584   return 1;
3585 }
3586 
3587 /* this belongs part of the srec_context */
3588 typedef struct
3589 {
3590   asr_int32_t image_format;
3591   asr_int32_t image_size;
3592   asr_int32_t sizes_signature;
3593 }
3594 context_image_header;
3595 
3596 
FST_LoadContextFromStreamImage(srec_context ** pcontext,char * buffer,asr_int32_t buffer_len)3597 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len)
3598 {
3599   return FST_FAILED_INTERNAL;
3600 }
3601 
3602 typedef struct nodeID_list_t
3603 {
3604   nodeID *nodes, num_nodes, num_alloc;
3605 }
3606 nodeID_list;
3607 
fst_node_has_speech_to_come(srec_context * context,nodeID node_index)3608 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index)
3609 {
3610   /* recursive func ... try to keep stack use low! */
3611   /* later we should cache this information:
3612      need 2bits:  endn opte regl dunno */
3613   FSMarc* arc;
3614   arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc;
3615   for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc)
3616   {
3617     arc = &context->FSMarc_list[arc_index];
3618     if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS)
3619     {
3620       return 1;
3621     }
3622     else if (arc->ilabel < EPSILON_OFFSET)
3623     {
3624       if (fst_node_has_speech_to_come(context, arc->to_node))
3625       {
3626         return 1;
3627       }
3628     }
3629     else if (IS_SILENCE_ILABEL(arc->ilabel, context))
3630     {
3631       /* another silence! */
3632       if (fst_node_has_speech_to_come(context, arc->to_node))
3633         return 1;
3634     }
3635   }
3636   return 0;
3637 }
3638 
fst_fill_node_info(srec_context * context)3639 int fst_fill_node_info(srec_context* context)
3640 {
3641   char* infos = context->FSMnode_info_list;
3642   nodeID_list optends_obj;
3643   nodeID_list *optends = &optends_obj;
3644 
3645   FSMnode* node;
3646   FSMarc* arc;
3647   nodeID i, j, node_index;
3648   arcID arc_index;
3649   costdata wrapup_cost;
3650   int wrapup_cost_inconsistencies;
3651   /* int num_near_end_nodes; nodes within EPS or silence to the end, these
3652      are called NODE_INFO_ENDNODES, so that they can be excluded from the
3653      comparison with end_node score in the eos detection */
3654 
3655   optends->num_nodes    = 1;
3656   optends->num_alloc    = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */
3657   optends->nodes        = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes");
3658   optends->nodes[0]     = context->end_node;
3659   /* num_near_end_nodes = 0; */
3660 
3661   for (i = 0; i < optends->num_nodes; i++)
3662   {
3663     node_index = optends->nodes[i];
3664     node = &context->FSMnode_list[ node_index];
3665     for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3666          arc_index = arc->linkl_prev_arc)
3667     {
3668       arc = &context->FSMarc_list[arc_index];
3669       if (arc->fr_node != node_index)
3670       {
3671 
3672         if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET)
3673         {
3674           /* ok, fr_node goes to the end via silence, check for dups */
3675           for (j = 0; j < optends->num_nodes; j++)
3676             if (optends->nodes[j] == arc->fr_node) break;
3677           /* append it to the list */
3678           if (j == optends->num_nodes)
3679           {
3680             if (optends->num_nodes < optends->num_alloc)
3681               optends->nodes[ optends->num_nodes++] = arc->fr_node;
3682             else
3683             {
3684               ASSERT(0 && "allocated too few optend nodes");
3685               return 0;
3686             }
3687           }
3688         }
3689       }
3690     }
3691   }
3692 
3693   /* now set the info array, default is regular */
3694   for (i = 0; i < context->num_nodes; i++)
3695     infos[i] = NODE_INFO_REGULAR;
3696   /* also set the other nodes, we'll check whether nodes were added
3697      via these flags */
3698   for (; i < context->FSMnode_list_len; i++)
3699     infos[i] = NODE_INFO_UNKNOWN;
3700   infos[ context->end_node] = NODE_INFO_ENDNODE;
3701 
3702   /* get rid of direct to end nodes, etc */
3703   for (i = 0, j = 0; i < optends->num_nodes; i++)
3704   {
3705     optends->nodes[j] = optends->nodes[i];
3706     if (fst_node_has_speech_to_come(context, optends->nodes[i]))
3707     {
3708       j++;
3709       infos[ optends->nodes[i]] = NODE_INFO_OPTENDN;
3710     }
3711     else
3712     {
3713       infos[ optends->nodes[i]] = NODE_INFO_ENDNODE;
3714       /* num_near_end_nodes++; */
3715     }
3716   }
3717   optends->num_nodes = j;
3718 
3719   /* printf("get_oppend_nodes (%d)", optends->num_nodes);
3720      for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]);
3721      printf("\n");
3722   */
3723 
3724   FREE(optends->nodes);
3725 
3726   /* find the wrapup cost */
3727   node = &context->FSMnode_list[ context->end_node];
3728   wrapup_cost = MAXcostdata;
3729   wrapup_cost_inconsistencies = 0;
3730   for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3731        arc_index = arc->linkl_prev_arc)
3732   {
3733     arc = &context->FSMarc_list[ arc_index];
3734     if (IS_SILENCE_ILABEL(arc->ilabel, context) &&
3735         arc->olabel == context->end_silence_word)
3736     {
3737       if (wrapup_cost == MAXcostdata)
3738         wrapup_cost = arc->cost;
3739       else if (context->wrapup_cost != arc->cost)
3740       {
3741         wrapup_cost = arc->cost;
3742         wrapup_cost_inconsistencies++;
3743       }
3744     }
3745   }
3746 #define MIN_WRAPUP_COST 100
3747   context->wtw_average = wrapup_cost;
3748   if (context->wtw_average > 200)
3749     context->wtw_average = 200;
3750   if (context->wrapup_cost < MIN_WRAPUP_COST)
3751     context->wrapup_cost = MIN_WRAPUP_COST;
3752   /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost,
3753     wrapup_cost_inconsistencies);*/
3754   return 0;
3755 }
3756 
fst_alloc_transit_points(srec_context * context)3757 int fst_alloc_transit_points(srec_context* context)
3758 {
3759   arcID i;
3760   wordID num_slots = context->olabels->num_slots;
3761   wordID olabel;
3762   asr_int16_t nfxps = 0;
3763   FSMarc* arc;
3764   FSMnode* node;
3765 
3766   context->num_fsm_exit_points = 0;
3767   /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */
3768   if (num_slots == 1)
3769     return 0;
3770 
3771   /* scan through to finds arc that carry rule references */
3772   for (i = 0; i < context->num_arcs; i++)
3773   {
3774     olabel =  context->FSMarc_list[i].olabel;
3775     /* (wordID)(olabel-1) < (num_slots-1) ?? might work */
3776     if (olabel > 0 && olabel < num_slots)
3777     {
3778       context->FSMarc_list[i].cost = DISABLE_ARC_COST;
3779       if (nfxps >= MAX_NUM_SLOTS)
3780       {
3781         PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n");
3782         return 0;
3783       }
3784       context->fsm_exit_points[nfxps].arc_index = i;
3785       context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node;
3786       /* skip over the trailing silence and .wb labels */
3787       for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;)
3788       {
3789         node = &context->FSMnode_list[arc->to_node];
3790         arc =  &context->FSMarc_list[node->un_ptr.first_next_arc];
3791       }
3792       context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node;
3793       nfxps++;
3794 
3795     } /* olabel<num_slots */
3796   } /* i<num_arcs */
3797   context->num_fsm_exit_points  = nfxps;
3798   return 0;
3799 }
3800 
3801 
3802 /*
3803   cross-word modeling:
3804   for now we'll assume silence context on either side, later we'll create
3805   a graph with the right fan out on the edges of each slot.  At that point we
3806   may have routines like ... if(is_dead_end(arc)) continue;
3807 
3808   word removal:
3809   we'll use refcounters on arcs, we can search the whole graph for the
3810   olabel, then remove forward and remove backwards
3811 
3812   word addition:
3813   glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219
3814 
3815   set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map
3816   set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map
3817   wintel/debug/addword.exe -min
3818   $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt
3819   $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot
3820   dotty out.PCLG.dot
3821 
3822   need
3823   - a sanity check
3824   - remove word
3825   - multiple pronunciations handling
3826   - homonyms handling, other boundary conditions checking
3827   - measure the speed
3828 
3829   alan_adams al~#ad}z
3830   alan_adams al~ad}z
3831   paul_adams p{l#ad}z
3832 
3833 
3834 
3835 */
3836