• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  srec_context.c                                                           *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                         *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 
21 #include "stdlib.h"
22 #include "string.h"
23 #include "buildopt.h"
24 #include "setting.h"
25 #include "passert.h"
26 #include "pendian.h"
27 #include "portable.h"
28 #include "pstdio.h"
29 #include "ptypes.h"
30 #include "srec_context.h"
31 #include "srec_sizes.h"
32 #include "search_network.h"
33 #include "srec_arb.h"
34 #include "hmm_desc.h"
35 #if USE_COMP_STATS
36 #include "comp_stats.h"
37 #endif
38 
39 #ifdef SET_RCSID
40 static const char *rcsid = 0 ? (const char *) &rcsid :
41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $";
42 #endif
43 
44 
45 /* these are constants for now, we need to figure out how to make
46    this more flexible, possible ideas are:
47    (1) to specify these via API functions such as CR_LoadSyntax(),
48    (2) use defaults that would have been set the main parfile,
49    (3) put values for these fields in an actual grammar
50 */
51 
52 #define AVG_ARCS_PER_WORD    10
53 #define AVG_NODES_PER_WORD    7
54 #define AVG_CHARS_PER_WORD   18
55 #define MAX_PRON_LEN         1024
56 #define MAX_WORD_LEN 128
57 #define DO_WEIGHTED_ADDWORD   1
58 #define SLOTLOOP_OFFSET 64
59 #define SLOTNAME_INDICATOR "__"
60 #define RULE_INDICATOR '@'
61 #define MAX_LINE_LENGTH 512
62 #define DEFAULT_WB_COST 40
63 #define DISABLE_ARC_COST 999
64 #define DO_ARCS_IO_IN_ARC_ORDER 1
65 
66 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS)
67 // looking to match filename.grxml.Names@__Names__
68 // see also grxmlcompile.cpp where these names are constructed
69 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) )
70 
71 /*
72 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled)
73 EXAMPLE: We need to cast a memory address to a (wordmap*)
74          so we call DATA_ALIGN(memptr, wordmap, filled),
75          where FILLED contains the number of bytes that were used to align the pointer
76 */
77 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM)
78 /* under IPAQ it appears we need to align the structures
79    to certain boundaries.  More attention needed here for other
80    platforms ! */
81 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \
82     (z)=sizeof(y) - ((int) (x) % sizeof(y)); \
83     (x) += (z); /* force correct data alignment */ \
84   } else z=0;
85 #else
86 #define DATA_ALIGN(x, y, z)
87 #endif
88 
89 static const int do_minimize = 1;
90 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base)))
91 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx)
92 
93 /* prototypes */
94 
95 /*----------------------------------------------------------------------*
96  *                                                                      *
97  * internal prototypes                                                  *
98  *                                                                      *
99  *----------------------------------------------------------------------*/
100 
101 /* general use functions */
102 char split_line(char* line, char** av);
103 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max);
104 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc);
105 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc);
106 int printf_node1(srec_context* fst, FSMnode* node);
107 
108 /* fst_* functions are internal fst functions */
109 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
110                  wordID olabel, costdata cost,
111                  modelID* model_sequence, int num_models);
112 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc);
113 int fst_push_arc_cost(srec_context* fst, FSMarc* arc);
114 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc);
115 int fst_free_arc(srec_context* fst, FSMarc* arc);
116 int fst_free_node(srec_context* fst, FSMnode* node);
117 int fst_free_arc_net(srec_context* fst, FSMarc* arc);
118 arcID fst_get_free_arc(srec_context* fst);
119 nodeID fst_get_free_node(srec_context* fst);
120 int fst_pack_arc_usage(srec_context* fst);
121 
122 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok);
123 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok);
124 int num_arcs_leaving(srec_context* fst, FSMnode* node);
125 
126 int num_arcs_arriving(srec_context* fst, FSMnode* node);
127 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node);
128 int fst_fill_node_info(srec_context* context);
129 int fst_alloc_transit_points(srec_context* context);
130 
131 /* higher-level functions */
132 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add,
133                              PFile* fp);
134 int FST_LoadParams(srec_context* context, PFile* fp);
135 int FST_AssumeParams(srec_context* context);
136 int FST_UnloadReverseWordGraph(srec_context* context);
137 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree);
138 
139 static ESR_ReturnCode wordmap_clean ( wordmap *word_map );
140 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words );
141 
142 /*--------------------------------------------------------------------------*
143  *                                                                          *
144  *                                                                          *
145  *                                                                          *
146  *--------------------------------------------------------------------------*/
147 
FST_IsVoiceEnrollment(srec_context * context)148 int FST_IsVoiceEnrollment(srec_context* context)
149 {
150   if (context->olabels == NULL) return 0;
151   if (context->olabels->num_words < 2) return 0;
152   if (strstr(context->olabels->words[1], "enroll")) return 1;
153   return 0;
154 }
155 
FST_LoadContext(const char * synbase,srec_context ** pcontext,int num_words_to_add)156 int FST_LoadContext(const char* synbase, srec_context** pcontext,
157                     int num_words_to_add)
158 {
159   int rc;
160   PFile* fp;
161   char buffer[MAX_LINE_LENGTH];
162   srec_context* context;
163 
164   context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base");
165   if(!context)
166 	  return FST_FAILED_ON_MEMORY;
167   memset(context, 0, sizeof(srec_context));
168 
169   context->addWordCaching_lastslot_name = 0;
170   context->addWordCaching_lastslot_num = MAXwordID;
171   context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
172 
173   sprintf(buffer, "%s.map", synbase);
174   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
175   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
176   rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp);
177   pfclose(fp);
178   if (rc) return rc;
179 
180   sprintf(buffer, "%s.PCLG.txt", synbase);
181   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
182   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
183   rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp);
184   pfclose(fp);
185   if (rc != FST_SUCCESS) return rc;
186 
187   sprintf(buffer, "%s.Grev2.det.txt", synbase);
188   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
189   if (!fp) return FST_FAILED_ON_INVALID_ARGS;
190   rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp);
191   pfclose(fp);
192   if (rc != FST_SUCCESS) return rc;
193 
194   sprintf(buffer, "%s.params", synbase);
195   fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
196   if (fp) {
197     rc = FST_LoadParams(context, fp);
198     pfclose(fp);
199     if (rc != FST_SUCCESS) return rc;
200   } else {
201     rc = FST_AssumeParams(context);
202   }
203 
204   context->grmtyp = (arcID)FST_GetGrammarType(context);
205 
206   *pcontext = context;
207   rc = fst_fill_node_info(context);
208   return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
209 }
210 
FST_UnloadContext(srec_context * context)211 void FST_UnloadContext(srec_context* context)
212 {
213   if (!context) return;
214 
215   FST_UnloadWordMap(&context->ilabels);
216   FST_UnloadWordMap(&context->olabels);
217   FST_UnloadGraph(context);
218   FST_UnloadReverseWordGraph(context);
219   FREE(context);
220 }
221 
FST_PrepareContext(srec_context * context)222 int FST_PrepareContext(srec_context* context)
223 {
224   nodeID i;
225   int rc = FST_SUCCESS;
226   /* after all word additions, we need to "prepare" the context */
227 
228   /* first check for any changes to optional endnodes etc .. */
229   for (i = 0; i < context->num_nodes; i++)
230     if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN)
231       break;
232   if (i != context->num_nodes)
233     rc = fst_fill_node_info(context);
234 
235   context->whether_prepared = 1;
236   return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
237 }
238 
fst_set_wb_costs(srec_context * context,costdata wbcost)239 void fst_set_wb_costs( srec_context* context, costdata wbcost)
240 {
241   unsigned int i;
242   for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) {
243     if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY)
244       context->FSMarc_list[i].cost = wbcost;
245   }
246 }
247 
FST_LoadParams(srec_context * context,PFile * fp)248 int FST_LoadParams(srec_context* context, PFile* fp)
249 {
250   char line[MAX_LINE_LENGTH];
251   costdata wbcost = MAXcostdata;
252   while (pfgets(line, MAX_LINE_LENGTH, fp))
253   {
254     char* key = strtok(line," \n\r\t");
255     char* val = key ? strtok(NULL," \n\r\t") : NULL;
256     if(val && !strcmp(val,"="))
257       val = key ? strtok(NULL," \n\r\t") : NULL;
258     if(!key || !key[0])
259       continue;
260     else if(val && val[0]) {
261       if(!strcmp(key,"word_penalty")) {
262         wbcost = (costdata)atoi_with_check(val, MAXcostdata);
263 	if(wbcost == MAXcostdata) {
264 	  return FST_FAILED_ON_INVALID_ARGS;
265 	}
266       } else {
267 	PLogError(L("error: unknown parameter %s in .params file"), key);
268 	return FST_FAILED_ON_INVALID_ARGS;
269       }
270     }
271   }
272   if(wbcost != MAXcostdata)
273     fst_set_wb_costs( context, wbcost);
274   return FST_SUCCESS;
275 }
276 
FST_AssumeParams(srec_context * context)277 int FST_AssumeParams( srec_context* context)
278 {
279   fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST);
280   return FST_SUCCESS;
281 }
282 
283 
284 /*--------------------------------------------------------------------------*
285  *                                                                          *
286  *                                                                          *
287  *                                                                          *
288  *--------------------------------------------------------------------------*/
289 
hmm_number(const char * hmm_Name,modelID hmm_ilabel_offset)290 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset)
291 {
292   if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL;
293   if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY;
294   if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY;
295   ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]);
296   return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID));
297 }
298 
hmm_name(modelID ilabel,modelID hmm_ilabel_offset,char * buf)299 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf)
300 {
301   if (ilabel == EPSILON_LABEL)
302     sprintf(buf, "eps");
303   else if (ilabel == WORD_BOUNDARY)
304     sprintf(buf, ".wb");
305   else if (ilabel == PHONE_BOUNDARY)
306     sprintf(buf, ".ph");
307   else
308     sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset);
309   return buf;
310 }
311 
312 /*------------------------------------------------------------------*
313  *                                                                  *
314  * words                                                            *
315  *                                                                  *
316  *------------------------------------------------------------------*/
HashCmpWord(const LCHAR * key1,const LCHAR * key2)317 int HashCmpWord(const LCHAR *key1, const LCHAR *key2)
318 {
319 	return LSTRCMP(key1,key2);
320 }
HashGetCode(const void * key)321 unsigned int HashGetCode(const void *key)
322 {
323 	const LCHAR* k = (const LCHAR*)key;
324 	unsigned int i, len, h = 0;
325 	len = LSTRLEN( k);
326 	for ( i = 0; i < len; i++)
327 		h = 31*h + (unsigned int)k[i];
328 	return h;
329 }
330 
wordmap_create(wordmap ** pwmap,int num_chars,int num_words,int num_words_to_add)331 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add)
332 {
333   wordmap* Interface;
334   PHashTableArgs hashArgs;
335   ESR_ReturnCode rc;
336 
337   Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base");
338   Interface->max_words = (wordID)(num_words + num_words_to_add);
339   Interface->num_words = (wordID)0;
340   /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */
341   Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words");
342   Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD;
343   Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars");
344   Interface->next_chars = Interface->chars;
345   Interface->wordIDForWord = NULL;
346   *pwmap = Interface;
347 
348   /* use a hashtable to save mapping between wdID and array index */
349   if (num_words_to_add >= 0)
350   {
351     hashArgs.capacity = num_words + num_words_to_add + 10;
352 	if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
353     hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION;
354     hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION;
355     hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
356     CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord));
357   }
358   else
359   {
360     Interface->wordIDForWord = NULL;
361   }
362 
363   return FST_SUCCESS;
364 
365 CLEANUP:
366   wordmap_destroy(pwmap);
367   return rc;
368 }
369 wordID wordmap_add_word(wordmap* wmap, const char* word);
370 
FST_LoadWordMap(wordmap ** pwmap,int num_words_to_add,PFile * fp)371 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp)
372 {
373   wordID my_wID, wm_wID;
374   char *word, *wID_string;
375   int num_words;
376   asr_int32_t num_chars;
377   char buf[MAX_LINE_LENGTH];
378   long fpos;
379   wordmap* wmap;
380   /* figure out the number of words */
381   fpos = pftell(fp);
382   for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++)
383   {
384     word = strtok(buf, " \n\r\t");
385     num_chars += strlen(word);
386   }
387   num_chars += num_words * 2; /* for null terminators */
388   pfseek(fp, fpos, SEEK_SET);
389 
390   /* Alex added this to reuse this functionality elsewhere
391     destroy function for this create function is actually taken care of by the
392     FST_UnloadWordMap function
393   */
394   wordmap_create(&wmap, num_chars, num_words, num_words_to_add);
395 
396   /* later replace this with wordblocks, linked list of 8 character blocks! */
397   while (pfgets(buf, MAX_LINE_LENGTH, fp))
398   {
399     word = strtok(buf, " \n\r\t");
400     wID_string = strtok(NULL, " \n\r\t");
401     my_wID = (wordID)atoi_with_check(wID_string, MAXwordID);
402 
403     // if(strstr(word,SLOTNAME_INDICATOR))
404     //    word = strstr(word,SLOTNAME_INDICATOR);
405     wm_wID = wordmap_add_word(wmap, word);
406     ASSERT(my_wID == wm_wID);
407   }
408   ASSERT(wmap->num_words == num_words);
409 
410   for (my_wID = 1; my_wID < num_words; my_wID++)
411   {
412     if (!IS_SLOT_OLABEL(wmap->words[my_wID]))
413       break;
414   }
415   wmap->num_slots = my_wID;
416   wordmap_setbase(wmap);
417   *pwmap = wmap;
418   if(wmap->num_slots > MAX_NUM_SLOTS)
419     return FST_FAILED_ON_INVALID_ARGS;
420   else
421     return FST_SUCCESS;
422 }
423 
wordmap_destroy(wordmap ** wmap)424 int wordmap_destroy(wordmap** wmap)
425 {
426   if (wmap && *wmap)
427   {
428     wordmap_clean ( *wmap );
429     if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord);
430     if (((*wmap)->chars)) FREE((*wmap)->chars);
431     if (((*wmap)->words)) FREE((*wmap)->words);
432     if ((*wmap)) FREE((*wmap));
433     *wmap = 0;
434   }
435   return FST_SUCCESS;
436 }
437 
FST_UnloadWordMap(wordmap ** wmap)438 int FST_UnloadWordMap(wordmap** wmap)
439 {
440   return wordmap_destroy(wmap);
441 }
442 
443 
FST_DumpWordMap(PFile * fp,wordmap * wmap)444 int FST_DumpWordMap(PFile* fp, wordmap* wmap)
445 {
446   wordID my_wID;
447   for (my_wID = 0; my_wID < wmap->num_words; my_wID++)
448   {
449     pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID);
450   }
451   return FST_SUCCESS;
452 }
453 
wordmap_find_index(wordmap * wmap,const char * word)454 wordID wordmap_find_index(wordmap* wmap, const char* word)
455 {
456   void *wdID_void;
457   ESR_ReturnCode rc;
458   size_t i;
459 
460   if (!word)
461     return MAXwordID;
462 
463   if (wmap->num_words == 0)
464     return MAXwordID;
465 
466   if (wmap->wordIDForWord)
467   {
468     rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void);
469 
470     if (rc == ESR_SUCCESS)
471       return (wordID)(int)wdID_void;
472   }
473   else
474   {
475     for (i = 0; i < wmap->num_words; i++)
476     {
477       if (!strcmp(wmap->words[i], word))
478       {
479         return (wordID)i;
480       }
481     }
482   }
483   return MAXwordID;
484 }
485 
wordmap_find_rule_index(wordmap * wmap,const char * rule)486 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule)
487 {
488   int i;
489   int strlen_rule = strlen(rule);
490   for (i = wmap->num_slots; --i > 0;)
491   { /* > (not >=) because eps is always 0 */
492     char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR);
493     if(p != NULL){
494       if (!strcmp(p, rule)) break;
495       else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break;
496     }
497   }
498   if (i == 0) return MAXwordID;
499   else return(wordID)i;
500 }
501 
wordmap_find_index_in_rule(wordmap * wmap,const char * word,wordID rule)502 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule)
503 {
504   int len = strlen(word);
505   int rule0 = rule + '0';
506   void *wdID_void;
507   LCHAR word_dot_rule[256];
508   ESR_ReturnCode rc;
509 
510   if (!word)
511     return MAXwordID;
512   LSTRCPY(word_dot_rule, word);
513   word_dot_rule[len++] = IMPORTED_RULES_DELIM;
514   word_dot_rule[len++] = (char)rule0;
515   word_dot_rule[len++] = 0;
516   rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void);
517   if (rc == ESR_SUCCESS)
518     return (wordID)(int)(wdID_void);
519   return MAXwordID;
520 }
521 
strlen_with_null(const char * word)522 int strlen_with_null(const char* word)
523 {
524 #if 1 /* NT  */
525   int len = strlen(word) + 1;
526   if (len % 2 == 1) len++;
527   return len;
528 #else /* C54 */
529 #endif
530 }
531 
wordmap_whether_in_rule(wordmap * wmap,wordID word,wordID rule)532 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule)
533 {
534   char* word_chars;
535   int len;
536   if (word > wmap->num_words) return 0;
537   word_chars = wmap->words[word];
538   len = strlen(word_chars);
539   if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM)
540     return 1;
541   else
542     return 0;
543 }
544 
wordmap_setbase(wordmap * wmap)545 void wordmap_setbase(wordmap* wmap)
546 {
547   wmap->num_base_words = wmap->num_words;
548   wmap->next_base_chars = wmap->next_chars;
549 }
550 
wordmap_ceiling(wordmap * wmap)551 void wordmap_ceiling(wordmap* wmap)
552 {
553   /* this is irreversible */
554   wmap->max_words = wmap->num_words;
555   wmap->max_chars = (wmap->next_chars - wmap->chars);
556 }
557 
558 
wordmap_clean(wordmap * word_map)559 static ESR_ReturnCode wordmap_clean ( wordmap *word_map )
560     {
561   ESR_ReturnCode  clean_status;
562   PHashTableEntry *entry;
563   PHashTableEntry *oldEntry;
564   wordID          *value;
565 
566   clean_status = ESR_SUCCESS;
567 
568   if ( word_map->wordIDForWord == NULL )
569     return clean_status;
570 
571   clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry);
572 
573   while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) )
574     {
575       clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value );
576 
577       if ( clean_status == ESR_SUCCESS )
578 	{
579 	  oldEntry = entry;
580 	  clean_status = PHashTableEntryAdvance ( &entry );
581 
582 	  if ( clean_status == ESR_SUCCESS )
583 	    clean_status = PHashTableEntryRemove ( oldEntry );
584 	}
585     }
586     return ( clean_status );
587     }
588 
589 
wordmap_populate(wordmap * word_map,wordID num_words)590 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words )
591 {
592   ESR_ReturnCode  populate_status;
593   wordID          wdID;
594 
595   populate_status = ESR_SUCCESS;
596   wdID = 0;
597 
598   if ( word_map->wordIDForWord != NULL )
599     {
600       while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) )
601 	{
602 	  populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID],
603 						 (const void *)(int)wdID, NULL );
604 	  if ( populate_status == ESR_SUCCESS )
605 	    wdID++;
606 	  else {
607 	    return (populate_status);
608 	  }
609 	}
610     }
611   return ( populate_status );
612 }
613 
wordmap_reset(wordmap * wmap)614 void wordmap_reset(wordmap* wmap)
615 {
616   char** tmp_words;
617   int i;
618   ESR_ReturnCode    reset_status;
619 
620   if (wmap->num_base_words < wmap->num_words)
621   {
622     /*wordID i = (wordID)(wmap->num_base_words);*/
623     char* old_wmap_chars = wmap->chars;
624     int offset = wmap->next_base_chars - wmap->chars;
625     char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars"));
626     if(!tmp_chars) {
627       passert ( 0 && L("failed to reset the memory for wordmap.chars ") );
628     }
629     memcpy(tmp_chars,wmap->chars, offset * sizeof(char));
630     FREE(wmap->chars);
631 
632     wmap->chars = tmp_chars;
633     wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars);
634     wmap->max_chars = (wordID) offset;
635     wmap->next_chars = wmap->next_base_chars;
636 
637     tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words");
638     if(!tmp_words) {
639       passert ( 0 && L("failed to reset the memory for wordmap.words ") );
640     }
641     memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*));
642     FREE( wmap->words);
643 
644     wmap->words = tmp_words;
645     wmap->max_words = wmap->num_base_words;
646     wmap->num_words  = wmap->num_base_words;
647 
648     for(i=0; i<wmap->num_words; i++)
649       wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars);
650   }
651 
652   reset_status = wordmap_clean ( wmap );
653 
654   if ( reset_status == ESR_SUCCESS )
655     {
656       reset_status = wordmap_populate ( wmap, wmap->num_base_words );
657 
658       if ( reset_status != ESR_SUCCESS )
659 	{
660 	  wordmap_clean ( wmap );
661 	  passert ( 0 && L("wordmap_reset failed") );
662 	}
663     }
664   else
665     {
666       passert ( 0 && L("wordmap_reset failed") );
667     }
668 }
669 
670 #define FST_GROW_FACTOR 12/10
671 #define FST_GROW_MINCHARS 256
672 #define FST_GROW_MINWORDS  32
wordmap_add_word(wordmap * wmap,const char * word)673 wordID wordmap_add_word(wordmap* wmap, const char* word)
674 {
675   int len = strlen_with_null(word);
676   wordID wdID =0;
677 
678   if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
679   {
680 #if defined(FST_GROW_FACTOR)
681      int i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR;
682 	 char* old_wmap__chars = wmap->chars;
683       if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS)
684 	tmp_max_chars +=FST_GROW_MINCHARS;
685 
686       char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
687       if(!tmp_chars) {
688           PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars"));
689           return MAXwordID;
690       }
691       memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char));
692       FREE(wmap->chars);
693 
694       wmap->chars = tmp_chars;
695       wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars);
696       wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars);
697       wmap->max_chars = (wordID)tmp_max_chars;
698       //Remove old keys --- Add WORD
699       wordmap_clean (wmap );
700 
701       // adjust word pointers
702       for(i=0; i<wmap->num_words; i++)
703 	{
704 	  wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars);
705 	  // adjust hashtable ---
706 	  if (wmap->wordIDForWord)
707 	    {
708 	      ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
709 						       (const void *)i, NULL );
710 	      if ( rc != ESR_SUCCESS ) {
711 		goto CLEANUP;
712 	      }
713 	    }
714 	}
715 #else // so not defined(FST_GROW_FACTOR)
716       PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
717       return MAXwordID;
718 #endif // defined(FST_GROW_FACTOR)
719   }
720   //Add word
721   if (wmap->num_words == wmap->max_words)
722     {
723 #if defined(FST_GROW_FACTOR)
724       char** tmp_words;
725       wordID tmp_max_words;
726       int itmp_max_words =  wmap->max_words * FST_GROW_FACTOR;
727 
728       if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
729 	    itmp_max_words += FST_GROW_MINWORDS;
730 
731       if( itmp_max_words >= MAXwordID) {
732 	    PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
733 	    return MAXwordID;
734       }
735       tmp_max_words = (wordID)itmp_max_words;
736 	  tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
737       if(!tmp_words) {
738 	PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words"));
739         return MAXwordID;
740       }
741       memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
742       FREE( wmap->words);
743       wmap->words = tmp_words;
744       wmap->max_words = tmp_max_words;
745 #else // so not defined(FST_GROW_FACTOR)
746       PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
747       return MAXwordID;
748 #endif // defined(FST_GROW_FACTOR)
749     }
750   if(1)
751     {
752     strcpy(wmap->next_chars, word);
753     wmap->words[ wmap->num_words++] = wmap->next_chars;
754     wmap->next_chars += len;
755     wdID = (wordID)(wmap->num_words - (wordID)1);
756     if (wmap->wordIDForWord)
757     {
758        ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
759                       (const void *)(int)wdID, NULL );
760     if ( rc != ESR_SUCCESS )
761       goto CLEANUP;
762     }
763     return wdID;
764   }
765 CLEANUP:
766   PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
767   return MAXwordID;
768 }
769 
wordmap_add_word_in_rule(wordmap * wmap,const char * word,wordID rule)770 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule)
771 {
772   int len = strlen_with_null(word) + 2;
773   wordID wdID = 0;
774   wordID i;
775 //wordmap_add_word_in_rule
776   if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
777   {
778 #if defined(FST_GROW_FACTOR)
779       int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR;
780       char* tmp_chars;
781 	  char* old_next_chars = wmap->next_chars;
782 	  char* old_chars = wmap->chars;
783 
784       if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS )
785 	    tmp_max_chars += FST_GROW_MINCHARS;
786       if( wmap->chars + tmp_max_chars <= wmap->next_chars + len)
787 	    tmp_max_chars += len;
788 
789       tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
790     if(!tmp_chars) {
791           PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars"));
792           return MAXwordID;
793       }
794       memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char));
795       FREE(wmap->chars);
796 
797       wmap->chars = tmp_chars;
798       wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ;
799       wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars);
800       wmap->max_chars = (wordID)tmp_max_chars;
801 
802       //Remove old keys
803       wordmap_clean ( wmap );
804 
805       // adjust word pointers wordmap_add_word_in_rule
806       for(i=0; i<wmap->num_words; i++)
807 	{
808  	  wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ;
809 	  // adjust hashtable ----- add in rulewordmap_add_word_in_rule ----
810 	  if (wmap->wordIDForWord) {
811 	    ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
812 						     (void*)(int)(i), NULL );
813 	    if ( rc != ESR_SUCCESS )
814 	      goto CLEANUP;
815 	  }
816 	}
817 #else
818       PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
819       return MAXwordID;
820 #endif
821   }//wordmap_add_word_in_rule
822   if (wmap->num_words == wmap->max_words)
823     {
824 #if defined(FST_GROW_FACTOR)
825       char** tmp_words;
826       wordID tmp_max_words;
827       int itmp_max_words =  wmap->max_words * FST_GROW_FACTOR;
828       if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
829 	itmp_max_words += FST_GROW_MINWORDS;
830 
831       if( itmp_max_words >= MAXwordID) {
832 	PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
833 	return MAXwordID;
834       }
835 
836       tmp_max_words = (wordID)itmp_max_words;
837       tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
838       if(!tmp_words) {
839         PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words"));
840         return MAXwordID;
841       }
842       memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
843       FREE( wmap->words);
844       wmap->words = tmp_words;
845       wmap->max_words = tmp_max_words;
846 #else
847       PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
848       return MAXwordID;
849 #endif
850     }
851   if(1)
852   {
853     char *p;
854     const char *q;
855     /* word.1 */
856     for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */
857     *p++ = IMPORTED_RULES_DELIM;
858     *p++ = (char)(rule + ((wordID)'0'));
859     *p   = '\0';
860     wmap->words[ wmap->num_words++] = wmap->next_chars;
861     wmap->next_chars += len;
862     wdID = (wordID)(wmap->num_words - (wordID)1);
863     if (wmap->wordIDForWord)
864       {
865         ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
866 						 (void*)(int)(wdID), NULL );
867 	if ( rc != ESR_SUCCESS )
868 	  goto CLEANUP;
869       }
870     return wdID;
871   }
872 CLEANUP:
873   PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
874   return MAXwordID;
875 }
876 
877 /*----------------------------------------------------------------------*
878  *                                                                      *
879  * API                                                                  *
880  *                                                                      *
881  *----------------------------------------------------------------------*/
882 
FST_AttachArbdata(srec_context * fst,srec_arbdata * allophone_tree)883 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree)
884 {
885   int rc = 0;
886   unsigned int allotree__modelid;
887 
888   fst->allotree = allophone_tree;
889   if( !allophone_tree)
890       return 1;
891   fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset;
892   allotree__modelid = version_arbdata_models(allophone_tree);
893   if (allotree__modelid != 0 && fst->modelid != 0)
894   {
895     if (allotree__modelid != fst->modelid)
896     {
897       PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid);
898       rc = FST_FAILED_ON_INVALID_ARGS;
899     }
900   }
901 
902   return rc;
903 }
904 
FST_LoadGraph(srec_context * pfst,wordmap * imap,wordmap * omap,int num_words_to_add,PFile * fp)905 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap,
906                   int num_words_to_add, PFile* fp)
907 {
908   int i, rc = 0;
909   char line[MAX_LINE_LENGTH], *args[32], nargs;
910   arcID max_num_FSMarcs;
911   arcID max_num_FSMnodes;
912   nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0;
913   FSMarc_ptr atok =  (FSMarc_ptr)0;
914   FSMarc *atoken = NULL;
915   FSMnode *fr_node, *to_node;
916   costdata cost = FREEcostdata;
917   arcID num_arcs;
918 
919   srec_context* fst = pfst;
920   char *ilabel_str = 0, *olabel_str = 0;
921   long fpos;
922   arcID new_arc_id;
923   asr_int32_t temp;
924 
925 
926   /* determine number of arcs and nodes to allocate, add 50% for dynamic */
927   fpos = pftell(fp);
928   max_num_FSMnodes = 0;
929   pfst->modelid = 0;
930   for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++)
931   {
932     if (strstr(line, "modelid:") == line)
933     {
934       char *p;
935       pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10);
936     }
937     from_node = (nodeID)atoi_with_check(line, MAXnodeID);
938     if (from_node > max_num_FSMnodes)
939       max_num_FSMnodes = from_node;
940   }
941   pfseek(fp, fpos, SEEK_SET);
942   temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD;
943   if (temp >= MAXnodeID)
944   {
945     max_num_FSMnodes = MAXnodeID - 1;
946     PLogMessage("Warning: using max nodes instead\n");
947   }
948   else
949   {
950     max_num_FSMnodes = (nodeID)temp;
951   }
952   temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD;
953   if (temp >= MAXarcID)
954   {
955     max_num_FSMarcs = MAXarcID - 1;
956     PLogMessage("Warning: using max arcs instead\n");
957   }
958   else
959   {
960     max_num_FSMarcs = (arcID)temp;
961   }
962   fst->olabels = omap;
963   if (imap)
964   {
965     /* generally no imap is specified */
966     fst->ilabels = imap;
967     fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0");
968     ASSERT(fst->hmm_ilabel_offset >= 0);
969   }
970   else
971   {
972     fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
973     fst->ilabels->num_words = fst->ilabels->max_words = 0;
974     fst->ilabels->words = 0;
975     /* this is bad hard code, we can get this from the swiarb file, it is
976        equal to the number of phonemes (53 for enu, 39 for jpn) plus the special
977        models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */
978     fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */
979   }
980   fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs");
981   fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes");
982   fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos");
983 
984   /* setup the arc freelist */
985   fst->FSMarc_freelist = 0;
986   fst->num_arcs = 0;
987   for (i = 0; i < max_num_FSMarcs - 1; i++)
988   {
989     fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
990     fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
991   }
992   fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
993   fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
994 
995   /* initialize the nodes, 'cuz reading is random order */
996   fst->num_nodes = 0;
997   for (i = 0; i < max_num_FSMnodes; i++)
998   {
999     fr_node = &fst->FSMnode_list[i];
1000     fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
1001   }
1002 
1003   /* 1. first load up all the information */
1004   IF_DEBUG_WDADD(printf("load graph ... 1\n"));
1005   num_arcs = 0;
1006   last_from_node = MAXnodeID;
1007   while (pfgets(line, sizeof(line) / sizeof(char), fp))
1008   {
1009     if (strstr(line, "modelid:") == line) continue;
1010     IF_DEBUG_WDADD(printf("read arc %s", line));
1011     nargs = split_line(line, args);
1012     if (nargs >= 4)
1013     {
1014       from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1015       into_node = (nodeID)atoi_with_check(args[1], MAXnodeID);
1016       ilabel_str = args[2];
1017       olabel_str = args[3];
1018       cost = FREEcostdata;
1019       if (nargs == 5)
1020         PLogError(L("Warning: too many arguments on line %s"), line);
1021     }
1022     else if (nargs == 1)
1023     {
1024       from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1025       into_node = MAXnodeID;
1026       ilabel_str = 0;
1027       olabel_str = 0;
1028       cost = FREEcostdata;
1029     }
1030     else
1031     {
1032       from_node = into_node = 0;
1033       PLogError("can't parse line %s\n", line);
1034       ASSERT(0);
1035     }
1036 
1037     if (into_node == MAXnodeID)
1038     {
1039       fst->end_node = from_node;
1040     }
1041     else
1042     {
1043       new_arc_id = fst_get_free_arc(fst);
1044       if (new_arc_id == MAXarcID)
1045         return FST_FAILED_ON_MEMORY;
1046       atok = ARC_ItoX(new_arc_id);
1047       atoken = ARC_XtoP(atok);
1048       num_arcs++;
1049       fr_node = &fst->FSMnode_list[from_node];
1050       to_node = &fst->FSMnode_list[into_node];
1051       if (fst->ilabels->num_words == 0)
1052       {
1053         atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset);
1054 	/* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/
1055 	if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL)
1056 	  atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET;
1057       }
1058       else
1059       {
1060         atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str);
1061       }
1062       atoken->olabel = wordmap_find_index(fst->olabels, olabel_str);
1063       /* if(g_fst_options.wtw_cost_override>0) {
1064       if(atoken->ilabel == WORD_BOUNDARY)
1065       atoken->cost = g_fst_options.wtw_cost_override;
1066       else
1067       atoken->cost = g_fst_options.arc_cost_override;
1068       } else  */
1069       atoken->cost = cost;
1070 #if DEBUG_WDADD
1071       atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0);
1072       atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0);
1073 #endif
1074       append_arc_leaving_node(fst, fr_node, atok);
1075       if (into_node > max_node_number)
1076         max_node_number = into_node;
1077       append_arc_arriving_node(fst, to_node, atok);
1078       atoken->fr_node = NODE_ItoX(from_node);
1079       atoken->to_node = NODE_ItoX(into_node);
1080     }
1081     last_from_node = from_node;
1082   }
1083   ASSERT(fst->num_arcs == num_arcs);
1084 
1085   /* setup the node freelist */
1086   IF_DEBUG_WDADD(printf("load graph ... 6\n"));
1087   num_nodes = (nodeID)(max_node_number + 1);
1088   if( max_num_FSMnodes > num_nodes) {
1089   fst->FSMnode_freelist = num_nodes;
1090   for (i = num_nodes; i < (max_num_FSMnodes - 1); i++)
1091   {
1092     fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
1093     fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1094   }
1095 	if (i == (max_num_FSMnodes - 1)) {
1096      fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
1097      fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1098     }
1099   } else
1100 	fst->FSMnode_freelist  = FSMNODE_NULL;
1101 
1102   /* some book-keeping stuff */
1103   IF_DEBUG_WDADD(printf("load graph ... 7\n"));
1104   fst->num_base_arcs = fst->num_arcs = num_arcs;
1105   fst->FSMarc_list_len = max_num_FSMarcs;
1106   fst->FSMnode_list_len = max_num_FSMnodes;
1107   fst->num_base_nodes = fst->num_nodes = num_nodes;
1108   fst->start_node = 0;
1109   /* fst->end_node = 0; this is set up above */
1110 
1111   /* beg and end word labels */
1112   fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-");
1113   fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-");
1114   fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence");
1115 
1116   /* set the search limitations, real ones will be registered later */
1117   fst->max_searchable_nodes = 0;
1118   fst->max_searchable_arcs = 0;
1119 
1120   rc = fst_alloc_transit_points(fst);
1121   return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS);
1122 }
1123 
FST_UnloadGraph(srec_context * pfst)1124 int FST_UnloadGraph(srec_context* pfst)
1125 {
1126   if (pfst->ilabels)
1127     FREE(pfst->ilabels);
1128   FREE(pfst->FSMarc_list);
1129   FREE(pfst->FSMnode_list);
1130   FREE(pfst->FSMnode_info_list);
1131   pfst->FSMarc_list = 0;
1132   pfst->FSMnode_list = 0;
1133   pfst->FSMnode_info_list = 0;
1134   return FST_SUCCESS;
1135 }
1136 
FST_DumpGraph(srec_context * fst,PFile * fp)1137 int FST_DumpGraph(srec_context* fst, PFile* fp)
1138 {
1139   int rc = 0;
1140   nodeID i;
1141   FSMarc_ptr atok;
1142   FSMarc* atoken;
1143   nodeID from_node, into_node;
1144   FSMnode* ntoken;
1145   char *ilabel, *olabel;
1146 
1147   for (i = 0; i < fst->num_nodes; i++)
1148   {
1149     from_node = i;
1150     ntoken = &fst->FSMnode_list[i];
1151     if (ntoken->first_prev_arc == FSMARC_FREE)
1152       continue;
1153     if (ntoken->un_ptr.first_next_arc != FSMARC_NULL)
1154     {
1155       for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
1156       {
1157         char buf[32];
1158         atoken = ARC_XtoP(atok);
1159         into_node = NODE_XtoI(atoken->to_node);
1160         ilabel = fst->ilabels->num_words == 0 ?
1161                  hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) :
1162                  fst->ilabels->words[atoken->ilabel] ;
1163         olabel = fst->olabels->words[atoken->olabel];
1164 
1165         if (atoken->cost != FREEcostdata)
1166         {
1167           /* regular arc */
1168           pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n",
1169                   from_node, into_node, ilabel, olabel, atoken->cost);
1170         }
1171         else
1172         {
1173           /* regular zero cost arc */
1174           pfprintf(fp, "%hu\t%hu\t%s\t%s\n",
1175                   from_node, into_node, ilabel, olabel);
1176         }
1177       }
1178     }
1179     else
1180     {
1181       pfprintf(fp, "%hu\n", from_node);
1182     }
1183   }
1184   return rc;
1185 }
1186 
FST_AddWordToGrammar(srec_context * fst,const char * _slot,const char * word,const char * pron,const int cost)1187 int FST_AddWordToGrammar(srec_context* fst, const char* _slot,
1188                          const char* word, const char* pron,
1189                          const int cost)
1190 {
1191   int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0;
1192   int i, pron_len, model_sequence_len;
1193   modelID model_sequence[MAX_PRON_LEN];
1194   char phoneme_sequence[MAX_PRON_LEN];
1195   wordID olabel = MAXwordID;
1196   int irc, rc = FST_SUCCESS;
1197   nodeID start_node = MAXnodeID, end_node = MAXnodeID;
1198   char veslot[MAX_WORD_LEN];
1199 #if USE_HMM_BASED_ENROLLMENT
1200   const char* Tpron;
1201 #endif
1202 
1203   /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */
1204   if(_slot[0] == '@') {
1205     strcpy(veslot,SLOTNAME_INDICATOR);
1206     strcat(veslot,_slot+1);
1207     strcat(veslot,SLOTNAME_INDICATOR);
1208   } else
1209     strcpy(veslot, _slot);
1210 
1211 #if USE_COMP_STATS
1212   if (!comp_stats)
1213     comp_stats = init_comp_stats1();
1214   start_cs_clock1(&comp_stats->word_addition);
1215 #endif
1216 
1217   /* we expect developers to call AddWord with the same slot many times,
1218   so we cache the slot start and end nodes and re-use on subseq calls */
1219   if( fst->addWordCaching_lastslot_num == MAXwordID )
1220     fst->addWordCaching_lastslot_name = NULL;
1221   else {
1222     fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1223     fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1224     ASSERT( fst->addWordCaching_lastslot_name);
1225   }
1226   if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) {
1227 
1228     fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot);
1229     /* olabel not found is a user error */
1230     if (fst->addWordCaching_lastslot_num == MAXwordID)
1231     {
1232       size_t i;
1233 
1234       pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot);
1235       for (i = 1; i < (size_t) fst->olabels->num_slots; ++i)
1236         pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]);
1237       pfprintf(PSTDOUT, L("] possible\n"));
1238       rc = FST_FAILED_ON_INVALID_ARGS;
1239       goto RETRC;
1240     }
1241 
1242     fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1243     fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1244     ASSERT(fst->addWordCaching_lastslot_name);
1245 
1246     /* now find where in the graph this slot is referenced, note that
1247        there might be more than one place, but here we ignore multiples */
1248     for (i = fst->num_fsm_exit_points; --i >= 0;)
1249       {
1250 	arcID arcid = fst->fsm_exit_points[i].arc_index;
1251 	if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num)
1252 	  {
1253 	    FSMarc* arc;
1254 	    FSMnode* node;
1255 	    start_node = fst->fsm_exit_points[i].from_node_index;
1256 	    end_node = fst->fsm_exit_points[i].wbto_node_index;
1257 	    fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE;
1258 	    fst->addWordCaching_lastslot_ifsm_exit_point = i;
1259 	    node = &fst->FSMnode_list[ end_node];
1260 	    arc = &fst->FSMarc_list[node->un_ptr.first_next_arc];
1261 	    if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID)
1262 	      fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
1263 	    break;
1264 	  }
1265       }
1266 
1267     /* not found in the graph is an internal error, coding error */
1268     if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes)
1269       {
1270 	PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num,
1271 		  start_node, end_node);
1272 	goto RETRC;
1273       }
1274   } /* cached or not */
1275 
1276   i = fst->addWordCaching_lastslot_ifsm_exit_point;
1277   start_node = fst->fsm_exit_points[i].from_node_index;
1278   end_node = fst->fsm_exit_points[i].wbto_node_index;
1279 
1280   /* now start_node and end_node are known */
1281   if (!word || !*word || !pron || !*pron)
1282     {
1283       rc = FST_FAILED_ON_INVALID_ARGS;
1284       PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n");
1285       goto RETRC;
1286     }
1287 
1288   /* from here */
1289   IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron));
1290 
1291   /* loop over all prons, we break when we hit the double-null (hence the +1) */
1292   for( ; (*pron)!='\0'; pron+=(pron_len+1)) {
1293     pron_len = strlen((char*)pron);
1294     if (pron_len >= MAX_PRON_LEN)
1295       {
1296 	PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1297 	rc = FST_FAILED_ON_INVALID_ARGS;
1298 	goto RETRC;
1299       }
1300 
1301     /* check for searchability after adding, estimating at most pron_len
1302        arcs and nodes will be added */
1303     if (fst->num_arcs + pron_len > fst->max_searchable_arcs)
1304       {
1305 	PLogError("error: wordadd failed on word %s due to %d arc search limit\n",
1306 		  word, fst->max_searchable_arcs);
1307 	rc = FST_FAILED_ON_MEMORY;
1308 	goto RETRC;
1309       }
1310 
1311     if (fst->num_nodes + pron_len > fst->max_searchable_nodes)
1312       {
1313 	PLogError("error: wordadd failed on word %s due to %d node search limit\n",
1314 		  word, fst->max_searchable_nodes);
1315 	rc = FST_FAILED_ON_MEMORY;
1316 	goto RETRC;
1317       }
1318 
1319     /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding
1320        an alternate pronunciation */
1321     olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1322     if (olabel == MAXwordID)
1323       {
1324 	olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1325 	if (olabel != MAXwordID)
1326 	  num_words_added++;
1327       }
1328     if (olabel == MAXwordID)
1329       {
1330 	PLogError("error: wordmap_add_word failed\n");
1331 	rc = FST_FAILED_ON_MEMORY;
1332 	goto RETRC;
1333       }
1334 
1335 #if USE_HMM_BASED_ENROLLMENT
1336     // pron should be converted to model_sequence, model_sequence_len
1337 #define VETRANS_PREFIX "wd_hmm"
1338 #define VETRANS_PREFIX_LEN 6
1339     if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) {
1340       // printf("Debug-pron: %d, %s\n",pron_len, pron);
1341       model_sequence_len=0;
1342       for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){
1343 	// Tpron = strstr(Tpron, "wd_hmm");
1344 	Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm"
1345 	// Copy hmm number (56,132,...)
1346 	model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID);
1347 	model_sequence_len++;
1348       }
1349 
1350 
1351       /* we need to append silence at the end since we might be dealing with a slot
1352 	 that has ensuing speech, formerly we only dealt with ROOT which defacto
1353 	 was followed by the pau2 silence, so ROOT did not need its own */
1354       if (fst->addWordCaching_lastslot_needs_post_silence)
1355 	model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here
1356 
1357       /* append the word boundary */
1358       model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1359       /* translate to input label ids */
1360       for (i = 0; i < model_sequence_len; i++)
1361 	{
1362 	  if (model_sequence[i] >= EPSILON_OFFSET)
1363 	    model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1364 	}
1365     }
1366     else
1367 #endif
1368       {
1369 	pron_len = strlen((char*)pron);
1370 	if (pron_len >= MAX_PRON_LEN)
1371 	  {
1372 	    PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1373 	    rc = FST_FAILED_ON_INVALID_ARGS;
1374 	    goto RETRC;
1375 	  }
1376 
1377 	for (i = 0; i < pron_len; i++)
1378 	  {
1379 	    if (pron[i] == OPTSILENCE_CODE)
1380 	      {
1381 		phoneme_sequence[i] = SILENCE_CODE;
1382 		num_pron_ampersands++;
1383 	      }
1384 	    else
1385 	      phoneme_sequence[i] = pron[i];
1386 	  }
1387 	/* we need to append silence at the end since we might be dealing with a slot
1388 	   that has ensuing speech, formerly we only dealt with ROOT which defacto
1389 	   was followed by the pau2 silence, so ROOT did not need its own */
1390 	if (fst->addWordCaching_lastslot_needs_post_silence)
1391 	  phoneme_sequence[i++] = SILENCE_CODE;
1392 
1393 	model_sequence_len = i;
1394 	irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1395 	/* check for bad sequence of phonemes */
1396 	if (irc)
1397 	  {
1398 	    PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc);
1399 	    rc = FST_FAILED_ON_INVALID_ARGS;
1400 	    goto RETRC;
1401 	  }
1402 	IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1403 
1404 	/* append the word boundary */
1405 	model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1406 	/* translate to input label ids */
1407 	for (i = 0; i < model_sequence_len; i++)
1408 	  {
1409 	    if (model_sequence[i] >= EPSILON_OFFSET)
1410 	      model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1411 	  }
1412       } /*  end of ph_t ph_r .. type decoding of the model  sequence */
1413 
1414     /* now do the actual addition */
1415     rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len);
1416     if (rc == FST_SUCCESS)
1417       {
1418 	num_prons_added++;
1419       }
1420     else if (rc == FST_FAILED_ON_HOMONYM)
1421       {
1422 	/* maybe the another pron will work? */
1423       }
1424     else
1425       {
1426 	PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron);
1427 	goto RETRC;
1428       }
1429 
1430     /* second add the pron with no silences,
1431        this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */
1432 
1433     if (num_pron_ampersands > 0)
1434       {
1435 	for (i = 0, model_sequence_len = 0; i < pron_len; i++)
1436 	  {
1437 	    if (pron[i] != OPTSILENCE_CODE)
1438 	      phoneme_sequence[model_sequence_len++] = pron[i];
1439 	  }
1440 
1441 	irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1442 	/* check for bad sequence of phonemes */
1443 	if (irc)
1444 	  {
1445 	    PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc);
1446 	    rc = FST_FAILED_ON_INVALID_ARGS;
1447 	    goto RETRC;
1448 	  }
1449 	else
1450 	  {
1451 	    IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1452 
1453 	    /* append the word boundary */
1454 	    model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1455 	    /* translate to input label ids */
1456 	    for (i = 0; i < model_sequence_len; i++)
1457 	      {
1458 		if (model_sequence[i] >= EPSILON_OFFSET)
1459 		  model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1460 	      }
1461 	    /* now do the actual addition */
1462 	    rc = fst_add_arcs(fst, start_node, end_node,
1463 			      olabel, (costdata)cost, model_sequence, model_sequence_len);
1464 
1465 	    if (rc == FST_SUCCESS)
1466 	      {
1467 		num_prons_added++;
1468 	      }
1469 	    else if (rc == FST_FAILED_ON_HOMONYM)
1470 	      {
1471 		/* maybe another pron worked? */
1472 	      }
1473 	    else
1474 	      {
1475 		PLogMessage("Warning: fst_add_arcs() failed while adding "
1476 			    "word %s pron %s (skip '&')\n", word, (char*)pron);
1477 		goto RETRC;
1478 	      }
1479 	  }
1480       }
1481   }
1482 
1483 
1484  RETRC:
1485   /* set this to make sure that FST_Prepare gets called after add word */
1486   fst->whether_prepared = 0;
1487 #if USE_COMP_STATS
1488   end_cs_clock1(&comp_stats->word_addition, 1);
1489 #endif
1490 
1491   if (rc < 0 && rc != FST_FAILED_ON_HOMONYM)
1492     return rc;
1493 
1494   if (num_prons_added == 0 && num_words_added > 0)
1495     {
1496       return FST_FAILED_ON_HOMONYM;
1497     }
1498   else if (num_prons_added == 0 && num_words_added == 0)
1499     {
1500       return FST_FAILED_ON_HOMOGRAPH;
1501     }
1502   else if (num_prons_added > 0 && num_words_added == 0)
1503     {
1504       return FST_SUCCESS_ON_OLD_WORD;
1505     }
1506   else
1507     {
1508       /* if(num_prons_added>0 && num_words_added>0) */
1509       return FST_SUCCESS;
1510     }
1511 }
1512 
1513 /* remove arcs leaving a node, arcs added via word add, are just discarded
1514 to be cleaned up for re-use by other functions */
remove_added_arcs_leaving(srec_context * fst,nodeID ni)1515 void remove_added_arcs_leaving(srec_context* fst, nodeID ni)
1516 {
1517   FSMnode* node = &fst->FSMnode_list[ ni];
1518   FSMarc *arc = NULL, *arc2;
1519   arcID *pai, ai, ai2;
1520   for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai))
1521   {
1522     if (ai < fst->num_base_arcs)
1523     {
1524       arc = &fst->FSMarc_list[ai];
1525     }
1526     else
1527     {
1528       arc2 = &fst->FSMarc_list[ai];
1529       for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1530            ai2 = arc2->linkl_next_arc)
1531       {
1532         arc2 = &fst->FSMarc_list[ai2];
1533       }
1534       *pai = ai2;
1535     }
1536   }
1537 }
1538 
1539 /* remove arcs arriving at a node, arcs added via word add, are just discarded
1540    to be cleaned up for re-use by other functions */
remove_added_arcs_arriving(srec_context * fst,nodeID ni)1541 void remove_added_arcs_arriving(srec_context* fst, nodeID ni)
1542 {
1543   FSMnode* node = &fst->FSMnode_list[ni];
1544   FSMarc *arc = NULL, *arc2;
1545   arcID *pai, ai, ai2;
1546   for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID;
1547        pai = &arc->linkl_prev_arc, ai = (*pai))
1548   {
1549     if (ai < fst->num_base_arcs)
1550     {
1551       arc = &fst->FSMarc_list[ai];
1552     }
1553     else
1554     {
1555       arc2 = &fst->FSMarc_list[ai];
1556       for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1557            ai2 = arc2->linkl_prev_arc)
1558       {
1559         arc2 = &fst->FSMarc_list[ai2];
1560       }
1561       *pai = ai2;
1562     }
1563   }
1564 }
1565 
1566 /* reset greammar, but resetting all the arcs, nodes and words added
1567    via the addword functions */
1568 
FST_ResetGrammar(srec_context * fst)1569 int FST_ResetGrammar(srec_context* fst)
1570 {
1571   int i, rc = 0;
1572   nodeID fst_slot_start_node, fst_slot_end_node;
1573   wordID fst_slot_slotnum;
1574   arcID ai;
1575   nodeID ni2, ni3, ni4, ni3a;
1576   FSMnode_t *node, *node2, *node3;
1577   FSMarc_t *arc, *arc2, *arc3;
1578 
1579   /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/
1580   for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots;
1581        fst_slot_slotnum++)
1582   {
1583 
1584     if (fst_slot_slotnum == MAXwordID)
1585     {
1586       char *slot = "";
1587       PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1);
1588       return (rc = FST_FAILED_ON_INVALID_ARGS);
1589     }
1590 
1591     /* now find where in the graph this slot is referenced, note that
1592        there might be more than one place, but here we ignore multiples */
1593     fst_slot_start_node = MAXnodeID;
1594     fst_slot_end_node = MAXnodeID;
1595     for (i = fst->num_fsm_exit_points; --i >= 0;)
1596     {
1597       ai = fst->fsm_exit_points[i].arc_index;
1598       if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum)
1599       {
1600         fst_slot_start_node = fst->fsm_exit_points[i].from_node_index;
1601         fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index;
1602       }
1603     }
1604 
1605     /* this 'slot' could be the root rule, which can't be removed */
1606     if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID)
1607       continue;
1608 
1609     /*
1610       start                     end
1611       node   node2    node3   node4
1612        o--@N-->o--sil-->o--wb-->o
1613          arc     arc2   | arc3  ^
1614                  |       |wb
1615                  \__sil->o
1616     arc4   ni3a
1617     */
1618 
1619     remove_added_arcs_leaving(fst, fst_slot_start_node);
1620     node = &fst->FSMnode_list[ fst_slot_start_node];
1621     for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc)
1622     {
1623       arc = &fst->FSMarc_list[ai];
1624       if (arc->olabel != fst_slot_slotnum)
1625         continue;
1626 
1627       ni2 = arc->to_node;
1628       remove_added_arcs_arriving(fst, ni2);
1629       if (ni2 == fst_slot_end_node)
1630         continue;
1631       node2 = &fst->FSMnode_list[ni2];
1632       arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc];
1633 
1634 
1635       ni3 = arc2->to_node;
1636       remove_added_arcs_arriving(fst, ni3);
1637       if (ni3 == fst_slot_end_node)
1638         continue;
1639       node3 = &fst->FSMnode_list[ni3];
1640       arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1641       while (arc3->linkl_next_arc != MAXarcID)
1642       {
1643         arc3 = &fst->FSMarc_list[arc3->linkl_next_arc];
1644         ni3a = arc3->to_node;
1645         remove_added_arcs_arriving(fst, ni3a);
1646       }
1647       arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1648 
1649       ni4 = arc3->to_node;
1650       remove_added_arcs_arriving(fst, ni4);
1651       ASSERT(ni4 == fst_slot_end_node);
1652       if (ni4 == fst_slot_end_node)
1653         continue;
1654     }
1655   }
1656 
1657   /* reset the freelist for nodes */
1658   if( fst->num_nodes == fst->num_base_nodes )
1659     {}
1660   else{
1661     FSMnode *ntoken;
1662     FSMnode* tmp_FSMnode_list;
1663     FSMnode_info* tmp_FSMnode_info_list;
1664 
1665 	fst->FSMnode_freelist = MAXnodeID;
1666     fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes;
1667 
1668 	tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
1669     if(!tmp_FSMnode_list){
1670      PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes");
1671      return FST_FAILED_ON_MEMORY;
1672      }
1673     memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
1674 
1675     /* scan to the end of the free list (to be re-used) */
1676     nodeID* last_free_node = (&fst->FSMnode_freelist);
1677 	ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
1678 
1679 	for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
1680 		 ntoken = &tmp_FSMnode_list[ *last_free_node];
1681 
1682 	FREE( fst->FSMnode_list);
1683     tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
1684     if(!tmp_FSMnode_info_list){
1685      PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos");
1686      return FST_FAILED_ON_MEMORY;
1687      }
1688 	// copy in node info
1689 	memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
1690 
1691 
1692 	FREE( fst->FSMnode_info_list);
1693 	fst->FSMnode_info_list = tmp_FSMnode_info_list;
1694     fst->FSMnode_list = tmp_FSMnode_list;
1695   }
1696 
1697   /*ni = fst->FSMnode_freelist = fst->num_base_nodes;
1698     node = &fst->FSMnode_list[ni];
1699     for (; ni < fst->FSMnode_list_len - 1; ni++, node++)
1700        node->un_ptr.next_node = (nodeID)(ni + 1);
1701     node->un_ptr.next_node = MAXnodeID;
1702     fst->num_nodes = fst->num_base_nodes;*/
1703 
1704   /* reset the freelist for arcs */
1705   if( fst->num_arcs == fst->num_base_arcs )
1706     {}
1707   else {
1708     FSMarc* atoken = NULL;
1709     FSMarc* tmp_FSMarc_list;
1710 
1711     fst->num_arcs = fst->num_base_arcs;
1712     fst->FSMarc_list_len = fst->num_base_arcs;
1713     fst->FSMarc_freelist = MAXarcID;
1714     tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
1715     if(!tmp_FSMarc_list){
1716       PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs");
1717      return FST_FAILED_ON_MEMORY;
1718     }
1719     memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
1720 
1721     /* scan to the end of the free list (to be re-used) */
1722     arcID* last_free_arc = &fst->FSMarc_freelist;
1723     atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
1724 
1725     for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
1726       atoken = &tmp_FSMarc_list[ *last_free_arc];
1727 
1728     FREE( fst->FSMarc_list);
1729     fst->FSMarc_list = tmp_FSMarc_list;
1730   }
1731 
1732   /*ai = fst->FSMarc_freelist = fst->num_base_arcs;
1733     arc = &fst->FSMarc_list[ai];
1734     for (; ai < fst->FSMarc_list_len - 1; ai++, arc++)
1735     arc->linkl_next_arc = (arcID)(ai + 1);
1736     arc->linkl_next_arc = MAXarcID;
1737     fst->num_arcs = fst->num_base_arcs;
1738     fst->FSMarc_list_len = fst->num_base_arcs;*/
1739 
1740   /* now remove all the added words */
1741   wordmap_reset(fst->olabels);
1742   return FST_SUCCESS;
1743 }
1744 
1745 /* See the description of arc_tokens in the header file. These
1746    arcs are nodeless, which make loading from a file that
1747    references nodes a little harder.  We need to do it in stages.
1748    Later, when we load from binary image it should be much simpler.
1749 */
1750 
get_first_arc_leaving_node(arc_token * arc_token_list,arcID num_arcs,nodeID node)1751 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list,
1752     arcID num_arcs,
1753     nodeID node)
1754 {
1755   arcID i;
1756   for (i = 0; i < num_arcs; i++)
1757   {
1758     if ((nodeID)(int)arc_token_list[i].next_token_index == node)
1759       return ARC_TOKEN_LNK(arc_token_list, i);
1760   }
1761   return ARC_TOKEN_NULL;
1762 }
1763 
FST_LoadReverseWordGraph(srec_context * context,int num_words_to_add,PFile * fp)1764 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp)
1765 {
1766   arcID i;
1767   char line[MAX_LINE_LENGTH];
1768   char word_label_as_str[128];
1769   arcID num_alloc_arc_tokens;
1770   nodeID from_node, into_node;
1771   labelID word_label = MAXwordID;
1772   arc_token *atoken, *last_atoken;
1773   costdata cost;
1774   arcID num_arcs;
1775   arc_token *arc_token_list, *tmp;
1776   long fpos;
1777 
1778   /* determine number of word arcs to allocate */
1779   fpos = pftell(fp);
1780   for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp);  num_arcs++) ;
1781   num_alloc_arc_tokens = num_arcs;
1782   num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add);
1783   pfseek(fp, fpos, SEEK_SET);
1784 
1785   context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph");
1786   arc_token_list = context->arc_token_list;
1787 
1788   /* 1. first load up all the information */
1789   i = 0;
1790   while (pfgets(line, MAX_LINE_LENGTH, fp))
1791   {
1792     if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3)
1793     {
1794       word_label = wordmap_find_index(context->olabels, word_label_as_str);
1795       // ASSERT(word_label >= 0);
1796       cost = FREEcostdata;
1797     }
1798     else if (sscanf(line, "%hu", &from_node) == 1)
1799     {
1800       into_node = MAXnodeID;
1801       word_label = MAXwordID;
1802       cost = FREEcostdata;
1803     }
1804     else
1805     {
1806       PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line);
1807       ASSERT(0);
1808     }
1809     atoken = &arc_token_list[i];
1810     i++;
1811     atoken->ilabel = word_label;
1812 	/* atoken->olabel = WORD_EPSILON_LABEL; */
1813     /*atoken->cost = cost; cost is not used for now */
1814 #if DEBUG_ASTAR
1815     atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]);
1816 #endif
1817     atoken->first_next_arc = (arc_token_lnk)into_node;
1818     atoken->next_token_index = (arc_token_lnk)from_node;
1819   }
1820   num_arcs = i;
1821 
1822   /* 2. now do the internal cross references */
1823   for (i = 0; i < num_arcs; i++)
1824   {
1825     atoken = &arc_token_list[i];
1826     into_node = (nodeID)atoken->first_next_arc;
1827     if (into_node == MAXnodeID)
1828       atoken->first_next_arc = ARC_TOKEN_NULL;
1829     else
1830       atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc);
1831   }
1832 
1833   /* 3. now do more internal cross refs */
1834   last_atoken = &arc_token_list[0];
1835   for (i = 1; i < num_arcs; i++)
1836   {
1837     atoken = &arc_token_list[i];
1838     if (atoken->next_token_index != last_atoken->next_token_index)
1839     {
1840       last_atoken->next_token_index = ARC_TOKEN_NULL;
1841     }
1842     else
1843     {
1844       last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i);
1845     }
1846     last_atoken = atoken;
1847   }
1848   last_atoken->next_token_index = ARC_TOKEN_NULL;
1849 
1850 #if DEBUG_ASTAR
1851   /* under debug, it's nice to be able to see the words leaving the
1852      destination node, they are stored sequentially in the debug ary */
1853   for (i = 0; i < num_arcs; i++)
1854   {
1855     char * p;
1856     atoken = &arc_token_list[i];
1857     atoken->debug[0] = 0;
1858     for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL;
1859          tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index))
1860     {
1861       if (tmp->first_next_arc == ARC_TOKEN_NULL)
1862       {
1863         p = "END";
1864       }
1865       else if (!tmp->label)
1866       {
1867         p = "NULL";
1868       }
1869       else
1870       {
1871         p = tmp->label;
1872       }
1873       if (strlen(atoken->debug) + strlen(p) + 6 < 64)
1874       {
1875         strcat(atoken->debug, p);
1876         strcat(atoken->debug, " ");
1877       }
1878       else
1879       {
1880         strcat(atoken->debug, "...");
1881         break;
1882       }
1883     }
1884   }
1885 #endif
1886   context->arc_token_list_len = num_alloc_arc_tokens;
1887   if (num_alloc_arc_tokens > num_arcs)
1888   {
1889     atoken = &context->arc_token_list[num_arcs];
1890     for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++)
1891     {
1892       atoken->first_next_arc = ARC_TOKEN_NULL;
1893       atoken->ilabel         = MAXwordID;
1894       atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/
1895     }
1896     atoken--;
1897     atoken->next_token_index = ARC_TOKEN_NULL;
1898     context->arc_token_freelist = &context->arc_token_list[num_arcs];
1899   }
1900   else
1901   {
1902     context->arc_token_freelist = NULL;
1903   }
1904   /* the insert start is the arc that goes to the end */
1905   atoken = context->arc_token_list;
1906   atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc);
1907   for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index))
1908   {
1909     if (atoken->first_next_arc == ARC_TOKEN_NULL)
1910       break;
1911     tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc);
1912     if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL)
1913       break;
1914   }
1915   context->arc_token_insert_start = atoken;
1916 
1917   return FST_SUCCESS;
1918 }
1919 
FST_UnloadReverseWordGraph(srec_context * context)1920 int FST_UnloadReverseWordGraph(srec_context* context)
1921 {
1922   FREE(context->arc_token_list);
1923   context->arc_token_list = 0;
1924   return FST_SUCCESS;
1925 }
1926 
1927 /*----------------------------------------------------------------------*
1928  *                                                                      *
1929  * general                                                              *
1930  *                                                                      *
1931  *----------------------------------------------------------------------*/
1932 
split_line(char * line,char ** av)1933 char split_line(char* line, char** av)
1934 {
1935   int i = 0;
1936   av[i] = strtok(line, "\n\r\t ");
1937   for (; av[i];)
1938   {
1939     av[++i] = strtok(NULL, "\n\r\t ");
1940   }
1941   return i;
1942 }
1943 
atoi_with_check(const char * buf,asr_int32_t mymax)1944 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax)
1945 {
1946   asr_int32_t ans = atol(buf);
1947   ASSERT(ans < mymax);
1948   return ans;
1949 }
1950 
1951 /*----------------------------------------------------------------------*
1952  *                                                                      *
1953  * fst                                                                  *
1954  *                                                                      *
1955  *----------------------------------------------------------------------*/
1956 
fst_get_free_arc(srec_context * fst)1957 arcID fst_get_free_arc(srec_context* fst)
1958 {
1959   FSMarc* atoken;
1960   arcID atokid = fst->FSMarc_freelist;
1961   if (atokid == MAXarcID)
1962   {
1963     PLogError("error: ran out of arcs\n");
1964     return atokid;
1965   }
1966   atoken = &fst->FSMarc_list[atokid];
1967   fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc);
1968   memset(atoken, 0, sizeof(FSMarc));
1969   atoken->to_node = FSMNODE_NULL;
1970   atoken->fr_node = FSMNODE_NULL;
1971   atoken->linkl_next_arc = FSMARC_NULL;
1972   atoken->linkl_prev_arc = FSMARC_NULL;
1973   atoken->ilabel = 0;
1974   atoken->olabel = 0;
1975   atoken->cost   = 0;
1976   fst->num_arcs++;
1977   return atokid;
1978 }
fst_get_free_node(srec_context * fst)1979 nodeID fst_get_free_node(srec_context* fst)
1980 {
1981   FSMnode* ntoken;
1982   nodeID ntokid = fst->FSMnode_freelist;
1983   if (ntokid == MAXnodeID)
1984   {
1985     PLogError("error: ran out of nodes\n");
1986     return ntokid;
1987   }
1988   ntoken = &fst->FSMnode_list[ntokid];
1989   fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node);
1990   ntoken->un_ptr.first_next_arc = FSMARC_NULL;
1991   ntoken->first_prev_arc = FSMARC_NULL;
1992   /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */
1993   fst->num_nodes++;
1994   return ntokid;
1995 }
1996 
fst_free_node(srec_context * fst,FSMnode * node)1997 int fst_free_node(srec_context* fst, FSMnode* node)
1998 {
1999   IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node));
2000   node->first_prev_arc = FSMARC_NULL;
2001   node->un_ptr.next_node = fst->FSMnode_freelist;
2002   node->first_prev_arc = FSMARC_FREE;
2003   fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list);
2004   /* fst->num_nodes--; not allowed unless we compress! */
2005   return 0;
2006 }
2007 
fst_free_arc(srec_context * fst,FSMarc * arc)2008 int fst_free_arc(srec_context* fst, FSMarc* arc)
2009 {
2010   IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc));
2011   arc->linkl_prev_arc = FSMARC_FREE;
2012   arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist);
2013   arc->cost = 2;
2014   arc->ilabel = 0;
2015   arc->olabel = 0;
2016   IF_DEBUG_WDADD(arc->ilabel_str = 0);
2017   IF_DEBUG_WDADD(arc->olabel_str = 0);
2018   fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list);
2019   fst->num_arcs--;
2020   return 0;
2021 }
2022 
fst_free_arc_net(srec_context * fst,FSMarc * arc)2023 int fst_free_arc_net(srec_context* fst, FSMarc* arc)
2024 {
2025   if (!arc)
2026     return 0;
2027   /* need to traverse the arc, get the 'from' nodes,
2028      then traverse the from nodes and kill arcs */
2029   /* we'll need this after an error condition */
2030   return 0;
2031 }
2032 
fst_push_arc_olabel(srec_context * fst,FSMarc * arc)2033 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc)
2034 {
2035   FSMarc_ptr atok;
2036   FSMarc* atoken;
2037   FSMnode_ptr ntok = arc->to_node;
2038 
2039   /* look for all arcs leaving this arc, and mark them
2040      normally there is only one, except for multipron words */
2041   for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2042   {
2043     atoken = ARC_XtoP(atok);
2044     /* cannot have multiple olabels before a boundary */
2045     if (atoken->olabel != WORD_EPSILON_LABEL)
2046       return FST_FAILED_INTERNAL;
2047     atoken->olabel = arc->olabel;
2048 #if DO_WEIGHTED_ADDWORD
2049     atoken->cost = (costdata)(atoken->cost + arc->cost);
2050 #endif
2051     IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2052   }
2053   /* unlabel this arc */
2054   arc->olabel = WORD_EPSILON_LABEL;
2055 #if DO_WEIGHTED_ADDWORD
2056   arc->cost = FREEcostdata;
2057 #endif
2058   IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2059   return FST_SUCCESS;
2060 }
2061 
2062 #if DO_WEIGHTED_ADDWORD
fst_push_arc_cost(srec_context * fst,FSMarc * arc)2063 int fst_push_arc_cost(srec_context* fst, FSMarc* arc)
2064 {
2065   FSMarc_ptr atok;
2066   FSMarc* atoken;
2067   FSMnode_ptr ntok = arc->to_node;
2068   /* look for all arcs leaving this arc, and push the cost thereto */
2069   for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2070   {
2071     atoken = ARC_XtoP(atok);
2072     atoken->cost = (costdata)(atoken->cost + arc->cost);
2073   }
2074   arc->cost = FREEcostdata;
2075   return FST_SUCCESS;
2076 }
2077 #endif
2078 
2079 
fst_pull_arc_olabel(srec_context * fst,FSMarc * arc)2080 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc)
2081 {
2082   FSMarc_ptr atok;
2083   FSMarc* atoken;
2084   FSMnode_ptr fr_node = arc->fr_node;
2085   FSMarc_ptr tmp_atok;
2086   FSMarc *tmp_atoken;
2087   FSMnode *anode;
2088 
2089   if (arc->olabel == WORD_EPSILON_LABEL)
2090     return FST_SUCCESS;
2091 
2092   /* can the olabel be pulled earlier? */
2093   for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2094   {
2095     atoken = ARC_XtoP(atok);
2096     anode = NODE_XtoP(atoken->to_node);
2097 
2098     tmp_atok = anode->un_ptr.first_next_arc;
2099     if (tmp_atok != FSMARC_NULL)
2100     {
2101       tmp_atoken = ARC_XtoP(tmp_atok);
2102       if (tmp_atoken->linkl_next_arc != FSMARC_NULL)
2103       {
2104         return FST_CONTINUE;
2105       }
2106     }
2107   }
2108 
2109   /* look for all arcs arriving at this arc, and mark them */
2110   for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc)
2111   {
2112     atoken = ARC_XtoP(atok);
2113     /* cannot have multiple olabels! serious internal error!? */
2114     if (atoken->olabel != WORD_EPSILON_LABEL)
2115     {
2116       PLogError("error: internal error, in fst_pull_arc_olabel()\n");
2117       return FST_FAILED_INTERNAL;
2118     }
2119     atoken->olabel = arc->olabel;
2120 #if DO_WEIGHTED_ADDWORD
2121     atoken->cost = arc->cost;
2122 #endif
2123     IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2124   }
2125   /* unlabel this arc */
2126   arc->olabel = WORD_EPSILON_LABEL;
2127 #if DO_WEIGHTED_ADDWORD
2128   arc->cost   = FREEcostdata;
2129 #endif
2130   IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2131   return FST_SUCCESS;
2132 }
2133 
find_next_arc_with_ilabel(srec_context * fst,FSMnode * node,labelID ilabel,FSMarc ** last)2134 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last)
2135 {
2136   FSMarc_ptr atok;
2137   FSMarc* atoken = NULL;
2138   for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2139   {
2140     atoken = ARC_XtoP(atok);
2141     if (atoken->ilabel == ilabel)
2142       return atoken;
2143   }
2144   *last = atoken; /* to this we may want to append a new arc */
2145   return NULL;
2146 }
2147 
find_prev_arc_with_iolabels(srec_context * fst,FSMnode * node,labelID ilabel,labelID olabel,FSMarc ** last)2148 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last)
2149 {
2150   FSMarc_ptr atok;
2151   FSMarc* atoken = NULL;
2152   FSMarc_ptr tmp_atok;
2153   FSMarc* tmp_atoken;
2154 
2155   for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2156   {
2157     atoken = ARC_XtoP(atok);
2158     if (atoken->olabel == olabel && atoken->ilabel == ilabel)
2159     {
2160       tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc;
2161       if (tmp_atok != FSMARC_NULL)
2162       {
2163         tmp_atoken = ARC_XtoP(tmp_atok);
2164 
2165         if (tmp_atoken->linkl_next_arc == FSMARC_NULL)
2166           return atoken;
2167       }
2168       else
2169         return atoken;
2170     }
2171 
2172   }
2173   if (last) *last = atoken;
2174   return NULL;
2175 }
2176 
num_arcs_leaving(srec_context * fst,FSMnode * node)2177 int num_arcs_leaving(srec_context* fst, FSMnode* node)
2178 {
2179   int num_arcs = 0;
2180   FSMarc_ptr atok;
2181   FSMarc* atoken;
2182   for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2183   {
2184     atoken = ARC_XtoP(atok);
2185     num_arcs++;
2186   }
2187   return num_arcs;
2188 }
2189 
num_arcs_arriving(srec_context * fst,FSMnode * node)2190 int num_arcs_arriving(srec_context* fst, FSMnode* node)
2191 {
2192   int num_arcs = 0;
2193   FSMarc_ptr atok;
2194   FSMarc* atoken;
2195   for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2196   {
2197     atoken = ARC_XtoP(atok);
2198     num_arcs++;
2199   }
2200   return num_arcs;
2201 }
2202 
num_arcs_arriving_gt_1(srec_context * fst,FSMnode * node)2203 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node)
2204 {
2205   FSMarc_ptr atok = node->first_prev_arc;
2206   FSMarc* atoken;
2207   if (atok == FSMARC_NULL)
2208     return 0;
2209   atoken = ARC_XtoP(atok);
2210   return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1;
2211 }
2212 
remove_arc_arriving(srec_context * fst,FSMnode * to_node,FSMarc * arc)2213 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc)
2214 {
2215   FSMarc* atoken;
2216   FSMarc_ptr* atok;
2217   for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2218   {
2219     atoken = ARC_XtoP(*atok);
2220     if (atoken == arc)
2221     {
2222       (*atok) = arc->linkl_prev_arc;
2223       return;
2224     }
2225   }
2226   ASSERT(0);
2227 }
2228 
split_node_for_arc(srec_context * fst,FSMarc * arc)2229 int split_node_for_arc(srec_context* fst, FSMarc* arc)
2230 {
2231   arcID new_arc_id;
2232   nodeID new_node_id;
2233   FSMnode* new_node;
2234   FSMnode* old_node;
2235   FSMarc_ptr atok, new_next, last_new_next;
2236   FSMarc *atoken, *new_next_p;
2237 
2238   IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc));
2239 
2240   new_node_id = fst_get_free_node(fst);
2241   if (new_node_id == MAXnodeID)
2242     return FST_FAILED_ON_MEMORY;
2243   new_node = &fst->FSMnode_list[ new_node_id];
2244   old_node = NODE_XtoP(arc->to_node);
2245   arc->to_node = NODE_ItoX(new_node_id);
2246   new_node->first_prev_arc = ARC_PtoX(arc);
2247   remove_arc_arriving(fst, old_node, arc);
2248   arc->linkl_prev_arc = FSMARC_NULL;
2249 
2250   last_new_next = FSMARC_NULL;
2251   for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2252   {
2253     atoken = ARC_XtoP(atok);
2254     new_arc_id = fst_get_free_arc(fst);
2255     if (new_arc_id == MAXarcID)
2256       return FST_FAILED_ON_MEMORY;
2257     new_next = ARC_ItoX(new_arc_id);
2258     if (last_new_next != FSMARC_NULL)
2259       LINKL_NEXT(last_new_next) = new_next;
2260     else
2261       new_node->un_ptr.first_next_arc = new_next;
2262     new_next_p = ARC_XtoP(new_next);
2263     new_next_p->ilabel = atoken->ilabel;
2264     new_next_p->to_node = atoken->to_node;
2265     new_next_p->fr_node = arc->to_node;
2266     new_next_p->olabel = atoken->olabel;
2267     IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str);
2268     IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str);
2269     new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */
2270     append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next);
2271     last_new_next = new_next;
2272   }
2273   return FST_SUCCESS;
2274 }
2275 
fst_add_arcs(srec_context * fst,nodeID start_node,nodeID end_node,wordID add_tree_olabel,costdata add_tree_cost,modelID * add_tree_start,int add_tree_len)2276 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
2277                  wordID add_tree_olabel, costdata add_tree_cost,
2278                  modelID* add_tree_start, int add_tree_len)
2279 {
2280   int rc = FST_SUCCESS;
2281   FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0;
2282   FSMarc* atoken = NULL;
2283   FSMarc *next_atoken, *prev_atoken;
2284   FSMarc *append_to;
2285   FSMnode *late_start_node, *early_end_node;
2286   FSMnode *ntoken, *last_ntoken;
2287   FSMnode_ptr ntok, last_ntok;
2288   modelID *add_tree_end = add_tree_start + add_tree_len - 1;
2289   modelID *add_tree_late_start, *add_tree_early_end;
2290   modelID *add_tree;
2291   arcID new_arc_id, atokid;
2292   nodeID new_node_id, atokid_node;
2293   wordID add_tree_olabel_use = add_tree_olabel;
2294   wordID add_tree_cost_use   = add_tree_cost;
2295 
2296   append_to = NULL;
2297   add_tree = add_tree_start;
2298 #define FST_GROW_MINARCS  100
2299 #define FST_GROW_MINNODES 100
2300 #if defined(FST_GROW_FACTOR)
2301 
2302   /* make sure we have enough arcs available */
2303   if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len)
2304    {
2305 	FSMarc* tmp_FSMarc_list;
2306 	arcID tmp_FSMarc_list_len;
2307 	int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ;
2308 
2309 	if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS)
2310 		itmp_FSMarc_list_len += FST_GROW_MINARCS;
2311 
2312 	if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len)
2313 		itmp_FSMarc_list_len += add_tree_len;
2314 
2315     if(itmp_FSMarc_list_len >= (int)MAXarcID)
2316 	     return FST_FAILED_ON_MEMORY;
2317 
2318 	tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len;
2319 
2320     tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
2321       if(!tmp_FSMarc_list) {
2322 	PLogError("error: Failed to extend the memory for arcs\n");
2323         return FST_FAILED_INTERNAL;
2324       }
2325     memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
2326 
2327 	/* initialize the new free list.
2328     head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */
2329 	for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) {
2330 	   tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1);
2331        tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2332     }
2333     tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL;
2334     tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2335 
2336     /* scan to the end of the old free list (to be re-used) */
2337     arcID* last_free_arc = &fst->FSMarc_freelist;
2338     atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
2339 
2340     for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
2341 	    atoken = &tmp_FSMarc_list[ *last_free_arc];
2342 
2343 	/* append the new free list to the current free list */
2344 	*last_free_arc = fst->FSMarc_list_len;
2345 
2346 	FREE( fst->FSMarc_list);
2347     fst->FSMarc_list = tmp_FSMarc_list;
2348     fst->FSMarc_list_len = tmp_FSMarc_list_len;
2349    }
2350 
2351    /* make sure we have enough nodes available */
2352    if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len)
2353    {
2354      FSMnode* tmp_FSMnode_list;
2355 	 nodeID tmp_FSMnode_list_len;
2356 	 FSMnode_info* tmp_FSMnode_info_list;
2357 	 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ;
2358 
2359 	 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES)
2360 		itmp_FSMnode_list_len += FST_GROW_MINNODES;
2361 
2362 	 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len)
2363 		itmp_FSMnode_list_len += add_tree_len;
2364 
2365 	 if(itmp_FSMnode_list_len >= (int)MAXnodeID)
2366 	     return FST_FAILED_ON_MEMORY;
2367 
2368 	 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len;
2369 
2370      tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
2371      if(!tmp_FSMnode_list) {
2372        PLogError("ERROR: Failed to extend the memory for nodes\n");
2373        return FST_FAILED_INTERNAL;
2374      }
2375      memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
2376 
2377 	/* initialize the new free node list.
2378     head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */
2379     for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++)
2380 	{
2381 	   tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1);
2382        tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2383     }
2384        tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL;
2385        tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2386 
2387     /* scan to the end of the old free list (to be re-used) */
2388     nodeID* last_free_node = (&fst->FSMnode_freelist);
2389 	ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
2390 
2391 	for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
2392 		 ntoken = &tmp_FSMnode_list[ *last_free_node];
2393 
2394 	/* append the new free list to the current free list */
2395 	*last_free_node = fst->FSMnode_list_len;
2396 
2397 	FREE( fst->FSMnode_list);
2398 
2399 	tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
2400      if(!tmp_FSMnode_info_list) {
2401        PLogError("ERROR: Failed to extend the memory for node infos\n");
2402        return FST_FAILED_INTERNAL;
2403      }
2404 	// copy in old node info
2405 	memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
2406 
2407 	// initialize the new node info
2408 	for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++)
2409 		tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN;
2410 
2411 	FREE( fst->FSMnode_info_list);
2412 	fst->FSMnode_info_list = tmp_FSMnode_info_list;
2413     fst->FSMnode_list = tmp_FSMnode_list;
2414     fst->FSMnode_list_len = tmp_FSMnode_list_len;
2415 }
2416 #endif
2417    late_start_node = &fst->FSMnode_list[start_node];
2418 
2419   while (1)
2420   {
2421 
2422     next_atoken = find_next_arc_with_ilabel(fst, late_start_node,
2423                                             *add_tree /*->ilabel*/,
2424                                             &append_to);
2425     if (next_atoken != NULL)
2426     {
2427       /* so next_atok->ilabel == add_tree->ilabel */
2428 
2429       if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY)
2430       {
2431 	 next_atoken = NULL;
2432 	 break; /* break as if nothing more can be shared! */
2433 	// return FST_FAILED_ON_HOMONYM;
2434       }
2435 
2436       if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node)))
2437       {
2438         split_node_for_arc(fst, next_atoken);
2439         /* unfortunate side effect here, that if we later find out this
2440            was a homonym addition, then this expansion was useless and
2441            for now we don't undo it! */
2442       }
2443 
2444       /* we shouldn't really push the olabel if it's the same for both! */
2445       if (next_atoken->olabel != add_tree_olabel_use)
2446       {
2447         if (next_atoken->olabel != WORD_EPSILON_LABEL)
2448         {
2449           if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS)
2450           {
2451             PLogError("error: internal error fst_push_arc_olabel()\n");
2452             return FST_FAILED_INTERNAL;
2453           }
2454         }
2455 #if DO_WEIGHTED_ADDWORD
2456         else
2457           fst_push_arc_cost(fst, next_atoken);
2458 #endif
2459       }
2460       else if (add_tree_olabel_use != WORD_EPSILON_LABEL)
2461       {
2462         /* the graph already has this word, so we just re-use the olabel
2463            and disable the setting of olabel down below */
2464         add_tree_olabel_use = WORD_EPSILON_LABEL;
2465 #if DO_WEIGHTED_ADDWORD
2466         add_tree_cost_use   = FREEcostdata;
2467 #endif
2468       }
2469       add_tree++;
2470       late_start_node = NODE_XtoP(next_atoken->to_node);
2471     }
2472     else
2473     {
2474       break;
2475     }
2476   }
2477 
2478   add_tree_late_start = add_tree;
2479   early_end_node = &fst->FSMnode_list[end_node];
2480 
2481   if (!do_minimize)
2482   {
2483     last_ntoken = NULL;
2484     last_ntok = NODE_PtoX(late_start_node);
2485     for (; add_tree <= add_tree_end; add_tree++)
2486     {
2487       new_arc_id = fst_get_free_arc(fst);
2488       if (new_arc_id == MAXarcID)
2489       {
2490         PLogError("fst_get_free_arc() failed\n");
2491         return FST_FAILED_ON_MEMORY;
2492       }
2493       atok = ARC_ItoX(new_arc_id);
2494       atoken = ARC_XtoP(atok);
2495       if (add_tree != add_tree_end)
2496       {
2497         new_node_id = fst_get_free_node(fst);
2498         if (new_node_id == MAXnodeID)
2499         {
2500           PLogError("fst_get_free_node() failed\n");
2501           return FST_FAILED_ON_MEMORY;
2502         }
2503         ntok = NODE_ItoX(new_node_id);
2504         ntoken = NODE_XtoP(ntok);
2505         ntoken->first_prev_arc = atok;
2506       }
2507       else
2508       {
2509         ntok = FSMNODE_NULL;
2510         ntoken = NULL;
2511       }
2512       atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */
2513       atoken->to_node = ntok;
2514       atoken->ilabel = *add_tree;
2515       atoken->linkl_next_arc = FSMARC_NULL;
2516       atoken->linkl_prev_arc = FSMARC_NULL;
2517       IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2518       if (!last_ntoken)
2519       {
2520         append_to->linkl_next_arc = atok;
2521         atoken->olabel = add_tree_olabel_use;
2522 #if DO_WEIGHTED_ADDWORD
2523         atoken->cost  =  add_tree_cost_use;
2524 #endif
2525         IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]);
2526       }
2527       else
2528       {
2529         last_ntoken->un_ptr.first_next_arc = atok;
2530       }
2531       last_ntoken = ntoken;
2532       last_ntok   = ntok;
2533     }
2534     /*  add_tree_end->to_node = early_end_node;
2535     add_tree_end->linkl_next_arc = FSMARC_NULL;
2536     atok = last_arc_arriving( early_end_node);
2537     atok->linkl_prev_arc = add_tree_end;
2538     add_tree_end->linkl_prev_arc = FSMARC_NULL; */
2539     atoken->to_node = NODE_ItoX(end_node);
2540     append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok);
2541   }
2542 
2543   else /* ie. do_minimize from the rear */
2544   {
2545 
2546     add_tree = add_tree_end;
2547     new_arc_id = fst_get_free_arc(fst);
2548     if (new_arc_id == MAXarcID)
2549       return FST_FAILED_ON_MEMORY;
2550     atok = ARC_ItoX(new_arc_id);
2551     atoken = ARC_XtoP(atok);
2552     atoken->olabel = add_tree_olabel_use;
2553 #if DO_WEIGHTED_ADDWORD
2554     atoken->cost   = add_tree_cost_use;
2555 #endif
2556     atoken->ilabel = *add_tree_late_start;
2557     atoken->fr_node = NODE_PtoX(late_start_node);
2558     if (atoken->ilabel == WORD_BOUNDARY)
2559     {
2560       atoken->cost = (costdata)(atoken->cost + fst->wtw_average);
2561     }
2562     else
2563     {
2564       atoken->cost   = add_tree_cost_use + fst->wtw_average;
2565       add_tree_cost_use = FREEcostdata;
2566     }
2567     IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]);
2568     IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2569     append_arc_leaving_node(fst, late_start_node, atok);
2570 
2571     last_atok = atok;
2572     while (1)
2573     {
2574 
2575       if (add_tree == add_tree_late_start)
2576         break;
2577       /* if there are other arcs leaving this node then joining
2578       earlier than here will result in over-generation, so don't
2579       if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL)
2580       break;
2581       */
2582       /*
2583       also need boundary conditions checker here
2584       */
2585 
2586       /* */
2587       IF_DEBUG_WDADD(printf_node(fst, early_end_node));
2588 
2589       /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) {
2590       printf("merge stopped due to num_arcs leaving ..\n");
2591       printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc);
2592       break;
2593       } */
2594 
2595       for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2596       {
2597         atoken = ARC_XtoP(atok);
2598 	/* never pull before the slot marker */
2599 	if(atoken->cost == DISABLE_ARC_COST) break;
2600 	if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break;
2601         fst_pull_arc_olabel(fst, atoken); /* fails are ok */
2602       }
2603 
2604       prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node,
2605                     *add_tree /*->ilabel*/,
2606                     WORD_EPSILON_LABEL,
2607                     NULL /*&append_to*/);
2608       if (!prev_atoken)
2609         break;
2610       else
2611       {
2612         /* this check is now inside find_prev_arc_with_iolabels */
2613         /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/
2614 
2615         /* now look to merge earlier still */
2616         early_end_node = NODE_XtoP(prev_atoken->fr_node);
2617         add_tree--;
2618         if (add_tree == add_tree_late_start)
2619           break;
2620       }
2621     }
2622     add_tree_early_end = add_tree;
2623     for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++)
2624     {
2625       new_node_id = fst_get_free_node(fst);
2626       if (new_node_id == MAXnodeID)
2627         return FST_FAILED_ON_MEMORY;
2628       ntok = NODE_ItoX(new_node_id);
2629       ntoken = NODE_XtoP(ntok);
2630       new_arc_id = fst_get_free_arc(fst);
2631       if (new_arc_id == MAXarcID)
2632         return FST_FAILED_ON_MEMORY;
2633       atok = ARC_ItoX(new_arc_id);
2634       atoken = ARC_XtoP(atok);
2635       atoken->ilabel = *add_tree;
2636 #if DO_WEIGHTED_ADDWORD
2637       atoken->cost   = FREEcostdata; /* idea: distribute add_tree_cost here;*/
2638 #endif
2639       atoken->olabel = WORD_EPSILON_LABEL;
2640       atoken->fr_node = ntok;
2641       atoken->to_node = FSMNODE_NULL; /* filled in next loop */
2642       IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]);
2643       IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]);
2644 
2645       ntoken->un_ptr.first_next_arc = atok;
2646       ntoken->first_prev_arc = last_atok;
2647       TO_NODE(last_atok) = ntok;
2648       last_atok = atok;
2649     }
2650     TO_NODE(last_atok) = NODE_PtoX(early_end_node);
2651     append_arc_arriving_node(fst, early_end_node, last_atok);
2652   }
2653   return rc;
2654 }
2655 
2656 
append_arc_leaving_node(srec_context * fst,FSMnode * fr_node,FSMarc_ptr arc)2657 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc)
2658 {
2659   FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc;
2660   FSMarc* atoken = ARC_XtoP(*atok);
2661   for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc)
2662   {
2663     atoken = ARC_XtoP(*atok);
2664   }
2665   *atok = arc;
2666   LINKL_NEXT(arc) = FSMARC_NULL;
2667 }
append_arc_arriving_node(srec_context * fst,FSMnode * to_node,FSMarc_ptr arc)2668 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc)
2669 {
2670   FSMarc_ptr* atok = &to_node->first_prev_arc;
2671   FSMarc* atoken = ARC_XtoP(*atok);
2672   for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2673   {
2674     atoken = ARC_XtoP(*atok);
2675   }
2676   *atok = arc;
2677   LINKL_PREV(arc) = FSMARC_NULL;
2678 }
2679 
2680 
printf_node1(srec_context * fst,FSMnode * node)2681 int printf_node1(srec_context* fst, FSMnode* node)
2682 {
2683   return 0;
2684 }
printf_arc1(srec_context * fst,char * msg,FSMarc * arc)2685 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc)
2686 {
2687   char buffer[MAX_LINE_LENGTH];
2688   sprintf_arc(buffer, fst, arc);
2689   printf("%s%s\n", msg, buffer);
2690   return 0;
2691 }
sprintf_arc(char * buf,srec_context * fst,FSMarc * arc)2692 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc)
2693 {
2694   int rc;
2695   FSMnode* to_node = NODE_XtoP(arc->to_node);
2696   arcID arc_index = (arcID)(arc - fst->FSMarc_list);
2697   if (to_node->un_ptr.first_next_arc == FSMARC_NULL)
2698     rc = sprintf(buf, "arc%hu\n", arc_index);
2699   else
2700   {
2701     rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n",
2702                  arc_index,
2703                  ARC_XtoI(to_node->un_ptr.first_next_arc),
2704                  arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1,
2705                  fst->ilabels->words[arc->ilabel],
2706                  fst->olabels->words[arc->olabel],
2707                  arc->cost);
2708   }
2709   return rc;
2710 }
2711 
2712 /* dumps the recognition context as a binary file,
2713    it will also store any empty space for growing */
2714 
2715 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); }
2716 
FST_sizes_signature()2717 asr_int32_t FST_sizes_signature()
2718 {
2719 #ifndef NDEBUG
2720   int shifted = 0;
2721 #endif
2722   asr_int32_t signature = 0;
2723   ENCODE(signature, arcID);
2724   ENCODE(signature, nodeID);
2725   ENCODE(signature, wordID);
2726   ENCODE(signature, labelID);
2727   ENCODE(signature, costdata);
2728   return signature;
2729 }
2730 
FST_ContextImageSize(srec_context * context)2731 int FST_ContextImageSize(srec_context* context)
2732 {
2733   int size = 0;
2734   size += sizeof(srec_context);
2735   size += sizeof(wordmap);
2736   size += sizeof(char*) * context->olabels->max_words;
2737   size += sizeof(char) * context->olabels->max_chars;
2738   size += sizeof(wordmap);
2739   if (context->ilabels->words != NULL)
2740     size += sizeof(char*) * context->olabels->max_words;
2741   if (context->ilabels->chars != NULL)
2742     size += sizeof(char) * context->olabels->max_chars;
2743   size += sizeof(FSMarc) * context->FSMarc_list_len;
2744   size += sizeof(FSMnode) * context->FSMnode_list_len;
2745   size += sizeof(FSMnode_info) * context->FSMnode_list_len;
2746   size += sizeof(arc_token) * context->arc_token_list_len;
2747   return size;
2748 }
2749 
serializeWordMapV2(wordmap * wordmap,PFile * fp)2750 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp)
2751 {
2752   unsigned int i = 0;
2753   unsigned int nfields;
2754   unsigned int tmp2[32];
2755 
2756   i = 0;
2757   tmp2[i++] = wordmap->num_words;
2758   tmp2[i++] = wordmap->num_slots;
2759   tmp2[i++] = wordmap->max_words;
2760   tmp2[i++] = wordmap->num_base_words;
2761   tmp2[i++] = wordmap->max_chars;
2762   tmp2[i++] = (wordmap->next_chars - wordmap->chars);
2763   tmp2[i++] = (wordmap->next_base_chars - wordmap->chars);
2764   nfields = i;
2765   if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2766     return ESR_WRITE_ERROR;
2767 
2768   if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars)
2769     return ESR_WRITE_ERROR;
2770 
2771   return ESR_SUCCESS;
2772 }
2773 
deserializeWordMapV2(wordmap ** pwordmap,PFile * fp)2774 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp)
2775 {
2776   unsigned int i = 0;
2777   unsigned int nfields;
2778   unsigned int tmp2[32];
2779   wordmap *awordmap;
2780   char *p;
2781   ESR_ReturnCode rc = ESR_SUCCESS;
2782   unsigned int next_chars_idx, next_base_chars_idx;
2783 
2784   awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base"));
2785   if (awordmap == NULL)
2786   {
2787     PLogError("NEW failed on srec.g2g.graph.wordmap.base\n");
2788     rc = ESR_OUT_OF_MEMORY;
2789     goto CLEANUP;
2790   }
2791   awordmap->wordIDForWord = NULL; // early break to cleanup needs this
2792 
2793   nfields = 7;
2794   if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2795   {
2796     PLogError("pfread failed when reading nfields\n");
2797     rc = ESR_READ_ERROR;
2798     goto CLEANUP;
2799   }
2800 
2801   i = 0;
2802   awordmap->num_words      = (wordID)tmp2[i++];
2803   awordmap->num_slots      = (wordID)tmp2[i++];
2804   awordmap->max_words      = (wordID)tmp2[i++];
2805   awordmap->num_base_words = (wordID)tmp2[i++];
2806   awordmap->max_chars      = tmp2[i++];
2807   next_chars_idx           = tmp2[i++];
2808   next_base_chars_idx      = tmp2[i++];
2809   ASSERT(nfields == i);
2810 
2811   awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words"));
2812   if (awordmap->words == NULL)
2813   {
2814     PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words);
2815     rc = ESR_OUT_OF_MEMORY;
2816     goto CLEANUP;
2817   }
2818   awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars"));
2819   if (awordmap->chars == NULL)
2820   {
2821     PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars);
2822     rc = ESR_OUT_OF_MEMORY;
2823     goto CLEANUP;
2824   }
2825   awordmap->next_chars = awordmap->chars + next_chars_idx;
2826   awordmap->next_base_chars = awordmap->chars + next_base_chars_idx;
2827 
2828   if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars)
2829   {
2830     PLogError("pfread failed while reading %d chars\n", awordmap->max_chars);
2831     rc = ESR_READ_ERROR;
2832     goto CLEANUP;
2833   }
2834 
2835 
2836   p = awordmap->chars;
2837   ASSERT((int)p % 2 == 0);
2838   nfields = 0;
2839   if (nfields < awordmap->num_words)
2840     awordmap->words[nfields++] = p;
2841   for (; p < awordmap->next_chars; p++) // was next_base_chars
2842   {
2843     if ( ( *p ) == '\0' )
2844     {
2845       if (nfields == awordmap->num_words) // was num_base_words
2846         break;
2847       if (((int)p) % 2 == 0) p++; /* so that words begin on even byte bound */
2848       awordmap->words[nfields++] = p + 1;
2849     }
2850   }
2851   ASSERT(nfields == awordmap->num_words); // was num_base_words
2852 
2853   if (awordmap->max_words >= awordmap->num_base_words)
2854     {
2855       PHashTableArgs hashArgs;
2856       hashArgs.capacity = awordmap->max_words;
2857       if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
2858       hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION;
2859       hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION;
2860       hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
2861       CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord));
2862 
2863       rc = wordmap_populate ( awordmap, awordmap->num_words );
2864 
2865       if (rc != ESR_SUCCESS)
2866 	{
2867         wordmap_clean ( awordmap );
2868         goto CLEANUP;
2869       }
2870     }
2871   else
2872     {
2873       awordmap->wordIDForWord = NULL;
2874     }
2875 
2876   /* success */
2877   *pwordmap = awordmap;
2878   return ESR_SUCCESS;
2879 
2880  CLEANUP:
2881   if (awordmap != NULL)
2882   {
2883     if (awordmap->wordIDForWord != NULL)
2884       PHashTableDestroy(awordmap->wordIDForWord);
2885     if (awordmap->words != NULL) FREE(awordmap->words);
2886     if (awordmap->chars != NULL) FREE(awordmap->chars);
2887     FREE(awordmap);
2888   }
2889   return rc;
2890 }
2891 
serializeArcToken(arc_token * token_base,int i,PFile * fp)2892 static ESR_ReturnCode serializeArcToken(arc_token *token_base,
2893                                         int i,
2894                                         PFile* fp)
2895 {
2896   arc_token *token = token_base + i;
2897   asr_uint32_t idx;
2898 
2899   if (pfwrite(&token->ilabel, 2, 1, fp) != 1)
2900     return ESR_WRITE_ERROR;
2901 
2902   if (pfwrite(&token->olabel, 2, 1, fp) != 1)
2903     return ESR_WRITE_ERROR;
2904 
2905   /* if (pfwrite(&token->cost, 2, 1, fp) != 1)
2906      return ESR_WRITE_ERROR; */
2907   idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base);
2908 
2909   if (pfwrite(&idx, 4, 1, fp) != 1)
2910     return ESR_WRITE_ERROR;
2911 
2912   idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base);
2913 
2914   if (pfwrite(&idx, 4, 1, fp) != 1)
2915     return ESR_WRITE_ERROR;
2916 
2917   return ESR_SUCCESS;
2918 }
2919 
serializeArcTokenInfo(srec_context * context,PFile * fp)2920 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context,
2921     PFile* fp)
2922 {
2923   int i;
2924   asr_uint32_t idx;
2925   ESR_ReturnCode rc;
2926 
2927   if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1)
2928     return ESR_WRITE_ERROR;
2929 
2930   idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list);
2931 
2932   if (pfwrite(&idx, 4, 1, fp) != 1)
2933     return ESR_WRITE_ERROR;
2934 
2935   idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list);
2936 
2937   if (pfwrite(&idx, 4, 1, fp) != 1)
2938     return ESR_WRITE_ERROR;
2939 
2940   for (i = 0; i < context->arc_token_list_len; ++i)
2941   {
2942     rc = serializeArcToken(context->arc_token_list, i, fp);
2943     if (rc != ESR_SUCCESS) return rc;
2944   }
2945   return ESR_SUCCESS;
2946 }
2947 
deserializeArcToken(arc_token * token_base,int i,PFile * fp)2948 static ESR_ReturnCode deserializeArcToken(arc_token *token_base,
2949     int i,
2950     PFile* fp)
2951 {
2952   arc_token *token = token_base + i;
2953   asr_uint32_t idx[2];
2954   asr_uint16_t labels[2];
2955 
2956   if (pfread(labels, 2, 2, fp) != 2)
2957     return ESR_READ_ERROR;
2958 
2959   token->ilabel = labels[0];
2960   token->olabel = labels[1];
2961 
2962   /* if (pfread(&token->cost, 2, 1, fp) != 1)
2963      return ESR_READ_ERROR; */
2964 
2965   if (pfread(idx, 4, 2, fp) != 2)
2966     return ESR_READ_ERROR;
2967 
2968   token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base));
2969   token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base));
2970 
2971   return ESR_SUCCESS;
2972 }
2973 
deserializeArcTokenInfo(srec_context * context,PFile * fp)2974 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context,
2975     PFile* fp)
2976 {
2977   ESR_ReturnCode rc;
2978   int i;
2979   asr_uint32_t idx;
2980 
2981   if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) {
2982     PLogError("pfread failed in deserializeArcTokenInfo()\n");
2983     return ESR_READ_ERROR;
2984   }
2985 
2986   context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len,
2987                                       L("srec.graph.wordgraph"));
2988 
2989   if (context->arc_token_list == NULL)
2990     return ESR_OUT_OF_MEMORY;
2991 
2992   if (pfread(&idx, 4, 1, fp) != 1)
2993   {
2994     rc = ESR_READ_ERROR;
2995     goto CLEANUP;
2996   }
2997 
2998   context->arc_token_freelist =
2999     IDX_TO_PTR(idx, context->arc_token_list);
3000 
3001   if (pfread(&idx, 4, 1, fp) != 1)
3002   {
3003     rc = ESR_READ_ERROR;
3004     goto CLEANUP;
3005   }
3006 
3007   context->arc_token_insert_start =
3008     IDX_TO_PTR(idx, context->arc_token_list);
3009 
3010   for (i = 0; i < context->arc_token_list_len; ++i)
3011   {
3012     rc = deserializeArcToken(context->arc_token_list, i, fp);
3013     if (rc != ESR_SUCCESS) goto CLEANUP;
3014   }
3015   return ESR_SUCCESS;
3016 
3017 CLEANUP:
3018   FREE(context->arc_token_list);
3019   context->arc_token_list =
3020     context->arc_token_freelist =
3021       context->arc_token_insert_start = NULL;
3022   return rc;
3023 }
3024 
FST_GetGrammarType(srec_context * context)3025 int FST_GetGrammarType(srec_context* context)
3026 {
3027   arc_token *t, *u;
3028   wordID expected_wdid;
3029   /* 0 1 0
3030      1 2 4
3031      ...
3032      1 2 1316
3033      2
3034   */
3035   t = context->arc_token_list;
3036   if (t->ilabel != WORD_EPSILON_LABEL)
3037     return GrammarTypeBNF;
3038   if (t->next_token_index)
3039     return GrammarTypeBNF;
3040   t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3041   expected_wdid = NUM_ITEMLIST_HDRWDS;
3042   for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index))
3043   {
3044     if (t->ilabel != expected_wdid)
3045       return GrammarTypeBNF;
3046     u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3047     if (u != NULL && u->ilabel != MAXwordID)
3048       return GrammarTypeBNF;
3049     expected_wdid++;
3050   }
3051   if (expected_wdid != context->olabels->num_words)
3052     return GrammarTypeBNF;
3053   return GrammarTypeItemList;
3054 }
3055 
FST_DumpContextAsImageV2(srec_context * context,PFile * fp)3056 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp)
3057 {
3058   asr_uint32_t header[4];
3059   arcID tmp[32], i, j, nfields;
3060   FSMarc* arc;
3061   ESR_ReturnCode rc;
3062 #if !defined(DO_ARCS_IO_IN_ARC_ORDER)
3063   FSMnode* node;
3064   arcID arcid, num_arcs, num_arcs_written;
3065 #endif
3066 
3067   /* Write header information. */
3068   header[0] = 0;
3069   header[1] = IMAGE_FORMAT_V2;
3070   header[2] = context->modelid;
3071   header[3] = context->grmtyp;
3072 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3073   PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp);
3074 #endif
3075 
3076   if (pfwrite(header, sizeof(header[0]), 4, fp) != 4)
3077   {
3078     PLogError("FST_DumpContextAsImage: Could not write header.\n");
3079     return FST_FAILED_INTERNAL;
3080   }
3081 
3082   /* arcs and nodes */
3083   i = 0;
3084   tmp[i++] = context->num_arcs;
3085   tmp[i++] = context->FSMarc_list_len;
3086   tmp[i++] = context->num_base_arcs;
3087   tmp[i++] = context->FSMarc_freelist;
3088   tmp[i++] = context->max_searchable_arcs;
3089 
3090   tmp[i++] = context->num_nodes;
3091   tmp[i++] = context->FSMnode_list_len;
3092   tmp[i++] = context->num_base_nodes;
3093   tmp[i++] = context->FSMnode_freelist;
3094   tmp[i++] = context->start_node;
3095   tmp[i++] = context->end_node;
3096   tmp[i++] = context->max_searchable_nodes;
3097 
3098   tmp[i++] = context->beg_silence_word;
3099   tmp[i++] = context->end_silence_word;
3100   tmp[i++] = context->hack_silence_word;
3101   tmp[i++] = context->hmm_ilabel_offset;
3102 
3103   nfields = i;
3104 
3105   if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3106     return ESR_WRITE_ERROR;
3107 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3108   PLogMessage("G2G done WR hdrs %d", pftell(fp));
3109 #endif
3110 
3111 #if defined(DO_ARCS_IO_IN_ARC_ORDER)
3112   if( 1) {
3113 	 i = 0;
3114      tmp[i++] = context->end_node;
3115      tmp[i++] = MAXnodeID;
3116      tmp[i++] = MAXmodelID;
3117      tmp[i++] = MAXwordID;
3118      tmp[i++] = FREEcostdata;
3119      nfields = i;
3120      if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3121         return ESR_WRITE_ERROR;
3122   }
3123   for(j=0; j<context->num_arcs; j++) {
3124         arc = &context->FSMarc_list[j];
3125         i = 0;
3126         tmp[i++] = arc->fr_node;
3127         tmp[i++] = arc->to_node;
3128 	ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3129         tmp[i++] = arc->ilabel;
3130         tmp[i++] = arc->olabel;
3131         tmp[i++] = arc->cost;
3132         nfields = i;
3133         if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3134           return ESR_WRITE_ERROR;
3135   }
3136 #else
3137   num_arcs_written = 0;
3138   for (j = 0; j < context->num_nodes; j++) {
3139     node = &context->FSMnode_list[j];
3140     num_arcs = (arcID)num_arcs_leaving(context, node);
3141     arcid = node->un_ptr.first_next_arc;
3142     if (arcid == MAXarcID) {
3143       i = 0;
3144       tmp[i++] = (arcID)j;
3145       tmp[i++] = MAXarcID;
3146       tmp[i++] = MAXarcID;
3147       tmp[i++] = MAXarcID;
3148       tmp[i++] = MAXarcID;
3149       nfields = i;
3150       if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3151         return ESR_WRITE_ERROR;
3152       /* num_arcs_written++; end node don't have a to-arc */
3153     }
3154     else {
3155       for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) {
3156         arc = &context->FSMarc_list[arcid];
3157         i = 0;
3158         tmp[i++] = (arcID)j;
3159         tmp[i++] = arc->to_node;
3160 	ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3161         tmp[i++] = arc->ilabel;
3162         tmp[i++] = arc->olabel;
3163         tmp[i++] = arc->cost;
3164         nfields = i;
3165         if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3166           return ESR_WRITE_ERROR;
3167         num_arcs_written++;
3168       }
3169     }
3170   }
3171   ASSERT(num_arcs_written == context->num_arcs);
3172 #endif
3173 
3174 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3175   PLogMessage("G2G done WR arcs %d", pftell(fp));
3176 #endif
3177 
3178   /* node info   .. will now be calculated on line */
3179   /* wrapup_cost .. will now be calculated on line */
3180 
3181   /* exit points for slots */
3182   if (1)
3183   {
3184     srec_fsm_exit_point *p, *q;
3185     if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1)
3186       return ESR_WRITE_ERROR;
3187     p = context->fsm_exit_points;
3188     q = p + MAX_NUM_SLOTS;
3189     while (p < q)
3190     {
3191       if (pfwrite(&p->from_node_index, 2, 1, fp) != 1)
3192         return ESR_WRITE_ERROR;
3193       if (pfwrite(&p->arc_index, 2, 1, fp) != 1)
3194         return ESR_WRITE_ERROR;
3195       if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1)
3196         return ESR_WRITE_ERROR;
3197       ++p;
3198     }
3199   }
3200 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3201   PLogMessage("G2G done WR exits %d", pftell(fp));
3202 #endif
3203   /* no entry points stuff */
3204 
3205   /* no saving ilabels */
3206 
3207   /* now load the ilabels */
3208   rc = serializeWordMapV2(context->olabels, fp);
3209   if (rc != ESR_SUCCESS)
3210     return rc;
3211 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3212   PLogMessage("G2G done WR wmap %d", pftell(fp));
3213 #endif
3214 
3215   if (context->grmtyp != GrammarTypeItemList)
3216   {
3217     if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS)
3218     {
3219       PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3220       return FST_FAILED_INTERNAL;
3221     }
3222   }
3223   else
3224   {
3225     /* nothing, at decoding time, we'll just construct the graph
3226        from the word list, ie just this ..
3227        0 1 0
3228        1 2 4
3229        ...
3230        1 2 1316
3231        2
3232        ie
3233        0 1 0
3234        for(i=4;i<nwords;i++) { "1 2 i" }
3235        2
3236        because words 0-3 are eps,-pau-,-pau2-,slot@grammarname.grxml
3237     */
3238   }
3239 
3240   /* Write header information. */
3241   header[0] = pftell(fp);
3242   header[1] = IMAGE_FORMAT_V2;
3243 
3244   /* goto beginning of file. */
3245   if (pfseek(fp, 0, SEEK_SET))
3246   {
3247     PLogError("FST_DumpContextAsImage: could not reposition for header.\n");
3248     return FST_FAILED_INTERNAL;
3249   }
3250 
3251   if (pfwrite(header, 4, 2, fp) != 2)
3252   {
3253     PLogError("FST_DumpContextAsImage: Could not write header.\n");
3254     return FST_FAILED_INTERNAL;
3255   }
3256 
3257   /* reset pointer at end of file. */
3258   if (pfseek(fp, 0, SEEK_END))
3259   {
3260     PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n");
3261     return FST_FAILED_INTERNAL;
3262   }
3263 
3264   return FST_SUCCESS;
3265 }
3266 
3267 
FST_LoadContextFromImageV2(srec_context * fst,PFile * fp)3268 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp)
3269 {
3270   asr_int32_t header[6];
3271   arcID tmp[32], num_arcs, new_arc_id;
3272   unsigned int i, nfields;
3273   srec_fsm_exit_point *p, *q;
3274   arcID max_num_FSMarcs;
3275   nodeID max_num_FSMnodes;
3276   FSMarc_ptr atok =  (FSMarc_ptr)0;
3277   FSMarc *atoken = NULL;
3278   FSMnode *fr_node, *to_node;
3279   int rc = FST_SUCCESS;
3280   ESR_ReturnCode esr_rc;
3281   ESR_BOOL seenFinalNode = ESR_FALSE;
3282 
3283   /* read header information. */
3284   if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2)
3285   {
3286     PLogError("FST_DumpContextAsImage: Could not read header.\n");
3287     return FST_FAILED_INTERNAL;
3288   }
3289   fst->modelid = header[2];
3290   fst->grmtyp  = header[3];
3291 
3292   nfields = 16;
3293   if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3294     return ESR_READ_ERROR;
3295 
3296   /* arcs and nodes */
3297   i = 0;
3298   fst->num_arcs             = max_num_FSMarcs = tmp[i++];
3299   fst->FSMarc_list_len      = tmp[i++];
3300   fst->num_base_arcs        = tmp[i++];
3301   fst->FSMarc_freelist      = tmp[i++];
3302   fst->max_searchable_arcs  = tmp[i++] * 0; /*5*/
3303   fst->num_nodes            = max_num_FSMnodes = tmp[i++];
3304   fst->FSMnode_list_len     = tmp[i++];
3305   fst->num_base_nodes       = tmp[i++];
3306   fst->FSMnode_freelist     = tmp[i++];
3307   fst->start_node           = tmp[i++];/*10*/
3308   fst->end_node             = tmp[i++];
3309   fst->max_searchable_nodes = tmp[i++] * 0;
3310   fst->beg_silence_word     = tmp[i++];
3311   fst->end_silence_word     = tmp[i++];
3312   fst->hack_silence_word    = tmp[i++]; /*15*/
3313   fst->hmm_ilabel_offset    = tmp[i++]; /* 16 */
3314 
3315   fst->addWordCaching_lastslot_name = 0;
3316   fst->addWordCaching_lastslot_num = MAXwordID;
3317   fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
3318 
3319   ASSERT(i == nfields);
3320 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3321   PLogMessage("G2G done RD hdrs %d", pftell(fp));
3322 #endif
3323 
3324   ASSERT(fst->num_arcs >= fst->num_base_arcs); // was ==
3325   ASSERT(fst->num_nodes >= fst->num_base_nodes); // was ==
3326 
3327   fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
3328   if (!fst->FSMarc_list)
3329   {
3330     rc = FST_FAILED_ON_MEMORY;
3331     PLogError("CALLOC_CLR fst->FSMarc_list \n");
3332     goto CLEANUP;
3333   }
3334   fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
3335   if (!fst->FSMnode_list)
3336   {
3337     rc = FST_FAILED_ON_MEMORY;
3338     PLogError("CALLOC_CLR fst->FSMnode_list  failed\n");
3339     goto CLEANUP;
3340   }
3341   fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
3342 
3343   if (!fst->FSMnode_info_list)
3344   {
3345     rc = FST_FAILED_ON_MEMORY;
3346     PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n");
3347     goto CLEANUP;
3348   }
3349 
3350   /* setup the arc freelist */
3351   fst->FSMarc_freelist = 0;
3352   fst->num_arcs = 0;
3353   for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++)
3354   {
3355     fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
3356     fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3357   }
3358   fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
3359   fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3360 
3361   /* initialize the nodes, 'cuz reading is random order */
3362   for (i = 0; i < max_num_FSMnodes; i++)
3363   {
3364     fr_node = &fst->FSMnode_list[i];
3365     fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
3366   }
3367 
3368   /* 1. first load up all the information */
3369   num_arcs = 0;
3370   for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++)
3371   {
3372     if (i > max_num_FSMarcs && !seenFinalNode)
3373     {
3374       PLogError("Final node never encountered");
3375       rc = ESR_INVALID_STATE;
3376       goto CLEANUP;
3377     }
3378 	nfields = 5;
3379     if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields)
3380     {
3381       PLogError("reading arc");
3382       rc = ESR_INVALID_STATE;
3383       goto CLEANUP;
3384     }
3385 
3386     if (tmp[1] == MAXnodeID)
3387     {
3388       seenFinalNode = ESR_TRUE;
3389       if(fst->end_node != tmp[0]) {
3390 	PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]);
3391       ASSERT(fst->end_node == tmp[0]);
3392       }
3393       i--;
3394     }
3395     else
3396     {
3397       new_arc_id = fst_get_free_arc(fst);
3398       if (new_arc_id == MAXarcID)
3399         return FST_FAILED_ON_MEMORY;
3400       atok = ARC_ItoX(new_arc_id);
3401       atoken = ARC_XtoP(atok);
3402       num_arcs++;
3403 
3404       ASSERT(tmp[0] < fst->num_nodes);
3405       ASSERT(tmp[1] < fst->num_nodes);
3406       fr_node = &fst->FSMnode_list[ tmp[0]];
3407       to_node = &fst->FSMnode_list[ tmp[1]];
3408       atoken->ilabel = tmp[2];
3409       atoken->olabel = tmp[3];
3410       atoken->cost   = tmp[4];
3411       append_arc_leaving_node(fst, fr_node, atok);
3412       append_arc_arriving_node(fst, to_node, atok);
3413       atoken->fr_node = NODE_ItoX(tmp[0]);
3414       atoken->to_node = NODE_ItoX(tmp[1]);
3415     }
3416   }
3417   ASSERT(fst->num_arcs == num_arcs);
3418 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3419   PLogMessage("G2G done RD arcs %d", pftell(fp));
3420 #endif
3421 
3422   /* setup the node freelist */
3423   if (fst->num_nodes < fst->FSMnode_list_len)
3424   {
3425     fst->FSMnode_freelist = fst->num_nodes;
3426     for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++)
3427     {
3428       fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
3429       fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3430     }
3431     fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
3432     fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3433   }
3434   else
3435   {
3436     fst->FSMnode_freelist = MAXnodeID;
3437   }
3438 
3439   /* node info   .. will now be calculated on line */
3440   /* wrapup_cost .. will now be calculated on line */
3441   fst->whether_prepared = 0;
3442   fst_fill_node_info(fst);
3443   for (i = 0; i < fst->num_nodes; i++)
3444     fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN;
3445 
3446   /* exit points for slots */
3447   if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1)
3448     return ESR_READ_ERROR;
3449   p = fst->fsm_exit_points;
3450   q = p + MAX_NUM_SLOTS;
3451   while (p < q)
3452   {
3453     if (pfread(&p->from_node_index, 2, 1, fp) != 1)
3454       return ESR_WRITE_ERROR;
3455     if (pfread(&p->arc_index, 2, 1, fp) != 1)
3456       return ESR_WRITE_ERROR;
3457     if (pfread(&p->wbto_node_index, 2, 1, fp) != 1)
3458       return ESR_WRITE_ERROR;
3459     ++p;
3460   }
3461 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3462   PLogMessage("G2G done RD exits %d", pftell(fp));
3463 #endif
3464 
3465   /* no entry points stuff */
3466 
3467   /* ilabels were not saved, create them as empty */
3468   fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
3469   fst->ilabels->num_words = fst->ilabels->max_words = 0;
3470   fst->ilabels->words = 0;
3471 
3472   /* now save the olabels */
3473   esr_rc = deserializeWordMapV2(&fst->olabels, fp);
3474   if (esr_rc != ESR_SUCCESS)
3475   {
3476     PLogError("deserializeWordMapV2() failed\n");
3477     rc = FST_FAILED_INTERNAL;
3478     goto CLEANUP;
3479   }
3480 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3481   PLogMessage("G2G done RD wmap %d", pftell(fp));
3482 #endif
3483 
3484   /* arctokeninfo */
3485   if (fst->grmtyp != GrammarTypeItemList)
3486   {
3487     if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS)
3488     {
3489       PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3490       rc = FST_FAILED_INTERNAL;
3491       goto CLEANUP;
3492     }
3493   }
3494   else
3495   {
3496     /* here we just construct an isolated list, but we should scrap it later */
3497     wordID wdid;
3498     arc_token *atl, *final, *atli;
3499     fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS;
3500     atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph"));
3501     if(!atl) {
3502       PLogError("fst->arc_token_list_len failed\n");
3503       goto CLEANUP;
3504     }
3505     atl->first_next_arc = ARC_TOKEN_LNK(atl, 1);
3506     atl->next_token_index = ARC_TOKEN_NULL;
3507     final = atl + fst->arc_token_list_len - 1;
3508     for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++)
3509     {
3510       atli = &atl[i];
3511       atli->ilabel = wdid;
3512       atli->olabel = wdid;/*not used*/
3513       atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1));
3514       atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/
3515     }
3516     ASSERT(atl + i == final);
3517     final->next_token_index = ARC_TOKEN_NULL;
3518     final->first_next_arc = ARC_TOKEN_NULL;
3519     final->ilabel = final->olabel = 0;
3520     fst->arc_token_list = atl;
3521   }
3522   return FST_SUCCESS;
3523 CLEANUP:
3524   return FST_FAILED_INTERNAL;
3525 }
3526 
3527 
3528 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp);
3529 
FST_LoadContextFromImage(srec_context ** pcontext,PFile * fp)3530 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp)
3531 {
3532   srec_context* context = NULL;
3533   /* used to force correct data alignment */
3534   asr_int32_t header[2];
3535   int rc = FST_SUCCESS;
3536 
3537   /*printf("FST_LoadContexFromImage\n");*/
3538   if (!fp)
3539   {
3540     PLogError("FST_LoadContextImage() failed; bad file pointer\n");
3541     return FST_FAILED_ON_INVALID_ARGS;
3542   }
3543 
3544   context = NEW(srec_context, L("srec.graph.binary"));
3545   if (context == NULL)
3546   {
3547     PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n");
3548     return FST_FAILED_ON_MEMORY;
3549   }
3550   memset(context, 0, sizeof(srec_context));
3551 
3552   /* header */
3553   if (pfread(header, 4, 2, fp) != 2)
3554   {
3555     PLogError("FST_LoadContextFromImage: failed reading header.\n");
3556     rc = FST_FAILED_ON_INVALID_ARGS;
3557     goto CLEANUP;
3558   }
3559 
3560   if (header[1] == IMAGE_FORMAT_V2)
3561   {
3562     rc = FST_LoadContextFromImageV2(context, fp);
3563     if (rc != FST_SUCCESS)
3564       goto CLEANUP;
3565   }
3566   else
3567   {
3568     PLogError("FST_LoadContextFromImage() failed on image_format\n");
3569     goto CLEANUP;
3570   }
3571   *pcontext = context;
3572   return FST_SUCCESS;
3573 CLEANUP:
3574   if (context) FREE(context);
3575   *pcontext = 0;
3576   return rc;
3577 }
3578 
3579 
FST_DumpReverseWordGraph(srec_context * context,PFile * fp)3580 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp)
3581 {
3582   /* not implemented, use FST_DumpSyntaxAsImage() for now */
3583   return 1;
3584 }
3585 
3586 /* this belongs part of the srec_context */
3587 typedef struct
3588 {
3589   asr_int32_t image_format;
3590   asr_int32_t image_size;
3591   asr_int32_t sizes_signature;
3592 }
3593 context_image_header;
3594 
3595 
FST_LoadContextFromStreamImage(srec_context ** pcontext,char * buffer,asr_int32_t buffer_len)3596 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len)
3597 {
3598   return FST_FAILED_INTERNAL;
3599 }
3600 
3601 typedef struct nodeID_list_t
3602 {
3603   nodeID *nodes, num_nodes, num_alloc;
3604 }
3605 nodeID_list;
3606 
fst_node_has_speech_to_come(srec_context * context,nodeID node_index)3607 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index)
3608 {
3609   /* recursive func ... try to keep stack use low! */
3610   /* later we should cache this information:
3611      need 2bits:  endn opte regl dunno */
3612   FSMarc* arc;
3613   arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc;
3614   for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc)
3615   {
3616     arc = &context->FSMarc_list[arc_index];
3617     if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS)
3618     {
3619       return 1;
3620     }
3621     else if (arc->ilabel < EPSILON_OFFSET)
3622     {
3623       if (fst_node_has_speech_to_come(context, arc->to_node))
3624       {
3625         return 1;
3626       }
3627     }
3628     else if (IS_SILENCE_ILABEL(arc->ilabel, context))
3629     {
3630       /* another silence! */
3631       if (fst_node_has_speech_to_come(context, arc->to_node))
3632         return 1;
3633     }
3634   }
3635   return 0;
3636 }
3637 
fst_fill_node_info(srec_context * context)3638 int fst_fill_node_info(srec_context* context)
3639 {
3640   char* infos = context->FSMnode_info_list;
3641   nodeID_list optends_obj;
3642   nodeID_list *optends = &optends_obj;
3643 
3644   FSMnode* node;
3645   FSMarc* arc;
3646   nodeID i, j, node_index;
3647   arcID arc_index;
3648   costdata wrapup_cost;
3649   int wrapup_cost_inconsistencies;
3650   /* int num_near_end_nodes; nodes within EPS or silence to the end, these
3651      are called NODE_INFO_ENDNODES, so that they can be excluded from the
3652      comparison with end_node score in the eos detection */
3653 
3654   optends->num_nodes    = 1;
3655   optends->num_alloc    = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */
3656   optends->nodes        = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes");
3657   optends->nodes[0]     = context->end_node;
3658   /* num_near_end_nodes = 0; */
3659 
3660   for (i = 0; i < optends->num_nodes; i++)
3661   {
3662     node_index = optends->nodes[i];
3663     node = &context->FSMnode_list[ node_index];
3664     for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3665          arc_index = arc->linkl_prev_arc)
3666     {
3667       arc = &context->FSMarc_list[arc_index];
3668       if (arc->fr_node != node_index)
3669       {
3670 
3671         if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET)
3672         {
3673           /* ok, fr_node goes to the end via silence, check for dups */
3674           for (j = 0; j < optends->num_nodes; j++)
3675             if (optends->nodes[j] == arc->fr_node) break;
3676           /* append it to the list */
3677           if (j == optends->num_nodes)
3678           {
3679             if (optends->num_nodes < optends->num_alloc)
3680               optends->nodes[ optends->num_nodes++] = arc->fr_node;
3681             else
3682             {
3683               ASSERT(0 && "allocated too few optend nodes");
3684               return 0;
3685             }
3686           }
3687         }
3688       }
3689     }
3690   }
3691 
3692   /* now set the info array, default is regular */
3693   for (i = 0; i < context->num_nodes; i++)
3694     infos[i] = NODE_INFO_REGULAR;
3695   /* also set the other nodes, we'll check whether nodes were added
3696      via these flags */
3697   for (; i < context->FSMnode_list_len; i++)
3698     infos[i] = NODE_INFO_UNKNOWN;
3699   infos[ context->end_node] = NODE_INFO_ENDNODE;
3700 
3701   /* get rid of direct to end nodes, etc */
3702   for (i = 0, j = 0; i < optends->num_nodes; i++)
3703   {
3704     optends->nodes[j] = optends->nodes[i];
3705     if (fst_node_has_speech_to_come(context, optends->nodes[i]))
3706     {
3707       j++;
3708       infos[ optends->nodes[i]] = NODE_INFO_OPTENDN;
3709     }
3710     else
3711     {
3712       infos[ optends->nodes[i]] = NODE_INFO_ENDNODE;
3713       /* num_near_end_nodes++; */
3714     }
3715   }
3716   optends->num_nodes = j;
3717 
3718   /* printf("get_oppend_nodes (%d)", optends->num_nodes);
3719      for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]);
3720      printf("\n");
3721   */
3722 
3723   FREE(optends->nodes);
3724 
3725   /* find the wrapup cost */
3726   node = &context->FSMnode_list[ context->end_node];
3727   wrapup_cost = MAXcostdata;
3728   wrapup_cost_inconsistencies = 0;
3729   for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3730        arc_index = arc->linkl_prev_arc)
3731   {
3732     arc = &context->FSMarc_list[ arc_index];
3733     if (IS_SILENCE_ILABEL(arc->ilabel, context) &&
3734         arc->olabel == context->end_silence_word)
3735     {
3736       if (wrapup_cost == MAXcostdata)
3737         wrapup_cost = arc->cost;
3738       else if (context->wrapup_cost != arc->cost)
3739       {
3740         wrapup_cost = arc->cost;
3741         wrapup_cost_inconsistencies++;
3742       }
3743     }
3744   }
3745 #define MIN_WRAPUP_COST 100
3746   context->wtw_average = wrapup_cost;
3747   if (context->wtw_average > 200)
3748     context->wtw_average = 200;
3749   if (context->wrapup_cost < MIN_WRAPUP_COST)
3750     context->wrapup_cost = MIN_WRAPUP_COST;
3751   /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost,
3752     wrapup_cost_inconsistencies);*/
3753   return 0;
3754 }
3755 
fst_alloc_transit_points(srec_context * context)3756 int fst_alloc_transit_points(srec_context* context)
3757 {
3758   arcID i;
3759   wordID num_slots = context->olabels->num_slots;
3760   wordID olabel;
3761   asr_int16_t nfxps = 0;
3762   FSMarc* arc;
3763   FSMnode* node;
3764 
3765   context->num_fsm_exit_points = 0;
3766   /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */
3767   if (num_slots == 1)
3768     return 0;
3769 
3770   /* scan through to finds arc that carry rule references */
3771   for (i = 0; i < context->num_arcs; i++)
3772   {
3773     olabel =  context->FSMarc_list[i].olabel;
3774     /* (wordID)(olabel-1) < (num_slots-1) ?? might work */
3775     if (olabel > 0 && olabel < num_slots)
3776     {
3777       context->FSMarc_list[i].cost = DISABLE_ARC_COST;
3778       if (nfxps >= MAX_NUM_SLOTS)
3779       {
3780         PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n");
3781         return 0;
3782       }
3783       context->fsm_exit_points[nfxps].arc_index = i;
3784       context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node;
3785       /* skip over the trailing silence and .wb labels */
3786       for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;)
3787       {
3788         node = &context->FSMnode_list[arc->to_node];
3789         arc =  &context->FSMarc_list[node->un_ptr.first_next_arc];
3790       }
3791       context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node;
3792       nfxps++;
3793 
3794     } /* olabel<num_slots */
3795   } /* i<num_arcs */
3796   context->num_fsm_exit_points  = nfxps;
3797   return 0;
3798 }
3799 
3800 
3801 /*
3802   cross-word modeling:
3803   for now we'll assume silence context on either side, later we'll create
3804   a graph with the right fan out on the edges of each slot.  At that point we
3805   may have routines like ... if(is_dead_end(arc)) continue;
3806 
3807   word removal:
3808   we'll use refcounters on arcs, we can search the whole graph for the
3809   olabel, then remove forward and remove backwards
3810 
3811   word addition:
3812   glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219
3813 
3814   set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map
3815   set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map
3816   wintel/debug/addword.exe -min
3817   $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt
3818   $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot
3819   dotty out.PCLG.dot
3820 
3821   need
3822   - a sanity check
3823   - remove word
3824   - multiple pronunciations handling
3825   - homonyms handling, other boundary conditions checking
3826   - measure the speed
3827 
3828   alan_adams al~#ad}z
3829   alan_adams al~ad}z
3830   paul_adams p{l#ad}z
3831 
3832 
3833 
3834 */
3835