1 /*---------------------------------------------------------------------------*
2 * srec_context.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20
21 #include "stdlib.h"
22 #include "string.h"
23 #include "buildopt.h"
24 #include "setting.h"
25 #include "passert.h"
26 #include "pendian.h"
27 #include "portable.h"
28 #include "pstdio.h"
29 #include "ptypes.h"
30 #include "srec_context.h"
31 #include "srec_sizes.h"
32 #include "search_network.h"
33 #include "srec_arb.h"
34 #include "hmm_desc.h"
35 #if USE_COMP_STATS
36 #include "comp_stats.h"
37 #endif
38
39 #ifdef SET_RCSID
40 static const char *rcsid = 0 ? (const char *) &rcsid :
41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $";
42 #endif
43
44
45 /* these are constants for now, we need to figure out how to make
46 this more flexible, possible ideas are:
47 (1) to specify these via API functions such as CR_LoadSyntax(),
48 (2) use defaults that would have been set the main parfile,
49 (3) put values for these fields in an actual grammar
50 */
51
52 #define AVG_ARCS_PER_WORD 10
53 #define AVG_NODES_PER_WORD 7
54 #define AVG_CHARS_PER_WORD 18
55 #define MAX_PRON_LEN 1024
56 #define MAX_WORD_LEN 128
57 #define DO_WEIGHTED_ADDWORD 1
58 #define SLOTLOOP_OFFSET 64
59 #define SLOTNAME_INDICATOR "__"
60 #define RULE_INDICATOR '@'
61 #define MAX_LINE_LENGTH 512
62 #define DEFAULT_WB_COST 40
63 #define DISABLE_ARC_COST 999
64 #define DO_ARCS_IO_IN_ARC_ORDER 1
65
66 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS)
67 // looking to match filename.grxml.Names@__Names__
68 // see also grxmlcompile.cpp where these names are constructed
69 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) )
70
71 /*
72 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled)
73 EXAMPLE: We need to cast a memory address to a (wordmap*)
74 so we call DATA_ALIGN(memptr, wordmap, filled),
75 where FILLED contains the number of bytes that were used to align the pointer
76 */
77 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM)
78 /* under IPAQ it appears we need to align the structures
79 to certain boundaries. More attention needed here for other
80 platforms ! */
81 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \
82 (z)=sizeof(y) - ((int) (x) % sizeof(y)); \
83 (x) += (z); /* force correct data alignment */ \
84 } else z=0;
85 #else
86 #define DATA_ALIGN(x, y, z)
87 #endif
88
89 static const int do_minimize = 1;
90 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base)))
91 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx)
92
93 /* prototypes */
94
95 /*----------------------------------------------------------------------*
96 * *
97 * internal prototypes *
98 * *
99 *----------------------------------------------------------------------*/
100
101 /* general use functions */
102 char split_line(char* line, char** av);
103 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max);
104 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc);
105 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc);
106 int printf_node1(srec_context* fst, FSMnode* node);
107
108 /* fst_* functions are internal fst functions */
109 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
110 wordID olabel, costdata cost,
111 modelID* model_sequence, int num_models);
112 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc);
113 int fst_push_arc_cost(srec_context* fst, FSMarc* arc);
114 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc);
115 int fst_free_arc(srec_context* fst, FSMarc* arc);
116 int fst_free_node(srec_context* fst, FSMnode* node);
117 int fst_free_arc_net(srec_context* fst, FSMarc* arc);
118 arcID fst_get_free_arc(srec_context* fst);
119 nodeID fst_get_free_node(srec_context* fst);
120 int fst_pack_arc_usage(srec_context* fst);
121
122 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok);
123 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok);
124 int num_arcs_leaving(srec_context* fst, FSMnode* node);
125
126 int num_arcs_arriving(srec_context* fst, FSMnode* node);
127 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node);
128 int fst_fill_node_info(srec_context* context);
129 int fst_alloc_transit_points(srec_context* context);
130
131 /* higher-level functions */
132 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add,
133 PFile* fp);
134 int FST_LoadParams(srec_context* context, PFile* fp);
135 int FST_AssumeParams(srec_context* context);
136 int FST_UnloadReverseWordGraph(srec_context* context);
137 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree);
138
139 static ESR_ReturnCode wordmap_clean ( wordmap *word_map );
140 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words );
141
142 /*--------------------------------------------------------------------------*
143 * *
144 * *
145 * *
146 *--------------------------------------------------------------------------*/
147
FST_IsVoiceEnrollment(srec_context * context)148 int FST_IsVoiceEnrollment(srec_context* context)
149 {
150 if (context->olabels == NULL) return 0;
151 if (context->olabels->num_words < 2) return 0;
152 if (strstr(context->olabels->words[1], "enroll")) return 1;
153 return 0;
154 }
155
FST_LoadContext(const char * synbase,srec_context ** pcontext,int num_words_to_add)156 int FST_LoadContext(const char* synbase, srec_context** pcontext,
157 int num_words_to_add)
158 {
159 int rc;
160 PFile* fp;
161 char buffer[MAX_LINE_LENGTH];
162 srec_context* context;
163
164 context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base");
165 if(!context)
166 return FST_FAILED_ON_MEMORY;
167 memset(context, 0, sizeof(srec_context));
168
169 context->addWordCaching_lastslot_name = 0;
170 context->addWordCaching_lastslot_num = MAXwordID;
171 context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
172
173 sprintf(buffer, "%s.map", synbase);
174 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
175 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
176 rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp);
177 pfclose(fp);
178 if (rc) return rc;
179
180 sprintf(buffer, "%s.PCLG.txt", synbase);
181 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
182 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
183 rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp);
184 pfclose(fp);
185 if (rc != FST_SUCCESS) return rc;
186
187 sprintf(buffer, "%s.Grev2.det.txt", synbase);
188 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
189 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
190 rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp);
191 pfclose(fp);
192 if (rc != FST_SUCCESS) return rc;
193
194 sprintf(buffer, "%s.params", synbase);
195 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
196 if (fp) {
197 rc = FST_LoadParams(context, fp);
198 pfclose(fp);
199 if (rc != FST_SUCCESS) return rc;
200 } else {
201 rc = FST_AssumeParams(context);
202 }
203
204 context->grmtyp = (arcID)FST_GetGrammarType(context);
205
206 *pcontext = context;
207 rc = fst_fill_node_info(context);
208 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
209 }
210
FST_UnloadContext(srec_context * context)211 void FST_UnloadContext(srec_context* context)
212 {
213 if (!context) return;
214
215 FST_UnloadWordMap(&context->ilabels);
216 FST_UnloadWordMap(&context->olabels);
217 FST_UnloadGraph(context);
218 FST_UnloadReverseWordGraph(context);
219 FREE(context);
220 }
221
FST_PrepareContext(srec_context * context)222 int FST_PrepareContext(srec_context* context)
223 {
224 nodeID i;
225 int rc = FST_SUCCESS;
226 /* after all word additions, we need to "prepare" the context */
227
228 /* first check for any changes to optional endnodes etc .. */
229 for (i = 0; i < context->num_nodes; i++)
230 if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN)
231 break;
232 if (i != context->num_nodes)
233 rc = fst_fill_node_info(context);
234
235 context->whether_prepared = 1;
236 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
237 }
238
fst_set_wb_costs(srec_context * context,costdata wbcost)239 void fst_set_wb_costs( srec_context* context, costdata wbcost)
240 {
241 unsigned int i;
242 for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) {
243 if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY)
244 context->FSMarc_list[i].cost = wbcost;
245 }
246 }
247
FST_LoadParams(srec_context * context,PFile * fp)248 int FST_LoadParams(srec_context* context, PFile* fp)
249 {
250 char line[MAX_LINE_LENGTH];
251 costdata wbcost = MAXcostdata;
252 while (pfgets(line, MAX_LINE_LENGTH, fp))
253 {
254 char* key = strtok(line," \n\r\t");
255 char* val = key ? strtok(NULL," \n\r\t") : NULL;
256 if(val && !strcmp(val,"="))
257 val = key ? strtok(NULL," \n\r\t") : NULL;
258 if(!key || !key[0])
259 continue;
260 else if(val && val[0]) {
261 if(!strcmp(key,"word_penalty")) {
262 wbcost = (costdata)atoi_with_check(val, MAXcostdata);
263 if(wbcost == MAXcostdata) {
264 return FST_FAILED_ON_INVALID_ARGS;
265 }
266 } else {
267 PLogError(L("error: unknown parameter %s in .params file"), key);
268 return FST_FAILED_ON_INVALID_ARGS;
269 }
270 }
271 }
272 if(wbcost != MAXcostdata)
273 fst_set_wb_costs( context, wbcost);
274 return FST_SUCCESS;
275 }
276
FST_AssumeParams(srec_context * context)277 int FST_AssumeParams( srec_context* context)
278 {
279 fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST);
280 return FST_SUCCESS;
281 }
282
283
284 /*--------------------------------------------------------------------------*
285 * *
286 * *
287 * *
288 *--------------------------------------------------------------------------*/
289
hmm_number(const char * hmm_Name,modelID hmm_ilabel_offset)290 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset)
291 {
292 if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL;
293 if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY;
294 if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY;
295 ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]);
296 return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID));
297 }
298
hmm_name(modelID ilabel,modelID hmm_ilabel_offset,char * buf)299 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf)
300 {
301 if (ilabel == EPSILON_LABEL)
302 sprintf(buf, "eps");
303 else if (ilabel == WORD_BOUNDARY)
304 sprintf(buf, ".wb");
305 else if (ilabel == PHONE_BOUNDARY)
306 sprintf(buf, ".ph");
307 else
308 sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset);
309 return buf;
310 }
311
312 /*------------------------------------------------------------------*
313 * *
314 * words *
315 * *
316 *------------------------------------------------------------------*/
HashCmpWord(const LCHAR * key1,const LCHAR * key2)317 int HashCmpWord(const LCHAR *key1, const LCHAR *key2)
318 {
319 return LSTRCMP(key1,key2);
320 }
HashGetCode(const void * key)321 unsigned int HashGetCode(const void *key)
322 {
323 const LCHAR* k = (const LCHAR*)key;
324 unsigned int i, len, h = 0;
325 len = LSTRLEN( k);
326 for ( i = 0; i < len; i++)
327 h = 31*h + (unsigned int)k[i];
328 return h;
329 }
330
wordmap_create(wordmap ** pwmap,int num_chars,int num_words,int num_words_to_add)331 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add)
332 {
333 wordmap* Interface;
334 PHashTableArgs hashArgs;
335 ESR_ReturnCode rc;
336
337 Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base");
338 Interface->max_words = (wordID)(num_words + num_words_to_add);
339 Interface->num_words = (wordID)0;
340 /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */
341 Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words");
342 Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD;
343 Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars");
344 Interface->next_chars = Interface->chars;
345 Interface->wordIDForWord = NULL;
346 *pwmap = Interface;
347
348 /* use a hashtable to save mapping between wdID and array index */
349 if (num_words_to_add >= 0)
350 {
351 hashArgs.capacity = num_words + num_words_to_add + 10;
352 if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
353 hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION;
354 hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION;
355 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
356 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord));
357 }
358 else
359 {
360 Interface->wordIDForWord = NULL;
361 }
362
363 return FST_SUCCESS;
364
365 CLEANUP:
366 wordmap_destroy(pwmap);
367 return rc;
368 }
369 wordID wordmap_add_word(wordmap* wmap, const char* word);
370
FST_LoadWordMap(wordmap ** pwmap,int num_words_to_add,PFile * fp)371 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp)
372 {
373 wordID my_wID, wm_wID;
374 char *word, *wID_string;
375 int num_words;
376 asr_int32_t num_chars;
377 char buf[MAX_LINE_LENGTH];
378 long fpos;
379 wordmap* wmap;
380 /* figure out the number of words */
381 fpos = pftell(fp);
382 for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++)
383 {
384 word = strtok(buf, " \n\r\t");
385 num_chars += strlen(word);
386 }
387 num_chars += num_words * 2; /* for null terminators */
388 pfseek(fp, fpos, SEEK_SET);
389
390 /* Alex added this to reuse this functionality elsewhere
391 destroy function for this create function is actually taken care of by the
392 FST_UnloadWordMap function
393 */
394 wordmap_create(&wmap, num_chars, num_words, num_words_to_add);
395
396 /* later replace this with wordblocks, linked list of 8 character blocks! */
397 while (pfgets(buf, MAX_LINE_LENGTH, fp))
398 {
399 word = strtok(buf, " \n\r\t");
400 wID_string = strtok(NULL, " \n\r\t");
401 my_wID = (wordID)atoi_with_check(wID_string, MAXwordID);
402
403 // if(strstr(word,SLOTNAME_INDICATOR))
404 // word = strstr(word,SLOTNAME_INDICATOR);
405 wm_wID = wordmap_add_word(wmap, word);
406 ASSERT(my_wID == wm_wID);
407 }
408 ASSERT(wmap->num_words == num_words);
409
410 for (my_wID = 1; my_wID < num_words; my_wID++)
411 {
412 if (!IS_SLOT_OLABEL(wmap->words[my_wID]))
413 break;
414 }
415 wmap->num_slots = my_wID;
416 wordmap_setbase(wmap);
417 *pwmap = wmap;
418 if(wmap->num_slots > MAX_NUM_SLOTS)
419 return FST_FAILED_ON_INVALID_ARGS;
420 else
421 return FST_SUCCESS;
422 }
423
wordmap_destroy(wordmap ** wmap)424 int wordmap_destroy(wordmap** wmap)
425 {
426 if (wmap && *wmap)
427 {
428 wordmap_clean ( *wmap );
429 if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord);
430 if (((*wmap)->chars)) FREE((*wmap)->chars);
431 if (((*wmap)->words)) FREE((*wmap)->words);
432 if ((*wmap)) FREE((*wmap));
433 *wmap = 0;
434 }
435 return FST_SUCCESS;
436 }
437
FST_UnloadWordMap(wordmap ** wmap)438 int FST_UnloadWordMap(wordmap** wmap)
439 {
440 return wordmap_destroy(wmap);
441 }
442
443
FST_DumpWordMap(PFile * fp,wordmap * wmap)444 int FST_DumpWordMap(PFile* fp, wordmap* wmap)
445 {
446 wordID my_wID;
447 for (my_wID = 0; my_wID < wmap->num_words; my_wID++)
448 {
449 pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID);
450 }
451 return FST_SUCCESS;
452 }
453
wordmap_find_index(wordmap * wmap,const char * word)454 wordID wordmap_find_index(wordmap* wmap, const char* word)
455 {
456 void *wdID_void;
457 ESR_ReturnCode rc;
458 size_t i;
459
460 if (!word)
461 return MAXwordID;
462
463 if (wmap->num_words == 0)
464 return MAXwordID;
465
466 if (wmap->wordIDForWord)
467 {
468 rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void);
469
470 if (rc == ESR_SUCCESS)
471 return (wordID)(int)wdID_void;
472 }
473 else
474 {
475 for (i = 0; i < wmap->num_words; i++)
476 {
477 if (!strcmp(wmap->words[i], word))
478 {
479 return (wordID)i;
480 }
481 }
482 }
483 return MAXwordID;
484 }
485
wordmap_find_rule_index(wordmap * wmap,const char * rule)486 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule)
487 {
488 int i;
489 int strlen_rule = strlen(rule);
490 for (i = wmap->num_slots; --i > 0;)
491 { /* > (not >=) because eps is always 0 */
492 char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR);
493 if(p != NULL){
494 if (!strcmp(p, rule)) break;
495 else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break;
496 }
497 }
498 if (i == 0) return MAXwordID;
499 else return(wordID)i;
500 }
501
wordmap_find_index_in_rule(wordmap * wmap,const char * word,wordID rule)502 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule)
503 {
504 int len = strlen(word);
505 int rule0 = rule + '0';
506 void *wdID_void;
507 LCHAR word_dot_rule[256];
508 ESR_ReturnCode rc;
509
510 if (!word)
511 return MAXwordID;
512 LSTRCPY(word_dot_rule, word);
513 word_dot_rule[len++] = IMPORTED_RULES_DELIM;
514 word_dot_rule[len++] = (char)rule0;
515 word_dot_rule[len++] = 0;
516 rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void);
517 if (rc == ESR_SUCCESS)
518 return (wordID)(int)(wdID_void);
519 return MAXwordID;
520 }
521
strlen_with_null(const char * word)522 int strlen_with_null(const char* word)
523 {
524 #if 1 /* NT */
525 int len = strlen(word) + 1;
526 if (len % 2 == 1) len++;
527 return len;
528 #else /* C54 */
529 #endif
530 }
531
wordmap_whether_in_rule(wordmap * wmap,wordID word,wordID rule)532 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule)
533 {
534 char* word_chars;
535 int len;
536 if (word > wmap->num_words) return 0;
537 word_chars = wmap->words[word];
538 len = strlen(word_chars);
539 if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM)
540 return 1;
541 else
542 return 0;
543 }
544
wordmap_setbase(wordmap * wmap)545 void wordmap_setbase(wordmap* wmap)
546 {
547 wmap->num_base_words = wmap->num_words;
548 wmap->next_base_chars = wmap->next_chars;
549 }
550
wordmap_ceiling(wordmap * wmap)551 void wordmap_ceiling(wordmap* wmap)
552 {
553 /* this is irreversible */
554 wmap->max_words = wmap->num_words;
555 wmap->max_chars = (wmap->next_chars - wmap->chars);
556 }
557
558
wordmap_clean(wordmap * word_map)559 static ESR_ReturnCode wordmap_clean ( wordmap *word_map )
560 {
561 ESR_ReturnCode clean_status;
562 PHashTableEntry *entry;
563 PHashTableEntry *oldEntry;
564 wordID *value;
565
566 clean_status = ESR_SUCCESS;
567
568 if ( word_map->wordIDForWord == NULL )
569 return clean_status;
570
571 clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry);
572
573 while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) )
574 {
575 clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value );
576
577 if ( clean_status == ESR_SUCCESS )
578 {
579 oldEntry = entry;
580 clean_status = PHashTableEntryAdvance ( &entry );
581
582 if ( clean_status == ESR_SUCCESS )
583 clean_status = PHashTableEntryRemove ( oldEntry );
584 }
585 }
586 return ( clean_status );
587 }
588
589
wordmap_populate(wordmap * word_map,wordID num_words)590 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words )
591 {
592 ESR_ReturnCode populate_status;
593 wordID wdID;
594
595 populate_status = ESR_SUCCESS;
596 wdID = 0;
597
598 if ( word_map->wordIDForWord != NULL )
599 {
600 while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) )
601 {
602 populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID],
603 (const void *)(int)wdID, NULL );
604 if ( populate_status == ESR_SUCCESS )
605 wdID++;
606 else {
607 return (populate_status);
608 }
609 }
610 }
611 return ( populate_status );
612 }
613
wordmap_reset(wordmap * wmap)614 void wordmap_reset(wordmap* wmap)
615 {
616 char** tmp_words;
617 int i;
618 ESR_ReturnCode reset_status;
619
620 if (wmap->num_base_words < wmap->num_words)
621 {
622 /*wordID i = (wordID)(wmap->num_base_words);*/
623 char* old_wmap_chars = wmap->chars;
624 int offset = wmap->next_base_chars - wmap->chars;
625 char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars"));
626 if(!tmp_chars) {
627 passert ( 0 && L("failed to reset the memory for wordmap.chars ") );
628 }
629 memcpy(tmp_chars,wmap->chars, offset * sizeof(char));
630 FREE(wmap->chars);
631
632 wmap->chars = tmp_chars;
633 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars);
634 wmap->max_chars = (wordID) offset;
635 wmap->next_chars = wmap->next_base_chars;
636
637 tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words");
638 if(!tmp_words) {
639 passert ( 0 && L("failed to reset the memory for wordmap.words ") );
640 }
641 memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*));
642 FREE( wmap->words);
643
644 wmap->words = tmp_words;
645 wmap->max_words = wmap->num_base_words;
646 wmap->num_words = wmap->num_base_words;
647
648 for(i=0; i<wmap->num_words; i++)
649 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars);
650 }
651
652 reset_status = wordmap_clean ( wmap );
653
654 if ( reset_status == ESR_SUCCESS )
655 {
656 reset_status = wordmap_populate ( wmap, wmap->num_base_words );
657
658 if ( reset_status != ESR_SUCCESS )
659 {
660 wordmap_clean ( wmap );
661 passert ( 0 && L("wordmap_reset failed") );
662 }
663 }
664 else
665 {
666 passert ( 0 && L("wordmap_reset failed") );
667 }
668 }
669
670 #define FST_GROW_FACTOR 12/10
671 #define FST_GROW_MINCHARS 256
672 #define FST_GROW_MINWORDS 32
wordmap_add_word(wordmap * wmap,const char * word)673 wordID wordmap_add_word(wordmap* wmap, const char* word)
674 {
675 int len = strlen_with_null(word);
676 wordID wdID =0;
677
678 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
679 {
680 #if defined(FST_GROW_FACTOR)
681 int i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR;
682 char* old_wmap__chars = wmap->chars;
683 if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS)
684 tmp_max_chars +=FST_GROW_MINCHARS;
685
686 char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
687 if(!tmp_chars) {
688 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars"));
689 return MAXwordID;
690 }
691 memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char));
692 FREE(wmap->chars);
693
694 wmap->chars = tmp_chars;
695 wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars);
696 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars);
697 wmap->max_chars = (wordID)tmp_max_chars;
698 //Remove old keys --- Add WORD
699 wordmap_clean (wmap );
700
701 // adjust word pointers
702 for(i=0; i<wmap->num_words; i++)
703 {
704 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars);
705 // adjust hashtable ---
706 if (wmap->wordIDForWord)
707 {
708 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
709 (const void *)i, NULL );
710 if ( rc != ESR_SUCCESS ) {
711 goto CLEANUP;
712 }
713 }
714 }
715 #else // so not defined(FST_GROW_FACTOR)
716 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
717 return MAXwordID;
718 #endif // defined(FST_GROW_FACTOR)
719 }
720 //Add word
721 if (wmap->num_words == wmap->max_words)
722 {
723 #if defined(FST_GROW_FACTOR)
724 char** tmp_words;
725 wordID tmp_max_words;
726 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR;
727
728 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
729 itmp_max_words += FST_GROW_MINWORDS;
730
731 if( itmp_max_words >= MAXwordID) {
732 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
733 return MAXwordID;
734 }
735 tmp_max_words = (wordID)itmp_max_words;
736 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
737 if(!tmp_words) {
738 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words"));
739 return MAXwordID;
740 }
741 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
742 FREE( wmap->words);
743 wmap->words = tmp_words;
744 wmap->max_words = tmp_max_words;
745 #else // so not defined(FST_GROW_FACTOR)
746 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
747 return MAXwordID;
748 #endif // defined(FST_GROW_FACTOR)
749 }
750 if(1)
751 {
752 strcpy(wmap->next_chars, word);
753 wmap->words[ wmap->num_words++] = wmap->next_chars;
754 wmap->next_chars += len;
755 wdID = (wordID)(wmap->num_words - (wordID)1);
756 if (wmap->wordIDForWord)
757 {
758 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
759 (const void *)(int)wdID, NULL );
760 if ( rc != ESR_SUCCESS )
761 goto CLEANUP;
762 }
763 return wdID;
764 }
765 CLEANUP:
766 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
767 return MAXwordID;
768 }
769
wordmap_add_word_in_rule(wordmap * wmap,const char * word,wordID rule)770 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule)
771 {
772 int len = strlen_with_null(word) + 2;
773 wordID wdID = 0;
774 wordID i;
775 //wordmap_add_word_in_rule
776 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
777 {
778 #if defined(FST_GROW_FACTOR)
779 int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR;
780 char* tmp_chars;
781 char* old_next_chars = wmap->next_chars;
782 char* old_chars = wmap->chars;
783
784 if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS )
785 tmp_max_chars += FST_GROW_MINCHARS;
786 if( wmap->chars + tmp_max_chars <= wmap->next_chars + len)
787 tmp_max_chars += len;
788
789 tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
790 if(!tmp_chars) {
791 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars"));
792 return MAXwordID;
793 }
794 memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char));
795 FREE(wmap->chars);
796
797 wmap->chars = tmp_chars;
798 wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ;
799 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars);
800 wmap->max_chars = (wordID)tmp_max_chars;
801
802 //Remove old keys
803 wordmap_clean ( wmap );
804
805 // adjust word pointers wordmap_add_word_in_rule
806 for(i=0; i<wmap->num_words; i++)
807 {
808 wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ;
809 // adjust hashtable ----- add in rulewordmap_add_word_in_rule ----
810 if (wmap->wordIDForWord) {
811 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
812 (void*)(int)(i), NULL );
813 if ( rc != ESR_SUCCESS )
814 goto CLEANUP;
815 }
816 }
817 #else
818 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
819 return MAXwordID;
820 #endif
821 }//wordmap_add_word_in_rule
822 if (wmap->num_words == wmap->max_words)
823 {
824 #if defined(FST_GROW_FACTOR)
825 char** tmp_words;
826 wordID tmp_max_words;
827 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR;
828 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
829 itmp_max_words += FST_GROW_MINWORDS;
830
831 if( itmp_max_words >= MAXwordID) {
832 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
833 return MAXwordID;
834 }
835
836 tmp_max_words = (wordID)itmp_max_words;
837 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
838 if(!tmp_words) {
839 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words"));
840 return MAXwordID;
841 }
842 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
843 FREE( wmap->words);
844 wmap->words = tmp_words;
845 wmap->max_words = tmp_max_words;
846 #else
847 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
848 return MAXwordID;
849 #endif
850 }
851 if(1)
852 {
853 char *p;
854 const char *q;
855 /* word.1 */
856 for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */
857 *p++ = IMPORTED_RULES_DELIM;
858 *p++ = (char)(rule + ((wordID)'0'));
859 *p = '\0';
860 wmap->words[ wmap->num_words++] = wmap->next_chars;
861 wmap->next_chars += len;
862 wdID = (wordID)(wmap->num_words - (wordID)1);
863 if (wmap->wordIDForWord)
864 {
865 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
866 (void*)(int)(wdID), NULL );
867 if ( rc != ESR_SUCCESS )
868 goto CLEANUP;
869 }
870 return wdID;
871 }
872 CLEANUP:
873 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
874 return MAXwordID;
875 }
876
877 /*----------------------------------------------------------------------*
878 * *
879 * API *
880 * *
881 *----------------------------------------------------------------------*/
882
FST_AttachArbdata(srec_context * fst,srec_arbdata * allophone_tree)883 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree)
884 {
885 int rc = 0;
886 unsigned int allotree__modelid;
887
888 fst->allotree = allophone_tree;
889 if( !allophone_tree)
890 return 1;
891 fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset;
892 allotree__modelid = version_arbdata_models(allophone_tree);
893 if (allotree__modelid != 0 && fst->modelid != 0)
894 {
895 if (allotree__modelid != fst->modelid)
896 {
897 PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid);
898 rc = FST_FAILED_ON_INVALID_ARGS;
899 }
900 }
901
902 return rc;
903 }
904
FST_LoadGraph(srec_context * pfst,wordmap * imap,wordmap * omap,int num_words_to_add,PFile * fp)905 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap,
906 int num_words_to_add, PFile* fp)
907 {
908 int i, rc = 0;
909 char line[MAX_LINE_LENGTH], *args[32], nargs;
910 arcID max_num_FSMarcs;
911 arcID max_num_FSMnodes;
912 nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0;
913 FSMarc_ptr atok = (FSMarc_ptr)0;
914 FSMarc *atoken = NULL;
915 FSMnode *fr_node, *to_node;
916 costdata cost = FREEcostdata;
917 arcID num_arcs;
918
919 srec_context* fst = pfst;
920 char *ilabel_str = 0, *olabel_str = 0;
921 long fpos;
922 arcID new_arc_id;
923 asr_int32_t temp;
924
925
926 /* determine number of arcs and nodes to allocate, add 50% for dynamic */
927 fpos = pftell(fp);
928 max_num_FSMnodes = 0;
929 pfst->modelid = 0;
930 for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++)
931 {
932 if (strstr(line, "modelid:") == line)
933 {
934 char *p;
935 pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10);
936 }
937 from_node = (nodeID)atoi_with_check(line, MAXnodeID);
938 if (from_node > max_num_FSMnodes)
939 max_num_FSMnodes = from_node;
940 }
941 pfseek(fp, fpos, SEEK_SET);
942 temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD;
943 if (temp >= MAXnodeID)
944 {
945 max_num_FSMnodes = MAXnodeID - 1;
946 PLogMessage("Warning: using max nodes instead\n");
947 }
948 else
949 {
950 max_num_FSMnodes = (nodeID)temp;
951 }
952 temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD;
953 if (temp >= MAXarcID)
954 {
955 max_num_FSMarcs = MAXarcID - 1;
956 PLogMessage("Warning: using max arcs instead\n");
957 }
958 else
959 {
960 max_num_FSMarcs = (arcID)temp;
961 }
962 fst->olabels = omap;
963 if (imap)
964 {
965 /* generally no imap is specified */
966 fst->ilabels = imap;
967 fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0");
968 ASSERT(fst->hmm_ilabel_offset >= 0);
969 }
970 else
971 {
972 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
973 fst->ilabels->num_words = fst->ilabels->max_words = 0;
974 fst->ilabels->words = 0;
975 /* this is bad hard code, we can get this from the swiarb file, it is
976 equal to the number of phonemes (53 for enu, 39 for jpn) plus the special
977 models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */
978 fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */
979 }
980 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs");
981 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes");
982 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos");
983
984 /* setup the arc freelist */
985 fst->FSMarc_freelist = 0;
986 fst->num_arcs = 0;
987 for (i = 0; i < max_num_FSMarcs - 1; i++)
988 {
989 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
990 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
991 }
992 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
993 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
994
995 /* initialize the nodes, 'cuz reading is random order */
996 fst->num_nodes = 0;
997 for (i = 0; i < max_num_FSMnodes; i++)
998 {
999 fr_node = &fst->FSMnode_list[i];
1000 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
1001 }
1002
1003 /* 1. first load up all the information */
1004 IF_DEBUG_WDADD(printf("load graph ... 1\n"));
1005 num_arcs = 0;
1006 last_from_node = MAXnodeID;
1007 while (pfgets(line, sizeof(line) / sizeof(char), fp))
1008 {
1009 if (strstr(line, "modelid:") == line) continue;
1010 IF_DEBUG_WDADD(printf("read arc %s", line));
1011 nargs = split_line(line, args);
1012 if (nargs >= 4)
1013 {
1014 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1015 into_node = (nodeID)atoi_with_check(args[1], MAXnodeID);
1016 ilabel_str = args[2];
1017 olabel_str = args[3];
1018 cost = FREEcostdata;
1019 if (nargs == 5)
1020 PLogError(L("Warning: too many arguments on line %s"), line);
1021 }
1022 else if (nargs == 1)
1023 {
1024 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1025 into_node = MAXnodeID;
1026 ilabel_str = 0;
1027 olabel_str = 0;
1028 cost = FREEcostdata;
1029 }
1030 else
1031 {
1032 from_node = into_node = 0;
1033 PLogError("can't parse line %s\n", line);
1034 ASSERT(0);
1035 }
1036
1037 if (into_node == MAXnodeID)
1038 {
1039 fst->end_node = from_node;
1040 }
1041 else
1042 {
1043 new_arc_id = fst_get_free_arc(fst);
1044 if (new_arc_id == MAXarcID)
1045 return FST_FAILED_ON_MEMORY;
1046 atok = ARC_ItoX(new_arc_id);
1047 atoken = ARC_XtoP(atok);
1048 num_arcs++;
1049 fr_node = &fst->FSMnode_list[from_node];
1050 to_node = &fst->FSMnode_list[into_node];
1051 if (fst->ilabels->num_words == 0)
1052 {
1053 atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset);
1054 /* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/
1055 if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL)
1056 atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET;
1057 }
1058 else
1059 {
1060 atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str);
1061 }
1062 atoken->olabel = wordmap_find_index(fst->olabels, olabel_str);
1063 /* if(g_fst_options.wtw_cost_override>0) {
1064 if(atoken->ilabel == WORD_BOUNDARY)
1065 atoken->cost = g_fst_options.wtw_cost_override;
1066 else
1067 atoken->cost = g_fst_options.arc_cost_override;
1068 } else */
1069 atoken->cost = cost;
1070 #if DEBUG_WDADD
1071 atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0);
1072 atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0);
1073 #endif
1074 append_arc_leaving_node(fst, fr_node, atok);
1075 if (into_node > max_node_number)
1076 max_node_number = into_node;
1077 append_arc_arriving_node(fst, to_node, atok);
1078 atoken->fr_node = NODE_ItoX(from_node);
1079 atoken->to_node = NODE_ItoX(into_node);
1080 }
1081 last_from_node = from_node;
1082 }
1083 ASSERT(fst->num_arcs == num_arcs);
1084
1085 /* setup the node freelist */
1086 IF_DEBUG_WDADD(printf("load graph ... 6\n"));
1087 num_nodes = (nodeID)(max_node_number + 1);
1088 if( max_num_FSMnodes > num_nodes) {
1089 fst->FSMnode_freelist = num_nodes;
1090 for (i = num_nodes; i < (max_num_FSMnodes - 1); i++)
1091 {
1092 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
1093 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1094 }
1095 if (i == (max_num_FSMnodes - 1)) {
1096 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
1097 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1098 }
1099 } else
1100 fst->FSMnode_freelist = FSMNODE_NULL;
1101
1102 /* some book-keeping stuff */
1103 IF_DEBUG_WDADD(printf("load graph ... 7\n"));
1104 fst->num_base_arcs = fst->num_arcs = num_arcs;
1105 fst->FSMarc_list_len = max_num_FSMarcs;
1106 fst->FSMnode_list_len = max_num_FSMnodes;
1107 fst->num_base_nodes = fst->num_nodes = num_nodes;
1108 fst->start_node = 0;
1109 /* fst->end_node = 0; this is set up above */
1110
1111 /* beg and end word labels */
1112 fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-");
1113 fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-");
1114 fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence");
1115
1116 /* set the search limitations, real ones will be registered later */
1117 fst->max_searchable_nodes = 0;
1118 fst->max_searchable_arcs = 0;
1119
1120 rc = fst_alloc_transit_points(fst);
1121 return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS);
1122 }
1123
FST_UnloadGraph(srec_context * pfst)1124 int FST_UnloadGraph(srec_context* pfst)
1125 {
1126 if (pfst->ilabels)
1127 FREE(pfst->ilabels);
1128 FREE(pfst->FSMarc_list);
1129 FREE(pfst->FSMnode_list);
1130 FREE(pfst->FSMnode_info_list);
1131 pfst->FSMarc_list = 0;
1132 pfst->FSMnode_list = 0;
1133 pfst->FSMnode_info_list = 0;
1134 return FST_SUCCESS;
1135 }
1136
FST_DumpGraph(srec_context * fst,PFile * fp)1137 int FST_DumpGraph(srec_context* fst, PFile* fp)
1138 {
1139 int rc = 0;
1140 nodeID i;
1141 FSMarc_ptr atok;
1142 FSMarc* atoken;
1143 nodeID from_node, into_node;
1144 FSMnode* ntoken;
1145 char *ilabel, *olabel;
1146
1147 for (i = 0; i < fst->num_nodes; i++)
1148 {
1149 from_node = i;
1150 ntoken = &fst->FSMnode_list[i];
1151 if (ntoken->first_prev_arc == FSMARC_FREE)
1152 continue;
1153 if (ntoken->un_ptr.first_next_arc != FSMARC_NULL)
1154 {
1155 for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
1156 {
1157 char buf[32];
1158 atoken = ARC_XtoP(atok);
1159 into_node = NODE_XtoI(atoken->to_node);
1160 ilabel = fst->ilabels->num_words == 0 ?
1161 hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) :
1162 fst->ilabels->words[atoken->ilabel] ;
1163 olabel = fst->olabels->words[atoken->olabel];
1164
1165 if (atoken->cost != FREEcostdata)
1166 {
1167 /* regular arc */
1168 pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n",
1169 from_node, into_node, ilabel, olabel, atoken->cost);
1170 }
1171 else
1172 {
1173 /* regular zero cost arc */
1174 pfprintf(fp, "%hu\t%hu\t%s\t%s\n",
1175 from_node, into_node, ilabel, olabel);
1176 }
1177 }
1178 }
1179 else
1180 {
1181 pfprintf(fp, "%hu\n", from_node);
1182 }
1183 }
1184 return rc;
1185 }
1186
FST_AddWordToGrammar(srec_context * fst,const char * _slot,const char * word,const char * pron,const int cost)1187 int FST_AddWordToGrammar(srec_context* fst, const char* _slot,
1188 const char* word, const char* pron,
1189 const int cost)
1190 {
1191 int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0;
1192 int i, pron_len, model_sequence_len;
1193 modelID model_sequence[MAX_PRON_LEN];
1194 char phoneme_sequence[MAX_PRON_LEN];
1195 wordID olabel = MAXwordID;
1196 int irc, rc = FST_SUCCESS;
1197 nodeID start_node = MAXnodeID, end_node = MAXnodeID;
1198 char veslot[MAX_WORD_LEN];
1199 #if USE_HMM_BASED_ENROLLMENT
1200 const char* Tpron;
1201 #endif
1202
1203 /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */
1204 if(_slot[0] == '@') {
1205 strcpy(veslot,SLOTNAME_INDICATOR);
1206 strcat(veslot,_slot+1);
1207 strcat(veslot,SLOTNAME_INDICATOR);
1208 } else
1209 strcpy(veslot, _slot);
1210
1211 #if USE_COMP_STATS
1212 if (!comp_stats)
1213 comp_stats = init_comp_stats1();
1214 start_cs_clock1(&comp_stats->word_addition);
1215 #endif
1216
1217 /* we expect developers to call AddWord with the same slot many times,
1218 so we cache the slot start and end nodes and re-use on subseq calls */
1219 if( fst->addWordCaching_lastslot_num == MAXwordID )
1220 fst->addWordCaching_lastslot_name = NULL;
1221 else {
1222 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1223 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1224 ASSERT( fst->addWordCaching_lastslot_name);
1225 }
1226 if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) {
1227
1228 fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot);
1229 /* olabel not found is a user error */
1230 if (fst->addWordCaching_lastslot_num == MAXwordID)
1231 {
1232 size_t i;
1233
1234 pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot);
1235 for (i = 1; i < (size_t) fst->olabels->num_slots; ++i)
1236 pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]);
1237 pfprintf(PSTDOUT, L("] possible\n"));
1238 rc = FST_FAILED_ON_INVALID_ARGS;
1239 goto RETRC;
1240 }
1241
1242 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1243 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1244 ASSERT(fst->addWordCaching_lastslot_name);
1245
1246 /* now find where in the graph this slot is referenced, note that
1247 there might be more than one place, but here we ignore multiples */
1248 for (i = fst->num_fsm_exit_points; --i >= 0;)
1249 {
1250 arcID arcid = fst->fsm_exit_points[i].arc_index;
1251 if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num)
1252 {
1253 FSMarc* arc;
1254 FSMnode* node;
1255 start_node = fst->fsm_exit_points[i].from_node_index;
1256 end_node = fst->fsm_exit_points[i].wbto_node_index;
1257 fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE;
1258 fst->addWordCaching_lastslot_ifsm_exit_point = i;
1259 node = &fst->FSMnode_list[ end_node];
1260 arc = &fst->FSMarc_list[node->un_ptr.first_next_arc];
1261 if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID)
1262 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
1263 break;
1264 }
1265 }
1266
1267 /* not found in the graph is an internal error, coding error */
1268 if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes)
1269 {
1270 PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num,
1271 start_node, end_node);
1272 goto RETRC;
1273 }
1274 } /* cached or not */
1275
1276 i = fst->addWordCaching_lastslot_ifsm_exit_point;
1277 start_node = fst->fsm_exit_points[i].from_node_index;
1278 end_node = fst->fsm_exit_points[i].wbto_node_index;
1279
1280 /* now start_node and end_node are known */
1281 if (!word || !*word || !pron || !*pron)
1282 {
1283 rc = FST_FAILED_ON_INVALID_ARGS;
1284 PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n");
1285 goto RETRC;
1286 }
1287
1288 /* from here */
1289 IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron));
1290
1291 /* loop over all prons, we break when we hit the double-null (hence the +1) */
1292 for( ; (*pron)!='\0'; pron+=(pron_len+1)) {
1293 pron_len = strlen((char*)pron);
1294 if (pron_len >= MAX_PRON_LEN)
1295 {
1296 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1297 rc = FST_FAILED_ON_INVALID_ARGS;
1298 goto RETRC;
1299 }
1300
1301 /* check for searchability after adding, estimating at most pron_len
1302 arcs and nodes will be added */
1303 if (fst->num_arcs + pron_len > fst->max_searchable_arcs)
1304 {
1305 PLogError("error: wordadd failed on word %s due to %d arc search limit\n",
1306 word, fst->max_searchable_arcs);
1307 rc = FST_FAILED_ON_MEMORY;
1308 goto RETRC;
1309 }
1310
1311 if (fst->num_nodes + pron_len > fst->max_searchable_nodes)
1312 {
1313 PLogError("error: wordadd failed on word %s due to %d node search limit\n",
1314 word, fst->max_searchable_nodes);
1315 rc = FST_FAILED_ON_MEMORY;
1316 goto RETRC;
1317 }
1318
1319 /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding
1320 an alternate pronunciation */
1321 olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1322 if (olabel == MAXwordID)
1323 {
1324 olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1325 if (olabel != MAXwordID)
1326 num_words_added++;
1327 }
1328 if (olabel == MAXwordID)
1329 {
1330 PLogError("error: wordmap_add_word failed\n");
1331 rc = FST_FAILED_ON_MEMORY;
1332 goto RETRC;
1333 }
1334
1335 #if USE_HMM_BASED_ENROLLMENT
1336 // pron should be converted to model_sequence, model_sequence_len
1337 #define VETRANS_PREFIX "wd_hmm"
1338 #define VETRANS_PREFIX_LEN 6
1339 if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) {
1340 // printf("Debug-pron: %d, %s\n",pron_len, pron);
1341 model_sequence_len=0;
1342 for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){
1343 // Tpron = strstr(Tpron, "wd_hmm");
1344 Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm"
1345 // Copy hmm number (56,132,...)
1346 model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID);
1347 model_sequence_len++;
1348 }
1349
1350
1351 /* we need to append silence at the end since we might be dealing with a slot
1352 that has ensuing speech, formerly we only dealt with ROOT which defacto
1353 was followed by the pau2 silence, so ROOT did not need its own */
1354 if (fst->addWordCaching_lastslot_needs_post_silence)
1355 model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here
1356
1357 /* append the word boundary */
1358 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1359 /* translate to input label ids */
1360 for (i = 0; i < model_sequence_len; i++)
1361 {
1362 if (model_sequence[i] >= EPSILON_OFFSET)
1363 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1364 }
1365 }
1366 else
1367 #endif
1368 {
1369 pron_len = strlen((char*)pron);
1370 if (pron_len >= MAX_PRON_LEN)
1371 {
1372 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1373 rc = FST_FAILED_ON_INVALID_ARGS;
1374 goto RETRC;
1375 }
1376
1377 for (i = 0; i < pron_len; i++)
1378 {
1379 if (pron[i] == OPTSILENCE_CODE)
1380 {
1381 phoneme_sequence[i] = SILENCE_CODE;
1382 num_pron_ampersands++;
1383 }
1384 else
1385 phoneme_sequence[i] = pron[i];
1386 }
1387 /* we need to append silence at the end since we might be dealing with a slot
1388 that has ensuing speech, formerly we only dealt with ROOT which defacto
1389 was followed by the pau2 silence, so ROOT did not need its own */
1390 if (fst->addWordCaching_lastslot_needs_post_silence)
1391 phoneme_sequence[i++] = SILENCE_CODE;
1392
1393 model_sequence_len = i;
1394 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1395 /* check for bad sequence of phonemes */
1396 if (irc)
1397 {
1398 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc);
1399 rc = FST_FAILED_ON_INVALID_ARGS;
1400 goto RETRC;
1401 }
1402 IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1403
1404 /* append the word boundary */
1405 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1406 /* translate to input label ids */
1407 for (i = 0; i < model_sequence_len; i++)
1408 {
1409 if (model_sequence[i] >= EPSILON_OFFSET)
1410 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1411 }
1412 } /* end of ph_t ph_r .. type decoding of the model sequence */
1413
1414 /* now do the actual addition */
1415 rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len);
1416 if (rc == FST_SUCCESS)
1417 {
1418 num_prons_added++;
1419 }
1420 else if (rc == FST_FAILED_ON_HOMONYM)
1421 {
1422 /* maybe the another pron will work? */
1423 }
1424 else
1425 {
1426 PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron);
1427 goto RETRC;
1428 }
1429
1430 /* second add the pron with no silences,
1431 this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */
1432
1433 if (num_pron_ampersands > 0)
1434 {
1435 for (i = 0, model_sequence_len = 0; i < pron_len; i++)
1436 {
1437 if (pron[i] != OPTSILENCE_CODE)
1438 phoneme_sequence[model_sequence_len++] = pron[i];
1439 }
1440
1441 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1442 /* check for bad sequence of phonemes */
1443 if (irc)
1444 {
1445 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc);
1446 rc = FST_FAILED_ON_INVALID_ARGS;
1447 goto RETRC;
1448 }
1449 else
1450 {
1451 IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1452
1453 /* append the word boundary */
1454 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1455 /* translate to input label ids */
1456 for (i = 0; i < model_sequence_len; i++)
1457 {
1458 if (model_sequence[i] >= EPSILON_OFFSET)
1459 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1460 }
1461 /* now do the actual addition */
1462 rc = fst_add_arcs(fst, start_node, end_node,
1463 olabel, (costdata)cost, model_sequence, model_sequence_len);
1464
1465 if (rc == FST_SUCCESS)
1466 {
1467 num_prons_added++;
1468 }
1469 else if (rc == FST_FAILED_ON_HOMONYM)
1470 {
1471 /* maybe another pron worked? */
1472 }
1473 else
1474 {
1475 PLogMessage("Warning: fst_add_arcs() failed while adding "
1476 "word %s pron %s (skip '&')\n", word, (char*)pron);
1477 goto RETRC;
1478 }
1479 }
1480 }
1481 }
1482
1483
1484 RETRC:
1485 /* set this to make sure that FST_Prepare gets called after add word */
1486 fst->whether_prepared = 0;
1487 #if USE_COMP_STATS
1488 end_cs_clock1(&comp_stats->word_addition, 1);
1489 #endif
1490
1491 if (rc < 0 && rc != FST_FAILED_ON_HOMONYM)
1492 return rc;
1493
1494 if (num_prons_added == 0 && num_words_added > 0)
1495 {
1496 return FST_FAILED_ON_HOMONYM;
1497 }
1498 else if (num_prons_added == 0 && num_words_added == 0)
1499 {
1500 return FST_FAILED_ON_HOMOGRAPH;
1501 }
1502 else if (num_prons_added > 0 && num_words_added == 0)
1503 {
1504 return FST_SUCCESS_ON_OLD_WORD;
1505 }
1506 else
1507 {
1508 /* if(num_prons_added>0 && num_words_added>0) */
1509 return FST_SUCCESS;
1510 }
1511 }
1512
1513 /* remove arcs leaving a node, arcs added via word add, are just discarded
1514 to be cleaned up for re-use by other functions */
remove_added_arcs_leaving(srec_context * fst,nodeID ni)1515 void remove_added_arcs_leaving(srec_context* fst, nodeID ni)
1516 {
1517 FSMnode* node = &fst->FSMnode_list[ ni];
1518 FSMarc *arc = NULL, *arc2;
1519 arcID *pai, ai, ai2;
1520 for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai))
1521 {
1522 if (ai < fst->num_base_arcs)
1523 {
1524 arc = &fst->FSMarc_list[ai];
1525 }
1526 else
1527 {
1528 arc2 = &fst->FSMarc_list[ai];
1529 for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1530 ai2 = arc2->linkl_next_arc)
1531 {
1532 arc2 = &fst->FSMarc_list[ai2];
1533 }
1534 *pai = ai2;
1535 }
1536 }
1537 }
1538
1539 /* remove arcs arriving at a node, arcs added via word add, are just discarded
1540 to be cleaned up for re-use by other functions */
remove_added_arcs_arriving(srec_context * fst,nodeID ni)1541 void remove_added_arcs_arriving(srec_context* fst, nodeID ni)
1542 {
1543 FSMnode* node = &fst->FSMnode_list[ni];
1544 FSMarc *arc = NULL, *arc2;
1545 arcID *pai, ai, ai2;
1546 for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID;
1547 pai = &arc->linkl_prev_arc, ai = (*pai))
1548 {
1549 if (ai < fst->num_base_arcs)
1550 {
1551 arc = &fst->FSMarc_list[ai];
1552 }
1553 else
1554 {
1555 arc2 = &fst->FSMarc_list[ai];
1556 for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1557 ai2 = arc2->linkl_prev_arc)
1558 {
1559 arc2 = &fst->FSMarc_list[ai2];
1560 }
1561 *pai = ai2;
1562 }
1563 }
1564 }
1565
1566 /* reset greammar, but resetting all the arcs, nodes and words added
1567 via the addword functions */
1568
FST_ResetGrammar(srec_context * fst)1569 int FST_ResetGrammar(srec_context* fst)
1570 {
1571 int i, rc = 0;
1572 nodeID fst_slot_start_node, fst_slot_end_node;
1573 wordID fst_slot_slotnum;
1574 arcID ai;
1575 nodeID ni2, ni3, ni4, ni3a;
1576 FSMnode_t *node, *node2, *node3;
1577 FSMarc_t *arc, *arc2, *arc3;
1578
1579 /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/
1580 for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots;
1581 fst_slot_slotnum++)
1582 {
1583
1584 if (fst_slot_slotnum == MAXwordID)
1585 {
1586 char *slot = "";
1587 PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1);
1588 return (rc = FST_FAILED_ON_INVALID_ARGS);
1589 }
1590
1591 /* now find where in the graph this slot is referenced, note that
1592 there might be more than one place, but here we ignore multiples */
1593 fst_slot_start_node = MAXnodeID;
1594 fst_slot_end_node = MAXnodeID;
1595 for (i = fst->num_fsm_exit_points; --i >= 0;)
1596 {
1597 ai = fst->fsm_exit_points[i].arc_index;
1598 if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum)
1599 {
1600 fst_slot_start_node = fst->fsm_exit_points[i].from_node_index;
1601 fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index;
1602 }
1603 }
1604
1605 /* this 'slot' could be the root rule, which can't be removed */
1606 if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID)
1607 continue;
1608
1609 /*
1610 start end
1611 node node2 node3 node4
1612 o--@N-->o--sil-->o--wb-->o
1613 arc arc2 | arc3 ^
1614 | |wb
1615 \__sil->o
1616 arc4 ni3a
1617 */
1618
1619 remove_added_arcs_leaving(fst, fst_slot_start_node);
1620 node = &fst->FSMnode_list[ fst_slot_start_node];
1621 for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc)
1622 {
1623 arc = &fst->FSMarc_list[ai];
1624 if (arc->olabel != fst_slot_slotnum)
1625 continue;
1626
1627 ni2 = arc->to_node;
1628 remove_added_arcs_arriving(fst, ni2);
1629 if (ni2 == fst_slot_end_node)
1630 continue;
1631 node2 = &fst->FSMnode_list[ni2];
1632 arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc];
1633
1634
1635 ni3 = arc2->to_node;
1636 remove_added_arcs_arriving(fst, ni3);
1637 if (ni3 == fst_slot_end_node)
1638 continue;
1639 node3 = &fst->FSMnode_list[ni3];
1640 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1641 while (arc3->linkl_next_arc != MAXarcID)
1642 {
1643 arc3 = &fst->FSMarc_list[arc3->linkl_next_arc];
1644 ni3a = arc3->to_node;
1645 remove_added_arcs_arriving(fst, ni3a);
1646 }
1647 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1648
1649 ni4 = arc3->to_node;
1650 remove_added_arcs_arriving(fst, ni4);
1651 ASSERT(ni4 == fst_slot_end_node);
1652 if (ni4 == fst_slot_end_node)
1653 continue;
1654 }
1655 }
1656
1657 /* reset the freelist for nodes */
1658 if( fst->num_nodes == fst->num_base_nodes )
1659 {}
1660 else{
1661 FSMnode *ntoken;
1662 FSMnode* tmp_FSMnode_list;
1663 FSMnode_info* tmp_FSMnode_info_list;
1664
1665 fst->FSMnode_freelist = MAXnodeID;
1666 fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes;
1667
1668 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
1669 if(!tmp_FSMnode_list){
1670 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes");
1671 return FST_FAILED_ON_MEMORY;
1672 }
1673 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
1674
1675 /* scan to the end of the free list (to be re-used) */
1676 nodeID* last_free_node = (&fst->FSMnode_freelist);
1677 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
1678
1679 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
1680 ntoken = &tmp_FSMnode_list[ *last_free_node];
1681
1682 FREE( fst->FSMnode_list);
1683 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
1684 if(!tmp_FSMnode_info_list){
1685 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos");
1686 return FST_FAILED_ON_MEMORY;
1687 }
1688 // copy in node info
1689 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
1690
1691
1692 FREE( fst->FSMnode_info_list);
1693 fst->FSMnode_info_list = tmp_FSMnode_info_list;
1694 fst->FSMnode_list = tmp_FSMnode_list;
1695 }
1696
1697 /*ni = fst->FSMnode_freelist = fst->num_base_nodes;
1698 node = &fst->FSMnode_list[ni];
1699 for (; ni < fst->FSMnode_list_len - 1; ni++, node++)
1700 node->un_ptr.next_node = (nodeID)(ni + 1);
1701 node->un_ptr.next_node = MAXnodeID;
1702 fst->num_nodes = fst->num_base_nodes;*/
1703
1704 /* reset the freelist for arcs */
1705 if( fst->num_arcs == fst->num_base_arcs )
1706 {}
1707 else {
1708 FSMarc* atoken = NULL;
1709 FSMarc* tmp_FSMarc_list;
1710
1711 fst->num_arcs = fst->num_base_arcs;
1712 fst->FSMarc_list_len = fst->num_base_arcs;
1713 fst->FSMarc_freelist = MAXarcID;
1714 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
1715 if(!tmp_FSMarc_list){
1716 PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs");
1717 return FST_FAILED_ON_MEMORY;
1718 }
1719 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
1720
1721 /* scan to the end of the free list (to be re-used) */
1722 arcID* last_free_arc = &fst->FSMarc_freelist;
1723 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
1724
1725 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
1726 atoken = &tmp_FSMarc_list[ *last_free_arc];
1727
1728 FREE( fst->FSMarc_list);
1729 fst->FSMarc_list = tmp_FSMarc_list;
1730 }
1731
1732 /*ai = fst->FSMarc_freelist = fst->num_base_arcs;
1733 arc = &fst->FSMarc_list[ai];
1734 for (; ai < fst->FSMarc_list_len - 1; ai++, arc++)
1735 arc->linkl_next_arc = (arcID)(ai + 1);
1736 arc->linkl_next_arc = MAXarcID;
1737 fst->num_arcs = fst->num_base_arcs;
1738 fst->FSMarc_list_len = fst->num_base_arcs;*/
1739
1740 /* now remove all the added words */
1741 wordmap_reset(fst->olabels);
1742 return FST_SUCCESS;
1743 }
1744
1745 /* See the description of arc_tokens in the header file. These
1746 arcs are nodeless, which make loading from a file that
1747 references nodes a little harder. We need to do it in stages.
1748 Later, when we load from binary image it should be much simpler.
1749 */
1750
get_first_arc_leaving_node(arc_token * arc_token_list,arcID num_arcs,nodeID node)1751 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list,
1752 arcID num_arcs,
1753 nodeID node)
1754 {
1755 arcID i;
1756 for (i = 0; i < num_arcs; i++)
1757 {
1758 if ((nodeID)(int)arc_token_list[i].next_token_index == node)
1759 return ARC_TOKEN_LNK(arc_token_list, i);
1760 }
1761 return ARC_TOKEN_NULL;
1762 }
1763
FST_LoadReverseWordGraph(srec_context * context,int num_words_to_add,PFile * fp)1764 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp)
1765 {
1766 arcID i;
1767 char line[MAX_LINE_LENGTH];
1768 char word_label_as_str[128];
1769 arcID num_alloc_arc_tokens;
1770 nodeID from_node, into_node;
1771 labelID word_label = MAXwordID;
1772 arc_token *atoken, *last_atoken;
1773 costdata cost;
1774 arcID num_arcs;
1775 arc_token *arc_token_list, *tmp;
1776 long fpos;
1777
1778 /* determine number of word arcs to allocate */
1779 fpos = pftell(fp);
1780 for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp); num_arcs++) ;
1781 num_alloc_arc_tokens = num_arcs;
1782 num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add);
1783 pfseek(fp, fpos, SEEK_SET);
1784
1785 context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph");
1786 arc_token_list = context->arc_token_list;
1787
1788 /* 1. first load up all the information */
1789 i = 0;
1790 while (pfgets(line, MAX_LINE_LENGTH, fp))
1791 {
1792 if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3)
1793 {
1794 word_label = wordmap_find_index(context->olabels, word_label_as_str);
1795 // ASSERT(word_label >= 0);
1796 cost = FREEcostdata;
1797 }
1798 else if (sscanf(line, "%hu", &from_node) == 1)
1799 {
1800 into_node = MAXnodeID;
1801 word_label = MAXwordID;
1802 cost = FREEcostdata;
1803 }
1804 else
1805 {
1806 PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line);
1807 ASSERT(0);
1808 }
1809 atoken = &arc_token_list[i];
1810 i++;
1811 atoken->ilabel = word_label;
1812 /* atoken->olabel = WORD_EPSILON_LABEL; */
1813 /*atoken->cost = cost; cost is not used for now */
1814 #if DEBUG_ASTAR
1815 atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]);
1816 #endif
1817 atoken->first_next_arc = (arc_token_lnk)into_node;
1818 atoken->next_token_index = (arc_token_lnk)from_node;
1819 }
1820 num_arcs = i;
1821
1822 /* 2. now do the internal cross references */
1823 for (i = 0; i < num_arcs; i++)
1824 {
1825 atoken = &arc_token_list[i];
1826 into_node = (nodeID)atoken->first_next_arc;
1827 if (into_node == MAXnodeID)
1828 atoken->first_next_arc = ARC_TOKEN_NULL;
1829 else
1830 atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc);
1831 }
1832
1833 /* 3. now do more internal cross refs */
1834 last_atoken = &arc_token_list[0];
1835 for (i = 1; i < num_arcs; i++)
1836 {
1837 atoken = &arc_token_list[i];
1838 if (atoken->next_token_index != last_atoken->next_token_index)
1839 {
1840 last_atoken->next_token_index = ARC_TOKEN_NULL;
1841 }
1842 else
1843 {
1844 last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i);
1845 }
1846 last_atoken = atoken;
1847 }
1848 last_atoken->next_token_index = ARC_TOKEN_NULL;
1849
1850 #if DEBUG_ASTAR
1851 /* under debug, it's nice to be able to see the words leaving the
1852 destination node, they are stored sequentially in the debug ary */
1853 for (i = 0; i < num_arcs; i++)
1854 {
1855 char * p;
1856 atoken = &arc_token_list[i];
1857 atoken->debug[0] = 0;
1858 for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL;
1859 tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index))
1860 {
1861 if (tmp->first_next_arc == ARC_TOKEN_NULL)
1862 {
1863 p = "END";
1864 }
1865 else if (!tmp->label)
1866 {
1867 p = "NULL";
1868 }
1869 else
1870 {
1871 p = tmp->label;
1872 }
1873 if (strlen(atoken->debug) + strlen(p) + 6 < 64)
1874 {
1875 strcat(atoken->debug, p);
1876 strcat(atoken->debug, " ");
1877 }
1878 else
1879 {
1880 strcat(atoken->debug, "...");
1881 break;
1882 }
1883 }
1884 }
1885 #endif
1886 context->arc_token_list_len = num_alloc_arc_tokens;
1887 if (num_alloc_arc_tokens > num_arcs)
1888 {
1889 atoken = &context->arc_token_list[num_arcs];
1890 for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++)
1891 {
1892 atoken->first_next_arc = ARC_TOKEN_NULL;
1893 atoken->ilabel = MAXwordID;
1894 atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/
1895 }
1896 atoken--;
1897 atoken->next_token_index = ARC_TOKEN_NULL;
1898 context->arc_token_freelist = &context->arc_token_list[num_arcs];
1899 }
1900 else
1901 {
1902 context->arc_token_freelist = NULL;
1903 }
1904 /* the insert start is the arc that goes to the end */
1905 atoken = context->arc_token_list;
1906 atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc);
1907 for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index))
1908 {
1909 if (atoken->first_next_arc == ARC_TOKEN_NULL)
1910 break;
1911 tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc);
1912 if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL)
1913 break;
1914 }
1915 context->arc_token_insert_start = atoken;
1916
1917 return FST_SUCCESS;
1918 }
1919
FST_UnloadReverseWordGraph(srec_context * context)1920 int FST_UnloadReverseWordGraph(srec_context* context)
1921 {
1922 FREE(context->arc_token_list);
1923 context->arc_token_list = 0;
1924 return FST_SUCCESS;
1925 }
1926
1927 /*----------------------------------------------------------------------*
1928 * *
1929 * general *
1930 * *
1931 *----------------------------------------------------------------------*/
1932
split_line(char * line,char ** av)1933 char split_line(char* line, char** av)
1934 {
1935 int i = 0;
1936 av[i] = strtok(line, "\n\r\t ");
1937 for (; av[i];)
1938 {
1939 av[++i] = strtok(NULL, "\n\r\t ");
1940 }
1941 return i;
1942 }
1943
atoi_with_check(const char * buf,asr_int32_t mymax)1944 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax)
1945 {
1946 asr_int32_t ans = atol(buf);
1947 ASSERT(ans < mymax);
1948 return ans;
1949 }
1950
1951 /*----------------------------------------------------------------------*
1952 * *
1953 * fst *
1954 * *
1955 *----------------------------------------------------------------------*/
1956
fst_get_free_arc(srec_context * fst)1957 arcID fst_get_free_arc(srec_context* fst)
1958 {
1959 FSMarc* atoken;
1960 arcID atokid = fst->FSMarc_freelist;
1961 if (atokid == MAXarcID)
1962 {
1963 PLogError("error: ran out of arcs\n");
1964 return atokid;
1965 }
1966 atoken = &fst->FSMarc_list[atokid];
1967 fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc);
1968 memset(atoken, 0, sizeof(FSMarc));
1969 atoken->to_node = FSMNODE_NULL;
1970 atoken->fr_node = FSMNODE_NULL;
1971 atoken->linkl_next_arc = FSMARC_NULL;
1972 atoken->linkl_prev_arc = FSMARC_NULL;
1973 atoken->ilabel = 0;
1974 atoken->olabel = 0;
1975 atoken->cost = 0;
1976 fst->num_arcs++;
1977 return atokid;
1978 }
fst_get_free_node(srec_context * fst)1979 nodeID fst_get_free_node(srec_context* fst)
1980 {
1981 FSMnode* ntoken;
1982 nodeID ntokid = fst->FSMnode_freelist;
1983 if (ntokid == MAXnodeID)
1984 {
1985 PLogError("error: ran out of nodes\n");
1986 return ntokid;
1987 }
1988 ntoken = &fst->FSMnode_list[ntokid];
1989 fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node);
1990 ntoken->un_ptr.first_next_arc = FSMARC_NULL;
1991 ntoken->first_prev_arc = FSMARC_NULL;
1992 /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */
1993 fst->num_nodes++;
1994 return ntokid;
1995 }
1996
fst_free_node(srec_context * fst,FSMnode * node)1997 int fst_free_node(srec_context* fst, FSMnode* node)
1998 {
1999 IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node));
2000 node->first_prev_arc = FSMARC_NULL;
2001 node->un_ptr.next_node = fst->FSMnode_freelist;
2002 node->first_prev_arc = FSMARC_FREE;
2003 fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list);
2004 /* fst->num_nodes--; not allowed unless we compress! */
2005 return 0;
2006 }
2007
fst_free_arc(srec_context * fst,FSMarc * arc)2008 int fst_free_arc(srec_context* fst, FSMarc* arc)
2009 {
2010 IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc));
2011 arc->linkl_prev_arc = FSMARC_FREE;
2012 arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist);
2013 arc->cost = 2;
2014 arc->ilabel = 0;
2015 arc->olabel = 0;
2016 IF_DEBUG_WDADD(arc->ilabel_str = 0);
2017 IF_DEBUG_WDADD(arc->olabel_str = 0);
2018 fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list);
2019 fst->num_arcs--;
2020 return 0;
2021 }
2022
fst_free_arc_net(srec_context * fst,FSMarc * arc)2023 int fst_free_arc_net(srec_context* fst, FSMarc* arc)
2024 {
2025 if (!arc)
2026 return 0;
2027 /* need to traverse the arc, get the 'from' nodes,
2028 then traverse the from nodes and kill arcs */
2029 /* we'll need this after an error condition */
2030 return 0;
2031 }
2032
fst_push_arc_olabel(srec_context * fst,FSMarc * arc)2033 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc)
2034 {
2035 FSMarc_ptr atok;
2036 FSMarc* atoken;
2037 FSMnode_ptr ntok = arc->to_node;
2038
2039 /* look for all arcs leaving this arc, and mark them
2040 normally there is only one, except for multipron words */
2041 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2042 {
2043 atoken = ARC_XtoP(atok);
2044 /* cannot have multiple olabels before a boundary */
2045 if (atoken->olabel != WORD_EPSILON_LABEL)
2046 return FST_FAILED_INTERNAL;
2047 atoken->olabel = arc->olabel;
2048 #if DO_WEIGHTED_ADDWORD
2049 atoken->cost = (costdata)(atoken->cost + arc->cost);
2050 #endif
2051 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2052 }
2053 /* unlabel this arc */
2054 arc->olabel = WORD_EPSILON_LABEL;
2055 #if DO_WEIGHTED_ADDWORD
2056 arc->cost = FREEcostdata;
2057 #endif
2058 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2059 return FST_SUCCESS;
2060 }
2061
2062 #if DO_WEIGHTED_ADDWORD
fst_push_arc_cost(srec_context * fst,FSMarc * arc)2063 int fst_push_arc_cost(srec_context* fst, FSMarc* arc)
2064 {
2065 FSMarc_ptr atok;
2066 FSMarc* atoken;
2067 FSMnode_ptr ntok = arc->to_node;
2068 /* look for all arcs leaving this arc, and push the cost thereto */
2069 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2070 {
2071 atoken = ARC_XtoP(atok);
2072 atoken->cost = (costdata)(atoken->cost + arc->cost);
2073 }
2074 arc->cost = FREEcostdata;
2075 return FST_SUCCESS;
2076 }
2077 #endif
2078
2079
fst_pull_arc_olabel(srec_context * fst,FSMarc * arc)2080 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc)
2081 {
2082 FSMarc_ptr atok;
2083 FSMarc* atoken;
2084 FSMnode_ptr fr_node = arc->fr_node;
2085 FSMarc_ptr tmp_atok;
2086 FSMarc *tmp_atoken;
2087 FSMnode *anode;
2088
2089 if (arc->olabel == WORD_EPSILON_LABEL)
2090 return FST_SUCCESS;
2091
2092 /* can the olabel be pulled earlier? */
2093 for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2094 {
2095 atoken = ARC_XtoP(atok);
2096 anode = NODE_XtoP(atoken->to_node);
2097
2098 tmp_atok = anode->un_ptr.first_next_arc;
2099 if (tmp_atok != FSMARC_NULL)
2100 {
2101 tmp_atoken = ARC_XtoP(tmp_atok);
2102 if (tmp_atoken->linkl_next_arc != FSMARC_NULL)
2103 {
2104 return FST_CONTINUE;
2105 }
2106 }
2107 }
2108
2109 /* look for all arcs arriving at this arc, and mark them */
2110 for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc)
2111 {
2112 atoken = ARC_XtoP(atok);
2113 /* cannot have multiple olabels! serious internal error!? */
2114 if (atoken->olabel != WORD_EPSILON_LABEL)
2115 {
2116 PLogError("error: internal error, in fst_pull_arc_olabel()\n");
2117 return FST_FAILED_INTERNAL;
2118 }
2119 atoken->olabel = arc->olabel;
2120 #if DO_WEIGHTED_ADDWORD
2121 atoken->cost = arc->cost;
2122 #endif
2123 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2124 }
2125 /* unlabel this arc */
2126 arc->olabel = WORD_EPSILON_LABEL;
2127 #if DO_WEIGHTED_ADDWORD
2128 arc->cost = FREEcostdata;
2129 #endif
2130 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2131 return FST_SUCCESS;
2132 }
2133
find_next_arc_with_ilabel(srec_context * fst,FSMnode * node,labelID ilabel,FSMarc ** last)2134 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last)
2135 {
2136 FSMarc_ptr atok;
2137 FSMarc* atoken = NULL;
2138 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2139 {
2140 atoken = ARC_XtoP(atok);
2141 if (atoken->ilabel == ilabel)
2142 return atoken;
2143 }
2144 *last = atoken; /* to this we may want to append a new arc */
2145 return NULL;
2146 }
2147
find_prev_arc_with_iolabels(srec_context * fst,FSMnode * node,labelID ilabel,labelID olabel,FSMarc ** last)2148 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last)
2149 {
2150 FSMarc_ptr atok;
2151 FSMarc* atoken = NULL;
2152 FSMarc_ptr tmp_atok;
2153 FSMarc* tmp_atoken;
2154
2155 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2156 {
2157 atoken = ARC_XtoP(atok);
2158 if (atoken->olabel == olabel && atoken->ilabel == ilabel)
2159 {
2160 tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc;
2161 if (tmp_atok != FSMARC_NULL)
2162 {
2163 tmp_atoken = ARC_XtoP(tmp_atok);
2164
2165 if (tmp_atoken->linkl_next_arc == FSMARC_NULL)
2166 return atoken;
2167 }
2168 else
2169 return atoken;
2170 }
2171
2172 }
2173 if (last) *last = atoken;
2174 return NULL;
2175 }
2176
num_arcs_leaving(srec_context * fst,FSMnode * node)2177 int num_arcs_leaving(srec_context* fst, FSMnode* node)
2178 {
2179 int num_arcs = 0;
2180 FSMarc_ptr atok;
2181 FSMarc* atoken;
2182 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2183 {
2184 atoken = ARC_XtoP(atok);
2185 num_arcs++;
2186 }
2187 return num_arcs;
2188 }
2189
num_arcs_arriving(srec_context * fst,FSMnode * node)2190 int num_arcs_arriving(srec_context* fst, FSMnode* node)
2191 {
2192 int num_arcs = 0;
2193 FSMarc_ptr atok;
2194 FSMarc* atoken;
2195 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2196 {
2197 atoken = ARC_XtoP(atok);
2198 num_arcs++;
2199 }
2200 return num_arcs;
2201 }
2202
num_arcs_arriving_gt_1(srec_context * fst,FSMnode * node)2203 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node)
2204 {
2205 FSMarc_ptr atok = node->first_prev_arc;
2206 FSMarc* atoken;
2207 if (atok == FSMARC_NULL)
2208 return 0;
2209 atoken = ARC_XtoP(atok);
2210 return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1;
2211 }
2212
remove_arc_arriving(srec_context * fst,FSMnode * to_node,FSMarc * arc)2213 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc)
2214 {
2215 FSMarc* atoken;
2216 FSMarc_ptr* atok;
2217 for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2218 {
2219 atoken = ARC_XtoP(*atok);
2220 if (atoken == arc)
2221 {
2222 (*atok) = arc->linkl_prev_arc;
2223 return;
2224 }
2225 }
2226 ASSERT(0);
2227 }
2228
split_node_for_arc(srec_context * fst,FSMarc * arc)2229 int split_node_for_arc(srec_context* fst, FSMarc* arc)
2230 {
2231 arcID new_arc_id;
2232 nodeID new_node_id;
2233 FSMnode* new_node;
2234 FSMnode* old_node;
2235 FSMarc_ptr atok, new_next, last_new_next;
2236 FSMarc *atoken, *new_next_p;
2237
2238 IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc));
2239
2240 new_node_id = fst_get_free_node(fst);
2241 if (new_node_id == MAXnodeID)
2242 return FST_FAILED_ON_MEMORY;
2243 new_node = &fst->FSMnode_list[ new_node_id];
2244 old_node = NODE_XtoP(arc->to_node);
2245 arc->to_node = NODE_ItoX(new_node_id);
2246 new_node->first_prev_arc = ARC_PtoX(arc);
2247 remove_arc_arriving(fst, old_node, arc);
2248 arc->linkl_prev_arc = FSMARC_NULL;
2249
2250 last_new_next = FSMARC_NULL;
2251 for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2252 {
2253 atoken = ARC_XtoP(atok);
2254 new_arc_id = fst_get_free_arc(fst);
2255 if (new_arc_id == MAXarcID)
2256 return FST_FAILED_ON_MEMORY;
2257 new_next = ARC_ItoX(new_arc_id);
2258 if (last_new_next != FSMARC_NULL)
2259 LINKL_NEXT(last_new_next) = new_next;
2260 else
2261 new_node->un_ptr.first_next_arc = new_next;
2262 new_next_p = ARC_XtoP(new_next);
2263 new_next_p->ilabel = atoken->ilabel;
2264 new_next_p->to_node = atoken->to_node;
2265 new_next_p->fr_node = arc->to_node;
2266 new_next_p->olabel = atoken->olabel;
2267 IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str);
2268 IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str);
2269 new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */
2270 append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next);
2271 last_new_next = new_next;
2272 }
2273 return FST_SUCCESS;
2274 }
2275
fst_add_arcs(srec_context * fst,nodeID start_node,nodeID end_node,wordID add_tree_olabel,costdata add_tree_cost,modelID * add_tree_start,int add_tree_len)2276 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
2277 wordID add_tree_olabel, costdata add_tree_cost,
2278 modelID* add_tree_start, int add_tree_len)
2279 {
2280 int rc = FST_SUCCESS;
2281 FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0;
2282 FSMarc* atoken = NULL;
2283 FSMarc *next_atoken, *prev_atoken;
2284 FSMarc *append_to;
2285 FSMnode *late_start_node, *early_end_node;
2286 FSMnode *ntoken, *last_ntoken;
2287 FSMnode_ptr ntok, last_ntok;
2288 modelID *add_tree_end = add_tree_start + add_tree_len - 1;
2289 modelID *add_tree_late_start, *add_tree_early_end;
2290 modelID *add_tree;
2291 arcID new_arc_id, atokid;
2292 nodeID new_node_id, atokid_node;
2293 wordID add_tree_olabel_use = add_tree_olabel;
2294 wordID add_tree_cost_use = add_tree_cost;
2295
2296 append_to = NULL;
2297 add_tree = add_tree_start;
2298 #define FST_GROW_MINARCS 100
2299 #define FST_GROW_MINNODES 100
2300 #if defined(FST_GROW_FACTOR)
2301
2302 /* make sure we have enough arcs available */
2303 if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len)
2304 {
2305 FSMarc* tmp_FSMarc_list;
2306 arcID tmp_FSMarc_list_len;
2307 int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ;
2308
2309 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS)
2310 itmp_FSMarc_list_len += FST_GROW_MINARCS;
2311
2312 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len)
2313 itmp_FSMarc_list_len += add_tree_len;
2314
2315 if(itmp_FSMarc_list_len >= (int)MAXarcID)
2316 return FST_FAILED_ON_MEMORY;
2317
2318 tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len;
2319
2320 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
2321 if(!tmp_FSMarc_list) {
2322 PLogError("error: Failed to extend the memory for arcs\n");
2323 return FST_FAILED_INTERNAL;
2324 }
2325 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
2326
2327 /* initialize the new free list.
2328 head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */
2329 for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) {
2330 tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1);
2331 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2332 }
2333 tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL;
2334 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2335
2336 /* scan to the end of the old free list (to be re-used) */
2337 arcID* last_free_arc = &fst->FSMarc_freelist;
2338 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
2339
2340 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
2341 atoken = &tmp_FSMarc_list[ *last_free_arc];
2342
2343 /* append the new free list to the current free list */
2344 *last_free_arc = fst->FSMarc_list_len;
2345
2346 FREE( fst->FSMarc_list);
2347 fst->FSMarc_list = tmp_FSMarc_list;
2348 fst->FSMarc_list_len = tmp_FSMarc_list_len;
2349 }
2350
2351 /* make sure we have enough nodes available */
2352 if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len)
2353 {
2354 FSMnode* tmp_FSMnode_list;
2355 nodeID tmp_FSMnode_list_len;
2356 FSMnode_info* tmp_FSMnode_info_list;
2357 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ;
2358
2359 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES)
2360 itmp_FSMnode_list_len += FST_GROW_MINNODES;
2361
2362 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len)
2363 itmp_FSMnode_list_len += add_tree_len;
2364
2365 if(itmp_FSMnode_list_len >= (int)MAXnodeID)
2366 return FST_FAILED_ON_MEMORY;
2367
2368 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len;
2369
2370 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
2371 if(!tmp_FSMnode_list) {
2372 PLogError("ERROR: Failed to extend the memory for nodes\n");
2373 return FST_FAILED_INTERNAL;
2374 }
2375 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
2376
2377 /* initialize the new free node list.
2378 head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */
2379 for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++)
2380 {
2381 tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1);
2382 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2383 }
2384 tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL;
2385 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2386
2387 /* scan to the end of the old free list (to be re-used) */
2388 nodeID* last_free_node = (&fst->FSMnode_freelist);
2389 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
2390
2391 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
2392 ntoken = &tmp_FSMnode_list[ *last_free_node];
2393
2394 /* append the new free list to the current free list */
2395 *last_free_node = fst->FSMnode_list_len;
2396
2397 FREE( fst->FSMnode_list);
2398
2399 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
2400 if(!tmp_FSMnode_info_list) {
2401 PLogError("ERROR: Failed to extend the memory for node infos\n");
2402 return FST_FAILED_INTERNAL;
2403 }
2404 // copy in old node info
2405 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
2406
2407 // initialize the new node info
2408 for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++)
2409 tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN;
2410
2411 FREE( fst->FSMnode_info_list);
2412 fst->FSMnode_info_list = tmp_FSMnode_info_list;
2413 fst->FSMnode_list = tmp_FSMnode_list;
2414 fst->FSMnode_list_len = tmp_FSMnode_list_len;
2415 }
2416 #endif
2417 late_start_node = &fst->FSMnode_list[start_node];
2418
2419 while (1)
2420 {
2421
2422 next_atoken = find_next_arc_with_ilabel(fst, late_start_node,
2423 *add_tree /*->ilabel*/,
2424 &append_to);
2425 if (next_atoken != NULL)
2426 {
2427 /* so next_atok->ilabel == add_tree->ilabel */
2428
2429 if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY)
2430 {
2431 next_atoken = NULL;
2432 break; /* break as if nothing more can be shared! */
2433 // return FST_FAILED_ON_HOMONYM;
2434 }
2435
2436 if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node)))
2437 {
2438 split_node_for_arc(fst, next_atoken);
2439 /* unfortunate side effect here, that if we later find out this
2440 was a homonym addition, then this expansion was useless and
2441 for now we don't undo it! */
2442 }
2443
2444 /* we shouldn't really push the olabel if it's the same for both! */
2445 if (next_atoken->olabel != add_tree_olabel_use)
2446 {
2447 if (next_atoken->olabel != WORD_EPSILON_LABEL)
2448 {
2449 if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS)
2450 {
2451 PLogError("error: internal error fst_push_arc_olabel()\n");
2452 return FST_FAILED_INTERNAL;
2453 }
2454 }
2455 #if DO_WEIGHTED_ADDWORD
2456 else
2457 fst_push_arc_cost(fst, next_atoken);
2458 #endif
2459 }
2460 else if (add_tree_olabel_use != WORD_EPSILON_LABEL)
2461 {
2462 /* the graph already has this word, so we just re-use the olabel
2463 and disable the setting of olabel down below */
2464 add_tree_olabel_use = WORD_EPSILON_LABEL;
2465 #if DO_WEIGHTED_ADDWORD
2466 add_tree_cost_use = FREEcostdata;
2467 #endif
2468 }
2469 add_tree++;
2470 late_start_node = NODE_XtoP(next_atoken->to_node);
2471 }
2472 else
2473 {
2474 break;
2475 }
2476 }
2477
2478 add_tree_late_start = add_tree;
2479 early_end_node = &fst->FSMnode_list[end_node];
2480
2481 if (!do_minimize)
2482 {
2483 last_ntoken = NULL;
2484 last_ntok = NODE_PtoX(late_start_node);
2485 for (; add_tree <= add_tree_end; add_tree++)
2486 {
2487 new_arc_id = fst_get_free_arc(fst);
2488 if (new_arc_id == MAXarcID)
2489 {
2490 PLogError("fst_get_free_arc() failed\n");
2491 return FST_FAILED_ON_MEMORY;
2492 }
2493 atok = ARC_ItoX(new_arc_id);
2494 atoken = ARC_XtoP(atok);
2495 if (add_tree != add_tree_end)
2496 {
2497 new_node_id = fst_get_free_node(fst);
2498 if (new_node_id == MAXnodeID)
2499 {
2500 PLogError("fst_get_free_node() failed\n");
2501 return FST_FAILED_ON_MEMORY;
2502 }
2503 ntok = NODE_ItoX(new_node_id);
2504 ntoken = NODE_XtoP(ntok);
2505 ntoken->first_prev_arc = atok;
2506 }
2507 else
2508 {
2509 ntok = FSMNODE_NULL;
2510 ntoken = NULL;
2511 }
2512 atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */
2513 atoken->to_node = ntok;
2514 atoken->ilabel = *add_tree;
2515 atoken->linkl_next_arc = FSMARC_NULL;
2516 atoken->linkl_prev_arc = FSMARC_NULL;
2517 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2518 if (!last_ntoken)
2519 {
2520 append_to->linkl_next_arc = atok;
2521 atoken->olabel = add_tree_olabel_use;
2522 #if DO_WEIGHTED_ADDWORD
2523 atoken->cost = add_tree_cost_use;
2524 #endif
2525 IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]);
2526 }
2527 else
2528 {
2529 last_ntoken->un_ptr.first_next_arc = atok;
2530 }
2531 last_ntoken = ntoken;
2532 last_ntok = ntok;
2533 }
2534 /* add_tree_end->to_node = early_end_node;
2535 add_tree_end->linkl_next_arc = FSMARC_NULL;
2536 atok = last_arc_arriving( early_end_node);
2537 atok->linkl_prev_arc = add_tree_end;
2538 add_tree_end->linkl_prev_arc = FSMARC_NULL; */
2539 atoken->to_node = NODE_ItoX(end_node);
2540 append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok);
2541 }
2542
2543 else /* ie. do_minimize from the rear */
2544 {
2545
2546 add_tree = add_tree_end;
2547 new_arc_id = fst_get_free_arc(fst);
2548 if (new_arc_id == MAXarcID)
2549 return FST_FAILED_ON_MEMORY;
2550 atok = ARC_ItoX(new_arc_id);
2551 atoken = ARC_XtoP(atok);
2552 atoken->olabel = add_tree_olabel_use;
2553 #if DO_WEIGHTED_ADDWORD
2554 atoken->cost = add_tree_cost_use;
2555 #endif
2556 atoken->ilabel = *add_tree_late_start;
2557 atoken->fr_node = NODE_PtoX(late_start_node);
2558 if (atoken->ilabel == WORD_BOUNDARY)
2559 {
2560 atoken->cost = (costdata)(atoken->cost + fst->wtw_average);
2561 }
2562 else
2563 {
2564 atoken->cost = add_tree_cost_use + fst->wtw_average;
2565 add_tree_cost_use = FREEcostdata;
2566 }
2567 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]);
2568 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2569 append_arc_leaving_node(fst, late_start_node, atok);
2570
2571 last_atok = atok;
2572 while (1)
2573 {
2574
2575 if (add_tree == add_tree_late_start)
2576 break;
2577 /* if there are other arcs leaving this node then joining
2578 earlier than here will result in over-generation, so don't
2579 if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL)
2580 break;
2581 */
2582 /*
2583 also need boundary conditions checker here
2584 */
2585
2586 /* */
2587 IF_DEBUG_WDADD(printf_node(fst, early_end_node));
2588
2589 /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) {
2590 printf("merge stopped due to num_arcs leaving ..\n");
2591 printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc);
2592 break;
2593 } */
2594
2595 for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2596 {
2597 atoken = ARC_XtoP(atok);
2598 /* never pull before the slot marker */
2599 if(atoken->cost == DISABLE_ARC_COST) break;
2600 if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break;
2601 fst_pull_arc_olabel(fst, atoken); /* fails are ok */
2602 }
2603
2604 prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node,
2605 *add_tree /*->ilabel*/,
2606 WORD_EPSILON_LABEL,
2607 NULL /*&append_to*/);
2608 if (!prev_atoken)
2609 break;
2610 else
2611 {
2612 /* this check is now inside find_prev_arc_with_iolabels */
2613 /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/
2614
2615 /* now look to merge earlier still */
2616 early_end_node = NODE_XtoP(prev_atoken->fr_node);
2617 add_tree--;
2618 if (add_tree == add_tree_late_start)
2619 break;
2620 }
2621 }
2622 add_tree_early_end = add_tree;
2623 for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++)
2624 {
2625 new_node_id = fst_get_free_node(fst);
2626 if (new_node_id == MAXnodeID)
2627 return FST_FAILED_ON_MEMORY;
2628 ntok = NODE_ItoX(new_node_id);
2629 ntoken = NODE_XtoP(ntok);
2630 new_arc_id = fst_get_free_arc(fst);
2631 if (new_arc_id == MAXarcID)
2632 return FST_FAILED_ON_MEMORY;
2633 atok = ARC_ItoX(new_arc_id);
2634 atoken = ARC_XtoP(atok);
2635 atoken->ilabel = *add_tree;
2636 #if DO_WEIGHTED_ADDWORD
2637 atoken->cost = FREEcostdata; /* idea: distribute add_tree_cost here;*/
2638 #endif
2639 atoken->olabel = WORD_EPSILON_LABEL;
2640 atoken->fr_node = ntok;
2641 atoken->to_node = FSMNODE_NULL; /* filled in next loop */
2642 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]);
2643 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]);
2644
2645 ntoken->un_ptr.first_next_arc = atok;
2646 ntoken->first_prev_arc = last_atok;
2647 TO_NODE(last_atok) = ntok;
2648 last_atok = atok;
2649 }
2650 TO_NODE(last_atok) = NODE_PtoX(early_end_node);
2651 append_arc_arriving_node(fst, early_end_node, last_atok);
2652 }
2653 return rc;
2654 }
2655
2656
append_arc_leaving_node(srec_context * fst,FSMnode * fr_node,FSMarc_ptr arc)2657 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc)
2658 {
2659 FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc;
2660 FSMarc* atoken = ARC_XtoP(*atok);
2661 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc)
2662 {
2663 atoken = ARC_XtoP(*atok);
2664 }
2665 *atok = arc;
2666 LINKL_NEXT(arc) = FSMARC_NULL;
2667 }
append_arc_arriving_node(srec_context * fst,FSMnode * to_node,FSMarc_ptr arc)2668 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc)
2669 {
2670 FSMarc_ptr* atok = &to_node->first_prev_arc;
2671 FSMarc* atoken = ARC_XtoP(*atok);
2672 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2673 {
2674 atoken = ARC_XtoP(*atok);
2675 }
2676 *atok = arc;
2677 LINKL_PREV(arc) = FSMARC_NULL;
2678 }
2679
2680
printf_node1(srec_context * fst,FSMnode * node)2681 int printf_node1(srec_context* fst, FSMnode* node)
2682 {
2683 return 0;
2684 }
printf_arc1(srec_context * fst,char * msg,FSMarc * arc)2685 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc)
2686 {
2687 char buffer[MAX_LINE_LENGTH];
2688 sprintf_arc(buffer, fst, arc);
2689 printf("%s%s\n", msg, buffer);
2690 return 0;
2691 }
sprintf_arc(char * buf,srec_context * fst,FSMarc * arc)2692 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc)
2693 {
2694 int rc;
2695 FSMnode* to_node = NODE_XtoP(arc->to_node);
2696 arcID arc_index = (arcID)(arc - fst->FSMarc_list);
2697 if (to_node->un_ptr.first_next_arc == FSMARC_NULL)
2698 rc = sprintf(buf, "arc%hu\n", arc_index);
2699 else
2700 {
2701 rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n",
2702 arc_index,
2703 ARC_XtoI(to_node->un_ptr.first_next_arc),
2704 arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1,
2705 fst->ilabels->words[arc->ilabel],
2706 fst->olabels->words[arc->olabel],
2707 arc->cost);
2708 }
2709 return rc;
2710 }
2711
2712 /* dumps the recognition context as a binary file,
2713 it will also store any empty space for growing */
2714
2715 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); }
2716
FST_sizes_signature()2717 asr_int32_t FST_sizes_signature()
2718 {
2719 #ifndef NDEBUG
2720 int shifted = 0;
2721 #endif
2722 asr_int32_t signature = 0;
2723 ENCODE(signature, arcID);
2724 ENCODE(signature, nodeID);
2725 ENCODE(signature, wordID);
2726 ENCODE(signature, labelID);
2727 ENCODE(signature, costdata);
2728 return signature;
2729 }
2730
FST_ContextImageSize(srec_context * context)2731 int FST_ContextImageSize(srec_context* context)
2732 {
2733 int size = 0;
2734 size += sizeof(srec_context);
2735 size += sizeof(wordmap);
2736 size += sizeof(char*) * context->olabels->max_words;
2737 size += sizeof(char) * context->olabels->max_chars;
2738 size += sizeof(wordmap);
2739 if (context->ilabels->words != NULL)
2740 size += sizeof(char*) * context->olabels->max_words;
2741 if (context->ilabels->chars != NULL)
2742 size += sizeof(char) * context->olabels->max_chars;
2743 size += sizeof(FSMarc) * context->FSMarc_list_len;
2744 size += sizeof(FSMnode) * context->FSMnode_list_len;
2745 size += sizeof(FSMnode_info) * context->FSMnode_list_len;
2746 size += sizeof(arc_token) * context->arc_token_list_len;
2747 return size;
2748 }
2749
serializeWordMapV2(wordmap * wordmap,PFile * fp)2750 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp)
2751 {
2752 unsigned int i = 0;
2753 unsigned int nfields;
2754 unsigned int tmp2[32];
2755
2756 i = 0;
2757 tmp2[i++] = wordmap->num_words;
2758 tmp2[i++] = wordmap->num_slots;
2759 tmp2[i++] = wordmap->max_words;
2760 tmp2[i++] = wordmap->num_base_words;
2761 tmp2[i++] = wordmap->max_chars;
2762 tmp2[i++] = (wordmap->next_chars - wordmap->chars);
2763 tmp2[i++] = (wordmap->next_base_chars - wordmap->chars);
2764 nfields = i;
2765 if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2766 return ESR_WRITE_ERROR;
2767
2768 if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars)
2769 return ESR_WRITE_ERROR;
2770
2771 return ESR_SUCCESS;
2772 }
2773
deserializeWordMapV2(wordmap ** pwordmap,PFile * fp)2774 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp)
2775 {
2776 unsigned int i = 0;
2777 unsigned int nfields;
2778 unsigned int tmp2[32];
2779 wordmap *awordmap;
2780 char *p;
2781 ESR_ReturnCode rc = ESR_SUCCESS;
2782 unsigned int next_chars_idx, next_base_chars_idx;
2783
2784 awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base"));
2785 if (awordmap == NULL)
2786 {
2787 PLogError("NEW failed on srec.g2g.graph.wordmap.base\n");
2788 rc = ESR_OUT_OF_MEMORY;
2789 goto CLEANUP;
2790 }
2791 awordmap->wordIDForWord = NULL; // early break to cleanup needs this
2792
2793 nfields = 7;
2794 if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2795 {
2796 PLogError("pfread failed when reading nfields\n");
2797 rc = ESR_READ_ERROR;
2798 goto CLEANUP;
2799 }
2800
2801 i = 0;
2802 awordmap->num_words = (wordID)tmp2[i++];
2803 awordmap->num_slots = (wordID)tmp2[i++];
2804 awordmap->max_words = (wordID)tmp2[i++];
2805 awordmap->num_base_words = (wordID)tmp2[i++];
2806 awordmap->max_chars = tmp2[i++];
2807 next_chars_idx = tmp2[i++];
2808 next_base_chars_idx = tmp2[i++];
2809 ASSERT(nfields == i);
2810
2811 awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words"));
2812 if (awordmap->words == NULL)
2813 {
2814 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words);
2815 rc = ESR_OUT_OF_MEMORY;
2816 goto CLEANUP;
2817 }
2818 awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars"));
2819 if (awordmap->chars == NULL)
2820 {
2821 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars);
2822 rc = ESR_OUT_OF_MEMORY;
2823 goto CLEANUP;
2824 }
2825 awordmap->next_chars = awordmap->chars + next_chars_idx;
2826 awordmap->next_base_chars = awordmap->chars + next_base_chars_idx;
2827
2828 if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars)
2829 {
2830 PLogError("pfread failed while reading %d chars\n", awordmap->max_chars);
2831 rc = ESR_READ_ERROR;
2832 goto CLEANUP;
2833 }
2834
2835
2836 p = awordmap->chars;
2837 ASSERT((int)p % 2 == 0);
2838 nfields = 0;
2839 if (nfields < awordmap->num_words)
2840 awordmap->words[nfields++] = p;
2841 for (; p < awordmap->next_chars; p++) // was next_base_chars
2842 {
2843 if ( ( *p ) == '\0' )
2844 {
2845 if (nfields == awordmap->num_words) // was num_base_words
2846 break;
2847 if (((int)p) % 2 == 0) p++; /* so that words begin on even byte bound */
2848 awordmap->words[nfields++] = p + 1;
2849 }
2850 }
2851 ASSERT(nfields == awordmap->num_words); // was num_base_words
2852
2853 if (awordmap->max_words >= awordmap->num_base_words)
2854 {
2855 PHashTableArgs hashArgs;
2856 hashArgs.capacity = awordmap->max_words;
2857 if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
2858 hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION;
2859 hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION;
2860 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
2861 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord));
2862
2863 rc = wordmap_populate ( awordmap, awordmap->num_words );
2864
2865 if (rc != ESR_SUCCESS)
2866 {
2867 wordmap_clean ( awordmap );
2868 goto CLEANUP;
2869 }
2870 }
2871 else
2872 {
2873 awordmap->wordIDForWord = NULL;
2874 }
2875
2876 /* success */
2877 *pwordmap = awordmap;
2878 return ESR_SUCCESS;
2879
2880 CLEANUP:
2881 if (awordmap != NULL)
2882 {
2883 if (awordmap->wordIDForWord != NULL)
2884 PHashTableDestroy(awordmap->wordIDForWord);
2885 if (awordmap->words != NULL) FREE(awordmap->words);
2886 if (awordmap->chars != NULL) FREE(awordmap->chars);
2887 FREE(awordmap);
2888 }
2889 return rc;
2890 }
2891
serializeArcToken(arc_token * token_base,int i,PFile * fp)2892 static ESR_ReturnCode serializeArcToken(arc_token *token_base,
2893 int i,
2894 PFile* fp)
2895 {
2896 arc_token *token = token_base + i;
2897 asr_uint32_t idx;
2898
2899 if (pfwrite(&token->ilabel, 2, 1, fp) != 1)
2900 return ESR_WRITE_ERROR;
2901
2902 if (pfwrite(&token->olabel, 2, 1, fp) != 1)
2903 return ESR_WRITE_ERROR;
2904
2905 /* if (pfwrite(&token->cost, 2, 1, fp) != 1)
2906 return ESR_WRITE_ERROR; */
2907 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base);
2908
2909 if (pfwrite(&idx, 4, 1, fp) != 1)
2910 return ESR_WRITE_ERROR;
2911
2912 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base);
2913
2914 if (pfwrite(&idx, 4, 1, fp) != 1)
2915 return ESR_WRITE_ERROR;
2916
2917 return ESR_SUCCESS;
2918 }
2919
serializeArcTokenInfo(srec_context * context,PFile * fp)2920 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context,
2921 PFile* fp)
2922 {
2923 int i;
2924 asr_uint32_t idx;
2925 ESR_ReturnCode rc;
2926
2927 if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1)
2928 return ESR_WRITE_ERROR;
2929
2930 idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list);
2931
2932 if (pfwrite(&idx, 4, 1, fp) != 1)
2933 return ESR_WRITE_ERROR;
2934
2935 idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list);
2936
2937 if (pfwrite(&idx, 4, 1, fp) != 1)
2938 return ESR_WRITE_ERROR;
2939
2940 for (i = 0; i < context->arc_token_list_len; ++i)
2941 {
2942 rc = serializeArcToken(context->arc_token_list, i, fp);
2943 if (rc != ESR_SUCCESS) return rc;
2944 }
2945 return ESR_SUCCESS;
2946 }
2947
deserializeArcToken(arc_token * token_base,int i,PFile * fp)2948 static ESR_ReturnCode deserializeArcToken(arc_token *token_base,
2949 int i,
2950 PFile* fp)
2951 {
2952 arc_token *token = token_base + i;
2953 asr_uint32_t idx[2];
2954 asr_uint16_t labels[2];
2955
2956 if (pfread(labels, 2, 2, fp) != 2)
2957 return ESR_READ_ERROR;
2958
2959 token->ilabel = labels[0];
2960 token->olabel = labels[1];
2961
2962 /* if (pfread(&token->cost, 2, 1, fp) != 1)
2963 return ESR_READ_ERROR; */
2964
2965 if (pfread(idx, 4, 2, fp) != 2)
2966 return ESR_READ_ERROR;
2967
2968 token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base));
2969 token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base));
2970
2971 return ESR_SUCCESS;
2972 }
2973
deserializeArcTokenInfo(srec_context * context,PFile * fp)2974 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context,
2975 PFile* fp)
2976 {
2977 ESR_ReturnCode rc;
2978 int i;
2979 asr_uint32_t idx;
2980
2981 if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) {
2982 PLogError("pfread failed in deserializeArcTokenInfo()\n");
2983 return ESR_READ_ERROR;
2984 }
2985
2986 context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len,
2987 L("srec.graph.wordgraph"));
2988
2989 if (context->arc_token_list == NULL)
2990 return ESR_OUT_OF_MEMORY;
2991
2992 if (pfread(&idx, 4, 1, fp) != 1)
2993 {
2994 rc = ESR_READ_ERROR;
2995 goto CLEANUP;
2996 }
2997
2998 context->arc_token_freelist =
2999 IDX_TO_PTR(idx, context->arc_token_list);
3000
3001 if (pfread(&idx, 4, 1, fp) != 1)
3002 {
3003 rc = ESR_READ_ERROR;
3004 goto CLEANUP;
3005 }
3006
3007 context->arc_token_insert_start =
3008 IDX_TO_PTR(idx, context->arc_token_list);
3009
3010 for (i = 0; i < context->arc_token_list_len; ++i)
3011 {
3012 rc = deserializeArcToken(context->arc_token_list, i, fp);
3013 if (rc != ESR_SUCCESS) goto CLEANUP;
3014 }
3015 return ESR_SUCCESS;
3016
3017 CLEANUP:
3018 FREE(context->arc_token_list);
3019 context->arc_token_list =
3020 context->arc_token_freelist =
3021 context->arc_token_insert_start = NULL;
3022 return rc;
3023 }
3024
FST_GetGrammarType(srec_context * context)3025 int FST_GetGrammarType(srec_context* context)
3026 {
3027 arc_token *t, *u;
3028 wordID expected_wdid;
3029 /* 0 1 0
3030 1 2 4
3031 ...
3032 1 2 1316
3033 2
3034 */
3035 t = context->arc_token_list;
3036 if (t->ilabel != WORD_EPSILON_LABEL)
3037 return GrammarTypeBNF;
3038 if (t->next_token_index)
3039 return GrammarTypeBNF;
3040 t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3041 expected_wdid = NUM_ITEMLIST_HDRWDS;
3042 for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index))
3043 {
3044 if (t->ilabel != expected_wdid)
3045 return GrammarTypeBNF;
3046 u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3047 if (u != NULL && u->ilabel != MAXwordID)
3048 return GrammarTypeBNF;
3049 expected_wdid++;
3050 }
3051 if (expected_wdid != context->olabels->num_words)
3052 return GrammarTypeBNF;
3053 return GrammarTypeItemList;
3054 }
3055
FST_DumpContextAsImageV2(srec_context * context,PFile * fp)3056 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp)
3057 {
3058 asr_uint32_t header[4];
3059 arcID tmp[32], i, j, nfields;
3060 FSMarc* arc;
3061 ESR_ReturnCode rc;
3062 #if !defined(DO_ARCS_IO_IN_ARC_ORDER)
3063 FSMnode* node;
3064 arcID arcid, num_arcs, num_arcs_written;
3065 #endif
3066
3067 /* Write header information. */
3068 header[0] = 0;
3069 header[1] = IMAGE_FORMAT_V2;
3070 header[2] = context->modelid;
3071 header[3] = context->grmtyp;
3072 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3073 PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp);
3074 #endif
3075
3076 if (pfwrite(header, sizeof(header[0]), 4, fp) != 4)
3077 {
3078 PLogError("FST_DumpContextAsImage: Could not write header.\n");
3079 return FST_FAILED_INTERNAL;
3080 }
3081
3082 /* arcs and nodes */
3083 i = 0;
3084 tmp[i++] = context->num_arcs;
3085 tmp[i++] = context->FSMarc_list_len;
3086 tmp[i++] = context->num_base_arcs;
3087 tmp[i++] = context->FSMarc_freelist;
3088 tmp[i++] = context->max_searchable_arcs;
3089
3090 tmp[i++] = context->num_nodes;
3091 tmp[i++] = context->FSMnode_list_len;
3092 tmp[i++] = context->num_base_nodes;
3093 tmp[i++] = context->FSMnode_freelist;
3094 tmp[i++] = context->start_node;
3095 tmp[i++] = context->end_node;
3096 tmp[i++] = context->max_searchable_nodes;
3097
3098 tmp[i++] = context->beg_silence_word;
3099 tmp[i++] = context->end_silence_word;
3100 tmp[i++] = context->hack_silence_word;
3101 tmp[i++] = context->hmm_ilabel_offset;
3102
3103 nfields = i;
3104
3105 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3106 return ESR_WRITE_ERROR;
3107 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3108 PLogMessage("G2G done WR hdrs %d", pftell(fp));
3109 #endif
3110
3111 #if defined(DO_ARCS_IO_IN_ARC_ORDER)
3112 if( 1) {
3113 i = 0;
3114 tmp[i++] = context->end_node;
3115 tmp[i++] = MAXnodeID;
3116 tmp[i++] = MAXmodelID;
3117 tmp[i++] = MAXwordID;
3118 tmp[i++] = FREEcostdata;
3119 nfields = i;
3120 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3121 return ESR_WRITE_ERROR;
3122 }
3123 for(j=0; j<context->num_arcs; j++) {
3124 arc = &context->FSMarc_list[j];
3125 i = 0;
3126 tmp[i++] = arc->fr_node;
3127 tmp[i++] = arc->to_node;
3128 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3129 tmp[i++] = arc->ilabel;
3130 tmp[i++] = arc->olabel;
3131 tmp[i++] = arc->cost;
3132 nfields = i;
3133 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3134 return ESR_WRITE_ERROR;
3135 }
3136 #else
3137 num_arcs_written = 0;
3138 for (j = 0; j < context->num_nodes; j++) {
3139 node = &context->FSMnode_list[j];
3140 num_arcs = (arcID)num_arcs_leaving(context, node);
3141 arcid = node->un_ptr.first_next_arc;
3142 if (arcid == MAXarcID) {
3143 i = 0;
3144 tmp[i++] = (arcID)j;
3145 tmp[i++] = MAXarcID;
3146 tmp[i++] = MAXarcID;
3147 tmp[i++] = MAXarcID;
3148 tmp[i++] = MAXarcID;
3149 nfields = i;
3150 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3151 return ESR_WRITE_ERROR;
3152 /* num_arcs_written++; end node don't have a to-arc */
3153 }
3154 else {
3155 for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) {
3156 arc = &context->FSMarc_list[arcid];
3157 i = 0;
3158 tmp[i++] = (arcID)j;
3159 tmp[i++] = arc->to_node;
3160 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3161 tmp[i++] = arc->ilabel;
3162 tmp[i++] = arc->olabel;
3163 tmp[i++] = arc->cost;
3164 nfields = i;
3165 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3166 return ESR_WRITE_ERROR;
3167 num_arcs_written++;
3168 }
3169 }
3170 }
3171 ASSERT(num_arcs_written == context->num_arcs);
3172 #endif
3173
3174 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3175 PLogMessage("G2G done WR arcs %d", pftell(fp));
3176 #endif
3177
3178 /* node info .. will now be calculated on line */
3179 /* wrapup_cost .. will now be calculated on line */
3180
3181 /* exit points for slots */
3182 if (1)
3183 {
3184 srec_fsm_exit_point *p, *q;
3185 if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1)
3186 return ESR_WRITE_ERROR;
3187 p = context->fsm_exit_points;
3188 q = p + MAX_NUM_SLOTS;
3189 while (p < q)
3190 {
3191 if (pfwrite(&p->from_node_index, 2, 1, fp) != 1)
3192 return ESR_WRITE_ERROR;
3193 if (pfwrite(&p->arc_index, 2, 1, fp) != 1)
3194 return ESR_WRITE_ERROR;
3195 if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1)
3196 return ESR_WRITE_ERROR;
3197 ++p;
3198 }
3199 }
3200 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3201 PLogMessage("G2G done WR exits %d", pftell(fp));
3202 #endif
3203 /* no entry points stuff */
3204
3205 /* no saving ilabels */
3206
3207 /* now load the ilabels */
3208 rc = serializeWordMapV2(context->olabels, fp);
3209 if (rc != ESR_SUCCESS)
3210 return rc;
3211 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3212 PLogMessage("G2G done WR wmap %d", pftell(fp));
3213 #endif
3214
3215 if (context->grmtyp != GrammarTypeItemList)
3216 {
3217 if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS)
3218 {
3219 PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3220 return FST_FAILED_INTERNAL;
3221 }
3222 }
3223 else
3224 {
3225 /* nothing, at decoding time, we'll just construct the graph
3226 from the word list, ie just this ..
3227 0 1 0
3228 1 2 4
3229 ...
3230 1 2 1316
3231 2
3232 ie
3233 0 1 0
3234 for(i=4;i<nwords;i++) { "1 2 i" }
3235 2
3236 because words 0-3 are eps,-pau-,-pau2-,slot@grammarname.grxml
3237 */
3238 }
3239
3240 /* Write header information. */
3241 header[0] = pftell(fp);
3242 header[1] = IMAGE_FORMAT_V2;
3243
3244 /* goto beginning of file. */
3245 if (pfseek(fp, 0, SEEK_SET))
3246 {
3247 PLogError("FST_DumpContextAsImage: could not reposition for header.\n");
3248 return FST_FAILED_INTERNAL;
3249 }
3250
3251 if (pfwrite(header, 4, 2, fp) != 2)
3252 {
3253 PLogError("FST_DumpContextAsImage: Could not write header.\n");
3254 return FST_FAILED_INTERNAL;
3255 }
3256
3257 /* reset pointer at end of file. */
3258 if (pfseek(fp, 0, SEEK_END))
3259 {
3260 PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n");
3261 return FST_FAILED_INTERNAL;
3262 }
3263
3264 return FST_SUCCESS;
3265 }
3266
3267
FST_LoadContextFromImageV2(srec_context * fst,PFile * fp)3268 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp)
3269 {
3270 asr_int32_t header[6];
3271 arcID tmp[32], num_arcs, new_arc_id;
3272 unsigned int i, nfields;
3273 srec_fsm_exit_point *p, *q;
3274 arcID max_num_FSMarcs;
3275 nodeID max_num_FSMnodes;
3276 FSMarc_ptr atok = (FSMarc_ptr)0;
3277 FSMarc *atoken = NULL;
3278 FSMnode *fr_node, *to_node;
3279 int rc = FST_SUCCESS;
3280 ESR_ReturnCode esr_rc;
3281 ESR_BOOL seenFinalNode = ESR_FALSE;
3282
3283 /* read header information. */
3284 if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2)
3285 {
3286 PLogError("FST_DumpContextAsImage: Could not read header.\n");
3287 return FST_FAILED_INTERNAL;
3288 }
3289 fst->modelid = header[2];
3290 fst->grmtyp = header[3];
3291
3292 nfields = 16;
3293 if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3294 return ESR_READ_ERROR;
3295
3296 /* arcs and nodes */
3297 i = 0;
3298 fst->num_arcs = max_num_FSMarcs = tmp[i++];
3299 fst->FSMarc_list_len = tmp[i++];
3300 fst->num_base_arcs = tmp[i++];
3301 fst->FSMarc_freelist = tmp[i++];
3302 fst->max_searchable_arcs = tmp[i++] * 0; /*5*/
3303 fst->num_nodes = max_num_FSMnodes = tmp[i++];
3304 fst->FSMnode_list_len = tmp[i++];
3305 fst->num_base_nodes = tmp[i++];
3306 fst->FSMnode_freelist = tmp[i++];
3307 fst->start_node = tmp[i++];/*10*/
3308 fst->end_node = tmp[i++];
3309 fst->max_searchable_nodes = tmp[i++] * 0;
3310 fst->beg_silence_word = tmp[i++];
3311 fst->end_silence_word = tmp[i++];
3312 fst->hack_silence_word = tmp[i++]; /*15*/
3313 fst->hmm_ilabel_offset = tmp[i++]; /* 16 */
3314
3315 fst->addWordCaching_lastslot_name = 0;
3316 fst->addWordCaching_lastslot_num = MAXwordID;
3317 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
3318
3319 ASSERT(i == nfields);
3320 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3321 PLogMessage("G2G done RD hdrs %d", pftell(fp));
3322 #endif
3323
3324 ASSERT(fst->num_arcs >= fst->num_base_arcs); // was ==
3325 ASSERT(fst->num_nodes >= fst->num_base_nodes); // was ==
3326
3327 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
3328 if (!fst->FSMarc_list)
3329 {
3330 rc = FST_FAILED_ON_MEMORY;
3331 PLogError("CALLOC_CLR fst->FSMarc_list \n");
3332 goto CLEANUP;
3333 }
3334 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
3335 if (!fst->FSMnode_list)
3336 {
3337 rc = FST_FAILED_ON_MEMORY;
3338 PLogError("CALLOC_CLR fst->FSMnode_list failed\n");
3339 goto CLEANUP;
3340 }
3341 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
3342
3343 if (!fst->FSMnode_info_list)
3344 {
3345 rc = FST_FAILED_ON_MEMORY;
3346 PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n");
3347 goto CLEANUP;
3348 }
3349
3350 /* setup the arc freelist */
3351 fst->FSMarc_freelist = 0;
3352 fst->num_arcs = 0;
3353 for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++)
3354 {
3355 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
3356 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3357 }
3358 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
3359 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3360
3361 /* initialize the nodes, 'cuz reading is random order */
3362 for (i = 0; i < max_num_FSMnodes; i++)
3363 {
3364 fr_node = &fst->FSMnode_list[i];
3365 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
3366 }
3367
3368 /* 1. first load up all the information */
3369 num_arcs = 0;
3370 for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++)
3371 {
3372 if (i > max_num_FSMarcs && !seenFinalNode)
3373 {
3374 PLogError("Final node never encountered");
3375 rc = ESR_INVALID_STATE;
3376 goto CLEANUP;
3377 }
3378 nfields = 5;
3379 if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields)
3380 {
3381 PLogError("reading arc");
3382 rc = ESR_INVALID_STATE;
3383 goto CLEANUP;
3384 }
3385
3386 if (tmp[1] == MAXnodeID)
3387 {
3388 seenFinalNode = ESR_TRUE;
3389 if(fst->end_node != tmp[0]) {
3390 PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]);
3391 ASSERT(fst->end_node == tmp[0]);
3392 }
3393 i--;
3394 }
3395 else
3396 {
3397 new_arc_id = fst_get_free_arc(fst);
3398 if (new_arc_id == MAXarcID)
3399 return FST_FAILED_ON_MEMORY;
3400 atok = ARC_ItoX(new_arc_id);
3401 atoken = ARC_XtoP(atok);
3402 num_arcs++;
3403
3404 ASSERT(tmp[0] < fst->num_nodes);
3405 ASSERT(tmp[1] < fst->num_nodes);
3406 fr_node = &fst->FSMnode_list[ tmp[0]];
3407 to_node = &fst->FSMnode_list[ tmp[1]];
3408 atoken->ilabel = tmp[2];
3409 atoken->olabel = tmp[3];
3410 atoken->cost = tmp[4];
3411 append_arc_leaving_node(fst, fr_node, atok);
3412 append_arc_arriving_node(fst, to_node, atok);
3413 atoken->fr_node = NODE_ItoX(tmp[0]);
3414 atoken->to_node = NODE_ItoX(tmp[1]);
3415 }
3416 }
3417 ASSERT(fst->num_arcs == num_arcs);
3418 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3419 PLogMessage("G2G done RD arcs %d", pftell(fp));
3420 #endif
3421
3422 /* setup the node freelist */
3423 if (fst->num_nodes < fst->FSMnode_list_len)
3424 {
3425 fst->FSMnode_freelist = fst->num_nodes;
3426 for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++)
3427 {
3428 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
3429 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3430 }
3431 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
3432 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3433 }
3434 else
3435 {
3436 fst->FSMnode_freelist = MAXnodeID;
3437 }
3438
3439 /* node info .. will now be calculated on line */
3440 /* wrapup_cost .. will now be calculated on line */
3441 fst->whether_prepared = 0;
3442 fst_fill_node_info(fst);
3443 for (i = 0; i < fst->num_nodes; i++)
3444 fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN;
3445
3446 /* exit points for slots */
3447 if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1)
3448 return ESR_READ_ERROR;
3449 p = fst->fsm_exit_points;
3450 q = p + MAX_NUM_SLOTS;
3451 while (p < q)
3452 {
3453 if (pfread(&p->from_node_index, 2, 1, fp) != 1)
3454 return ESR_WRITE_ERROR;
3455 if (pfread(&p->arc_index, 2, 1, fp) != 1)
3456 return ESR_WRITE_ERROR;
3457 if (pfread(&p->wbto_node_index, 2, 1, fp) != 1)
3458 return ESR_WRITE_ERROR;
3459 ++p;
3460 }
3461 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3462 PLogMessage("G2G done RD exits %d", pftell(fp));
3463 #endif
3464
3465 /* no entry points stuff */
3466
3467 /* ilabels were not saved, create them as empty */
3468 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
3469 fst->ilabels->num_words = fst->ilabels->max_words = 0;
3470 fst->ilabels->words = 0;
3471
3472 /* now save the olabels */
3473 esr_rc = deserializeWordMapV2(&fst->olabels, fp);
3474 if (esr_rc != ESR_SUCCESS)
3475 {
3476 PLogError("deserializeWordMapV2() failed\n");
3477 rc = FST_FAILED_INTERNAL;
3478 goto CLEANUP;
3479 }
3480 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3481 PLogMessage("G2G done RD wmap %d", pftell(fp));
3482 #endif
3483
3484 /* arctokeninfo */
3485 if (fst->grmtyp != GrammarTypeItemList)
3486 {
3487 if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS)
3488 {
3489 PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3490 rc = FST_FAILED_INTERNAL;
3491 goto CLEANUP;
3492 }
3493 }
3494 else
3495 {
3496 /* here we just construct an isolated list, but we should scrap it later */
3497 wordID wdid;
3498 arc_token *atl, *final, *atli;
3499 fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS;
3500 atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph"));
3501 if(!atl) {
3502 PLogError("fst->arc_token_list_len failed\n");
3503 goto CLEANUP;
3504 }
3505 atl->first_next_arc = ARC_TOKEN_LNK(atl, 1);
3506 atl->next_token_index = ARC_TOKEN_NULL;
3507 final = atl + fst->arc_token_list_len - 1;
3508 for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++)
3509 {
3510 atli = &atl[i];
3511 atli->ilabel = wdid;
3512 atli->olabel = wdid;/*not used*/
3513 atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1));
3514 atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/
3515 }
3516 ASSERT(atl + i == final);
3517 final->next_token_index = ARC_TOKEN_NULL;
3518 final->first_next_arc = ARC_TOKEN_NULL;
3519 final->ilabel = final->olabel = 0;
3520 fst->arc_token_list = atl;
3521 }
3522 return FST_SUCCESS;
3523 CLEANUP:
3524 return FST_FAILED_INTERNAL;
3525 }
3526
3527
3528 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp);
3529
FST_LoadContextFromImage(srec_context ** pcontext,PFile * fp)3530 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp)
3531 {
3532 srec_context* context = NULL;
3533 /* used to force correct data alignment */
3534 asr_int32_t header[2];
3535 int rc = FST_SUCCESS;
3536
3537 /*printf("FST_LoadContexFromImage\n");*/
3538 if (!fp)
3539 {
3540 PLogError("FST_LoadContextImage() failed; bad file pointer\n");
3541 return FST_FAILED_ON_INVALID_ARGS;
3542 }
3543
3544 context = NEW(srec_context, L("srec.graph.binary"));
3545 if (context == NULL)
3546 {
3547 PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n");
3548 return FST_FAILED_ON_MEMORY;
3549 }
3550 memset(context, 0, sizeof(srec_context));
3551
3552 /* header */
3553 if (pfread(header, 4, 2, fp) != 2)
3554 {
3555 PLogError("FST_LoadContextFromImage: failed reading header.\n");
3556 rc = FST_FAILED_ON_INVALID_ARGS;
3557 goto CLEANUP;
3558 }
3559
3560 if (header[1] == IMAGE_FORMAT_V2)
3561 {
3562 rc = FST_LoadContextFromImageV2(context, fp);
3563 if (rc != FST_SUCCESS)
3564 goto CLEANUP;
3565 }
3566 else
3567 {
3568 PLogError("FST_LoadContextFromImage() failed on image_format\n");
3569 goto CLEANUP;
3570 }
3571 *pcontext = context;
3572 return FST_SUCCESS;
3573 CLEANUP:
3574 if (context) FREE(context);
3575 *pcontext = 0;
3576 return rc;
3577 }
3578
3579
FST_DumpReverseWordGraph(srec_context * context,PFile * fp)3580 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp)
3581 {
3582 /* not implemented, use FST_DumpSyntaxAsImage() for now */
3583 return 1;
3584 }
3585
3586 /* this belongs part of the srec_context */
3587 typedef struct
3588 {
3589 asr_int32_t image_format;
3590 asr_int32_t image_size;
3591 asr_int32_t sizes_signature;
3592 }
3593 context_image_header;
3594
3595
FST_LoadContextFromStreamImage(srec_context ** pcontext,char * buffer,asr_int32_t buffer_len)3596 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len)
3597 {
3598 return FST_FAILED_INTERNAL;
3599 }
3600
3601 typedef struct nodeID_list_t
3602 {
3603 nodeID *nodes, num_nodes, num_alloc;
3604 }
3605 nodeID_list;
3606
fst_node_has_speech_to_come(srec_context * context,nodeID node_index)3607 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index)
3608 {
3609 /* recursive func ... try to keep stack use low! */
3610 /* later we should cache this information:
3611 need 2bits: endn opte regl dunno */
3612 FSMarc* arc;
3613 arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc;
3614 for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc)
3615 {
3616 arc = &context->FSMarc_list[arc_index];
3617 if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS)
3618 {
3619 return 1;
3620 }
3621 else if (arc->ilabel < EPSILON_OFFSET)
3622 {
3623 if (fst_node_has_speech_to_come(context, arc->to_node))
3624 {
3625 return 1;
3626 }
3627 }
3628 else if (IS_SILENCE_ILABEL(arc->ilabel, context))
3629 {
3630 /* another silence! */
3631 if (fst_node_has_speech_to_come(context, arc->to_node))
3632 return 1;
3633 }
3634 }
3635 return 0;
3636 }
3637
fst_fill_node_info(srec_context * context)3638 int fst_fill_node_info(srec_context* context)
3639 {
3640 char* infos = context->FSMnode_info_list;
3641 nodeID_list optends_obj;
3642 nodeID_list *optends = &optends_obj;
3643
3644 FSMnode* node;
3645 FSMarc* arc;
3646 nodeID i, j, node_index;
3647 arcID arc_index;
3648 costdata wrapup_cost;
3649 int wrapup_cost_inconsistencies;
3650 /* int num_near_end_nodes; nodes within EPS or silence to the end, these
3651 are called NODE_INFO_ENDNODES, so that they can be excluded from the
3652 comparison with end_node score in the eos detection */
3653
3654 optends->num_nodes = 1;
3655 optends->num_alloc = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */
3656 optends->nodes = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes");
3657 optends->nodes[0] = context->end_node;
3658 /* num_near_end_nodes = 0; */
3659
3660 for (i = 0; i < optends->num_nodes; i++)
3661 {
3662 node_index = optends->nodes[i];
3663 node = &context->FSMnode_list[ node_index];
3664 for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3665 arc_index = arc->linkl_prev_arc)
3666 {
3667 arc = &context->FSMarc_list[arc_index];
3668 if (arc->fr_node != node_index)
3669 {
3670
3671 if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET)
3672 {
3673 /* ok, fr_node goes to the end via silence, check for dups */
3674 for (j = 0; j < optends->num_nodes; j++)
3675 if (optends->nodes[j] == arc->fr_node) break;
3676 /* append it to the list */
3677 if (j == optends->num_nodes)
3678 {
3679 if (optends->num_nodes < optends->num_alloc)
3680 optends->nodes[ optends->num_nodes++] = arc->fr_node;
3681 else
3682 {
3683 ASSERT(0 && "allocated too few optend nodes");
3684 return 0;
3685 }
3686 }
3687 }
3688 }
3689 }
3690 }
3691
3692 /* now set the info array, default is regular */
3693 for (i = 0; i < context->num_nodes; i++)
3694 infos[i] = NODE_INFO_REGULAR;
3695 /* also set the other nodes, we'll check whether nodes were added
3696 via these flags */
3697 for (; i < context->FSMnode_list_len; i++)
3698 infos[i] = NODE_INFO_UNKNOWN;
3699 infos[ context->end_node] = NODE_INFO_ENDNODE;
3700
3701 /* get rid of direct to end nodes, etc */
3702 for (i = 0, j = 0; i < optends->num_nodes; i++)
3703 {
3704 optends->nodes[j] = optends->nodes[i];
3705 if (fst_node_has_speech_to_come(context, optends->nodes[i]))
3706 {
3707 j++;
3708 infos[ optends->nodes[i]] = NODE_INFO_OPTENDN;
3709 }
3710 else
3711 {
3712 infos[ optends->nodes[i]] = NODE_INFO_ENDNODE;
3713 /* num_near_end_nodes++; */
3714 }
3715 }
3716 optends->num_nodes = j;
3717
3718 /* printf("get_oppend_nodes (%d)", optends->num_nodes);
3719 for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]);
3720 printf("\n");
3721 */
3722
3723 FREE(optends->nodes);
3724
3725 /* find the wrapup cost */
3726 node = &context->FSMnode_list[ context->end_node];
3727 wrapup_cost = MAXcostdata;
3728 wrapup_cost_inconsistencies = 0;
3729 for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3730 arc_index = arc->linkl_prev_arc)
3731 {
3732 arc = &context->FSMarc_list[ arc_index];
3733 if (IS_SILENCE_ILABEL(arc->ilabel, context) &&
3734 arc->olabel == context->end_silence_word)
3735 {
3736 if (wrapup_cost == MAXcostdata)
3737 wrapup_cost = arc->cost;
3738 else if (context->wrapup_cost != arc->cost)
3739 {
3740 wrapup_cost = arc->cost;
3741 wrapup_cost_inconsistencies++;
3742 }
3743 }
3744 }
3745 #define MIN_WRAPUP_COST 100
3746 context->wtw_average = wrapup_cost;
3747 if (context->wtw_average > 200)
3748 context->wtw_average = 200;
3749 if (context->wrapup_cost < MIN_WRAPUP_COST)
3750 context->wrapup_cost = MIN_WRAPUP_COST;
3751 /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost,
3752 wrapup_cost_inconsistencies);*/
3753 return 0;
3754 }
3755
fst_alloc_transit_points(srec_context * context)3756 int fst_alloc_transit_points(srec_context* context)
3757 {
3758 arcID i;
3759 wordID num_slots = context->olabels->num_slots;
3760 wordID olabel;
3761 asr_int16_t nfxps = 0;
3762 FSMarc* arc;
3763 FSMnode* node;
3764
3765 context->num_fsm_exit_points = 0;
3766 /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */
3767 if (num_slots == 1)
3768 return 0;
3769
3770 /* scan through to finds arc that carry rule references */
3771 for (i = 0; i < context->num_arcs; i++)
3772 {
3773 olabel = context->FSMarc_list[i].olabel;
3774 /* (wordID)(olabel-1) < (num_slots-1) ?? might work */
3775 if (olabel > 0 && olabel < num_slots)
3776 {
3777 context->FSMarc_list[i].cost = DISABLE_ARC_COST;
3778 if (nfxps >= MAX_NUM_SLOTS)
3779 {
3780 PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n");
3781 return 0;
3782 }
3783 context->fsm_exit_points[nfxps].arc_index = i;
3784 context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node;
3785 /* skip over the trailing silence and .wb labels */
3786 for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;)
3787 {
3788 node = &context->FSMnode_list[arc->to_node];
3789 arc = &context->FSMarc_list[node->un_ptr.first_next_arc];
3790 }
3791 context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node;
3792 nfxps++;
3793
3794 } /* olabel<num_slots */
3795 } /* i<num_arcs */
3796 context->num_fsm_exit_points = nfxps;
3797 return 0;
3798 }
3799
3800
3801 /*
3802 cross-word modeling:
3803 for now we'll assume silence context on either side, later we'll create
3804 a graph with the right fan out on the edges of each slot. At that point we
3805 may have routines like ... if(is_dead_end(arc)) continue;
3806
3807 word removal:
3808 we'll use refcounters on arcs, we can search the whole graph for the
3809 olabel, then remove forward and remove backwards
3810
3811 word addition:
3812 glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219
3813
3814 set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map
3815 set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map
3816 wintel/debug/addword.exe -min
3817 $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt
3818 $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot
3819 dotty out.PCLG.dot
3820
3821 need
3822 - a sanity check
3823 - remove word
3824 - multiple pronunciations handling
3825 - homonyms handling, other boundary conditions checking
3826 - measure the speed
3827
3828 alan_adams al~#ad}z
3829 alan_adams al~ad}z
3830 paul_adams p{l#ad}z
3831
3832
3833
3834 */
3835