1 /*---------------------------------------------------------------------------*
2 * srec_context.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20
21 #include "stdlib.h"
22 #include "string.h"
23 #include "buildopt.h"
24 #include "setting.h"
25 #include "passert.h"
26 #include "pendian.h"
27 #include "portable.h"
28 #include "pstdio.h"
29 #include "ptypes.h"
30 #include "srec_context.h"
31 #include "srec_sizes.h"
32 #include "search_network.h"
33 #include "srec_arb.h"
34 #include "hmm_desc.h"
35 #if USE_COMP_STATS
36 #include "comp_stats.h"
37 #endif
38
39 #ifdef SET_RCSID
40 static const char *rcsid = 0 ? (const char *) &rcsid :
41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $";
42 #endif
43
44 #include <stdint.h>
45
46 /* these are constants for now, we need to figure out how to make
47 this more flexible, possible ideas are:
48 (1) to specify these via API functions such as CR_LoadSyntax(),
49 (2) use defaults that would have been set the main parfile,
50 (3) put values for these fields in an actual grammar
51 */
52
53 #define AVG_ARCS_PER_WORD 10
54 #define AVG_NODES_PER_WORD 7
55 #define AVG_CHARS_PER_WORD 18
56 #define MAX_PRON_LEN 1024
57 #define MAX_WORD_LEN 128
58 #define DO_WEIGHTED_ADDWORD 1
59 #define SLOTLOOP_OFFSET 64
60 #define SLOTNAME_INDICATOR "__"
61 #define RULE_INDICATOR '@'
62 #define MAX_LINE_LENGTH 512
63 #define DEFAULT_WB_COST 40
64 #define DISABLE_ARC_COST 999
65 #define DO_ARCS_IO_IN_ARC_ORDER 1
66
67 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS)
68 // looking to match filename.grxml.Names@__Names__
69 // see also grxmlcompile.cpp where these names are constructed
70 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) )
71
72 /*
73 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled)
74 EXAMPLE: We need to cast a memory address to a (wordmap*)
75 so we call DATA_ALIGN(memptr, wordmap, filled),
76 where FILLED contains the number of bytes that were used to align the pointer
77 */
78 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM)
79 /* under IPAQ it appears we need to align the structures
80 to certain boundaries. More attention needed here for other
81 platforms ! */
82 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \
83 (z)=sizeof(y) - ((int) (x) % sizeof(y)); \
84 (x) += (z); /* force correct data alignment */ \
85 } else z=0;
86 #else
87 #define DATA_ALIGN(x, y, z)
88 #endif
89
90 static const int do_minimize = 1;
91 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base)))
92 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx)
93
94 /* prototypes */
95
96 /*----------------------------------------------------------------------*
97 * *
98 * internal prototypes *
99 * *
100 *----------------------------------------------------------------------*/
101
102 /* general use functions */
103 char split_line(char* line, char** av);
104 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max);
105 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc);
106 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc);
107 int printf_node1(srec_context* fst, FSMnode* node);
108
109 /* fst_* functions are internal fst functions */
110 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
111 wordID olabel, costdata cost,
112 modelID* model_sequence, int num_models);
113 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc);
114 int fst_push_arc_cost(srec_context* fst, FSMarc* arc);
115 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc);
116 int fst_free_arc(srec_context* fst, FSMarc* arc);
117 int fst_free_node(srec_context* fst, FSMnode* node);
118 int fst_free_arc_net(srec_context* fst, FSMarc* arc);
119 arcID fst_get_free_arc(srec_context* fst);
120 nodeID fst_get_free_node(srec_context* fst);
121 int fst_pack_arc_usage(srec_context* fst);
122
123 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok);
124 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok);
125 int num_arcs_leaving(srec_context* fst, FSMnode* node);
126
127 int num_arcs_arriving(srec_context* fst, FSMnode* node);
128 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node);
129 int fst_fill_node_info(srec_context* context);
130 int fst_alloc_transit_points(srec_context* context);
131
132 /* higher-level functions */
133 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add,
134 PFile* fp);
135 int FST_LoadParams(srec_context* context, PFile* fp);
136 int FST_AssumeParams(srec_context* context);
137 int FST_UnloadReverseWordGraph(srec_context* context);
138 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree);
139
140 static ESR_ReturnCode wordmap_clean ( wordmap *word_map );
141 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words );
142
143 /*--------------------------------------------------------------------------*
144 * *
145 * *
146 * *
147 *--------------------------------------------------------------------------*/
148
FST_IsVoiceEnrollment(srec_context * context)149 int FST_IsVoiceEnrollment(srec_context* context)
150 {
151 if (context->olabels == NULL) return 0;
152 if (context->olabels->num_words < 2) return 0;
153 if (strstr(context->olabels->words[1], "enroll")) return 1;
154 return 0;
155 }
156
FST_LoadContext(const char * synbase,srec_context ** pcontext,int num_words_to_add)157 int FST_LoadContext(const char* synbase, srec_context** pcontext,
158 int num_words_to_add)
159 {
160 int rc;
161 PFile* fp;
162 char buffer[MAX_LINE_LENGTH];
163 srec_context* context;
164
165 context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base");
166 if(!context)
167 return FST_FAILED_ON_MEMORY;
168 memset(context, 0, sizeof(srec_context));
169
170 context->addWordCaching_lastslot_name = 0;
171 context->addWordCaching_lastslot_num = MAXwordID;
172 context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
173
174 sprintf(buffer, "%s.map", synbase);
175 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
176 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
177 rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp);
178 pfclose(fp);
179 if (rc) return rc;
180
181 sprintf(buffer, "%s.PCLG.txt", synbase);
182 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
183 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
184 rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp);
185 pfclose(fp);
186 if (rc != FST_SUCCESS) return rc;
187
188 sprintf(buffer, "%s.Grev2.det.txt", synbase);
189 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
190 if (!fp) return FST_FAILED_ON_INVALID_ARGS;
191 rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp);
192 pfclose(fp);
193 if (rc != FST_SUCCESS) return rc;
194
195 sprintf(buffer, "%s.params", synbase);
196 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE);
197 if (fp) {
198 rc = FST_LoadParams(context, fp);
199 pfclose(fp);
200 if (rc != FST_SUCCESS) return rc;
201 } else {
202 rc = FST_AssumeParams(context);
203 }
204
205 context->grmtyp = (arcID)FST_GetGrammarType(context);
206
207 *pcontext = context;
208 rc = fst_fill_node_info(context);
209 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
210 }
211
FST_UnloadContext(srec_context * context)212 void FST_UnloadContext(srec_context* context)
213 {
214 if (!context) return;
215
216 FST_UnloadWordMap(&context->ilabels);
217 FST_UnloadWordMap(&context->olabels);
218 FST_UnloadGraph(context);
219 FST_UnloadReverseWordGraph(context);
220 FREE(context);
221 }
222
FST_PrepareContext(srec_context * context)223 int FST_PrepareContext(srec_context* context)
224 {
225 nodeID i;
226 int rc = FST_SUCCESS;
227 /* after all word additions, we need to "prepare" the context */
228
229 /* first check for any changes to optional endnodes etc .. */
230 for (i = 0; i < context->num_nodes; i++)
231 if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN)
232 break;
233 if (i != context->num_nodes)
234 rc = fst_fill_node_info(context);
235
236 context->whether_prepared = 1;
237 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS;
238 }
239
fst_set_wb_costs(srec_context * context,costdata wbcost)240 void fst_set_wb_costs( srec_context* context, costdata wbcost)
241 {
242 unsigned int i;
243 for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) {
244 if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY)
245 context->FSMarc_list[i].cost = wbcost;
246 }
247 }
248
FST_LoadParams(srec_context * context,PFile * fp)249 int FST_LoadParams(srec_context* context, PFile* fp)
250 {
251 char line[MAX_LINE_LENGTH];
252 costdata wbcost = MAXcostdata;
253 while (pfgets(line, MAX_LINE_LENGTH, fp))
254 {
255 char* key = strtok(line," \n\r\t");
256 char* val = key ? strtok(NULL," \n\r\t") : NULL;
257 if(val && !strcmp(val,"="))
258 val = key ? strtok(NULL," \n\r\t") : NULL;
259 if(!key || !key[0])
260 continue;
261 else if(val && val[0]) {
262 if(!strcmp(key,"word_penalty")) {
263 wbcost = (costdata)atoi_with_check(val, MAXcostdata);
264 if(wbcost == MAXcostdata) {
265 return FST_FAILED_ON_INVALID_ARGS;
266 }
267 } else {
268 PLogError(L("error: unknown parameter %s in .params file"), key);
269 return FST_FAILED_ON_INVALID_ARGS;
270 }
271 }
272 }
273 if(wbcost != MAXcostdata)
274 fst_set_wb_costs( context, wbcost);
275 return FST_SUCCESS;
276 }
277
FST_AssumeParams(srec_context * context)278 int FST_AssumeParams( srec_context* context)
279 {
280 fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST);
281 return FST_SUCCESS;
282 }
283
284
285 /*--------------------------------------------------------------------------*
286 * *
287 * *
288 * *
289 *--------------------------------------------------------------------------*/
290
hmm_number(const char * hmm_Name,modelID hmm_ilabel_offset)291 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset)
292 {
293 if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL;
294 if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY;
295 if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY;
296 ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]);
297 return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID));
298 }
299
hmm_name(modelID ilabel,modelID hmm_ilabel_offset,char * buf)300 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf)
301 {
302 if (ilabel == EPSILON_LABEL)
303 sprintf(buf, "eps");
304 else if (ilabel == WORD_BOUNDARY)
305 sprintf(buf, ".wb");
306 else if (ilabel == PHONE_BOUNDARY)
307 sprintf(buf, ".ph");
308 else
309 sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset);
310 return buf;
311 }
312
313 /*------------------------------------------------------------------*
314 * *
315 * words *
316 * *
317 *------------------------------------------------------------------*/
HashCmpWord(const LCHAR * key1,const LCHAR * key2)318 int HashCmpWord(const LCHAR *key1, const LCHAR *key2)
319 {
320 return LSTRCMP(key1,key2);
321 }
HashGetCode(const void * key)322 unsigned int HashGetCode(const void *key)
323 {
324 const LCHAR* k = (const LCHAR*)key;
325 unsigned int i, len, h = 0;
326 len = LSTRLEN( k);
327 for ( i = 0; i < len; i++)
328 h = 31*h + (unsigned int)k[i];
329 return h;
330 }
331
wordmap_create(wordmap ** pwmap,int num_chars,int num_words,int num_words_to_add)332 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add)
333 {
334 wordmap* Interface;
335 PHashTableArgs hashArgs;
336 ESR_ReturnCode rc;
337
338 Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base");
339 Interface->max_words = (wordID)(num_words + num_words_to_add);
340 Interface->num_words = (wordID)0;
341 /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */
342 Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words");
343 Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD;
344 Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars");
345 Interface->next_chars = Interface->chars;
346 Interface->wordIDForWord = NULL;
347 *pwmap = Interface;
348
349 /* use a hashtable to save mapping between wdID and array index */
350 if (num_words_to_add >= 0)
351 {
352 hashArgs.capacity = num_words + num_words_to_add + 10;
353 if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
354 hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION;
355 hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION;
356 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
357 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord));
358 }
359 else
360 {
361 Interface->wordIDForWord = NULL;
362 }
363
364 return FST_SUCCESS;
365
366 CLEANUP:
367 wordmap_destroy(pwmap);
368 return rc;
369 }
370 wordID wordmap_add_word(wordmap* wmap, const char* word);
371
FST_LoadWordMap(wordmap ** pwmap,int num_words_to_add,PFile * fp)372 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp)
373 {
374 wordID my_wID, wm_wID;
375 char *word, *wID_string;
376 int num_words;
377 asr_int32_t num_chars;
378 char buf[MAX_LINE_LENGTH];
379 long fpos;
380 wordmap* wmap;
381 /* figure out the number of words */
382 fpos = pftell(fp);
383 for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++)
384 {
385 word = strtok(buf, " \n\r\t");
386 num_chars += strlen(word);
387 }
388 num_chars += num_words * 2; /* for null terminators */
389 pfseek(fp, fpos, SEEK_SET);
390
391 /* Alex added this to reuse this functionality elsewhere
392 destroy function for this create function is actually taken care of by the
393 FST_UnloadWordMap function
394 */
395 wordmap_create(&wmap, num_chars, num_words, num_words_to_add);
396
397 /* later replace this with wordblocks, linked list of 8 character blocks! */
398 while (pfgets(buf, MAX_LINE_LENGTH, fp))
399 {
400 word = strtok(buf, " \n\r\t");
401 wID_string = strtok(NULL, " \n\r\t");
402 my_wID = (wordID)atoi_with_check(wID_string, MAXwordID);
403
404 // if(strstr(word,SLOTNAME_INDICATOR))
405 // word = strstr(word,SLOTNAME_INDICATOR);
406 wm_wID = wordmap_add_word(wmap, word);
407 ASSERT(my_wID == wm_wID);
408 }
409 ASSERT(wmap->num_words == num_words);
410
411 for (my_wID = 1; my_wID < num_words; my_wID++)
412 {
413 if (!IS_SLOT_OLABEL(wmap->words[my_wID]))
414 break;
415 }
416 wmap->num_slots = my_wID;
417 wordmap_setbase(wmap);
418 *pwmap = wmap;
419 if(wmap->num_slots > MAX_NUM_SLOTS)
420 return FST_FAILED_ON_INVALID_ARGS;
421 else
422 return FST_SUCCESS;
423 }
424
wordmap_destroy(wordmap ** wmap)425 int wordmap_destroy(wordmap** wmap)
426 {
427 if (wmap && *wmap)
428 {
429 wordmap_clean ( *wmap );
430 if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord);
431 if (((*wmap)->chars)) FREE((*wmap)->chars);
432 if (((*wmap)->words)) FREE((*wmap)->words);
433 if ((*wmap)) FREE((*wmap));
434 *wmap = 0;
435 }
436 return FST_SUCCESS;
437 }
438
FST_UnloadWordMap(wordmap ** wmap)439 int FST_UnloadWordMap(wordmap** wmap)
440 {
441 return wordmap_destroy(wmap);
442 }
443
444
FST_DumpWordMap(PFile * fp,wordmap * wmap)445 int FST_DumpWordMap(PFile* fp, wordmap* wmap)
446 {
447 wordID my_wID;
448 for (my_wID = 0; my_wID < wmap->num_words; my_wID++)
449 {
450 pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID);
451 }
452 return FST_SUCCESS;
453 }
454
wordmap_find_index(wordmap * wmap,const char * word)455 wordID wordmap_find_index(wordmap* wmap, const char* word)
456 {
457 void *wdID_void;
458 ESR_ReturnCode rc;
459 size_t i;
460
461 if (!word)
462 return MAXwordID;
463
464 if (wmap->num_words == 0)
465 return MAXwordID;
466
467 if (wmap->wordIDForWord)
468 {
469 rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void);
470
471 if (rc == ESR_SUCCESS)
472 return (wordID)(uintptr_t)wdID_void;
473 }
474 else
475 {
476 for (i = 0; i < wmap->num_words; i++)
477 {
478 if (!strcmp(wmap->words[i], word))
479 {
480 return (wordID)i;
481 }
482 }
483 }
484 return MAXwordID;
485 }
486
wordmap_find_rule_index(wordmap * wmap,const char * rule)487 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule)
488 {
489 int i;
490 int strlen_rule = strlen(rule);
491 for (i = wmap->num_slots; --i > 0;)
492 { /* > (not >=) because eps is always 0 */
493 char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR);
494 if(p != NULL){
495 if (!strcmp(p, rule)) break;
496 else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break;
497 }
498 }
499 if (i == 0) return MAXwordID;
500 else return(wordID)i;
501 }
502
wordmap_find_index_in_rule(wordmap * wmap,const char * word,wordID rule)503 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule)
504 {
505 int len = strlen(word);
506 int rule0 = rule + '0';
507 void *wdID_void;
508 LCHAR word_dot_rule[256];
509 ESR_ReturnCode rc;
510
511 if (!word)
512 return MAXwordID;
513 LSTRCPY(word_dot_rule, word);
514 word_dot_rule[len++] = IMPORTED_RULES_DELIM;
515 word_dot_rule[len++] = (char)rule0;
516 word_dot_rule[len++] = 0;
517 rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void);
518 if (rc == ESR_SUCCESS)
519 return (wordID)(uintptr_t)(wdID_void);
520 return MAXwordID;
521 }
522
strlen_with_null(const char * word)523 int strlen_with_null(const char* word)
524 {
525 #if 1 /* NT */
526 int len = strlen(word) + 1;
527 if (len % 2 == 1) len++;
528 return len;
529 #else /* C54 */
530 #endif
531 }
532
wordmap_whether_in_rule(wordmap * wmap,wordID word,wordID rule)533 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule)
534 {
535 char* word_chars;
536 int len;
537 if (word > wmap->num_words) return 0;
538 word_chars = wmap->words[word];
539 len = strlen(word_chars);
540 if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM)
541 return 1;
542 else
543 return 0;
544 }
545
wordmap_setbase(wordmap * wmap)546 void wordmap_setbase(wordmap* wmap)
547 {
548 wmap->num_base_words = wmap->num_words;
549 wmap->next_base_chars = wmap->next_chars;
550 }
551
wordmap_ceiling(wordmap * wmap)552 void wordmap_ceiling(wordmap* wmap)
553 {
554 /* this is irreversible */
555 wmap->max_words = wmap->num_words;
556 wmap->max_chars = (wmap->next_chars - wmap->chars);
557 }
558
559
wordmap_clean(wordmap * word_map)560 static ESR_ReturnCode wordmap_clean ( wordmap *word_map )
561 {
562 ESR_ReturnCode clean_status;
563 PHashTableEntry *entry;
564 PHashTableEntry *oldEntry;
565 wordID *value;
566
567 clean_status = ESR_SUCCESS;
568
569 if ( word_map->wordIDForWord == NULL )
570 return clean_status;
571
572 clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry);
573
574 while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) )
575 {
576 clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value );
577
578 if ( clean_status == ESR_SUCCESS )
579 {
580 oldEntry = entry;
581 clean_status = PHashTableEntryAdvance ( &entry );
582
583 if ( clean_status == ESR_SUCCESS )
584 clean_status = PHashTableEntryRemove ( oldEntry );
585 }
586 }
587 return ( clean_status );
588 }
589
590
wordmap_populate(wordmap * word_map,wordID num_words)591 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words )
592 {
593 ESR_ReturnCode populate_status;
594 wordID wdID;
595
596 populate_status = ESR_SUCCESS;
597 wdID = 0;
598
599 if ( word_map->wordIDForWord != NULL )
600 {
601 while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) )
602 {
603 populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID],
604 (const void *)(uintptr_t)wdID, NULL );
605 if ( populate_status == ESR_SUCCESS )
606 wdID++;
607 else {
608 return (populate_status);
609 }
610 }
611 }
612 return ( populate_status );
613 }
614
wordmap_reset(wordmap * wmap)615 void wordmap_reset(wordmap* wmap)
616 {
617 char** tmp_words;
618 int i;
619 ESR_ReturnCode reset_status;
620
621 if (wmap->num_base_words < wmap->num_words)
622 {
623 /*wordID i = (wordID)(wmap->num_base_words);*/
624 char* old_wmap_chars = wmap->chars;
625 int offset = wmap->next_base_chars - wmap->chars;
626 char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars"));
627 if(!tmp_chars) {
628 passert ( 0 && L("failed to reset the memory for wordmap.chars ") );
629 }
630 memcpy(tmp_chars,wmap->chars, offset * sizeof(char));
631 FREE(wmap->chars);
632
633 wmap->chars = tmp_chars;
634 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars);
635 wmap->max_chars = (wordID) offset;
636 wmap->next_chars = wmap->next_base_chars;
637
638 tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words");
639 if(!tmp_words) {
640 passert ( 0 && L("failed to reset the memory for wordmap.words ") );
641 }
642 memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*));
643 FREE( wmap->words);
644
645 wmap->words = tmp_words;
646 wmap->max_words = wmap->num_base_words;
647 wmap->num_words = wmap->num_base_words;
648
649 for(i=0; i<wmap->num_words; i++)
650 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars);
651 }
652
653 reset_status = wordmap_clean ( wmap );
654
655 if ( reset_status == ESR_SUCCESS )
656 {
657 reset_status = wordmap_populate ( wmap, wmap->num_base_words );
658
659 if ( reset_status != ESR_SUCCESS )
660 {
661 wordmap_clean ( wmap );
662 passert ( 0 && L("wordmap_reset failed") );
663 }
664 }
665 else
666 {
667 passert ( 0 && L("wordmap_reset failed") );
668 }
669 }
670
671 #define FST_GROW_FACTOR 12/10
672 #define FST_GROW_MINCHARS 256
673 #define FST_GROW_MINWORDS 32
wordmap_add_word(wordmap * wmap,const char * word)674 wordID wordmap_add_word(wordmap* wmap, const char* word)
675 {
676 int len = strlen_with_null(word);
677 wordID wdID =0;
678
679 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
680 {
681 #if defined(FST_GROW_FACTOR)
682 uintptr_t i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR;
683 char* old_wmap__chars = wmap->chars;
684 if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS)
685 tmp_max_chars +=FST_GROW_MINCHARS;
686
687 char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
688 if(!tmp_chars) {
689 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars"));
690 return MAXwordID;
691 }
692 memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char));
693 FREE(wmap->chars);
694
695 wmap->chars = tmp_chars;
696 wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars);
697 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars);
698 wmap->max_chars = (wordID)tmp_max_chars;
699 //Remove old keys --- Add WORD
700 wordmap_clean (wmap );
701
702 // adjust word pointers
703 for(i=0; i<wmap->num_words; i++)
704 {
705 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars);
706 // adjust hashtable ---
707 if (wmap->wordIDForWord)
708 {
709 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
710 (const void *)i, NULL );
711 if ( rc != ESR_SUCCESS ) {
712 goto CLEANUP;
713 }
714 }
715 }
716 #else // so not defined(FST_GROW_FACTOR)
717 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
718 return MAXwordID;
719 #endif // defined(FST_GROW_FACTOR)
720 }
721 //Add word
722 if (wmap->num_words == wmap->max_words)
723 {
724 #if defined(FST_GROW_FACTOR)
725 char** tmp_words;
726 wordID tmp_max_words;
727 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR;
728
729 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
730 itmp_max_words += FST_GROW_MINWORDS;
731
732 if( itmp_max_words >= MAXwordID) {
733 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
734 return MAXwordID;
735 }
736 tmp_max_words = (wordID)itmp_max_words;
737 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
738 if(!tmp_words) {
739 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words"));
740 return MAXwordID;
741 }
742 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
743 FREE( wmap->words);
744 wmap->words = tmp_words;
745 wmap->max_words = tmp_max_words;
746 #else // so not defined(FST_GROW_FACTOR)
747 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
748 return MAXwordID;
749 #endif // defined(FST_GROW_FACTOR)
750 }
751 if(1)
752 {
753 strcpy(wmap->next_chars, word);
754 wmap->words[ wmap->num_words++] = wmap->next_chars;
755 wmap->next_chars += len;
756 wdID = (wordID)(wmap->num_words - (wordID)1);
757 if (wmap->wordIDForWord)
758 {
759 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
760 (const void *)(uintptr_t)wdID, NULL );
761 if ( rc != ESR_SUCCESS )
762 goto CLEANUP;
763 }
764 return wdID;
765 }
766 CLEANUP:
767 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
768 return MAXwordID;
769 }
770
wordmap_add_word_in_rule(wordmap * wmap,const char * word,wordID rule)771 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule)
772 {
773 int len = strlen_with_null(word) + 2;
774 wordID wdID = 0;
775 wordID i;
776 //wordmap_add_word_in_rule
777 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars)
778 {
779 #if defined(FST_GROW_FACTOR)
780 int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR;
781 char* tmp_chars;
782 char* old_next_chars = wmap->next_chars;
783 char* old_chars = wmap->chars;
784
785 if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS )
786 tmp_max_chars += FST_GROW_MINCHARS;
787 if( wmap->chars + tmp_max_chars <= wmap->next_chars + len)
788 tmp_max_chars += len;
789
790 tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars"));
791 if(!tmp_chars) {
792 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars"));
793 return MAXwordID;
794 }
795 memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char));
796 FREE(wmap->chars);
797
798 wmap->chars = tmp_chars;
799 wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ;
800 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars);
801 wmap->max_chars = (wordID)tmp_max_chars;
802
803 //Remove old keys
804 wordmap_clean ( wmap );
805
806 // adjust word pointers wordmap_add_word_in_rule
807 for(i=0; i<wmap->num_words; i++)
808 {
809 wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ;
810 // adjust hashtable ----- add in rulewordmap_add_word_in_rule ----
811 if (wmap->wordIDForWord) {
812 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i],
813 (void*)(uintptr_t)(i), NULL );
814 if ( rc != ESR_SUCCESS )
815 goto CLEANUP;
816 }
817 }
818 #else
819 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars);
820 return MAXwordID;
821 #endif
822 }//wordmap_add_word_in_rule
823 if (wmap->num_words == wmap->max_words)
824 {
825 #if defined(FST_GROW_FACTOR)
826 char** tmp_words;
827 wordID tmp_max_words;
828 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR;
829 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS)
830 itmp_max_words += FST_GROW_MINWORDS;
831
832 if( itmp_max_words >= MAXwordID) {
833 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
834 return MAXwordID;
835 }
836
837 tmp_max_words = (wordID)itmp_max_words;
838 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words");
839 if(!tmp_words) {
840 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words"));
841 return MAXwordID;
842 }
843 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*));
844 FREE( wmap->words);
845 wmap->words = tmp_words;
846 wmap->max_words = tmp_max_words;
847 #else
848 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words);
849 return MAXwordID;
850 #endif
851 }
852 if(1)
853 {
854 char *p;
855 const char *q;
856 /* word.1 */
857 for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */
858 *p++ = IMPORTED_RULES_DELIM;
859 *p++ = (char)(rule + ((wordID)'0'));
860 *p = '\0';
861 wmap->words[ wmap->num_words++] = wmap->next_chars;
862 wmap->next_chars += len;
863 wdID = (wordID)(wmap->num_words - (wordID)1);
864 if (wmap->wordIDForWord)
865 {
866 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID],
867 (void*)(uintptr_t)(wdID), NULL );
868 if ( rc != ESR_SUCCESS )
869 goto CLEANUP;
870 }
871 return wdID;
872 }
873 CLEANUP:
874 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID);
875 return MAXwordID;
876 }
877
878 /*----------------------------------------------------------------------*
879 * *
880 * API *
881 * *
882 *----------------------------------------------------------------------*/
883
FST_AttachArbdata(srec_context * fst,srec_arbdata * allophone_tree)884 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree)
885 {
886 int rc = 0;
887 unsigned int allotree__modelid;
888
889 fst->allotree = allophone_tree;
890 if( !allophone_tree)
891 return 1;
892 fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset;
893 allotree__modelid = version_arbdata_models(allophone_tree);
894 if (allotree__modelid != 0 && fst->modelid != 0)
895 {
896 if (allotree__modelid != fst->modelid)
897 {
898 PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid);
899 rc = FST_FAILED_ON_INVALID_ARGS;
900 }
901 }
902
903 return rc;
904 }
905
FST_LoadGraph(srec_context * pfst,wordmap * imap,wordmap * omap,int num_words_to_add,PFile * fp)906 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap,
907 int num_words_to_add, PFile* fp)
908 {
909 int i, rc = 0;
910 char line[MAX_LINE_LENGTH], *args[32], nargs;
911 arcID max_num_FSMarcs;
912 arcID max_num_FSMnodes;
913 nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0;
914 FSMarc_ptr atok = (FSMarc_ptr)0;
915 FSMarc *atoken = NULL;
916 FSMnode *fr_node, *to_node;
917 costdata cost = FREEcostdata;
918 arcID num_arcs;
919
920 srec_context* fst = pfst;
921 char *ilabel_str = 0, *olabel_str = 0;
922 long fpos;
923 arcID new_arc_id;
924 asr_int32_t temp;
925
926
927 /* determine number of arcs and nodes to allocate, add 50% for dynamic */
928 fpos = pftell(fp);
929 max_num_FSMnodes = 0;
930 pfst->modelid = 0;
931 for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++)
932 {
933 if (strstr(line, "modelid:") == line)
934 {
935 char *p;
936 pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10);
937 }
938 from_node = (nodeID)atoi_with_check(line, MAXnodeID);
939 if (from_node > max_num_FSMnodes)
940 max_num_FSMnodes = from_node;
941 }
942 pfseek(fp, fpos, SEEK_SET);
943 temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD;
944 if (temp >= MAXnodeID)
945 {
946 max_num_FSMnodes = MAXnodeID - 1;
947 PLogMessage("Warning: using max nodes instead\n");
948 }
949 else
950 {
951 max_num_FSMnodes = (nodeID)temp;
952 }
953 temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD;
954 if (temp >= MAXarcID)
955 {
956 max_num_FSMarcs = MAXarcID - 1;
957 PLogMessage("Warning: using max arcs instead\n");
958 }
959 else
960 {
961 max_num_FSMarcs = (arcID)temp;
962 }
963 fst->olabels = omap;
964 if (imap)
965 {
966 /* generally no imap is specified */
967 fst->ilabels = imap;
968 fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0");
969 ASSERT(fst->hmm_ilabel_offset >= 0);
970 }
971 else
972 {
973 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
974 fst->ilabels->num_words = fst->ilabels->max_words = 0;
975 fst->ilabels->words = 0;
976 /* this is bad hard code, we can get this from the swiarb file, it is
977 equal to the number of phonemes (53 for enu, 39 for jpn) plus the special
978 models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */
979 fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */
980 }
981 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs");
982 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes");
983 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos");
984
985 /* setup the arc freelist */
986 fst->FSMarc_freelist = 0;
987 fst->num_arcs = 0;
988 for (i = 0; i < max_num_FSMarcs - 1; i++)
989 {
990 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
991 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
992 }
993 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
994 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
995
996 /* initialize the nodes, 'cuz reading is random order */
997 fst->num_nodes = 0;
998 for (i = 0; i < max_num_FSMnodes; i++)
999 {
1000 fr_node = &fst->FSMnode_list[i];
1001 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
1002 }
1003
1004 /* 1. first load up all the information */
1005 IF_DEBUG_WDADD(printf("load graph ... 1\n"));
1006 num_arcs = 0;
1007 last_from_node = MAXnodeID;
1008 while (pfgets(line, sizeof(line) / sizeof(char), fp))
1009 {
1010 if (strstr(line, "modelid:") == line) continue;
1011 IF_DEBUG_WDADD(printf("read arc %s", line));
1012 nargs = split_line(line, args);
1013 if (nargs >= 4)
1014 {
1015 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1016 into_node = (nodeID)atoi_with_check(args[1], MAXnodeID);
1017 ilabel_str = args[2];
1018 olabel_str = args[3];
1019 cost = FREEcostdata;
1020 if (nargs == 5)
1021 PLogError(L("Warning: too many arguments on line %s"), line);
1022 }
1023 else if (nargs == 1)
1024 {
1025 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID);
1026 into_node = MAXnodeID;
1027 ilabel_str = 0;
1028 olabel_str = 0;
1029 cost = FREEcostdata;
1030 }
1031 else
1032 {
1033 from_node = into_node = 0;
1034 PLogError("can't parse line %s\n", line);
1035 ASSERT(0);
1036 }
1037
1038 if (into_node == MAXnodeID)
1039 {
1040 fst->end_node = from_node;
1041 }
1042 else
1043 {
1044 new_arc_id = fst_get_free_arc(fst);
1045 if (new_arc_id == MAXarcID)
1046 return FST_FAILED_ON_MEMORY;
1047 atok = ARC_ItoX(new_arc_id);
1048 atoken = ARC_XtoP(atok);
1049 num_arcs++;
1050 fr_node = &fst->FSMnode_list[from_node];
1051 to_node = &fst->FSMnode_list[into_node];
1052 if (fst->ilabels->num_words == 0)
1053 {
1054 atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset);
1055 /* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/
1056 if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL)
1057 atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET;
1058 }
1059 else
1060 {
1061 atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str);
1062 }
1063 atoken->olabel = wordmap_find_index(fst->olabels, olabel_str);
1064 /* if(g_fst_options.wtw_cost_override>0) {
1065 if(atoken->ilabel == WORD_BOUNDARY)
1066 atoken->cost = g_fst_options.wtw_cost_override;
1067 else
1068 atoken->cost = g_fst_options.arc_cost_override;
1069 } else */
1070 atoken->cost = cost;
1071 #if DEBUG_WDADD
1072 atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0);
1073 atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0);
1074 #endif
1075 append_arc_leaving_node(fst, fr_node, atok);
1076 if (into_node > max_node_number)
1077 max_node_number = into_node;
1078 append_arc_arriving_node(fst, to_node, atok);
1079 atoken->fr_node = NODE_ItoX(from_node);
1080 atoken->to_node = NODE_ItoX(into_node);
1081 }
1082 last_from_node = from_node;
1083 }
1084 ASSERT(fst->num_arcs == num_arcs);
1085
1086 /* setup the node freelist */
1087 IF_DEBUG_WDADD(printf("load graph ... 6\n"));
1088 num_nodes = (nodeID)(max_node_number + 1);
1089 if( max_num_FSMnodes > num_nodes) {
1090 fst->FSMnode_freelist = num_nodes;
1091 for (i = num_nodes; i < (max_num_FSMnodes - 1); i++)
1092 {
1093 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
1094 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1095 }
1096 if (i == (max_num_FSMnodes - 1)) {
1097 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
1098 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
1099 }
1100 } else
1101 fst->FSMnode_freelist = FSMNODE_NULL;
1102
1103 /* some book-keeping stuff */
1104 IF_DEBUG_WDADD(printf("load graph ... 7\n"));
1105 fst->num_base_arcs = fst->num_arcs = num_arcs;
1106 fst->FSMarc_list_len = max_num_FSMarcs;
1107 fst->FSMnode_list_len = max_num_FSMnodes;
1108 fst->num_base_nodes = fst->num_nodes = num_nodes;
1109 fst->start_node = 0;
1110 /* fst->end_node = 0; this is set up above */
1111
1112 /* beg and end word labels */
1113 fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-");
1114 fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-");
1115 fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence");
1116
1117 /* set the search limitations, real ones will be registered later */
1118 fst->max_searchable_nodes = 0;
1119 fst->max_searchable_arcs = 0;
1120
1121 rc = fst_alloc_transit_points(fst);
1122 return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS);
1123 }
1124
FST_UnloadGraph(srec_context * pfst)1125 int FST_UnloadGraph(srec_context* pfst)
1126 {
1127 if (pfst->ilabels)
1128 FREE(pfst->ilabels);
1129 FREE(pfst->FSMarc_list);
1130 FREE(pfst->FSMnode_list);
1131 FREE(pfst->FSMnode_info_list);
1132 pfst->FSMarc_list = 0;
1133 pfst->FSMnode_list = 0;
1134 pfst->FSMnode_info_list = 0;
1135 return FST_SUCCESS;
1136 }
1137
FST_DumpGraph(srec_context * fst,PFile * fp)1138 int FST_DumpGraph(srec_context* fst, PFile* fp)
1139 {
1140 int rc = 0;
1141 nodeID i;
1142 FSMarc_ptr atok;
1143 FSMarc* atoken;
1144 nodeID from_node, into_node;
1145 FSMnode* ntoken;
1146 char *ilabel, *olabel;
1147
1148 for (i = 0; i < fst->num_nodes; i++)
1149 {
1150 from_node = i;
1151 ntoken = &fst->FSMnode_list[i];
1152 if (ntoken->first_prev_arc == FSMARC_FREE)
1153 continue;
1154 if (ntoken->un_ptr.first_next_arc != FSMARC_NULL)
1155 {
1156 for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
1157 {
1158 char buf[32];
1159 atoken = ARC_XtoP(atok);
1160 into_node = NODE_XtoI(atoken->to_node);
1161 ilabel = fst->ilabels->num_words == 0 ?
1162 hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) :
1163 fst->ilabels->words[atoken->ilabel] ;
1164 olabel = fst->olabels->words[atoken->olabel];
1165
1166 if (atoken->cost != FREEcostdata)
1167 {
1168 /* regular arc */
1169 pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n",
1170 from_node, into_node, ilabel, olabel, atoken->cost);
1171 }
1172 else
1173 {
1174 /* regular zero cost arc */
1175 pfprintf(fp, "%hu\t%hu\t%s\t%s\n",
1176 from_node, into_node, ilabel, olabel);
1177 }
1178 }
1179 }
1180 else
1181 {
1182 pfprintf(fp, "%hu\n", from_node);
1183 }
1184 }
1185 return rc;
1186 }
1187
FST_AddWordToGrammar(srec_context * fst,const char * _slot,const char * word,const char * pron,const int cost)1188 int FST_AddWordToGrammar(srec_context* fst, const char* _slot,
1189 const char* word, const char* pron,
1190 const int cost)
1191 {
1192 int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0;
1193 int i, pron_len, model_sequence_len;
1194 modelID model_sequence[MAX_PRON_LEN];
1195 char phoneme_sequence[MAX_PRON_LEN];
1196 wordID olabel = MAXwordID;
1197 int irc, rc = FST_SUCCESS;
1198 nodeID start_node = MAXnodeID, end_node = MAXnodeID;
1199 char veslot[MAX_WORD_LEN];
1200 #if USE_HMM_BASED_ENROLLMENT
1201 const char* Tpron;
1202 #endif
1203
1204 /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */
1205 if(_slot[0] == '@') {
1206 strcpy(veslot,SLOTNAME_INDICATOR);
1207 strcat(veslot,_slot+1);
1208 strcat(veslot,SLOTNAME_INDICATOR);
1209 } else
1210 strcpy(veslot, _slot);
1211
1212 #if USE_COMP_STATS
1213 if (!comp_stats)
1214 comp_stats = init_comp_stats1();
1215 start_cs_clock1(&comp_stats->word_addition);
1216 #endif
1217
1218 /* we expect developers to call AddWord with the same slot many times,
1219 so we cache the slot start and end nodes and re-use on subseq calls */
1220 if( fst->addWordCaching_lastslot_num == MAXwordID )
1221 fst->addWordCaching_lastslot_name = NULL;
1222 else {
1223 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1224 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1225 ASSERT( fst->addWordCaching_lastslot_name);
1226 }
1227 if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) {
1228
1229 fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot);
1230 /* olabel not found is a user error */
1231 if (fst->addWordCaching_lastslot_num == MAXwordID)
1232 {
1233 size_t i;
1234
1235 pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot);
1236 for (i = 1; i < (size_t) fst->olabels->num_slots; ++i)
1237 pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]);
1238 pfprintf(PSTDOUT, L("] possible\n"));
1239 rc = FST_FAILED_ON_INVALID_ARGS;
1240 goto RETRC;
1241 }
1242
1243 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num];
1244 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR);
1245 ASSERT(fst->addWordCaching_lastslot_name);
1246
1247 /* now find where in the graph this slot is referenced, note that
1248 there might be more than one place, but here we ignore multiples */
1249 for (i = fst->num_fsm_exit_points; --i >= 0;)
1250 {
1251 arcID arcid = fst->fsm_exit_points[i].arc_index;
1252 if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num)
1253 {
1254 FSMarc* arc;
1255 FSMnode* node;
1256 start_node = fst->fsm_exit_points[i].from_node_index;
1257 end_node = fst->fsm_exit_points[i].wbto_node_index;
1258 fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE;
1259 fst->addWordCaching_lastslot_ifsm_exit_point = i;
1260 node = &fst->FSMnode_list[ end_node];
1261 arc = &fst->FSMarc_list[node->un_ptr.first_next_arc];
1262 if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID)
1263 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
1264 break;
1265 }
1266 }
1267
1268 /* not found in the graph is an internal error, coding error */
1269 if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes)
1270 {
1271 PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num,
1272 start_node, end_node);
1273 goto RETRC;
1274 }
1275 } /* cached or not */
1276
1277 i = fst->addWordCaching_lastslot_ifsm_exit_point;
1278 start_node = fst->fsm_exit_points[i].from_node_index;
1279 end_node = fst->fsm_exit_points[i].wbto_node_index;
1280
1281 /* now start_node and end_node are known */
1282 if (!word || !*word || !pron || !*pron)
1283 {
1284 rc = FST_FAILED_ON_INVALID_ARGS;
1285 PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n");
1286 goto RETRC;
1287 }
1288
1289 /* from here */
1290 IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron));
1291
1292 /* loop over all prons, we break when we hit the double-null (hence the +1) */
1293 for( ; (*pron)!='\0'; pron+=(pron_len+1)) {
1294 pron_len = strlen((char*)pron);
1295 if (pron_len >= MAX_PRON_LEN)
1296 {
1297 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1298 rc = FST_FAILED_ON_INVALID_ARGS;
1299 goto RETRC;
1300 }
1301
1302 /* check for searchability after adding, estimating at most pron_len
1303 arcs and nodes will be added */
1304 if (fst->num_arcs + pron_len > fst->max_searchable_arcs)
1305 {
1306 PLogError("error: wordadd failed on word %s due to %d arc search limit\n",
1307 word, fst->max_searchable_arcs);
1308 rc = FST_FAILED_ON_MEMORY;
1309 goto RETRC;
1310 }
1311
1312 if (fst->num_nodes + pron_len > fst->max_searchable_nodes)
1313 {
1314 PLogError("error: wordadd failed on word %s due to %d node search limit\n",
1315 word, fst->max_searchable_nodes);
1316 rc = FST_FAILED_ON_MEMORY;
1317 goto RETRC;
1318 }
1319
1320 /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding
1321 an alternate pronunciation */
1322 olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1323 if (olabel == MAXwordID)
1324 {
1325 olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num);
1326 if (olabel != MAXwordID)
1327 num_words_added++;
1328 }
1329 if (olabel == MAXwordID)
1330 {
1331 PLogError("error: wordmap_add_word failed\n");
1332 rc = FST_FAILED_ON_MEMORY;
1333 goto RETRC;
1334 }
1335
1336 #if USE_HMM_BASED_ENROLLMENT
1337 // pron should be converted to model_sequence, model_sequence_len
1338 #define VETRANS_PREFIX "wd_hmm"
1339 #define VETRANS_PREFIX_LEN 6
1340 if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) {
1341 // printf("Debug-pron: %d, %s\n",pron_len, pron);
1342 model_sequence_len=0;
1343 for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){
1344 // Tpron = strstr(Tpron, "wd_hmm");
1345 Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm"
1346 // Copy hmm number (56,132,...)
1347 model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID);
1348 model_sequence_len++;
1349 }
1350
1351
1352 /* we need to append silence at the end since we might be dealing with a slot
1353 that has ensuing speech, formerly we only dealt with ROOT which defacto
1354 was followed by the pau2 silence, so ROOT did not need its own */
1355 if (fst->addWordCaching_lastslot_needs_post_silence)
1356 model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here
1357
1358 /* append the word boundary */
1359 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1360 /* translate to input label ids */
1361 for (i = 0; i < model_sequence_len; i++)
1362 {
1363 if (model_sequence[i] >= EPSILON_OFFSET)
1364 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1365 }
1366 }
1367 else
1368 #endif
1369 {
1370 pron_len = strlen((char*)pron);
1371 if (pron_len >= MAX_PRON_LEN)
1372 {
1373 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron);
1374 rc = FST_FAILED_ON_INVALID_ARGS;
1375 goto RETRC;
1376 }
1377
1378 for (i = 0; i < pron_len; i++)
1379 {
1380 if (pron[i] == OPTSILENCE_CODE)
1381 {
1382 phoneme_sequence[i] = SILENCE_CODE;
1383 num_pron_ampersands++;
1384 }
1385 else
1386 phoneme_sequence[i] = pron[i];
1387 }
1388 /* we need to append silence at the end since we might be dealing with a slot
1389 that has ensuing speech, formerly we only dealt with ROOT which defacto
1390 was followed by the pau2 silence, so ROOT did not need its own */
1391 if (fst->addWordCaching_lastslot_needs_post_silence)
1392 phoneme_sequence[i++] = SILENCE_CODE;
1393
1394 model_sequence_len = i;
1395 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1396 /* check for bad sequence of phonemes */
1397 if (irc)
1398 {
1399 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc);
1400 rc = FST_FAILED_ON_INVALID_ARGS;
1401 goto RETRC;
1402 }
1403 IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1404
1405 /* append the word boundary */
1406 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1407 /* translate to input label ids */
1408 for (i = 0; i < model_sequence_len; i++)
1409 {
1410 if (model_sequence[i] >= EPSILON_OFFSET)
1411 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1412 }
1413 } /* end of ph_t ph_r .. type decoding of the model sequence */
1414
1415 /* now do the actual addition */
1416 rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len);
1417 if (rc == FST_SUCCESS)
1418 {
1419 num_prons_added++;
1420 }
1421 else if (rc == FST_FAILED_ON_HOMONYM)
1422 {
1423 /* maybe the another pron will work? */
1424 }
1425 else
1426 {
1427 PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron);
1428 goto RETRC;
1429 }
1430
1431 /* second add the pron with no silences,
1432 this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */
1433
1434 if (num_pron_ampersands > 0)
1435 {
1436 for (i = 0, model_sequence_len = 0; i < pron_len; i++)
1437 {
1438 if (pron[i] != OPTSILENCE_CODE)
1439 phoneme_sequence[model_sequence_len++] = pron[i];
1440 }
1441
1442 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence);
1443 /* check for bad sequence of phonemes */
1444 if (irc)
1445 {
1446 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc);
1447 rc = FST_FAILED_ON_INVALID_ARGS;
1448 goto RETRC;
1449 }
1450 else
1451 {
1452 IF_DEBUG_WDADD(printf("model_sequence ...\n"));
1453
1454 /* append the word boundary */
1455 model_sequence[model_sequence_len++] = WORD_BOUNDARY;
1456 /* translate to input label ids */
1457 for (i = 0; i < model_sequence_len; i++)
1458 {
1459 if (model_sequence[i] >= EPSILON_OFFSET)
1460 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset);
1461 }
1462 /* now do the actual addition */
1463 rc = fst_add_arcs(fst, start_node, end_node,
1464 olabel, (costdata)cost, model_sequence, model_sequence_len);
1465
1466 if (rc == FST_SUCCESS)
1467 {
1468 num_prons_added++;
1469 }
1470 else if (rc == FST_FAILED_ON_HOMONYM)
1471 {
1472 /* maybe another pron worked? */
1473 }
1474 else
1475 {
1476 PLogMessage("Warning: fst_add_arcs() failed while adding "
1477 "word %s pron %s (skip '&')\n", word, (char*)pron);
1478 goto RETRC;
1479 }
1480 }
1481 }
1482 }
1483
1484
1485 RETRC:
1486 /* set this to make sure that FST_Prepare gets called after add word */
1487 fst->whether_prepared = 0;
1488 #if USE_COMP_STATS
1489 end_cs_clock1(&comp_stats->word_addition, 1);
1490 #endif
1491
1492 if (rc < 0 && rc != FST_FAILED_ON_HOMONYM)
1493 return rc;
1494
1495 if (num_prons_added == 0 && num_words_added > 0)
1496 {
1497 return FST_FAILED_ON_HOMONYM;
1498 }
1499 else if (num_prons_added == 0 && num_words_added == 0)
1500 {
1501 return FST_FAILED_ON_HOMOGRAPH;
1502 }
1503 else if (num_prons_added > 0 && num_words_added == 0)
1504 {
1505 return FST_SUCCESS_ON_OLD_WORD;
1506 }
1507 else
1508 {
1509 /* if(num_prons_added>0 && num_words_added>0) */
1510 return FST_SUCCESS;
1511 }
1512 }
1513
1514 /* remove arcs leaving a node, arcs added via word add, are just discarded
1515 to be cleaned up for re-use by other functions */
remove_added_arcs_leaving(srec_context * fst,nodeID ni)1516 void remove_added_arcs_leaving(srec_context* fst, nodeID ni)
1517 {
1518 FSMnode* node = &fst->FSMnode_list[ ni];
1519 FSMarc *arc = NULL, *arc2;
1520 arcID *pai, ai, ai2;
1521 for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai))
1522 {
1523 if (ai < fst->num_base_arcs)
1524 {
1525 arc = &fst->FSMarc_list[ai];
1526 }
1527 else
1528 {
1529 arc2 = &fst->FSMarc_list[ai];
1530 for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1531 ai2 = arc2->linkl_next_arc)
1532 {
1533 arc2 = &fst->FSMarc_list[ai2];
1534 }
1535 *pai = ai2;
1536 }
1537 }
1538 }
1539
1540 /* remove arcs arriving at a node, arcs added via word add, are just discarded
1541 to be cleaned up for re-use by other functions */
remove_added_arcs_arriving(srec_context * fst,nodeID ni)1542 void remove_added_arcs_arriving(srec_context* fst, nodeID ni)
1543 {
1544 FSMnode* node = &fst->FSMnode_list[ni];
1545 FSMarc *arc = NULL, *arc2;
1546 arcID *pai, ai, ai2;
1547 for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID;
1548 pai = &arc->linkl_prev_arc, ai = (*pai))
1549 {
1550 if (ai < fst->num_base_arcs)
1551 {
1552 arc = &fst->FSMarc_list[ai];
1553 }
1554 else
1555 {
1556 arc2 = &fst->FSMarc_list[ai];
1557 for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID;
1558 ai2 = arc2->linkl_prev_arc)
1559 {
1560 arc2 = &fst->FSMarc_list[ai2];
1561 }
1562 *pai = ai2;
1563 }
1564 }
1565 }
1566
1567 /* reset greammar, but resetting all the arcs, nodes and words added
1568 via the addword functions */
1569
FST_ResetGrammar(srec_context * fst)1570 int FST_ResetGrammar(srec_context* fst)
1571 {
1572 int i, rc = 0;
1573 nodeID fst_slot_start_node, fst_slot_end_node;
1574 wordID fst_slot_slotnum;
1575 arcID ai;
1576 nodeID ni2, ni3, ni4, ni3a;
1577 FSMnode_t *node, *node2, *node3;
1578 FSMarc_t *arc, *arc2, *arc3;
1579
1580 /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/
1581 for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots;
1582 fst_slot_slotnum++)
1583 {
1584
1585 if (fst_slot_slotnum == MAXwordID)
1586 {
1587 char *slot = "";
1588 PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1);
1589 return (rc = FST_FAILED_ON_INVALID_ARGS);
1590 }
1591
1592 /* now find where in the graph this slot is referenced, note that
1593 there might be more than one place, but here we ignore multiples */
1594 fst_slot_start_node = MAXnodeID;
1595 fst_slot_end_node = MAXnodeID;
1596 for (i = fst->num_fsm_exit_points; --i >= 0;)
1597 {
1598 ai = fst->fsm_exit_points[i].arc_index;
1599 if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum)
1600 {
1601 fst_slot_start_node = fst->fsm_exit_points[i].from_node_index;
1602 fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index;
1603 }
1604 }
1605
1606 /* this 'slot' could be the root rule, which can't be removed */
1607 if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID)
1608 continue;
1609
1610 /*
1611 start end
1612 node node2 node3 node4
1613 o--@N-->o--sil-->o--wb-->o
1614 arc arc2 | arc3 ^
1615 | |wb
1616 \__sil->o
1617 arc4 ni3a
1618 */
1619
1620 remove_added_arcs_leaving(fst, fst_slot_start_node);
1621 node = &fst->FSMnode_list[ fst_slot_start_node];
1622 for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc)
1623 {
1624 arc = &fst->FSMarc_list[ai];
1625 if (arc->olabel != fst_slot_slotnum)
1626 continue;
1627
1628 ni2 = arc->to_node;
1629 remove_added_arcs_arriving(fst, ni2);
1630 if (ni2 == fst_slot_end_node)
1631 continue;
1632 node2 = &fst->FSMnode_list[ni2];
1633 arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc];
1634
1635
1636 ni3 = arc2->to_node;
1637 remove_added_arcs_arriving(fst, ni3);
1638 if (ni3 == fst_slot_end_node)
1639 continue;
1640 node3 = &fst->FSMnode_list[ni3];
1641 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1642 while (arc3->linkl_next_arc != MAXarcID)
1643 {
1644 arc3 = &fst->FSMarc_list[arc3->linkl_next_arc];
1645 ni3a = arc3->to_node;
1646 remove_added_arcs_arriving(fst, ni3a);
1647 }
1648 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc];
1649
1650 ni4 = arc3->to_node;
1651 remove_added_arcs_arriving(fst, ni4);
1652 ASSERT(ni4 == fst_slot_end_node);
1653 if (ni4 == fst_slot_end_node)
1654 continue;
1655 }
1656 }
1657
1658 /* reset the freelist for nodes */
1659 if( fst->num_nodes == fst->num_base_nodes )
1660 {}
1661 else{
1662 FSMnode *ntoken;
1663 FSMnode* tmp_FSMnode_list;
1664 FSMnode_info* tmp_FSMnode_info_list;
1665
1666 fst->FSMnode_freelist = MAXnodeID;
1667 fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes;
1668
1669 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
1670 if(!tmp_FSMnode_list){
1671 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes");
1672 return FST_FAILED_ON_MEMORY;
1673 }
1674 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
1675
1676 /* scan to the end of the free list (to be re-used) */
1677 nodeID* last_free_node = (&fst->FSMnode_freelist);
1678 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
1679
1680 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
1681 ntoken = &tmp_FSMnode_list[ *last_free_node];
1682
1683 FREE( fst->FSMnode_list);
1684 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
1685 if(!tmp_FSMnode_info_list){
1686 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos");
1687 return FST_FAILED_ON_MEMORY;
1688 }
1689 // copy in node info
1690 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
1691
1692
1693 FREE( fst->FSMnode_info_list);
1694 fst->FSMnode_info_list = tmp_FSMnode_info_list;
1695 fst->FSMnode_list = tmp_FSMnode_list;
1696 }
1697
1698 /*ni = fst->FSMnode_freelist = fst->num_base_nodes;
1699 node = &fst->FSMnode_list[ni];
1700 for (; ni < fst->FSMnode_list_len - 1; ni++, node++)
1701 node->un_ptr.next_node = (nodeID)(ni + 1);
1702 node->un_ptr.next_node = MAXnodeID;
1703 fst->num_nodes = fst->num_base_nodes;*/
1704
1705 /* reset the freelist for arcs */
1706 if( fst->num_arcs == fst->num_base_arcs )
1707 {}
1708 else {
1709 FSMarc* atoken = NULL;
1710 FSMarc* tmp_FSMarc_list;
1711
1712 fst->num_arcs = fst->num_base_arcs;
1713 fst->FSMarc_list_len = fst->num_base_arcs;
1714 fst->FSMarc_freelist = MAXarcID;
1715 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
1716 if(!tmp_FSMarc_list){
1717 PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs");
1718 return FST_FAILED_ON_MEMORY;
1719 }
1720 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
1721
1722 /* scan to the end of the free list (to be re-used) */
1723 arcID* last_free_arc = &fst->FSMarc_freelist;
1724 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
1725
1726 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
1727 atoken = &tmp_FSMarc_list[ *last_free_arc];
1728
1729 FREE( fst->FSMarc_list);
1730 fst->FSMarc_list = tmp_FSMarc_list;
1731 }
1732
1733 /*ai = fst->FSMarc_freelist = fst->num_base_arcs;
1734 arc = &fst->FSMarc_list[ai];
1735 for (; ai < fst->FSMarc_list_len - 1; ai++, arc++)
1736 arc->linkl_next_arc = (arcID)(ai + 1);
1737 arc->linkl_next_arc = MAXarcID;
1738 fst->num_arcs = fst->num_base_arcs;
1739 fst->FSMarc_list_len = fst->num_base_arcs;*/
1740
1741 /* now remove all the added words */
1742 wordmap_reset(fst->olabels);
1743 return FST_SUCCESS;
1744 }
1745
1746 /* See the description of arc_tokens in the header file. These
1747 arcs are nodeless, which make loading from a file that
1748 references nodes a little harder. We need to do it in stages.
1749 Later, when we load from binary image it should be much simpler.
1750 */
1751
get_first_arc_leaving_node(arc_token * arc_token_list,arcID num_arcs,nodeID node)1752 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list,
1753 arcID num_arcs,
1754 nodeID node)
1755 {
1756 arcID i;
1757 for (i = 0; i < num_arcs; i++)
1758 {
1759 if ((nodeID)(int)arc_token_list[i].next_token_index == node)
1760 return ARC_TOKEN_LNK(arc_token_list, i);
1761 }
1762 return ARC_TOKEN_NULL;
1763 }
1764
FST_LoadReverseWordGraph(srec_context * context,int num_words_to_add,PFile * fp)1765 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp)
1766 {
1767 arcID i;
1768 char line[MAX_LINE_LENGTH];
1769 char word_label_as_str[128];
1770 arcID num_alloc_arc_tokens;
1771 nodeID from_node, into_node;
1772 labelID word_label = MAXwordID;
1773 arc_token *atoken, *last_atoken;
1774 costdata cost;
1775 arcID num_arcs;
1776 arc_token *arc_token_list, *tmp;
1777 long fpos;
1778
1779 /* determine number of word arcs to allocate */
1780 fpos = pftell(fp);
1781 for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp); num_arcs++) ;
1782 num_alloc_arc_tokens = num_arcs;
1783 num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add);
1784 pfseek(fp, fpos, SEEK_SET);
1785
1786 context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph");
1787 arc_token_list = context->arc_token_list;
1788
1789 /* 1. first load up all the information */
1790 i = 0;
1791 while (pfgets(line, MAX_LINE_LENGTH, fp))
1792 {
1793 if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3)
1794 {
1795 word_label = wordmap_find_index(context->olabels, word_label_as_str);
1796 // ASSERT(word_label >= 0);
1797 cost = FREEcostdata;
1798 }
1799 else if (sscanf(line, "%hu", &from_node) == 1)
1800 {
1801 into_node = MAXnodeID;
1802 word_label = MAXwordID;
1803 cost = FREEcostdata;
1804 }
1805 else
1806 {
1807 PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line);
1808 ASSERT(0);
1809 }
1810 atoken = &arc_token_list[i];
1811 i++;
1812 atoken->ilabel = word_label;
1813 /* atoken->olabel = WORD_EPSILON_LABEL; */
1814 /*atoken->cost = cost; cost is not used for now */
1815 #if DEBUG_ASTAR
1816 atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]);
1817 #endif
1818 atoken->first_next_arc = (arc_token_lnk)into_node;
1819 atoken->next_token_index = (arc_token_lnk)from_node;
1820 }
1821 num_arcs = i;
1822
1823 /* 2. now do the internal cross references */
1824 for (i = 0; i < num_arcs; i++)
1825 {
1826 atoken = &arc_token_list[i];
1827 into_node = (nodeID)atoken->first_next_arc;
1828 if (into_node == MAXnodeID)
1829 atoken->first_next_arc = ARC_TOKEN_NULL;
1830 else
1831 atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc);
1832 }
1833
1834 /* 3. now do more internal cross refs */
1835 last_atoken = &arc_token_list[0];
1836 for (i = 1; i < num_arcs; i++)
1837 {
1838 atoken = &arc_token_list[i];
1839 if (atoken->next_token_index != last_atoken->next_token_index)
1840 {
1841 last_atoken->next_token_index = ARC_TOKEN_NULL;
1842 }
1843 else
1844 {
1845 last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i);
1846 }
1847 last_atoken = atoken;
1848 }
1849 last_atoken->next_token_index = ARC_TOKEN_NULL;
1850
1851 #if DEBUG_ASTAR
1852 /* under debug, it's nice to be able to see the words leaving the
1853 destination node, they are stored sequentially in the debug ary */
1854 for (i = 0; i < num_arcs; i++)
1855 {
1856 char * p;
1857 atoken = &arc_token_list[i];
1858 atoken->debug[0] = 0;
1859 for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL;
1860 tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index))
1861 {
1862 if (tmp->first_next_arc == ARC_TOKEN_NULL)
1863 {
1864 p = "END";
1865 }
1866 else if (!tmp->label)
1867 {
1868 p = "NULL";
1869 }
1870 else
1871 {
1872 p = tmp->label;
1873 }
1874 if (strlen(atoken->debug) + strlen(p) + 6 < 64)
1875 {
1876 strcat(atoken->debug, p);
1877 strcat(atoken->debug, " ");
1878 }
1879 else
1880 {
1881 strcat(atoken->debug, "...");
1882 break;
1883 }
1884 }
1885 }
1886 #endif
1887 context->arc_token_list_len = num_alloc_arc_tokens;
1888 if (num_alloc_arc_tokens > num_arcs)
1889 {
1890 atoken = &context->arc_token_list[num_arcs];
1891 for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++)
1892 {
1893 atoken->first_next_arc = ARC_TOKEN_NULL;
1894 atoken->ilabel = MAXwordID;
1895 atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/
1896 }
1897 atoken--;
1898 atoken->next_token_index = ARC_TOKEN_NULL;
1899 context->arc_token_freelist = &context->arc_token_list[num_arcs];
1900 }
1901 else
1902 {
1903 context->arc_token_freelist = NULL;
1904 }
1905 /* the insert start is the arc that goes to the end */
1906 atoken = context->arc_token_list;
1907 atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc);
1908 for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index))
1909 {
1910 if (atoken->first_next_arc == ARC_TOKEN_NULL)
1911 break;
1912 tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc);
1913 if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL)
1914 break;
1915 }
1916 context->arc_token_insert_start = atoken;
1917
1918 return FST_SUCCESS;
1919 }
1920
FST_UnloadReverseWordGraph(srec_context * context)1921 int FST_UnloadReverseWordGraph(srec_context* context)
1922 {
1923 FREE(context->arc_token_list);
1924 context->arc_token_list = 0;
1925 return FST_SUCCESS;
1926 }
1927
1928 /*----------------------------------------------------------------------*
1929 * *
1930 * general *
1931 * *
1932 *----------------------------------------------------------------------*/
1933
split_line(char * line,char ** av)1934 char split_line(char* line, char** av)
1935 {
1936 int i = 0;
1937 av[i] = strtok(line, "\n\r\t ");
1938 for (; av[i];)
1939 {
1940 av[++i] = strtok(NULL, "\n\r\t ");
1941 }
1942 return i;
1943 }
1944
atoi_with_check(const char * buf,asr_int32_t mymax)1945 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax)
1946 {
1947 asr_int32_t ans = atol(buf);
1948 ASSERT(ans < mymax);
1949 return ans;
1950 }
1951
1952 /*----------------------------------------------------------------------*
1953 * *
1954 * fst *
1955 * *
1956 *----------------------------------------------------------------------*/
1957
fst_get_free_arc(srec_context * fst)1958 arcID fst_get_free_arc(srec_context* fst)
1959 {
1960 FSMarc* atoken;
1961 arcID atokid = fst->FSMarc_freelist;
1962 if (atokid == MAXarcID)
1963 {
1964 PLogError("error: ran out of arcs\n");
1965 return atokid;
1966 }
1967 atoken = &fst->FSMarc_list[atokid];
1968 fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc);
1969 memset(atoken, 0, sizeof(FSMarc));
1970 atoken->to_node = FSMNODE_NULL;
1971 atoken->fr_node = FSMNODE_NULL;
1972 atoken->linkl_next_arc = FSMARC_NULL;
1973 atoken->linkl_prev_arc = FSMARC_NULL;
1974 atoken->ilabel = 0;
1975 atoken->olabel = 0;
1976 atoken->cost = 0;
1977 fst->num_arcs++;
1978 return atokid;
1979 }
fst_get_free_node(srec_context * fst)1980 nodeID fst_get_free_node(srec_context* fst)
1981 {
1982 FSMnode* ntoken;
1983 nodeID ntokid = fst->FSMnode_freelist;
1984 if (ntokid == MAXnodeID)
1985 {
1986 PLogError("error: ran out of nodes\n");
1987 return ntokid;
1988 }
1989 ntoken = &fst->FSMnode_list[ntokid];
1990 fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node);
1991 ntoken->un_ptr.first_next_arc = FSMARC_NULL;
1992 ntoken->first_prev_arc = FSMARC_NULL;
1993 /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */
1994 fst->num_nodes++;
1995 return ntokid;
1996 }
1997
fst_free_node(srec_context * fst,FSMnode * node)1998 int fst_free_node(srec_context* fst, FSMnode* node)
1999 {
2000 IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node));
2001 node->first_prev_arc = FSMARC_NULL;
2002 node->un_ptr.next_node = fst->FSMnode_freelist;
2003 node->first_prev_arc = FSMARC_FREE;
2004 fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list);
2005 /* fst->num_nodes--; not allowed unless we compress! */
2006 return 0;
2007 }
2008
fst_free_arc(srec_context * fst,FSMarc * arc)2009 int fst_free_arc(srec_context* fst, FSMarc* arc)
2010 {
2011 IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc));
2012 arc->linkl_prev_arc = FSMARC_FREE;
2013 arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist);
2014 arc->cost = 2;
2015 arc->ilabel = 0;
2016 arc->olabel = 0;
2017 IF_DEBUG_WDADD(arc->ilabel_str = 0);
2018 IF_DEBUG_WDADD(arc->olabel_str = 0);
2019 fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list);
2020 fst->num_arcs--;
2021 return 0;
2022 }
2023
fst_free_arc_net(srec_context * fst,FSMarc * arc)2024 int fst_free_arc_net(srec_context* fst, FSMarc* arc)
2025 {
2026 if (!arc)
2027 return 0;
2028 /* need to traverse the arc, get the 'from' nodes,
2029 then traverse the from nodes and kill arcs */
2030 /* we'll need this after an error condition */
2031 return 0;
2032 }
2033
fst_push_arc_olabel(srec_context * fst,FSMarc * arc)2034 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc)
2035 {
2036 FSMarc_ptr atok;
2037 FSMarc* atoken;
2038 FSMnode_ptr ntok = arc->to_node;
2039
2040 /* look for all arcs leaving this arc, and mark them
2041 normally there is only one, except for multipron words */
2042 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2043 {
2044 atoken = ARC_XtoP(atok);
2045 /* cannot have multiple olabels before a boundary */
2046 if (atoken->olabel != WORD_EPSILON_LABEL)
2047 return FST_FAILED_INTERNAL;
2048 atoken->olabel = arc->olabel;
2049 #if DO_WEIGHTED_ADDWORD
2050 atoken->cost = (costdata)(atoken->cost + arc->cost);
2051 #endif
2052 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2053 }
2054 /* unlabel this arc */
2055 arc->olabel = WORD_EPSILON_LABEL;
2056 #if DO_WEIGHTED_ADDWORD
2057 arc->cost = FREEcostdata;
2058 #endif
2059 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2060 return FST_SUCCESS;
2061 }
2062
2063 #if DO_WEIGHTED_ADDWORD
fst_push_arc_cost(srec_context * fst,FSMarc * arc)2064 int fst_push_arc_cost(srec_context* fst, FSMarc* arc)
2065 {
2066 FSMarc_ptr atok;
2067 FSMarc* atoken;
2068 FSMnode_ptr ntok = arc->to_node;
2069 /* look for all arcs leaving this arc, and push the cost thereto */
2070 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2071 {
2072 atoken = ARC_XtoP(atok);
2073 atoken->cost = (costdata)(atoken->cost + arc->cost);
2074 }
2075 arc->cost = FREEcostdata;
2076 return FST_SUCCESS;
2077 }
2078 #endif
2079
2080
fst_pull_arc_olabel(srec_context * fst,FSMarc * arc)2081 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc)
2082 {
2083 FSMarc_ptr atok;
2084 FSMarc* atoken;
2085 FSMnode_ptr fr_node = arc->fr_node;
2086 FSMarc_ptr tmp_atok;
2087 FSMarc *tmp_atoken;
2088 FSMnode *anode;
2089
2090 if (arc->olabel == WORD_EPSILON_LABEL)
2091 return FST_SUCCESS;
2092
2093 /* can the olabel be pulled earlier? */
2094 for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2095 {
2096 atoken = ARC_XtoP(atok);
2097 anode = NODE_XtoP(atoken->to_node);
2098
2099 tmp_atok = anode->un_ptr.first_next_arc;
2100 if (tmp_atok != FSMARC_NULL)
2101 {
2102 tmp_atoken = ARC_XtoP(tmp_atok);
2103 if (tmp_atoken->linkl_next_arc != FSMARC_NULL)
2104 {
2105 return FST_CONTINUE;
2106 }
2107 }
2108 }
2109
2110 /* look for all arcs arriving at this arc, and mark them */
2111 for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc)
2112 {
2113 atoken = ARC_XtoP(atok);
2114 /* cannot have multiple olabels! serious internal error!? */
2115 if (atoken->olabel != WORD_EPSILON_LABEL)
2116 {
2117 PLogError("error: internal error, in fst_pull_arc_olabel()\n");
2118 return FST_FAILED_INTERNAL;
2119 }
2120 atoken->olabel = arc->olabel;
2121 #if DO_WEIGHTED_ADDWORD
2122 atoken->cost = arc->cost;
2123 #endif
2124 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str);
2125 }
2126 /* unlabel this arc */
2127 arc->olabel = WORD_EPSILON_LABEL;
2128 #if DO_WEIGHTED_ADDWORD
2129 arc->cost = FREEcostdata;
2130 #endif
2131 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]);
2132 return FST_SUCCESS;
2133 }
2134
find_next_arc_with_ilabel(srec_context * fst,FSMnode * node,labelID ilabel,FSMarc ** last)2135 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last)
2136 {
2137 FSMarc_ptr atok;
2138 FSMarc* atoken = NULL;
2139 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2140 {
2141 atoken = ARC_XtoP(atok);
2142 if (atoken->ilabel == ilabel)
2143 return atoken;
2144 }
2145 *last = atoken; /* to this we may want to append a new arc */
2146 return NULL;
2147 }
2148
find_prev_arc_with_iolabels(srec_context * fst,FSMnode * node,labelID ilabel,labelID olabel,FSMarc ** last)2149 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last)
2150 {
2151 FSMarc_ptr atok;
2152 FSMarc* atoken = NULL;
2153 FSMarc_ptr tmp_atok;
2154 FSMarc* tmp_atoken;
2155
2156 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2157 {
2158 atoken = ARC_XtoP(atok);
2159 if (atoken->olabel == olabel && atoken->ilabel == ilabel)
2160 {
2161 tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc;
2162 if (tmp_atok != FSMARC_NULL)
2163 {
2164 tmp_atoken = ARC_XtoP(tmp_atok);
2165
2166 if (tmp_atoken->linkl_next_arc == FSMARC_NULL)
2167 return atoken;
2168 }
2169 else
2170 return atoken;
2171 }
2172
2173 }
2174 if (last) *last = atoken;
2175 return NULL;
2176 }
2177
num_arcs_leaving(srec_context * fst,FSMnode * node)2178 int num_arcs_leaving(srec_context* fst, FSMnode* node)
2179 {
2180 int num_arcs = 0;
2181 FSMarc_ptr atok;
2182 FSMarc* atoken;
2183 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2184 {
2185 atoken = ARC_XtoP(atok);
2186 num_arcs++;
2187 }
2188 return num_arcs;
2189 }
2190
num_arcs_arriving(srec_context * fst,FSMnode * node)2191 int num_arcs_arriving(srec_context* fst, FSMnode* node)
2192 {
2193 int num_arcs = 0;
2194 FSMarc_ptr atok;
2195 FSMarc* atoken;
2196 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2197 {
2198 atoken = ARC_XtoP(atok);
2199 num_arcs++;
2200 }
2201 return num_arcs;
2202 }
2203
num_arcs_arriving_gt_1(srec_context * fst,FSMnode * node)2204 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node)
2205 {
2206 FSMarc_ptr atok = node->first_prev_arc;
2207 FSMarc* atoken;
2208 if (atok == FSMARC_NULL)
2209 return 0;
2210 atoken = ARC_XtoP(atok);
2211 return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1;
2212 }
2213
remove_arc_arriving(srec_context * fst,FSMnode * to_node,FSMarc * arc)2214 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc)
2215 {
2216 FSMarc* atoken;
2217 FSMarc_ptr* atok;
2218 for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2219 {
2220 atoken = ARC_XtoP(*atok);
2221 if (atoken == arc)
2222 {
2223 (*atok) = arc->linkl_prev_arc;
2224 return;
2225 }
2226 }
2227 ASSERT(0);
2228 }
2229
split_node_for_arc(srec_context * fst,FSMarc * arc)2230 int split_node_for_arc(srec_context* fst, FSMarc* arc)
2231 {
2232 arcID new_arc_id;
2233 nodeID new_node_id;
2234 FSMnode* new_node;
2235 FSMnode* old_node;
2236 FSMarc_ptr atok, new_next, last_new_next;
2237 FSMarc *atoken, *new_next_p;
2238
2239 IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc));
2240
2241 new_node_id = fst_get_free_node(fst);
2242 if (new_node_id == MAXnodeID)
2243 return FST_FAILED_ON_MEMORY;
2244 new_node = &fst->FSMnode_list[ new_node_id];
2245 old_node = NODE_XtoP(arc->to_node);
2246 arc->to_node = NODE_ItoX(new_node_id);
2247 new_node->first_prev_arc = ARC_PtoX(arc);
2248 remove_arc_arriving(fst, old_node, arc);
2249 arc->linkl_prev_arc = FSMARC_NULL;
2250
2251 last_new_next = FSMARC_NULL;
2252 for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc)
2253 {
2254 atoken = ARC_XtoP(atok);
2255 new_arc_id = fst_get_free_arc(fst);
2256 if (new_arc_id == MAXarcID)
2257 return FST_FAILED_ON_MEMORY;
2258 new_next = ARC_ItoX(new_arc_id);
2259 if (last_new_next != FSMARC_NULL)
2260 LINKL_NEXT(last_new_next) = new_next;
2261 else
2262 new_node->un_ptr.first_next_arc = new_next;
2263 new_next_p = ARC_XtoP(new_next);
2264 new_next_p->ilabel = atoken->ilabel;
2265 new_next_p->to_node = atoken->to_node;
2266 new_next_p->fr_node = arc->to_node;
2267 new_next_p->olabel = atoken->olabel;
2268 IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str);
2269 IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str);
2270 new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */
2271 append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next);
2272 last_new_next = new_next;
2273 }
2274 return FST_SUCCESS;
2275 }
2276
fst_add_arcs(srec_context * fst,nodeID start_node,nodeID end_node,wordID add_tree_olabel,costdata add_tree_cost,modelID * add_tree_start,int add_tree_len)2277 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node,
2278 wordID add_tree_olabel, costdata add_tree_cost,
2279 modelID* add_tree_start, int add_tree_len)
2280 {
2281 int rc = FST_SUCCESS;
2282 FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0;
2283 FSMarc* atoken = NULL;
2284 FSMarc *next_atoken, *prev_atoken;
2285 FSMarc *append_to;
2286 FSMnode *late_start_node, *early_end_node;
2287 FSMnode *ntoken, *last_ntoken;
2288 FSMnode_ptr ntok, last_ntok;
2289 modelID *add_tree_end = add_tree_start + add_tree_len - 1;
2290 modelID *add_tree_late_start, *add_tree_early_end;
2291 modelID *add_tree;
2292 arcID new_arc_id, atokid;
2293 nodeID new_node_id, atokid_node;
2294 wordID add_tree_olabel_use = add_tree_olabel;
2295 wordID add_tree_cost_use = add_tree_cost;
2296
2297 append_to = NULL;
2298 add_tree = add_tree_start;
2299 #define FST_GROW_MINARCS 100
2300 #define FST_GROW_MINNODES 100
2301 #if defined(FST_GROW_FACTOR)
2302
2303 /* make sure we have enough arcs available */
2304 if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len)
2305 {
2306 FSMarc* tmp_FSMarc_list;
2307 arcID tmp_FSMarc_list_len;
2308 int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ;
2309
2310 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS)
2311 itmp_FSMarc_list_len += FST_GROW_MINARCS;
2312
2313 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len)
2314 itmp_FSMarc_list_len += add_tree_len;
2315
2316 if(itmp_FSMarc_list_len >= (int)MAXarcID)
2317 return FST_FAILED_ON_MEMORY;
2318
2319 tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len;
2320
2321 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
2322 if(!tmp_FSMarc_list) {
2323 PLogError("error: Failed to extend the memory for arcs\n");
2324 return FST_FAILED_INTERNAL;
2325 }
2326 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc));
2327
2328 /* initialize the new free list.
2329 head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */
2330 for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) {
2331 tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1);
2332 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2333 }
2334 tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL;
2335 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE;
2336
2337 /* scan to the end of the old free list (to be re-used) */
2338 arcID* last_free_arc = &fst->FSMarc_freelist;
2339 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ;
2340
2341 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc)
2342 atoken = &tmp_FSMarc_list[ *last_free_arc];
2343
2344 /* append the new free list to the current free list */
2345 *last_free_arc = fst->FSMarc_list_len;
2346
2347 FREE( fst->FSMarc_list);
2348 fst->FSMarc_list = tmp_FSMarc_list;
2349 fst->FSMarc_list_len = tmp_FSMarc_list_len;
2350 }
2351
2352 /* make sure we have enough nodes available */
2353 if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len)
2354 {
2355 FSMnode* tmp_FSMnode_list;
2356 nodeID tmp_FSMnode_list_len;
2357 FSMnode_info* tmp_FSMnode_info_list;
2358 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ;
2359
2360 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES)
2361 itmp_FSMnode_list_len += FST_GROW_MINNODES;
2362
2363 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len)
2364 itmp_FSMnode_list_len += add_tree_len;
2365
2366 if(itmp_FSMnode_list_len >= (int)MAXnodeID)
2367 return FST_FAILED_ON_MEMORY;
2368
2369 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len;
2370
2371 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
2372 if(!tmp_FSMnode_list) {
2373 PLogError("ERROR: Failed to extend the memory for nodes\n");
2374 return FST_FAILED_INTERNAL;
2375 }
2376 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode));
2377
2378 /* initialize the new free node list.
2379 head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */
2380 for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++)
2381 {
2382 tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1);
2383 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2384 }
2385 tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL;
2386 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE;
2387
2388 /* scan to the end of the old free list (to be re-used) */
2389 nodeID* last_free_node = (&fst->FSMnode_freelist);
2390 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ;
2391
2392 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node)
2393 ntoken = &tmp_FSMnode_list[ *last_free_node];
2394
2395 /* append the new free list to the current free list */
2396 *last_free_node = fst->FSMnode_list_len;
2397
2398 FREE( fst->FSMnode_list);
2399
2400 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
2401 if(!tmp_FSMnode_info_list) {
2402 PLogError("ERROR: Failed to extend the memory for node infos\n");
2403 return FST_FAILED_INTERNAL;
2404 }
2405 // copy in old node info
2406 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info));
2407
2408 // initialize the new node info
2409 for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++)
2410 tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN;
2411
2412 FREE( fst->FSMnode_info_list);
2413 fst->FSMnode_info_list = tmp_FSMnode_info_list;
2414 fst->FSMnode_list = tmp_FSMnode_list;
2415 fst->FSMnode_list_len = tmp_FSMnode_list_len;
2416 }
2417 #endif
2418 late_start_node = &fst->FSMnode_list[start_node];
2419
2420 while (1)
2421 {
2422
2423 next_atoken = find_next_arc_with_ilabel(fst, late_start_node,
2424 *add_tree /*->ilabel*/,
2425 &append_to);
2426 if (next_atoken != NULL)
2427 {
2428 /* so next_atok->ilabel == add_tree->ilabel */
2429
2430 if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY)
2431 {
2432 next_atoken = NULL;
2433 break; /* break as if nothing more can be shared! */
2434 // return FST_FAILED_ON_HOMONYM;
2435 }
2436
2437 if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node)))
2438 {
2439 split_node_for_arc(fst, next_atoken);
2440 /* unfortunate side effect here, that if we later find out this
2441 was a homonym addition, then this expansion was useless and
2442 for now we don't undo it! */
2443 }
2444
2445 /* we shouldn't really push the olabel if it's the same for both! */
2446 if (next_atoken->olabel != add_tree_olabel_use)
2447 {
2448 if (next_atoken->olabel != WORD_EPSILON_LABEL)
2449 {
2450 if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS)
2451 {
2452 PLogError("error: internal error fst_push_arc_olabel()\n");
2453 return FST_FAILED_INTERNAL;
2454 }
2455 }
2456 #if DO_WEIGHTED_ADDWORD
2457 else
2458 fst_push_arc_cost(fst, next_atoken);
2459 #endif
2460 }
2461 else if (add_tree_olabel_use != WORD_EPSILON_LABEL)
2462 {
2463 /* the graph already has this word, so we just re-use the olabel
2464 and disable the setting of olabel down below */
2465 add_tree_olabel_use = WORD_EPSILON_LABEL;
2466 #if DO_WEIGHTED_ADDWORD
2467 add_tree_cost_use = FREEcostdata;
2468 #endif
2469 }
2470 add_tree++;
2471 late_start_node = NODE_XtoP(next_atoken->to_node);
2472 }
2473 else
2474 {
2475 break;
2476 }
2477 }
2478
2479 add_tree_late_start = add_tree;
2480 early_end_node = &fst->FSMnode_list[end_node];
2481
2482 if (!do_minimize)
2483 {
2484 last_ntoken = NULL;
2485 last_ntok = NODE_PtoX(late_start_node);
2486 for (; add_tree <= add_tree_end; add_tree++)
2487 {
2488 new_arc_id = fst_get_free_arc(fst);
2489 if (new_arc_id == MAXarcID)
2490 {
2491 PLogError("fst_get_free_arc() failed\n");
2492 return FST_FAILED_ON_MEMORY;
2493 }
2494 atok = ARC_ItoX(new_arc_id);
2495 atoken = ARC_XtoP(atok);
2496 if (add_tree != add_tree_end)
2497 {
2498 new_node_id = fst_get_free_node(fst);
2499 if (new_node_id == MAXnodeID)
2500 {
2501 PLogError("fst_get_free_node() failed\n");
2502 return FST_FAILED_ON_MEMORY;
2503 }
2504 ntok = NODE_ItoX(new_node_id);
2505 ntoken = NODE_XtoP(ntok);
2506 ntoken->first_prev_arc = atok;
2507 }
2508 else
2509 {
2510 ntok = FSMNODE_NULL;
2511 ntoken = NULL;
2512 }
2513 atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */
2514 atoken->to_node = ntok;
2515 atoken->ilabel = *add_tree;
2516 atoken->linkl_next_arc = FSMARC_NULL;
2517 atoken->linkl_prev_arc = FSMARC_NULL;
2518 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2519 if (!last_ntoken)
2520 {
2521 append_to->linkl_next_arc = atok;
2522 atoken->olabel = add_tree_olabel_use;
2523 #if DO_WEIGHTED_ADDWORD
2524 atoken->cost = add_tree_cost_use;
2525 #endif
2526 IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]);
2527 }
2528 else
2529 {
2530 last_ntoken->un_ptr.first_next_arc = atok;
2531 }
2532 last_ntoken = ntoken;
2533 last_ntok = ntok;
2534 }
2535 /* add_tree_end->to_node = early_end_node;
2536 add_tree_end->linkl_next_arc = FSMARC_NULL;
2537 atok = last_arc_arriving( early_end_node);
2538 atok->linkl_prev_arc = add_tree_end;
2539 add_tree_end->linkl_prev_arc = FSMARC_NULL; */
2540 atoken->to_node = NODE_ItoX(end_node);
2541 append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok);
2542 }
2543
2544 else /* ie. do_minimize from the rear */
2545 {
2546
2547 add_tree = add_tree_end;
2548 new_arc_id = fst_get_free_arc(fst);
2549 if (new_arc_id == MAXarcID)
2550 return FST_FAILED_ON_MEMORY;
2551 atok = ARC_ItoX(new_arc_id);
2552 atoken = ARC_XtoP(atok);
2553 atoken->olabel = add_tree_olabel_use;
2554 #if DO_WEIGHTED_ADDWORD
2555 atoken->cost = add_tree_cost_use;
2556 #endif
2557 atoken->ilabel = *add_tree_late_start;
2558 atoken->fr_node = NODE_PtoX(late_start_node);
2559 if (atoken->ilabel == WORD_BOUNDARY)
2560 {
2561 atoken->cost = (costdata)(atoken->cost + fst->wtw_average);
2562 }
2563 else
2564 {
2565 atoken->cost = add_tree_cost_use + fst->wtw_average;
2566 add_tree_cost_use = FREEcostdata;
2567 }
2568 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]);
2569 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]);
2570 append_arc_leaving_node(fst, late_start_node, atok);
2571
2572 last_atok = atok;
2573 while (1)
2574 {
2575
2576 if (add_tree == add_tree_late_start)
2577 break;
2578 /* if there are other arcs leaving this node then joining
2579 earlier than here will result in over-generation, so don't
2580 if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL)
2581 break;
2582 */
2583 /*
2584 also need boundary conditions checker here
2585 */
2586
2587 /* */
2588 IF_DEBUG_WDADD(printf_node(fst, early_end_node));
2589
2590 /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) {
2591 printf("merge stopped due to num_arcs leaving ..\n");
2592 printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc);
2593 break;
2594 } */
2595
2596 for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc)
2597 {
2598 atoken = ARC_XtoP(atok);
2599 /* never pull before the slot marker */
2600 if(atoken->cost == DISABLE_ARC_COST) break;
2601 if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break;
2602 fst_pull_arc_olabel(fst, atoken); /* fails are ok */
2603 }
2604
2605 prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node,
2606 *add_tree /*->ilabel*/,
2607 WORD_EPSILON_LABEL,
2608 NULL /*&append_to*/);
2609 if (!prev_atoken)
2610 break;
2611 else
2612 {
2613 /* this check is now inside find_prev_arc_with_iolabels */
2614 /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/
2615
2616 /* now look to merge earlier still */
2617 early_end_node = NODE_XtoP(prev_atoken->fr_node);
2618 add_tree--;
2619 if (add_tree == add_tree_late_start)
2620 break;
2621 }
2622 }
2623 add_tree_early_end = add_tree;
2624 for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++)
2625 {
2626 new_node_id = fst_get_free_node(fst);
2627 if (new_node_id == MAXnodeID)
2628 return FST_FAILED_ON_MEMORY;
2629 ntok = NODE_ItoX(new_node_id);
2630 ntoken = NODE_XtoP(ntok);
2631 new_arc_id = fst_get_free_arc(fst);
2632 if (new_arc_id == MAXarcID)
2633 return FST_FAILED_ON_MEMORY;
2634 atok = ARC_ItoX(new_arc_id);
2635 atoken = ARC_XtoP(atok);
2636 atoken->ilabel = *add_tree;
2637 #if DO_WEIGHTED_ADDWORD
2638 atoken->cost = FREEcostdata; /* idea: distribute add_tree_cost here;*/
2639 #endif
2640 atoken->olabel = WORD_EPSILON_LABEL;
2641 atoken->fr_node = ntok;
2642 atoken->to_node = FSMNODE_NULL; /* filled in next loop */
2643 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]);
2644 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]);
2645
2646 ntoken->un_ptr.first_next_arc = atok;
2647 ntoken->first_prev_arc = last_atok;
2648 TO_NODE(last_atok) = ntok;
2649 last_atok = atok;
2650 }
2651 TO_NODE(last_atok) = NODE_PtoX(early_end_node);
2652 append_arc_arriving_node(fst, early_end_node, last_atok);
2653 }
2654 return rc;
2655 }
2656
2657
append_arc_leaving_node(srec_context * fst,FSMnode * fr_node,FSMarc_ptr arc)2658 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc)
2659 {
2660 FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc;
2661 FSMarc* atoken = ARC_XtoP(*atok);
2662 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc)
2663 {
2664 atoken = ARC_XtoP(*atok);
2665 }
2666 *atok = arc;
2667 LINKL_NEXT(arc) = FSMARC_NULL;
2668 }
append_arc_arriving_node(srec_context * fst,FSMnode * to_node,FSMarc_ptr arc)2669 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc)
2670 {
2671 FSMarc_ptr* atok = &to_node->first_prev_arc;
2672 FSMarc* atoken = ARC_XtoP(*atok);
2673 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc)
2674 {
2675 atoken = ARC_XtoP(*atok);
2676 }
2677 *atok = arc;
2678 LINKL_PREV(arc) = FSMARC_NULL;
2679 }
2680
2681
printf_node1(srec_context * fst,FSMnode * node)2682 int printf_node1(srec_context* fst, FSMnode* node)
2683 {
2684 return 0;
2685 }
printf_arc1(srec_context * fst,char * msg,FSMarc * arc)2686 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc)
2687 {
2688 char buffer[MAX_LINE_LENGTH];
2689 sprintf_arc(buffer, fst, arc);
2690 printf("%s%s\n", msg, buffer);
2691 return 0;
2692 }
sprintf_arc(char * buf,srec_context * fst,FSMarc * arc)2693 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc)
2694 {
2695 int rc;
2696 FSMnode* to_node = NODE_XtoP(arc->to_node);
2697 arcID arc_index = (arcID)(arc - fst->FSMarc_list);
2698 if (to_node->un_ptr.first_next_arc == FSMARC_NULL)
2699 rc = sprintf(buf, "arc%hu\n", arc_index);
2700 else
2701 {
2702 rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n",
2703 arc_index,
2704 ARC_XtoI(to_node->un_ptr.first_next_arc),
2705 arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1,
2706 fst->ilabels->words[arc->ilabel],
2707 fst->olabels->words[arc->olabel],
2708 arc->cost);
2709 }
2710 return rc;
2711 }
2712
2713 /* dumps the recognition context as a binary file,
2714 it will also store any empty space for growing */
2715
2716 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); }
2717
FST_sizes_signature()2718 asr_int32_t FST_sizes_signature()
2719 {
2720 #ifndef NDEBUG
2721 int shifted = 0;
2722 #endif
2723 asr_int32_t signature = 0;
2724 ENCODE(signature, arcID);
2725 ENCODE(signature, nodeID);
2726 ENCODE(signature, wordID);
2727 ENCODE(signature, labelID);
2728 ENCODE(signature, costdata);
2729 return signature;
2730 }
2731
FST_ContextImageSize(srec_context * context)2732 int FST_ContextImageSize(srec_context* context)
2733 {
2734 int size = 0;
2735 size += sizeof(srec_context);
2736 size += sizeof(wordmap);
2737 size += sizeof(char*) * context->olabels->max_words;
2738 size += sizeof(char) * context->olabels->max_chars;
2739 size += sizeof(wordmap);
2740 if (context->ilabels->words != NULL)
2741 size += sizeof(char*) * context->olabels->max_words;
2742 if (context->ilabels->chars != NULL)
2743 size += sizeof(char) * context->olabels->max_chars;
2744 size += sizeof(FSMarc) * context->FSMarc_list_len;
2745 size += sizeof(FSMnode) * context->FSMnode_list_len;
2746 size += sizeof(FSMnode_info) * context->FSMnode_list_len;
2747 size += sizeof(arc_token) * context->arc_token_list_len;
2748 return size;
2749 }
2750
serializeWordMapV2(wordmap * wordmap,PFile * fp)2751 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp)
2752 {
2753 unsigned int i = 0;
2754 unsigned int nfields;
2755 unsigned int tmp2[32];
2756
2757 i = 0;
2758 tmp2[i++] = wordmap->num_words;
2759 tmp2[i++] = wordmap->num_slots;
2760 tmp2[i++] = wordmap->max_words;
2761 tmp2[i++] = wordmap->num_base_words;
2762 tmp2[i++] = wordmap->max_chars;
2763 tmp2[i++] = (wordmap->next_chars - wordmap->chars);
2764 tmp2[i++] = (wordmap->next_base_chars - wordmap->chars);
2765 nfields = i;
2766 if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2767 return ESR_WRITE_ERROR;
2768
2769 if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars)
2770 return ESR_WRITE_ERROR;
2771
2772 return ESR_SUCCESS;
2773 }
2774
deserializeWordMapV2(wordmap ** pwordmap,PFile * fp)2775 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp)
2776 {
2777 unsigned int i = 0;
2778 unsigned int nfields;
2779 unsigned int tmp2[32];
2780 wordmap *awordmap;
2781 char *p;
2782 ESR_ReturnCode rc = ESR_SUCCESS;
2783 unsigned int next_chars_idx, next_base_chars_idx;
2784
2785 awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base"));
2786 if (awordmap == NULL)
2787 {
2788 PLogError("NEW failed on srec.g2g.graph.wordmap.base\n");
2789 rc = ESR_OUT_OF_MEMORY;
2790 goto CLEANUP;
2791 }
2792 awordmap->wordIDForWord = NULL; // early break to cleanup needs this
2793
2794 nfields = 7;
2795 if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields)
2796 {
2797 PLogError("pfread failed when reading nfields\n");
2798 rc = ESR_READ_ERROR;
2799 goto CLEANUP;
2800 }
2801
2802 i = 0;
2803 awordmap->num_words = (wordID)tmp2[i++];
2804 awordmap->num_slots = (wordID)tmp2[i++];
2805 awordmap->max_words = (wordID)tmp2[i++];
2806 awordmap->num_base_words = (wordID)tmp2[i++];
2807 awordmap->max_chars = tmp2[i++];
2808 next_chars_idx = tmp2[i++];
2809 next_base_chars_idx = tmp2[i++];
2810 ASSERT(nfields == i);
2811
2812 awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words"));
2813 if (awordmap->words == NULL)
2814 {
2815 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words);
2816 rc = ESR_OUT_OF_MEMORY;
2817 goto CLEANUP;
2818 }
2819 awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars"));
2820 if (awordmap->chars == NULL)
2821 {
2822 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars);
2823 rc = ESR_OUT_OF_MEMORY;
2824 goto CLEANUP;
2825 }
2826 awordmap->next_chars = awordmap->chars + next_chars_idx;
2827 awordmap->next_base_chars = awordmap->chars + next_base_chars_idx;
2828
2829 if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars)
2830 {
2831 PLogError("pfread failed while reading %d chars\n", awordmap->max_chars);
2832 rc = ESR_READ_ERROR;
2833 goto CLEANUP;
2834 }
2835
2836
2837 p = awordmap->chars;
2838 ASSERT((uintptr_t)p % 2 == 0);
2839 nfields = 0;
2840 if (nfields < awordmap->num_words)
2841 awordmap->words[nfields++] = p;
2842 for (; p < awordmap->next_chars; p++) // was next_base_chars
2843 {
2844 if ( ( *p ) == '\0' )
2845 {
2846 if (nfields == awordmap->num_words) // was num_base_words
2847 break;
2848 if (((uintptr_t)p) % 2 == 0) p++; /* so that words begin on even byte bound */
2849 awordmap->words[nfields++] = p + 1;
2850 }
2851 }
2852 ASSERT(nfields == awordmap->num_words); // was num_base_words
2853
2854 if (awordmap->max_words >= awordmap->num_base_words)
2855 {
2856 PHashTableArgs hashArgs;
2857 hashArgs.capacity = awordmap->max_words;
2858 if(hashArgs.capacity%2==0) hashArgs.capacity += 1;
2859 hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION;
2860 hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION;
2861 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR;
2862 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord));
2863
2864 rc = wordmap_populate ( awordmap, awordmap->num_words );
2865
2866 if (rc != ESR_SUCCESS)
2867 {
2868 wordmap_clean ( awordmap );
2869 goto CLEANUP;
2870 }
2871 }
2872 else
2873 {
2874 awordmap->wordIDForWord = NULL;
2875 }
2876
2877 /* success */
2878 *pwordmap = awordmap;
2879 return ESR_SUCCESS;
2880
2881 CLEANUP:
2882 if (awordmap != NULL)
2883 {
2884 if (awordmap->wordIDForWord != NULL)
2885 PHashTableDestroy(awordmap->wordIDForWord);
2886 if (awordmap->words != NULL) FREE(awordmap->words);
2887 if (awordmap->chars != NULL) FREE(awordmap->chars);
2888 FREE(awordmap);
2889 }
2890 return rc;
2891 }
2892
serializeArcToken(arc_token * token_base,int i,PFile * fp)2893 static ESR_ReturnCode serializeArcToken(arc_token *token_base,
2894 int i,
2895 PFile* fp)
2896 {
2897 arc_token *token = token_base + i;
2898 asr_uint32_t idx;
2899
2900 if (pfwrite(&token->ilabel, 2, 1, fp) != 1)
2901 return ESR_WRITE_ERROR;
2902
2903 if (pfwrite(&token->olabel, 2, 1, fp) != 1)
2904 return ESR_WRITE_ERROR;
2905
2906 /* if (pfwrite(&token->cost, 2, 1, fp) != 1)
2907 return ESR_WRITE_ERROR; */
2908 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base);
2909
2910 if (pfwrite(&idx, 4, 1, fp) != 1)
2911 return ESR_WRITE_ERROR;
2912
2913 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base);
2914
2915 if (pfwrite(&idx, 4, 1, fp) != 1)
2916 return ESR_WRITE_ERROR;
2917
2918 return ESR_SUCCESS;
2919 }
2920
serializeArcTokenInfo(srec_context * context,PFile * fp)2921 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context,
2922 PFile* fp)
2923 {
2924 int i;
2925 asr_uint32_t idx;
2926 ESR_ReturnCode rc;
2927
2928 if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1)
2929 return ESR_WRITE_ERROR;
2930
2931 idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list);
2932
2933 if (pfwrite(&idx, 4, 1, fp) != 1)
2934 return ESR_WRITE_ERROR;
2935
2936 idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list);
2937
2938 if (pfwrite(&idx, 4, 1, fp) != 1)
2939 return ESR_WRITE_ERROR;
2940
2941 for (i = 0; i < context->arc_token_list_len; ++i)
2942 {
2943 rc = serializeArcToken(context->arc_token_list, i, fp);
2944 if (rc != ESR_SUCCESS) return rc;
2945 }
2946 return ESR_SUCCESS;
2947 }
2948
deserializeArcToken(arc_token * token_base,int i,PFile * fp)2949 static ESR_ReturnCode deserializeArcToken(arc_token *token_base,
2950 int i,
2951 PFile* fp)
2952 {
2953 arc_token *token = token_base + i;
2954 asr_uint32_t idx[2];
2955 asr_uint16_t labels[2];
2956
2957 if (pfread(labels, 2, 2, fp) != 2)
2958 return ESR_READ_ERROR;
2959
2960 token->ilabel = labels[0];
2961 token->olabel = labels[1];
2962
2963 /* if (pfread(&token->cost, 2, 1, fp) != 1)
2964 return ESR_READ_ERROR; */
2965
2966 if (pfread(idx, 4, 2, fp) != 2)
2967 return ESR_READ_ERROR;
2968
2969 token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base));
2970 token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base));
2971
2972 return ESR_SUCCESS;
2973 }
2974
deserializeArcTokenInfo(srec_context * context,PFile * fp)2975 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context,
2976 PFile* fp)
2977 {
2978 ESR_ReturnCode rc;
2979 int i;
2980 asr_uint32_t idx;
2981
2982 if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) {
2983 PLogError("pfread failed in deserializeArcTokenInfo()\n");
2984 return ESR_READ_ERROR;
2985 }
2986
2987 context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len,
2988 L("srec.graph.wordgraph"));
2989
2990 if (context->arc_token_list == NULL)
2991 return ESR_OUT_OF_MEMORY;
2992
2993 if (pfread(&idx, 4, 1, fp) != 1)
2994 {
2995 rc = ESR_READ_ERROR;
2996 goto CLEANUP;
2997 }
2998
2999 context->arc_token_freelist =
3000 IDX_TO_PTR(idx, context->arc_token_list);
3001
3002 if (pfread(&idx, 4, 1, fp) != 1)
3003 {
3004 rc = ESR_READ_ERROR;
3005 goto CLEANUP;
3006 }
3007
3008 context->arc_token_insert_start =
3009 IDX_TO_PTR(idx, context->arc_token_list);
3010
3011 for (i = 0; i < context->arc_token_list_len; ++i)
3012 {
3013 rc = deserializeArcToken(context->arc_token_list, i, fp);
3014 if (rc != ESR_SUCCESS) goto CLEANUP;
3015 }
3016 return ESR_SUCCESS;
3017
3018 CLEANUP:
3019 FREE(context->arc_token_list);
3020 context->arc_token_list =
3021 context->arc_token_freelist =
3022 context->arc_token_insert_start = NULL;
3023 return rc;
3024 }
3025
FST_GetGrammarType(srec_context * context)3026 int FST_GetGrammarType(srec_context* context)
3027 {
3028 arc_token *t, *u;
3029 wordID expected_wdid;
3030 /* 0 1 0
3031 1 2 4
3032 ...
3033 1 2 1316
3034 2
3035 */
3036 t = context->arc_token_list;
3037 if (t->ilabel != WORD_EPSILON_LABEL)
3038 return GrammarTypeBNF;
3039 if (t->next_token_index)
3040 return GrammarTypeBNF;
3041 t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3042 expected_wdid = NUM_ITEMLIST_HDRWDS;
3043 for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index))
3044 {
3045 if (t->ilabel != expected_wdid)
3046 return GrammarTypeBNF;
3047 u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc);
3048 if (u != NULL && u->ilabel != MAXwordID)
3049 return GrammarTypeBNF;
3050 expected_wdid++;
3051 }
3052 if (expected_wdid != context->olabels->num_words)
3053 return GrammarTypeBNF;
3054 return GrammarTypeItemList;
3055 }
3056
FST_DumpContextAsImageV2(srec_context * context,PFile * fp)3057 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp)
3058 {
3059 asr_uint32_t header[4];
3060 arcID tmp[32], i, j, nfields;
3061 FSMarc* arc;
3062 ESR_ReturnCode rc;
3063 #if !defined(DO_ARCS_IO_IN_ARC_ORDER)
3064 FSMnode* node;
3065 arcID arcid, num_arcs, num_arcs_written;
3066 #endif
3067
3068 /* Write header information. */
3069 header[0] = 0;
3070 header[1] = IMAGE_FORMAT_V2;
3071 header[2] = context->modelid;
3072 header[3] = context->grmtyp;
3073 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3074 PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp);
3075 #endif
3076
3077 if (pfwrite(header, sizeof(header[0]), 4, fp) != 4)
3078 {
3079 PLogError("FST_DumpContextAsImage: Could not write header.\n");
3080 return FST_FAILED_INTERNAL;
3081 }
3082
3083 /* arcs and nodes */
3084 i = 0;
3085 tmp[i++] = context->num_arcs;
3086 tmp[i++] = context->FSMarc_list_len;
3087 tmp[i++] = context->num_base_arcs;
3088 tmp[i++] = context->FSMarc_freelist;
3089 tmp[i++] = context->max_searchable_arcs;
3090
3091 tmp[i++] = context->num_nodes;
3092 tmp[i++] = context->FSMnode_list_len;
3093 tmp[i++] = context->num_base_nodes;
3094 tmp[i++] = context->FSMnode_freelist;
3095 tmp[i++] = context->start_node;
3096 tmp[i++] = context->end_node;
3097 tmp[i++] = context->max_searchable_nodes;
3098
3099 tmp[i++] = context->beg_silence_word;
3100 tmp[i++] = context->end_silence_word;
3101 tmp[i++] = context->hack_silence_word;
3102 tmp[i++] = context->hmm_ilabel_offset;
3103
3104 nfields = i;
3105
3106 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3107 return ESR_WRITE_ERROR;
3108 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3109 PLogMessage("G2G done WR hdrs %d", pftell(fp));
3110 #endif
3111
3112 #if defined(DO_ARCS_IO_IN_ARC_ORDER)
3113 if( 1) {
3114 i = 0;
3115 tmp[i++] = context->end_node;
3116 tmp[i++] = MAXnodeID;
3117 tmp[i++] = MAXmodelID;
3118 tmp[i++] = MAXwordID;
3119 tmp[i++] = FREEcostdata;
3120 nfields = i;
3121 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3122 return ESR_WRITE_ERROR;
3123 }
3124 for(j=0; j<context->num_arcs; j++) {
3125 arc = &context->FSMarc_list[j];
3126 i = 0;
3127 tmp[i++] = arc->fr_node;
3128 tmp[i++] = arc->to_node;
3129 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3130 tmp[i++] = arc->ilabel;
3131 tmp[i++] = arc->olabel;
3132 tmp[i++] = arc->cost;
3133 nfields = i;
3134 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3135 return ESR_WRITE_ERROR;
3136 }
3137 #else
3138 num_arcs_written = 0;
3139 for (j = 0; j < context->num_nodes; j++) {
3140 node = &context->FSMnode_list[j];
3141 num_arcs = (arcID)num_arcs_leaving(context, node);
3142 arcid = node->un_ptr.first_next_arc;
3143 if (arcid == MAXarcID) {
3144 i = 0;
3145 tmp[i++] = (arcID)j;
3146 tmp[i++] = MAXarcID;
3147 tmp[i++] = MAXarcID;
3148 tmp[i++] = MAXarcID;
3149 tmp[i++] = MAXarcID;
3150 nfields = i;
3151 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3152 return ESR_WRITE_ERROR;
3153 /* num_arcs_written++; end node don't have a to-arc */
3154 }
3155 else {
3156 for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) {
3157 arc = &context->FSMarc_list[arcid];
3158 i = 0;
3159 tmp[i++] = (arcID)j;
3160 tmp[i++] = arc->to_node;
3161 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID);
3162 tmp[i++] = arc->ilabel;
3163 tmp[i++] = arc->olabel;
3164 tmp[i++] = arc->cost;
3165 nfields = i;
3166 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3167 return ESR_WRITE_ERROR;
3168 num_arcs_written++;
3169 }
3170 }
3171 }
3172 ASSERT(num_arcs_written == context->num_arcs);
3173 #endif
3174
3175 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3176 PLogMessage("G2G done WR arcs %d", pftell(fp));
3177 #endif
3178
3179 /* node info .. will now be calculated on line */
3180 /* wrapup_cost .. will now be calculated on line */
3181
3182 /* exit points for slots */
3183 if (1)
3184 {
3185 srec_fsm_exit_point *p, *q;
3186 if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1)
3187 return ESR_WRITE_ERROR;
3188 p = context->fsm_exit_points;
3189 q = p + MAX_NUM_SLOTS;
3190 while (p < q)
3191 {
3192 if (pfwrite(&p->from_node_index, 2, 1, fp) != 1)
3193 return ESR_WRITE_ERROR;
3194 if (pfwrite(&p->arc_index, 2, 1, fp) != 1)
3195 return ESR_WRITE_ERROR;
3196 if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1)
3197 return ESR_WRITE_ERROR;
3198 ++p;
3199 }
3200 }
3201 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3202 PLogMessage("G2G done WR exits %d", pftell(fp));
3203 #endif
3204 /* no entry points stuff */
3205
3206 /* no saving ilabels */
3207
3208 /* now load the ilabels */
3209 rc = serializeWordMapV2(context->olabels, fp);
3210 if (rc != ESR_SUCCESS)
3211 return rc;
3212 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3213 PLogMessage("G2G done WR wmap %d", pftell(fp));
3214 #endif
3215
3216 if (context->grmtyp != GrammarTypeItemList)
3217 {
3218 if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS)
3219 {
3220 PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3221 return FST_FAILED_INTERNAL;
3222 }
3223 }
3224 else
3225 {
3226 /* nothing, at decoding time, we'll just construct the graph
3227 from the word list, ie just this ..
3228 0 1 0
3229 1 2 4
3230 ...
3231 1 2 1316
3232 2
3233 ie
3234 0 1 0
3235 for(i=4;i<nwords;i++) { "1 2 i" }
3236 2
3237 because words 0-3 are eps,-pau-,-pau2-,slot@grammarname.grxml
3238 */
3239 }
3240
3241 /* Write header information. */
3242 header[0] = pftell(fp);
3243 header[1] = IMAGE_FORMAT_V2;
3244
3245 /* goto beginning of file. */
3246 if (pfseek(fp, 0, SEEK_SET))
3247 {
3248 PLogError("FST_DumpContextAsImage: could not reposition for header.\n");
3249 return FST_FAILED_INTERNAL;
3250 }
3251
3252 if (pfwrite(header, 4, 2, fp) != 2)
3253 {
3254 PLogError("FST_DumpContextAsImage: Could not write header.\n");
3255 return FST_FAILED_INTERNAL;
3256 }
3257
3258 /* reset pointer at end of file. */
3259 if (pfseek(fp, 0, SEEK_END))
3260 {
3261 PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n");
3262 return FST_FAILED_INTERNAL;
3263 }
3264
3265 return FST_SUCCESS;
3266 }
3267
3268
FST_LoadContextFromImageV2(srec_context * fst,PFile * fp)3269 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp)
3270 {
3271 asr_int32_t header[6];
3272 arcID tmp[32], num_arcs, new_arc_id;
3273 unsigned int i, nfields;
3274 srec_fsm_exit_point *p, *q;
3275 arcID max_num_FSMarcs;
3276 nodeID max_num_FSMnodes;
3277 FSMarc_ptr atok = (FSMarc_ptr)0;
3278 FSMarc *atoken = NULL;
3279 FSMnode *fr_node, *to_node;
3280 int rc = FST_SUCCESS;
3281 ESR_ReturnCode esr_rc;
3282 ESR_BOOL seenFinalNode = ESR_FALSE;
3283
3284 /* read header information. */
3285 if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2)
3286 {
3287 PLogError("FST_DumpContextAsImage: Could not read header.\n");
3288 return FST_FAILED_INTERNAL;
3289 }
3290 fst->modelid = header[2];
3291 fst->grmtyp = header[3];
3292
3293 nfields = 16;
3294 if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields)
3295 return ESR_READ_ERROR;
3296
3297 /* arcs and nodes */
3298 i = 0;
3299 fst->num_arcs = max_num_FSMarcs = tmp[i++];
3300 fst->FSMarc_list_len = tmp[i++];
3301 fst->num_base_arcs = tmp[i++];
3302 fst->FSMarc_freelist = tmp[i++];
3303 fst->max_searchable_arcs = tmp[i++] * 0; /*5*/
3304 fst->num_nodes = max_num_FSMnodes = tmp[i++];
3305 fst->FSMnode_list_len = tmp[i++];
3306 fst->num_base_nodes = tmp[i++];
3307 fst->FSMnode_freelist = tmp[i++];
3308 fst->start_node = tmp[i++];/*10*/
3309 fst->end_node = tmp[i++];
3310 fst->max_searchable_nodes = tmp[i++] * 0;
3311 fst->beg_silence_word = tmp[i++];
3312 fst->end_silence_word = tmp[i++];
3313 fst->hack_silence_word = tmp[i++]; /*15*/
3314 fst->hmm_ilabel_offset = tmp[i++]; /* 16 */
3315
3316 fst->addWordCaching_lastslot_name = 0;
3317 fst->addWordCaching_lastslot_num = MAXwordID;
3318 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE;
3319
3320 ASSERT(i == nfields);
3321 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3322 PLogMessage("G2G done RD hdrs %d", pftell(fp));
3323 #endif
3324
3325 ASSERT(fst->num_arcs >= fst->num_base_arcs); // was ==
3326 ASSERT(fst->num_nodes >= fst->num_base_nodes); // was ==
3327
3328 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs");
3329 if (!fst->FSMarc_list)
3330 {
3331 rc = FST_FAILED_ON_MEMORY;
3332 PLogError("CALLOC_CLR fst->FSMarc_list \n");
3333 goto CLEANUP;
3334 }
3335 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes");
3336 if (!fst->FSMnode_list)
3337 {
3338 rc = FST_FAILED_ON_MEMORY;
3339 PLogError("CALLOC_CLR fst->FSMnode_list failed\n");
3340 goto CLEANUP;
3341 }
3342 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos");
3343
3344 if (!fst->FSMnode_info_list)
3345 {
3346 rc = FST_FAILED_ON_MEMORY;
3347 PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n");
3348 goto CLEANUP;
3349 }
3350
3351 /* setup the arc freelist */
3352 fst->FSMarc_freelist = 0;
3353 fst->num_arcs = 0;
3354 for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++)
3355 {
3356 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1);
3357 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3358 }
3359 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL;
3360 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE;
3361
3362 /* initialize the nodes, 'cuz reading is random order */
3363 for (i = 0; i < max_num_FSMnodes; i++)
3364 {
3365 fr_node = &fst->FSMnode_list[i];
3366 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL;
3367 }
3368
3369 /* 1. first load up all the information */
3370 num_arcs = 0;
3371 for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++)
3372 {
3373 if (i > max_num_FSMarcs && !seenFinalNode)
3374 {
3375 PLogError("Final node never encountered");
3376 rc = ESR_INVALID_STATE;
3377 goto CLEANUP;
3378 }
3379 nfields = 5;
3380 if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields)
3381 {
3382 PLogError("reading arc");
3383 rc = ESR_INVALID_STATE;
3384 goto CLEANUP;
3385 }
3386
3387 if (tmp[1] == MAXnodeID)
3388 {
3389 seenFinalNode = ESR_TRUE;
3390 if(fst->end_node != tmp[0]) {
3391 PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]);
3392 ASSERT(fst->end_node == tmp[0]);
3393 }
3394 i--;
3395 }
3396 else
3397 {
3398 new_arc_id = fst_get_free_arc(fst);
3399 if (new_arc_id == MAXarcID)
3400 return FST_FAILED_ON_MEMORY;
3401 atok = ARC_ItoX(new_arc_id);
3402 atoken = ARC_XtoP(atok);
3403 num_arcs++;
3404
3405 ASSERT(tmp[0] < fst->num_nodes);
3406 ASSERT(tmp[1] < fst->num_nodes);
3407 fr_node = &fst->FSMnode_list[ tmp[0]];
3408 to_node = &fst->FSMnode_list[ tmp[1]];
3409 atoken->ilabel = tmp[2];
3410 atoken->olabel = tmp[3];
3411 atoken->cost = tmp[4];
3412 append_arc_leaving_node(fst, fr_node, atok);
3413 append_arc_arriving_node(fst, to_node, atok);
3414 atoken->fr_node = NODE_ItoX(tmp[0]);
3415 atoken->to_node = NODE_ItoX(tmp[1]);
3416 }
3417 }
3418 ASSERT(fst->num_arcs == num_arcs);
3419 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3420 PLogMessage("G2G done RD arcs %d", pftell(fp));
3421 #endif
3422
3423 /* setup the node freelist */
3424 if (fst->num_nodes < fst->FSMnode_list_len)
3425 {
3426 fst->FSMnode_freelist = fst->num_nodes;
3427 for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++)
3428 {
3429 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1);
3430 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3431 }
3432 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL;
3433 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE;
3434 }
3435 else
3436 {
3437 fst->FSMnode_freelist = MAXnodeID;
3438 }
3439
3440 /* node info .. will now be calculated on line */
3441 /* wrapup_cost .. will now be calculated on line */
3442 fst->whether_prepared = 0;
3443 fst_fill_node_info(fst);
3444 for (i = 0; i < fst->num_nodes; i++)
3445 fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN;
3446
3447 /* exit points for slots */
3448 if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1)
3449 return ESR_READ_ERROR;
3450 p = fst->fsm_exit_points;
3451 q = p + MAX_NUM_SLOTS;
3452 while (p < q)
3453 {
3454 if (pfread(&p->from_node_index, 2, 1, fp) != 1)
3455 return ESR_WRITE_ERROR;
3456 if (pfread(&p->arc_index, 2, 1, fp) != 1)
3457 return ESR_WRITE_ERROR;
3458 if (pfread(&p->wbto_node_index, 2, 1, fp) != 1)
3459 return ESR_WRITE_ERROR;
3460 ++p;
3461 }
3462 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3463 PLogMessage("G2G done RD exits %d", pftell(fp));
3464 #endif
3465
3466 /* no entry points stuff */
3467
3468 /* ilabels were not saved, create them as empty */
3469 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap");
3470 fst->ilabels->num_words = fst->ilabels->max_words = 0;
3471 fst->ilabels->words = 0;
3472
3473 /* now save the olabels */
3474 esr_rc = deserializeWordMapV2(&fst->olabels, fp);
3475 if (esr_rc != ESR_SUCCESS)
3476 {
3477 PLogError("deserializeWordMapV2() failed\n");
3478 rc = FST_FAILED_INTERNAL;
3479 goto CLEANUP;
3480 }
3481 #ifdef SREC_ENGINE_VERBOSE_LOGGING
3482 PLogMessage("G2G done RD wmap %d", pftell(fp));
3483 #endif
3484
3485 /* arctokeninfo */
3486 if (fst->grmtyp != GrammarTypeItemList)
3487 {
3488 if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS)
3489 {
3490 PLogError("FST_DumpContextAsImage: could not write arc token data.\n");
3491 rc = FST_FAILED_INTERNAL;
3492 goto CLEANUP;
3493 }
3494 }
3495 else
3496 {
3497 /* here we just construct an isolated list, but we should scrap it later */
3498 wordID wdid;
3499 arc_token *atl, *final, *atli;
3500 fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS;
3501 atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph"));
3502 if(!atl) {
3503 PLogError("fst->arc_token_list_len failed\n");
3504 goto CLEANUP;
3505 }
3506 atl->first_next_arc = ARC_TOKEN_LNK(atl, 1);
3507 atl->next_token_index = ARC_TOKEN_NULL;
3508 final = atl + fst->arc_token_list_len - 1;
3509 for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++)
3510 {
3511 atli = &atl[i];
3512 atli->ilabel = wdid;
3513 atli->olabel = wdid;/*not used*/
3514 atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1));
3515 atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/
3516 }
3517 ASSERT(atl + i == final);
3518 final->next_token_index = ARC_TOKEN_NULL;
3519 final->first_next_arc = ARC_TOKEN_NULL;
3520 final->ilabel = final->olabel = 0;
3521 fst->arc_token_list = atl;
3522 }
3523 return FST_SUCCESS;
3524 CLEANUP:
3525 return FST_FAILED_INTERNAL;
3526 }
3527
3528
3529 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp);
3530
FST_LoadContextFromImage(srec_context ** pcontext,PFile * fp)3531 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp)
3532 {
3533 srec_context* context = NULL;
3534 /* used to force correct data alignment */
3535 asr_int32_t header[2];
3536 int rc = FST_SUCCESS;
3537
3538 /*printf("FST_LoadContexFromImage\n");*/
3539 if (!fp)
3540 {
3541 PLogError("FST_LoadContextImage() failed; bad file pointer\n");
3542 return FST_FAILED_ON_INVALID_ARGS;
3543 }
3544
3545 context = NEW(srec_context, L("srec.graph.binary"));
3546 if (context == NULL)
3547 {
3548 PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n");
3549 return FST_FAILED_ON_MEMORY;
3550 }
3551 memset(context, 0, sizeof(srec_context));
3552
3553 /* header */
3554 if (pfread(header, 4, 2, fp) != 2)
3555 {
3556 PLogError("FST_LoadContextFromImage: failed reading header.\n");
3557 rc = FST_FAILED_ON_INVALID_ARGS;
3558 goto CLEANUP;
3559 }
3560
3561 if (header[1] == IMAGE_FORMAT_V2)
3562 {
3563 rc = FST_LoadContextFromImageV2(context, fp);
3564 if (rc != FST_SUCCESS)
3565 goto CLEANUP;
3566 }
3567 else
3568 {
3569 PLogError("FST_LoadContextFromImage() failed on image_format\n");
3570 goto CLEANUP;
3571 }
3572 *pcontext = context;
3573 return FST_SUCCESS;
3574 CLEANUP:
3575 if (context) FREE(context);
3576 *pcontext = 0;
3577 return rc;
3578 }
3579
3580
FST_DumpReverseWordGraph(srec_context * context,PFile * fp)3581 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp)
3582 {
3583 /* not implemented, use FST_DumpSyntaxAsImage() for now */
3584 return 1;
3585 }
3586
3587 /* this belongs part of the srec_context */
3588 typedef struct
3589 {
3590 asr_int32_t image_format;
3591 asr_int32_t image_size;
3592 asr_int32_t sizes_signature;
3593 }
3594 context_image_header;
3595
3596
FST_LoadContextFromStreamImage(srec_context ** pcontext,char * buffer,asr_int32_t buffer_len)3597 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len)
3598 {
3599 return FST_FAILED_INTERNAL;
3600 }
3601
3602 typedef struct nodeID_list_t
3603 {
3604 nodeID *nodes, num_nodes, num_alloc;
3605 }
3606 nodeID_list;
3607
fst_node_has_speech_to_come(srec_context * context,nodeID node_index)3608 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index)
3609 {
3610 /* recursive func ... try to keep stack use low! */
3611 /* later we should cache this information:
3612 need 2bits: endn opte regl dunno */
3613 FSMarc* arc;
3614 arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc;
3615 for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc)
3616 {
3617 arc = &context->FSMarc_list[arc_index];
3618 if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS)
3619 {
3620 return 1;
3621 }
3622 else if (arc->ilabel < EPSILON_OFFSET)
3623 {
3624 if (fst_node_has_speech_to_come(context, arc->to_node))
3625 {
3626 return 1;
3627 }
3628 }
3629 else if (IS_SILENCE_ILABEL(arc->ilabel, context))
3630 {
3631 /* another silence! */
3632 if (fst_node_has_speech_to_come(context, arc->to_node))
3633 return 1;
3634 }
3635 }
3636 return 0;
3637 }
3638
fst_fill_node_info(srec_context * context)3639 int fst_fill_node_info(srec_context* context)
3640 {
3641 char* infos = context->FSMnode_info_list;
3642 nodeID_list optends_obj;
3643 nodeID_list *optends = &optends_obj;
3644
3645 FSMnode* node;
3646 FSMarc* arc;
3647 nodeID i, j, node_index;
3648 arcID arc_index;
3649 costdata wrapup_cost;
3650 int wrapup_cost_inconsistencies;
3651 /* int num_near_end_nodes; nodes within EPS or silence to the end, these
3652 are called NODE_INFO_ENDNODES, so that they can be excluded from the
3653 comparison with end_node score in the eos detection */
3654
3655 optends->num_nodes = 1;
3656 optends->num_alloc = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */
3657 optends->nodes = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes");
3658 optends->nodes[0] = context->end_node;
3659 /* num_near_end_nodes = 0; */
3660
3661 for (i = 0; i < optends->num_nodes; i++)
3662 {
3663 node_index = optends->nodes[i];
3664 node = &context->FSMnode_list[ node_index];
3665 for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3666 arc_index = arc->linkl_prev_arc)
3667 {
3668 arc = &context->FSMarc_list[arc_index];
3669 if (arc->fr_node != node_index)
3670 {
3671
3672 if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET)
3673 {
3674 /* ok, fr_node goes to the end via silence, check for dups */
3675 for (j = 0; j < optends->num_nodes; j++)
3676 if (optends->nodes[j] == arc->fr_node) break;
3677 /* append it to the list */
3678 if (j == optends->num_nodes)
3679 {
3680 if (optends->num_nodes < optends->num_alloc)
3681 optends->nodes[ optends->num_nodes++] = arc->fr_node;
3682 else
3683 {
3684 ASSERT(0 && "allocated too few optend nodes");
3685 return 0;
3686 }
3687 }
3688 }
3689 }
3690 }
3691 }
3692
3693 /* now set the info array, default is regular */
3694 for (i = 0; i < context->num_nodes; i++)
3695 infos[i] = NODE_INFO_REGULAR;
3696 /* also set the other nodes, we'll check whether nodes were added
3697 via these flags */
3698 for (; i < context->FSMnode_list_len; i++)
3699 infos[i] = NODE_INFO_UNKNOWN;
3700 infos[ context->end_node] = NODE_INFO_ENDNODE;
3701
3702 /* get rid of direct to end nodes, etc */
3703 for (i = 0, j = 0; i < optends->num_nodes; i++)
3704 {
3705 optends->nodes[j] = optends->nodes[i];
3706 if (fst_node_has_speech_to_come(context, optends->nodes[i]))
3707 {
3708 j++;
3709 infos[ optends->nodes[i]] = NODE_INFO_OPTENDN;
3710 }
3711 else
3712 {
3713 infos[ optends->nodes[i]] = NODE_INFO_ENDNODE;
3714 /* num_near_end_nodes++; */
3715 }
3716 }
3717 optends->num_nodes = j;
3718
3719 /* printf("get_oppend_nodes (%d)", optends->num_nodes);
3720 for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]);
3721 printf("\n");
3722 */
3723
3724 FREE(optends->nodes);
3725
3726 /* find the wrapup cost */
3727 node = &context->FSMnode_list[ context->end_node];
3728 wrapup_cost = MAXcostdata;
3729 wrapup_cost_inconsistencies = 0;
3730 for (arc_index = node->first_prev_arc; arc_index != MAXarcID;
3731 arc_index = arc->linkl_prev_arc)
3732 {
3733 arc = &context->FSMarc_list[ arc_index];
3734 if (IS_SILENCE_ILABEL(arc->ilabel, context) &&
3735 arc->olabel == context->end_silence_word)
3736 {
3737 if (wrapup_cost == MAXcostdata)
3738 wrapup_cost = arc->cost;
3739 else if (context->wrapup_cost != arc->cost)
3740 {
3741 wrapup_cost = arc->cost;
3742 wrapup_cost_inconsistencies++;
3743 }
3744 }
3745 }
3746 #define MIN_WRAPUP_COST 100
3747 context->wtw_average = wrapup_cost;
3748 if (context->wtw_average > 200)
3749 context->wtw_average = 200;
3750 if (context->wrapup_cost < MIN_WRAPUP_COST)
3751 context->wrapup_cost = MIN_WRAPUP_COST;
3752 /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost,
3753 wrapup_cost_inconsistencies);*/
3754 return 0;
3755 }
3756
fst_alloc_transit_points(srec_context * context)3757 int fst_alloc_transit_points(srec_context* context)
3758 {
3759 arcID i;
3760 wordID num_slots = context->olabels->num_slots;
3761 wordID olabel;
3762 asr_int16_t nfxps = 0;
3763 FSMarc* arc;
3764 FSMnode* node;
3765
3766 context->num_fsm_exit_points = 0;
3767 /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */
3768 if (num_slots == 1)
3769 return 0;
3770
3771 /* scan through to finds arc that carry rule references */
3772 for (i = 0; i < context->num_arcs; i++)
3773 {
3774 olabel = context->FSMarc_list[i].olabel;
3775 /* (wordID)(olabel-1) < (num_slots-1) ?? might work */
3776 if (olabel > 0 && olabel < num_slots)
3777 {
3778 context->FSMarc_list[i].cost = DISABLE_ARC_COST;
3779 if (nfxps >= MAX_NUM_SLOTS)
3780 {
3781 PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n");
3782 return 0;
3783 }
3784 context->fsm_exit_points[nfxps].arc_index = i;
3785 context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node;
3786 /* skip over the trailing silence and .wb labels */
3787 for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;)
3788 {
3789 node = &context->FSMnode_list[arc->to_node];
3790 arc = &context->FSMarc_list[node->un_ptr.first_next_arc];
3791 }
3792 context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node;
3793 nfxps++;
3794
3795 } /* olabel<num_slots */
3796 } /* i<num_arcs */
3797 context->num_fsm_exit_points = nfxps;
3798 return 0;
3799 }
3800
3801
3802 /*
3803 cross-word modeling:
3804 for now we'll assume silence context on either side, later we'll create
3805 a graph with the right fan out on the edges of each slot. At that point we
3806 may have routines like ... if(is_dead_end(arc)) continue;
3807
3808 word removal:
3809 we'll use refcounters on arcs, we can search the whole graph for the
3810 olabel, then remove forward and remove backwards
3811
3812 word addition:
3813 glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219
3814
3815 set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map
3816 set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map
3817 wintel/debug/addword.exe -min
3818 $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt
3819 $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot
3820 dotty out.PCLG.dot
3821
3822 need
3823 - a sanity check
3824 - remove word
3825 - multiple pronunciations handling
3826 - homonyms handling, other boundary conditions checking
3827 - measure the speed
3828
3829 alan_adams al~#ad}z
3830 alan_adams al~ad}z
3831 paul_adams p{l#ad}z
3832
3833
3834
3835 */
3836