1 /*---------------------------------------------------------------------------*
2 * text_parser.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20 #include"pstdio.h"
21 #include"srec_context.h"
22 #include"astar.h"
23
24 #include "passert.h"
25 #include "portable.h"
26
27
28 #define MAX_LOCAL_LEN 256
29 #define PARSE_PASS 0
30 #define PARSE_FAIL 1
31
32
check_word_path(srec_context * context,arc_token * atok,const char * transcription,int tlen)33 static int check_word_path(srec_context* context, arc_token* atok,
34 const char* transcription, int tlen)
35 {
36 const char *wd, *p;
37 char *q;
38 arc_token* next_atok;
39 wordID wdID;
40 int q_position;
41
42 if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
43 {
44 PLogError("Transcription too long [%s]\n", transcription);
45 return PARSE_FAIL;
46 }
47
48 while (1) {
49 char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
50
51 /* wd points to the first char of last word */
52 wd = transcription;
53 if (tlen > 0)
54 {
55 for (wd = transcription + tlen - 1; wd > transcription; wd--)
56 {
57 if (*wd == ' ')
58 {
59 wd++;
60 break;
61 }
62 }
63 }
64 for (p = wd, q = copy_of_word; ; p++, q++)
65 {
66 q_position = q - copy_of_word;
67 if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
68 {
69 PLogError("Word too long in transcription [%s]\n", transcription);
70 return PARSE_FAIL;
71 }
72 *q = *p;
73 if (*p == ' ' || *p == '\0')
74 {
75 *q = 0;
76 break;
77 }
78 }
79 wdID = wordmap_find_index(context->olabels, copy_of_word);
80
81 if (wdID < MAXwordID)
82 {
83 next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
84 }
85 else
86 {
87 next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
88 if (!next_atok) return PARSE_FAIL;
89 }
90
91 if (!next_atok) return PARSE_FAIL;
92
93 int whether_final_atok = 0;
94 arc_token* tmp;
95 for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
96 tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
97 {
98 if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
99 }
100
101 if (wd == transcription && whether_final_atok) return PARSE_PASS;
102 if (wd == transcription) return PARSE_FAIL;
103 tlen--;
104 while (transcription[tlen] != ' ' && tlen > 0) tlen--;
105
106 atok = next_atok;
107 }
108 }
109
FST_CheckPath_Simple(srec_context * context,const char * transcription)110 int FST_CheckPath_Simple(srec_context* context, const char* transcription)
111 {
112 arc_token* atok = &context->arc_token_list[0];
113 int transcription_len = strlen(transcription);
114 int rc;
115
116 for (; transcription_len > 0; transcription_len--)
117 if (transcription[transcription_len-1] != ' ') break;
118 rc = check_word_path(context, atok, transcription, transcription_len);
119 return rc;
120 }
121
FST_CheckPath_Complex(srec_context * context,const char * transcription,char * literal,size_t max_literal_len)122 int FST_CheckPath_Complex(srec_context* context, const char* transcription,
123 char* literal, size_t max_literal_len)
124 {
125 int i, j, rc;
126 int num_spaces;
127 char copy_of_transcription[MAX_LOCAL_LEN];
128 char* spaces[24], *p; /* can't go too high here!! */
129 ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
130
131 strcpy(copy_of_transcription, transcription);
132 for (num_spaces = 0, p = copy_of_transcription; *p; p++)
133 {
134 if (*p == ' ')
135 {
136 if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
137 {
138 PLogError("FST_CheckPath_Complex() failed on too many words\n");
139 return PARSE_FAIL;
140 }
141 spaces[num_spaces++] = p;
142 }
143 }
144
145 if (num_spaces == 0)
146 {
147 rc = FST_CheckPath_Simple(context, transcription);
148 if (rc == PARSE_PASS)
149 {
150 ASSERT(strlen(copy_of_transcription) < max_literal_len);
151 strcpy(literal, copy_of_transcription);
152 }
153 return rc;
154 }
155
156 for (i = 0; i < (1 << num_spaces); i++)
157 {
158 /* find the space pointers */
159 for (j = 0; j < num_spaces; j++)
160 *spaces[j] = i & (1 << j) ? '_' : ' ';
161 /* check each word, potentially within a rule! */
162 for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
163 {
164 wordID k, wdid = wordmap_find_index(context->olabels, p);
165 if (wdid < MAXwordID) continue;
166 for (k = 1; k < context->olabels->num_slots; k++)
167 {
168 wdid = wordmap_find_index_in_rule(context->olabels, p, k);
169 if (wdid < MAXwordID) break;
170 }
171 if (wdid == MAXwordID)
172 goto next_i;
173 }
174 /* fix the nulls back */
175 for (j = 0; j < num_spaces; j++)
176 *spaces[j] = i & (1 << j) ? '_' : ' ';
177 rc = FST_CheckPath_Simple(context, copy_of_transcription);
178 if (rc == PARSE_PASS)
179 {
180 ASSERT(strlen(copy_of_transcription) < max_literal_len);
181 strcpy(literal, copy_of_transcription);
182 return rc;
183 }
184 next_i:
185 continue;
186 }
187 return PARSE_FAIL;
188 }
189
190 static void clean_up_sentence(char* s);
191
FST_CheckPath(srec_context * context,const char * transcription,char * literal,size_t max_literal_len)192 int FST_CheckPath(srec_context* context, const char* transcription,
193 char* literal, size_t max_literal_len)
194 {
195 char mytranscription[256];
196 passert(strlen(transcription) < sizeof(mytranscription));
197 strcpy(mytranscription, transcription);
198 clean_up_sentence(mytranscription);
199 if (!context->arc_token_list)
200 return 2;
201 else
202 return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
203 }
204
clean_up_sentence(char * s)205 static void clean_up_sentence(char* s)
206 {
207 char* p, *q;
208 if (0) printf("sentence: '%s'\n", s);
209 /* change speech codes to spaces */
210 for (p = s; *p; p++)
211 {
212 if (*p == '[')
213 for (;*p && *p != ']'; p++)
214 *p = ' ';
215 if (*p == ']') *p = ' ';
216 }
217 /* trim leading spaces */
218 for (p = s; *p == ' ';)
219 for (q = p; *q; q++) *q = *(q + 1);
220 /* trim middle spaces */
221 for (p = s; p && *p;)
222 {
223 if (!*p) break;
224 p = strchr(p, ' ');
225 if (!p) break;
226 for (;*(p + 1) == ' ';)
227 for (q = p; *q; q++) *q = *(q + 1);
228 p++;
229 }
230 /* trim ending spaces */
231 for (p = s + strlen(s); p != s;)
232 if (*(--p) == ' ') *p = 0;
233 else break;
234
235 if (0) printf("clean_sentence: '%s'\n", s);
236 }
237
238
239
240