• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  text_parser.c  *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 #include"pstdio.h"
21 #include"srec_context.h"
22 #include"astar.h"
23 
24 #include "passert.h"
25 #include "portable.h"
26 
27 
28 #define MAX_LOCAL_LEN 256
29 #define PARSE_PASS 0
30 #define PARSE_FAIL 1
31 
32 
check_word_path(srec_context * context,arc_token * atok,const char * transcription,int tlen)33 static int check_word_path(srec_context* context, arc_token* atok,
34                            const char* transcription, int tlen)
35 {
36   const char    *wd, *p;
37   char          *q;
38   arc_token*    next_atok;
39   wordID        wdID;
40   int           q_position;
41 
42   if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
43   {
44     PLogError("Transcription too long [%s]\n", transcription);
45     return PARSE_FAIL;
46   }
47 
48   while (1) {
49     char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
50 
51     /* wd points to the first char of last word */
52     wd = transcription;
53     if (tlen > 0)
54     {
55       for (wd = transcription + tlen - 1; wd > transcription; wd--)
56       {
57         if (*wd == ' ')
58         {
59           wd++;
60           break;
61         }
62       }
63     }
64     for (p = wd, q = copy_of_word; ; p++, q++)
65     {
66       q_position = q - copy_of_word;
67       if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
68       {
69         PLogError("Word too long in transcription [%s]\n", transcription);
70         return PARSE_FAIL;
71       }
72       *q = *p;
73       if (*p == ' ' || *p == '\0')
74       {
75         *q = 0;
76         break;
77       }
78     }
79     wdID = wordmap_find_index(context->olabels, copy_of_word);
80 
81     if (wdID < MAXwordID)
82     {
83       next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
84     }
85     else
86     {
87       next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
88       if (!next_atok) return PARSE_FAIL;
89     }
90 
91     if (!next_atok) return PARSE_FAIL;
92 
93     int whether_final_atok = 0;
94     arc_token* tmp;
95     for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
96          tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
97     {
98       if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
99     }
100 
101     if (wd == transcription && whether_final_atok) return PARSE_PASS;
102     if (wd == transcription) return PARSE_FAIL;
103     tlen--;
104     while (transcription[tlen] != ' ' && tlen > 0) tlen--;
105 
106     atok = next_atok;
107   }
108 }
109 
FST_CheckPath_Simple(srec_context * context,const char * transcription)110 int FST_CheckPath_Simple(srec_context* context, const char* transcription)
111 {
112   arc_token* atok = &context->arc_token_list[0];
113   int transcription_len = strlen(transcription);
114   int rc;
115 
116   for (; transcription_len > 0; transcription_len--)
117     if (transcription[transcription_len-1] != ' ') break;
118   rc = check_word_path(context, atok, transcription, transcription_len);
119   return rc;
120 }
121 
FST_CheckPath_Complex(srec_context * context,const char * transcription,char * literal,size_t max_literal_len)122 int FST_CheckPath_Complex(srec_context* context, const char* transcription,
123                           char* literal, size_t max_literal_len)
124 {
125   int i, j, rc;
126   int num_spaces;
127   char copy_of_transcription[MAX_LOCAL_LEN];
128   char* spaces[24], *p; /* can't go too high here!! */
129   ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
130 
131   strcpy(copy_of_transcription, transcription);
132   for (num_spaces = 0, p = copy_of_transcription; *p; p++)
133   {
134     if (*p == ' ')
135     {
136       if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
137       {
138         PLogError("FST_CheckPath_Complex() failed on too many words\n");
139         return PARSE_FAIL;
140       }
141       spaces[num_spaces++] = p;
142     }
143   }
144 
145   if (num_spaces == 0)
146   {
147     rc = FST_CheckPath_Simple(context, transcription);
148     if (rc == PARSE_PASS)
149     {
150       ASSERT(strlen(copy_of_transcription) < max_literal_len);
151       strcpy(literal, copy_of_transcription);
152     }
153     return rc;
154   }
155 
156   for (i = 0; i < (1 << num_spaces); i++)
157   {
158     /* find the space pointers */
159     for (j = 0; j < num_spaces; j++)
160       *spaces[j] = i & (1 << j) ? '_' : ' ';
161     /* check each word, potentially within a rule! */
162     for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
163     {
164       wordID k, wdid = wordmap_find_index(context->olabels, p);
165       if (wdid < MAXwordID) continue;
166       for (k = 1; k < context->olabels->num_slots; k++)
167       {
168         wdid = wordmap_find_index_in_rule(context->olabels, p, k);
169         if (wdid < MAXwordID) break;
170       }
171       if (wdid == MAXwordID)
172         goto next_i;
173     }
174     /* fix the nulls back */
175     for (j = 0; j < num_spaces; j++)
176       *spaces[j] = i & (1 << j) ? '_' : ' ';
177     rc = FST_CheckPath_Simple(context, copy_of_transcription);
178     if (rc == PARSE_PASS)
179     {
180       ASSERT(strlen(copy_of_transcription) < max_literal_len);
181       strcpy(literal, copy_of_transcription);
182       return rc;
183     }
184 next_i:
185     continue;
186   }
187   return PARSE_FAIL;
188 }
189 
190 static void clean_up_sentence(char* s);
191 
FST_CheckPath(srec_context * context,const char * transcription,char * literal,size_t max_literal_len)192 int FST_CheckPath(srec_context* context, const char* transcription,
193                   char* literal, size_t max_literal_len)
194 {
195   char mytranscription[256];
196   passert(strlen(transcription) < sizeof(mytranscription));
197   strcpy(mytranscription, transcription);
198   clean_up_sentence(mytranscription);
199   if (!context->arc_token_list)
200     return 2;
201   else
202     return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
203 }
204 
clean_up_sentence(char * s)205 static void clean_up_sentence(char* s)
206 {
207   char* p, *q;
208   if (0) printf("sentence: '%s'\n", s);
209   /* change speech codes to spaces */
210   for (p = s; *p; p++)
211   {
212     if (*p == '[')
213       for (;*p && *p != ']'; p++)
214         *p = ' ';
215     if (*p == ']') *p = ' ';
216   }
217   /* trim leading spaces */
218   for (p = s; *p == ' ';)
219     for (q = p; *q; q++) *q = *(q + 1);
220   /* trim middle spaces */
221   for (p = s; p && *p;)
222   {
223     if (!*p) break;
224     p = strchr(p, ' ');
225     if (!p) break;
226     for (;*(p + 1) == ' ';)
227       for (q = p; *q; q++) *q = *(q + 1);
228     p++;
229   }
230   /* trim ending spaces */
231   for (p = s + strlen(s); p != s;)
232     if (*(--p) == ' ') *p = 0;
233     else break;
234 
235   if (0) printf("clean_sentence: '%s'\n", s);
236 }
237 
238 
239 
240