• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*---------------------------------------------------------------------------*
2  *  voc_read.c  *
3  *                                                                           *
4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5  *                                                                           *
6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7  *  you may not use this file except in compliance with the License.         *
8  *                                                                           *
9  *  You may obtain a copy of the License at                                  *
10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
11  *                                                                           *
12  *  Unless required by applicable law or agreed to in writing, software      *
13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15  *  See the License for the specific language governing permissions and      *
16  *  limitations under the License.                                           *
17  *                                                                           *
18  *---------------------------------------------------------------------------*/
19 
20 
21 #ifndef _RTT
22 #include <stdio.h>
23 #endif
24 #include <stdlib.h>
25 #include <math.h>
26 #include <assert.h>
27 
28 #if defined(__cplusplus) && defined(_MSC_VER)
29 extern "C"
30 {
31 #include <string.h>
32 }
33 #else
34 #include <string.h>
35 #endif
36 
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #ifdef _WIN32
40 #define stat _stat
41 #else
42 #include <unistd.h>
43 #endif
44 
45 
46 #include <fcntl.h>
47 #include <sys/mman.h>
48 
49 #include <zipfile/zipfile.h>
50 
51 
52 #include "hmmlib.h"
53 #include "duk_io.h"
54 #include "LCHAR.h"
55 #include "portable.h"
56 
57 #include "memmove.h"
58 
59 static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";
60 
61 
62 #define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')
63 
64 
65 #ifndef _RTT
66 
67 /**
68  *  Read word models and their phoneme transcriptions from .ok or .voc files.
69  *  returns -1 on error
70  */
read_word_transcription(const LCHAR * basename,vocab_info * voc,ESR_Locale * locale)71 int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
72 {
73   const char *ok;
74   ESR_ReturnCode rc;
75   int result;
76   int i;
77   char token[256];
78 
79   ASSERT(voc);
80 
81   if (basename == NULL || strlen(basename) == 0) {
82     PLogError("Error: invalid arg to read_word_transcription()\n");
83     goto CLEANUP;
84   }
85 
86   if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
87     PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
88     goto CLEANUP;
89   }
90 
91   /* this assumption eliminates simplifies bounds checking when parsing */
92   if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
93     PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
94     goto CLEANUP;
95   }
96 
97   /* set up point to walk the data */
98   ok = voc->ok_file_data;
99 
100   /* verify the header */
101   i = 0;
102   while (*ok != '=') {
103     if (cr_or_nl(*ok)) {
104       PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
105       goto CLEANUP;
106     }
107     token[i++] = *ok++;
108   }
109   token[i] = 0;
110   ok++;
111   CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
112   if (result != 0)
113   {
114     PLogError(L("%s was missing #LANG=en-us header"), basename);
115     goto CLEANUP;
116   }
117   i = 0;
118   while (!cr_or_nl(*ok)) token[i++] = *ok++;
119   token[i] = 0;
120   ok++;
121   CHKLOG(rc, ESR_str2locale(token, locale));
122 
123   /* set up first and last entries */
124   voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
125   voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
126   while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
127   voc->last_entry++;
128 
129   /* determine if there are any upper case entries */
130   voc->hasUpper = 1;
131   while (ok < voc->ok_file_data + voc->ok_file_data_length) {
132     int ch = *ok;
133     if ('A' <= ch && ch <= 'Z') {
134       voc->hasUpper = 1;
135       break;
136     }
137     else if ('Z' < ch) {
138       voc->hasUpper = 0;
139       break;
140     }
141     /* scan to the next entry */
142     while (*ok++ != '\n') ;
143   }
144 
145   return 0;
146 
147 CLEANUP:
148   delete_word_transcription(voc);
149 
150   PLogError(L("read_word_transcription: failed to read '%s'"), basename);
151 
152   return -1;
153 }
154 #endif
155 
156 /* the label is terminated with 0 and the entry terminated with ' ' */
kompare(const char * label,const char * entry)157 static int kompare(const char* label, const char* entry) {
158   while (*label == *entry) {
159     label++;
160     entry++;
161   }
162   return (*label ? *label : ' ') - *entry;
163 }
164 
get_prons(const vocab_info * voc,const char * label,char * prons,int prons_len)165 int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
166   int num_prons;
167   const char* low;
168   const char* middle;
169   const char* high;
170 
171   //PLogError(L("get_prons '%s'"), label);
172 
173   /* dictionaries are usually lower case, so do this for speed */
174   if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;
175 
176   /* binary search to find matching entry */
177   low = voc->first_entry;
178   high = voc->last_entry;
179   while (1) {
180     /* pick a point in the middle and align to next entry */
181     middle = low + ((high - low) >> 1) - 1;
182     while (*middle++ != '\n') ;
183 
184     /* compare 'label' to 'middle' */
185     int diff = kompare(label, middle);
186     if (diff == 0) break;
187 
188     /* nothing found */
189     if (low == high) return 0;
190 
191     /* 'middle' aligned to 'high', so move 'high' down */
192     if (middle == high) {
193       high -= 2;
194       while (*high != '\n') high--;
195       high++;
196       continue;
197     }
198 
199     if (diff > 0) low = middle;
200     else high = middle;
201   }
202 
203   /* back up to find the first entry equal to 'label' */
204   low = middle;
205   while (voc->first_entry < low) {
206     const char* lo;
207     for (lo = low - 2; *lo != '\n'; lo--) ;
208     lo++;
209     if (kompare(label, lo)) break;
210     low = lo;
211   }
212 
213   /* move forward to the last entry equal to 'label' */
214   high = middle;
215   while (high < voc->last_entry) {
216     const char* hi;
217     for (hi = high; *hi != '\n'; hi++) ;
218     hi++;
219     if (kompare(label, hi)) break;
220     high = hi;
221   }
222 
223   /* loop over all the entries */
224   num_prons = 0;
225   while (low <= high) {
226     /* scan over the label */
227     while (*low++ != ' ') ;
228 
229     /* skip the whitespace */
230     while (*low == ' ') low++;
231 
232     /* copy the pron */
233     while (*low != '\n') {
234       if (--prons_len <= 2) return -1;
235       *prons++ = *low++;
236     }
237     *prons++ = 0;
238     low++;
239     num_prons++;
240   }
241   *prons++ = 0;
242 
243   return num_prons;
244 }
245 
delete_word_transcription(vocab_info * voc)246 void delete_word_transcription(vocab_info* voc)
247 {
248   ASSERT(voc);
249 
250   voc->first_entry = 0;
251   voc->last_entry = 0;
252   if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
253   voc->ok_file_data = NULL;
254   voc->ok_file_data_length = 0;
255 }
256 
257 
258 /**************************************************/
259 /* may want to move these functions to 'portable' */
260 /**************************************************/
261 
endeql(const char * string,const char * end)262 static int endeql(const char* string, const char* end) {
263   return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
264 }
265 
266 /* decompress_entry requires an oversize destination buffer, so... */
inflateSize(size_t size)267 static size_t inflateSize(size_t size) {
268   return size + size / 1000 + 1;
269 }
270 
mmap_zip(const char * fname,void ** buf,size_t * size)271 int mmap_zip(const char* fname, void** buf, size_t* size) {
272     int fd = -1;
273     struct stat statbuf;
274     zipfile_t zf = 0;
275     zipentry_t ze = 0;
276     char entryname[FILENAME_MAX];
277     size_t size2 = 0;
278     void* buf2 = 0;
279 
280     /* open data file, determine size, map it, and close fd */
281     fd = open(fname, O_RDONLY);
282     if (fd < 0) goto FAILED;
283 
284     /* determine length */
285     if (fstat(fd, &statbuf) < 0) goto FAILED;
286 
287     /* mmap it */
288     *size = statbuf.st_size;
289     *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
290     if (*buf == MAP_FAILED) goto FAILED;
291 
292     /* close fd, since we can */
293     close(fd);
294     fd = -1;
295 
296     /* if not a zip file, we are done! */
297     if (!endeql(fname, ".zip")) return 0;
298 
299     /* set up zipfiler */
300     zf = init_zipfile(*buf, *size);
301     if (!zf) goto FAILED;
302 
303     /* get entry */
304     strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
305     entryname[strlen(entryname) - strlen(".zip")] = 0;
306     ze = lookup_zipentry(zf, entryname);
307     if (!ze) goto FAILED;
308 
309     /* mmap anon memory to hold unzipped entry */
310     size2 = get_zipentry_size(ze);
311     buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
312     if (buf2 == (void*)-1) goto FAILED;
313 
314     /* unzip entry */
315     if (decompress_zipentry(ze, buf2, size2)) goto FAILED;
316 
317     /* release unzipper */
318     release_zipfile(zf);
319     zf = 0;
320 
321     /* release mmapped file */
322     munmap(*buf, inflateSize(*size));
323 
324     /* set return values */
325     *buf = buf2;
326     *size = size2;
327 
328     return 0;
329 
330 FAILED:
331     if (fd != -1) close(fd);
332     if (zf) release_zipfile(zf);
333     if (buf2) munmap(buf2, inflateSize(size2));
334     if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
335     *buf = 0;
336     *size = 0;
337     return -1;
338 }
339 
munmap_zip(void * buf,size_t size)340 int munmap_zip(void* buf, size_t size) {
341     return munmap(buf, inflateSize(size));
342 }
343 
344