1 /*---------------------------------------------------------------------------*
2 * voc_read.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20
21 #ifndef _RTT
22 #include <stdio.h>
23 #endif
24 #include <stdlib.h>
25 #include <math.h>
26 #include <assert.h>
27
28 #if defined(__cplusplus) && defined(_MSC_VER)
29 extern "C"
30 {
31 #include <string.h>
32 }
33 #else
34 #include <string.h>
35 #endif
36
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #ifdef _WIN32
40 #define stat _stat
41 #else
42 #include <unistd.h>
43 #endif
44
45
46 #include <fcntl.h>
47 #include <sys/mman.h>
48
49 #include <zipfile/zipfile.h>
50
51
52 #include "hmmlib.h"
53 #include "duk_io.h"
54 #include "LCHAR.h"
55 #include "portable.h"
56
57 #include "memmove.h"
58
59 static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";
60
61
62 #define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')
63
64
65 #ifndef _RTT
66
67 /**
68 * Read word models and their phoneme transcriptions from .ok or .voc files.
69 * returns -1 on error
70 */
read_word_transcription(const LCHAR * basename,vocab_info * voc,ESR_Locale * locale)71 int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
72 {
73 const char *ok;
74 ESR_ReturnCode rc;
75 int result;
76 int i;
77 char token[256];
78
79 ASSERT(voc);
80
81 if (basename == NULL || strlen(basename) == 0) {
82 PLogError("Error: invalid arg to read_word_transcription()\n");
83 goto CLEANUP;
84 }
85
86 if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
87 PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
88 goto CLEANUP;
89 }
90
91 /* this assumption eliminates simplifies bounds checking when parsing */
92 if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
93 PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
94 goto CLEANUP;
95 }
96
97 /* set up point to walk the data */
98 ok = voc->ok_file_data;
99
100 /* verify the header */
101 i = 0;
102 while (*ok != '=') {
103 if (cr_or_nl(*ok)) {
104 PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
105 goto CLEANUP;
106 }
107 token[i++] = *ok++;
108 }
109 token[i] = 0;
110 ok++;
111 CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
112 if (result != 0)
113 {
114 PLogError(L("%s was missing #LANG=en-us header"), basename);
115 goto CLEANUP;
116 }
117 i = 0;
118 while (!cr_or_nl(*ok)) token[i++] = *ok++;
119 token[i] = 0;
120 ok++;
121 CHKLOG(rc, ESR_str2locale(token, locale));
122
123 /* set up first and last entries */
124 voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
125 voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
126 while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
127 voc->last_entry++;
128
129 /* determine if there are any upper case entries */
130 voc->hasUpper = 1;
131 while (ok < voc->ok_file_data + voc->ok_file_data_length) {
132 int ch = *ok;
133 if ('A' <= ch && ch <= 'Z') {
134 voc->hasUpper = 1;
135 break;
136 }
137 else if ('Z' < ch) {
138 voc->hasUpper = 0;
139 break;
140 }
141 /* scan to the next entry */
142 while (*ok++ != '\n') ;
143 }
144
145 return 0;
146
147 CLEANUP:
148 delete_word_transcription(voc);
149
150 PLogError(L("read_word_transcription: failed to read '%s'"), basename);
151
152 return -1;
153 }
154 #endif
155
156 /* the label is terminated with 0 and the entry terminated with ' ' */
kompare(const char * label,const char * entry)157 static int kompare(const char* label, const char* entry) {
158 while (*label == *entry) {
159 label++;
160 entry++;
161 }
162 return (*label ? *label : ' ') - *entry;
163 }
164
get_prons(const vocab_info * voc,const char * label,char * prons,int prons_len)165 int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
166 int num_prons;
167 const char* low;
168 const char* middle;
169 const char* high;
170
171 //PLogError(L("get_prons '%s'"), label);
172
173 /* dictionaries are usually lower case, so do this for speed */
174 if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;
175
176 /* binary search to find matching entry */
177 low = voc->first_entry;
178 high = voc->last_entry;
179 while (1) {
180 /* pick a point in the middle and align to next entry */
181 middle = low + ((high - low) >> 1) - 1;
182 while (*middle++ != '\n') ;
183
184 /* compare 'label' to 'middle' */
185 int diff = kompare(label, middle);
186 if (diff == 0) break;
187
188 /* nothing found */
189 if (low == high) return 0;
190
191 /* 'middle' aligned to 'high', so move 'high' down */
192 if (middle == high) {
193 high -= 2;
194 while (*high != '\n') high--;
195 high++;
196 continue;
197 }
198
199 if (diff > 0) low = middle;
200 else high = middle;
201 }
202
203 /* back up to find the first entry equal to 'label' */
204 low = middle;
205 while (voc->first_entry < low) {
206 const char* lo;
207 for (lo = low - 2; *lo != '\n'; lo--) ;
208 lo++;
209 if (kompare(label, lo)) break;
210 low = lo;
211 }
212
213 /* move forward to the last entry equal to 'label' */
214 high = middle;
215 while (high < voc->last_entry) {
216 const char* hi;
217 for (hi = high; *hi != '\n'; hi++) ;
218 hi++;
219 if (kompare(label, hi)) break;
220 high = hi;
221 }
222
223 /* loop over all the entries */
224 num_prons = 0;
225 while (low <= high) {
226 /* scan over the label */
227 while (*low++ != ' ') ;
228
229 /* skip the whitespace */
230 while (*low == ' ') low++;
231
232 /* copy the pron */
233 while (*low != '\n') {
234 if (--prons_len <= 2) return -1;
235 *prons++ = *low++;
236 }
237 *prons++ = 0;
238 low++;
239 num_prons++;
240 }
241 *prons++ = 0;
242
243 return num_prons;
244 }
245
delete_word_transcription(vocab_info * voc)246 void delete_word_transcription(vocab_info* voc)
247 {
248 ASSERT(voc);
249
250 voc->first_entry = 0;
251 voc->last_entry = 0;
252 if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
253 voc->ok_file_data = NULL;
254 voc->ok_file_data_length = 0;
255 }
256
257
258 /**************************************************/
259 /* may want to move these functions to 'portable' */
260 /**************************************************/
261
endeql(const char * string,const char * end)262 static int endeql(const char* string, const char* end) {
263 return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
264 }
265
266 /* decompress_entry requires an oversize destination buffer, so... */
inflateSize(size_t size)267 static size_t inflateSize(size_t size) {
268 return size + size / 1000 + 1;
269 }
270
mmap_zip(const char * fname,void ** buf,size_t * size)271 int mmap_zip(const char* fname, void** buf, size_t* size) {
272 int fd = -1;
273 struct stat statbuf;
274 zipfile_t zf = 0;
275 zipentry_t ze = 0;
276 char entryname[FILENAME_MAX];
277 size_t size2 = 0;
278 void* buf2 = 0;
279
280 /* open data file, determine size, map it, and close fd */
281 fd = open(fname, O_RDONLY);
282 if (fd < 0) goto FAILED;
283
284 /* determine length */
285 if (fstat(fd, &statbuf) < 0) goto FAILED;
286
287 /* mmap it */
288 *size = statbuf.st_size;
289 *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
290 if (*buf == MAP_FAILED) goto FAILED;
291
292 /* close fd, since we can */
293 close(fd);
294 fd = -1;
295
296 /* if not a zip file, we are done! */
297 if (!endeql(fname, ".zip")) return 0;
298
299 /* set up zipfiler */
300 zf = init_zipfile(*buf, *size);
301 if (!zf) goto FAILED;
302
303 /* get entry */
304 strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
305 entryname[strlen(entryname) - strlen(".zip")] = 0;
306 ze = lookup_zipentry(zf, entryname);
307 if (!ze) goto FAILED;
308
309 /* mmap anon memory to hold unzipped entry */
310 size2 = get_zipentry_size(ze);
311 buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
312 if (buf2 == (void*)-1) goto FAILED;
313
314 /* unzip entry */
315 if (decompress_zipentry(ze, buf2, size2)) goto FAILED;
316
317 /* release unzipper */
318 release_zipfile(zf);
319 zf = 0;
320
321 /* release mmapped file */
322 munmap(*buf, inflateSize(*size));
323
324 /* set return values */
325 *buf = buf2;
326 *size = size2;
327
328 return 0;
329
330 FAILED:
331 if (fd != -1) close(fd);
332 if (zf) release_zipfile(zf);
333 if (buf2) munmap(buf2, inflateSize(size2));
334 if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
335 *buf = 0;
336 *size = 0;
337 return -1;
338 }
339
munmap_zip(void * buf,size_t size)340 int munmap_zip(void* buf, size_t size) {
341 return munmap(buf, inflateSize(size));
342 }
343
344