1 /*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /**
17 * @file picoklex.c
18 *
19 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
20 * All rights reserved.
21 *
22 * History:
23 * - 2009-04-20 -- initial version
24 *
25 */
26 #include "picoos.h"
27 #include "picodbg.h"
28 #include "picodata.h"
29 #include "picoknow.h"
30 #include "picoklex.h"
31
32 #ifdef __cplusplus
33 extern "C" {
34 #endif
35 #if 0
36 }
37 #endif
38
39 /* ************************************************************/
40 /* lexicon */
41 /* ************************************************************/
42
43 /**
44 * @addtogroup picolex
45 *
46 overview:
47 - lex consists of optional searchindex and a non-empty list of lexblocks
48 - lexblocks are fixed size, at the start of a block there is also the
49 start of an entry
50 - using the searchindex a unambiguous lexblock can be determined which
51 contains the entry (or there is no entry)
52 - one lex entry has POS GRAPH PHON, all mandatory, but
53 - PHON can be empty string -> no pronunciation in the resulting TTS output
54 - PHON can be :G2P -> use G2P later to add pronunciation
55 - (POS,GRAPH) is a uniq key (only one entry allowed)
56 - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
57 differing POS and differing PHON possible)
58 - for one graph we can have two or three solutions from the lex
59 which all need to be passed on the the next PU
60 - in this case GRAPH, POS, and PHON all must be available in lex
61
62 sizing:
63 - 3 bytes entry index -> 16MB addressable
64 - 2 bytes searchindex nr -> 64K blocks possible
65 - 5 bytes per searchindex entry
66 - 3 bytes for graph-prefix
67 - 2 bytes blockadr in searchindex -> 64K blocks possible
68 - lexblock size 512B:
69 - 32M possible
70 - with ~20 bytes per entry
71 -> max. average of ~26 entries to be searched per lookup
72 - overhead of ~10 bytes per block to sync with
73 block boundaries
74 - examples:
75 - 500KB lex -> 1000 blocks,
76 1000 entries in searchindex, ~25.6K lex-entries,
77 - ~5KB searchindex
78 ~10KB overhead for block sync
79 - 100KB lex -> 200 blocks,
80 200 entries in searchindex, ~5.1K lex-entries,
81 - ~1KB searchindex
82 ~2KB overhead for block sync
83
84 pil-file: lexicon knowledge base in binary form
85
86 lex-kb = content
87
88 content = searchindex {lexblock}1:NRBLOCKS2
89
90 lexblock = {lexentry}1: (lexblock size is fixed 512Bytes)
91
92 searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
93
94 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
95 LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
96
97 - special cases:
98 - PHON is empty string (no pronunciation in the resulting TTS output):
99 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 2 POS1
100 - PHON can be :G2P -> use G2P later to add pronunciation:
101 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 3 POS1 <reserved-phon-val=5>
102 - multi-byte values always little endian
103 */
104
105
106 /* ************************************************************/
107 /* lexicon data defines */
108 /* may not be changed with current implementation */
109 /* ************************************************************/
110
111 /* nr bytes of nrblocks info */
112 #define PICOKLEX_LEX_NRBLOCKS_SIZE 2
113
114 /* search index entry: - nr graphs
115 - nr bytes of block index
116 - nr bytes per entry, NRGRAPHS*INDSIZE */
117 #define PICOKLEX_LEX_SIE_NRGRAPHS 3
118 #define PICOKLEX_LEX_SIE_INDSIZE 2
119 #define PICOKLEX_LEX_SIE_SIZE 5
120
121 /* nr of bytes per lexblock */
122 #define PICOKLEX_LEXBLOCK_SIZE 512
123
124
125 /* reserved values in klex to indicate :G2P needed for a lexentry */
126 #define PICOKLEX_NEEDS_G2P 5
127
128
129 /* ************************************************************/
130 /* lexicon type and loading */
131 /* ************************************************************/
132
133 /** object : LexKnowledgeBase
134 * shortcut : klex
135 * derived from : picoknow_KnowledgeBase
136 */
137
138 typedef struct klex_subobj *klex_SubObj;
139
140 typedef struct klex_subobj
141 {
142 picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
143 picoos_uint8 *searchind;
144 picoos_uint8 *lexblocks;
145 } klex_subobj_t;
146
147
klexInitialize(register picoknow_KnowledgeBase this,picoos_Common common)148 static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
149 picoos_Common common)
150 {
151 picoos_uint32 curpos = 0;
152 klex_subobj_t *klex;
153
154 PICODBG_DEBUG(("start"));
155
156 /* check whether (this->size != 0) done before calling this function */
157
158 if (NULL == this || NULL == this->subObj) {
159 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
160 NULL, NULL);
161 }
162 klex = (klex_subobj_t *) this->subObj;
163
164 if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
165 &(klex->nrblocks))) {
166 if (klex->nrblocks > 0) {
167 PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
168 klex->searchind = this->base + curpos;
169 } else {
170 klex->searchind = NULL;
171 }
172 klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
173 (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
174 return PICO_OK;
175 } else {
176 return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
177 NULL, NULL);
178 }
179 }
180
181
klexSubObjDeallocate(register picoknow_KnowledgeBase this,picoos_MemoryManager mm)182 static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
183 picoos_MemoryManager mm)
184 {
185 if (NULL != this) {
186 picoos_deallocate(mm, (void *) &this->subObj);
187 }
188 return PICO_OK;
189 }
190
191
192 /* we don't offer a specialized constructor for a LexKnowledgeBase but
193 * instead a "specializer" of an allready existing generic
194 * picoknow_KnowledgeBase */
195
picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,picoos_Common common)196 pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
197 picoos_Common common)
198 {
199 if (NULL == this) {
200 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
201 NULL, NULL);
202 }
203 if (this->size > 0) {
204 this->subDeallocate = klexSubObjDeallocate;
205 this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
206 if (NULL == this->subObj) {
207 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
208 NULL, NULL);
209 }
210 return klexInitialize(this, common);
211 } else {
212 /* some dummy klex */
213 return PICO_OK;
214 }
215 }
216
217 /* for now we don't need to do anything special for the main lex */
218 /*
219 pico_status_t picoklex_specializeMainLexKnowledgeBase(
220 picoknow_KnowledgeBase this,
221 picoos_Common common)
222 {
223 return picoklex_specializeLexKnowledgeBase(this,common);
224 }
225 */
226
227
228 /* ************************************************************/
229 /* lexicon getLex */
230 /* ************************************************************/
231
picoklex_getLex(picoknow_KnowledgeBase this)232 picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
233 {
234 if (NULL == this) {
235 return NULL;
236 } else {
237 return (picoklex_Lex) this->subObj;
238 }
239 }
240
241
242 /* ************************************************************/
243 /* functions on searchindex */
244 /* ************************************************************/
245
246
klex_getSearchIndexVal(const klex_SubObj this,picoos_uint16 index)247 static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
248 picoos_uint16 index)
249 {
250 picoos_uint32 pos, val;
251 pos = index * PICOKLEX_LEX_SIE_SIZE;
252 val = this->searchind[pos];
253 val = (val << 8) + this->searchind[pos + 1];
254 val = (val << 8) + this->searchind[pos + 2];
255 return val;
256 }
257
258
259 /* Determine first lexblock containing entries for specified
260 grapheme. */
261
klex_getLexblockNr(const klex_SubObj this,const picoos_uint8 * graphsi)262 static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
263 const picoos_uint8 *graphsi) {
264 /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
265 picoos_int32 low, mid, high;
266 picoos_uint32 searchval, indval;
267
268 /* PICOKLEX_LEX_SIE_NRGRAPHS */
269
270 /* convert graph-prefix to number with 'lexicographic' ordering */
271 searchval = graphsi[0];
272 searchval = (searchval << 8) + graphsi[1];
273 searchval = (searchval << 8) + graphsi[2];
274
275 low = 0;
276 high = this->nrblocks;
277
278 /* do binary search */
279 while (low < high) {
280 mid = (low + high) / 2;
281 indval = klex_getSearchIndexVal(this, mid);
282 if (indval < searchval) {
283 low = mid + 1;
284 } else {
285 high = mid;
286 }
287 }
288 PICODBG_ASSERT(high == low);
289 /* low points to the first entry greater than or equal to searchval */
290
291 if (low < this->nrblocks) {
292 indval = klex_getSearchIndexVal(this, low);
293 if (indval > searchval) {
294 low--;
295 /* if there are identical elements in the search index we have
296 to move to the first one */
297 if (low > 0) {
298 indval = klex_getSearchIndexVal(this, low);
299 while (indval == klex_getSearchIndexVal(this, low-1)) {
300 low--;
301 }
302 }
303 }
304 } else {
305 low = this->nrblocks - 1;
306 }
307
308 #if defined(PICO_DEBUG)
309 {
310 picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
311 PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
312 this->searchind[pos], this->searchind[pos + 1],
313 this->searchind[pos + 2], low));
314 }
315 #endif
316
317 return (picoos_uint16) low;
318 }
319
320
321 /* Determine number of adjacent lexblocks containing entries for
322 the same grapheme search prefix (identified by search index). */
323
klex_getLexblockRange(const klex_SubObj this,picoos_uint16 index)324 static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
325 picoos_uint16 index)
326 {
327 picoos_uint16 count;
328 picoos_uint32 sval1, sval2;
329
330 sval1 = klex_getSearchIndexVal(this, index);
331
332 #if defined(PICO_DEBUG)
333 /* 'index' must point to first lexblock of its kind */
334 if (index > 0) {
335 sval2 = klex_getSearchIndexVal(this, index - 1);
336 PICODBG_ASSERT(sval1 != sval2);
337 }
338 #endif
339
340 index++;
341 sval2 = klex_getSearchIndexVal(this, index);
342
343 count = 1;
344 while (sval1 == sval2) {
345 count++;
346 index++;
347 sval2 = klex_getSearchIndexVal(this, index);
348 }
349
350 return count;
351 }
352
353
354 /* ************************************************************/
355 /* functions on single lexblock */
356 /* ************************************************************/
357
klex_lexMatch(picoos_uint8 * lexentry,const picoos_uint8 * graph,const picoos_uint16 graphlen)358 static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
359 const picoos_uint8 *graph,
360 const picoos_uint16 graphlen) {
361 picoos_uint8 i;
362 picoos_uint8 lexlen;
363 picoos_uint8 *lexgraph;
364
365 lexlen = lexentry[0] - 1;
366 lexgraph = &(lexentry[1]);
367 for (i=0; (i<graphlen) && (i<lexlen); i++) {
368 PICODBG_TRACE(("%d|%d graph|lex: %c|%c", graphlen, lexlen,
369 graph[i], lexgraph[i]));
370 if (lexgraph[i] < graph[i]) {
371 return -1;
372 } else if (lexgraph[i] > graph[i]) {
373 return 1;
374 }
375 }
376 if (graphlen == lexlen) {
377 return 0;
378 } else if (lexlen < graphlen) {
379 return -1;
380 } else {
381 return 1;
382 }
383 }
384
385
klex_setLexResult(const picoos_uint8 * lexentry,const picoos_uint32 lexpos,picoklex_lexl_result_t * lexres)386 static void klex_setLexResult(const picoos_uint8 *lexentry,
387 const picoos_uint32 lexpos,
388 picoklex_lexl_result_t *lexres) {
389 picoos_uint8 i;
390
391 /* check if :G2P */
392 if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
393 /* set pos */
394 lexres->posind[0] = lexentry[lexentry[0] + 1];
395 /* set rest */
396 lexres->phonfound = FALSE;
397 lexres->posindlen = 1;
398 lexres->nrres = 1;
399 PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
400 } else {
401 i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
402 lexres->posindlen += PICOKLEX_POSIND_SIZE;
403 lexres->phonfound = TRUE;
404 /* set pos */
405 lexres->posind[i++] = lexentry[lexentry[0] + 1];
406 /* set ind, PICOKLEX_IND_SIZE */
407 lexres->posind[i++] = 0x000000ff & (lexpos);
408 lexres->posind[i++] = 0x000000ff & (lexpos >> 8);
409 lexres->posind[i] = 0x000000ff & (lexpos >> 16);
410 lexres->nrres++;
411 PICODBG_DEBUG(("result %d", lexres->nrres));
412 }
413 }
414
415
klex_lexblockLookup(klex_SubObj this,const picoos_uint32 lexposStart,const picoos_uint32 lexposEnd,const picoos_uint8 * graph,const picoos_uint16 graphlen,picoklex_lexl_result_t * lexres)416 static void klex_lexblockLookup(klex_SubObj this,
417 const picoos_uint32 lexposStart,
418 const picoos_uint32 lexposEnd,
419 const picoos_uint8 *graph,
420 const picoos_uint16 graphlen,
421 picoklex_lexl_result_t *lexres) {
422 picoos_uint32 lexpos;
423 picoos_int8 rv;
424
425 lexres->nrres = 0;
426
427 lexpos = lexposStart;
428 rv = -1;
429 while ((rv < 0) && (lexpos < lexposEnd)) {
430
431 rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
432
433 if (rv == 0) { /* found */
434 klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
435 if (lexres->phonfound) {
436 /* look for more results, up to MAX_NRRES, don't even
437 check if more results would be available */
438 while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
439 (lexpos < lexposEnd)) {
440 lexpos += this->lexblocks[lexpos];
441 lexpos += this->lexblocks[lexpos];
442 /* if there are no more entries in this block, advance
443 to next block by skipping all zeros */
444 while ((this->lexblocks[lexpos] == 0) &&
445 (lexpos < lexposEnd)) {
446 lexpos++;
447 }
448 if (lexpos < lexposEnd) {
449 if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
450 graphlen) == 0) {
451 klex_setLexResult(&(this->lexblocks[lexpos]),
452 lexpos, lexres);
453 } else {
454 /* no more results, quit loop */
455 lexpos = lexposEnd;
456 }
457 }
458 }
459 } else {
460 /* :G2P mark */
461 }
462 } else if (rv < 0) {
463 /* not found, goto next entry */
464 lexpos += this->lexblocks[lexpos];
465 lexpos += this->lexblocks[lexpos];
466 /* if there are no more entries in this block, advance
467 to next block by skipping all zeros */
468 while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
469 lexpos++;
470 }
471 } else {
472 /* rv > 0, not found, won't show up later in block */
473 }
474 }
475 }
476
477
478 /* ************************************************************/
479 /* lexicon lookup functions */
480 /* ************************************************************/
481
picoklex_lexLookup(const picoklex_Lex this,const picoos_uint8 * graph,const picoos_uint16 graphlen,picoklex_lexl_result_t * lexres)482 picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
483 const picoos_uint8 *graph,
484 const picoos_uint16 graphlen,
485 picoklex_lexl_result_t *lexres) {
486 picoos_uint16 lbnr, lbc;
487 picoos_uint32 lexposStart, lexposEnd;
488 picoos_uint8 i;
489 picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
490 klex_SubObj klex = (klex_SubObj) this;
491
492 if (NULL == klex) {
493 PICODBG_ERROR(("no lexicon loaded"));
494 /* no exception here needed, already checked at initialization */
495 return FALSE;
496 }
497
498 lexres->nrres = 0;
499 lexres->posindlen = 0;
500 lexres->phonfound = FALSE;
501
502 for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
503 if (i < graphlen) {
504 tgraph[i] = graph[i];
505 } else {
506 tgraph[i] = '\0';
507 }
508 }
509 PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
510
511 if ((klex->nrblocks) == 0) {
512 /* no searchindex, no lexblock */
513 PICODBG_WARN(("no searchindex, no lexblock"));
514 return FALSE;
515 } else {
516 lbnr = klex_getLexblockNr(klex, tgraph);
517 PICODBG_ASSERT(lbnr < klex->nrblocks);
518 lbc = klex_getLexblockRange(klex, lbnr);
519 PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
520 }
521 PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
522
523 lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
524 lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
525
526 PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
527 klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
528 PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
529
530 return (lexres->nrres > 0);
531 }
532
533
picoklex_lexIndLookup(const picoklex_Lex this,const picoos_uint8 * ind,const picoos_uint8 indlen,picoos_uint8 * pos,picoos_uint8 ** phon,picoos_uint8 * phonlen)534 picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
535 const picoos_uint8 *ind,
536 const picoos_uint8 indlen,
537 picoos_uint8 *pos,
538 picoos_uint8 **phon,
539 picoos_uint8 *phonlen) {
540 picoos_uint32 pentry;
541 klex_SubObj klex = (klex_SubObj) this;
542
543 /* check indlen */
544 if (indlen != PICOKLEX_IND_SIZE) {
545 return FALSE;
546 }
547
548 /* PICOKLEX_IND_SIZE */
549 pentry = 0x000000ff & (ind[0]);
550 pentry |= ((picoos_uint32)(ind[1]) << 8);
551 pentry |= ((picoos_uint32)(ind[2]) << 16);
552
553 /* check ind if it is within lexblocks byte stream, if not, return FALSE */
554 if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
555 return FALSE;
556 }
557
558 pentry += (klex->lexblocks[pentry]);
559 *phonlen = (klex->lexblocks[pentry++]) - 2;
560 *pos = klex->lexblocks[pentry++];
561 *phon = &(klex->lexblocks[pentry]);
562
563 PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
564 return TRUE;
565 }
566
567 #ifdef __cplusplus
568 }
569 #endif
570
571
572 /* end */
573