1 /*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /**
17 * @file picotok.c
18 *
19 * tokenizer
20 *
21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
22 * All rights reserved.
23 *
24 * History:
25 * - 2009-04-20 -- initial version
26 *
27 */
28
29
30 /* ************************************************************/
31 /* tokenisation and markup handling */
32 /* ************************************************************/
33
34 /** @addtogroup picotok
35 @b tokenisation_overview
36
37 markup handling overview:
38
39 The following markups are recognized
40 - ignore
41 - speed
42 - pitch
43 - volume
44 - voice
45 - preproccontext
46 - mark
47 - play
48 - usesig
49 - genfile
50 - sentence
51 - s
52 - paragraph
53 - p
54 - break
55 - spell (pauses between letter)
56 - phoneme
57
58 All markups which are recognized but are not yet implemented in pico
59 system have the mark.
60 */
61
62
63 #include "picodefs.h"
64 #include "picoos.h"
65 #include "picobase.h"
66 #include "picodbg.h"
67 #include "picodata.h"
68 #include "picotok.h"
69 #include "picoktab.h"
70
71 #ifdef __cplusplus
72 extern "C" {
73 #endif
74 #if 0
75 }
76 #endif
77
78 /* *****************************************************************************/
79
80 #define IN_BUF_SIZE 255
81 #define OUT_BUF_SIZE IN_BUF_SIZE + 3 * PICODATA_ITEM_HEADSIZE + 3
82
83 #define MARKUP_STRING_BUF_SIZE (IN_BUF_SIZE*5)
84 #define MAX_NR_MARKUP_PARAMS 6
85 #define MARKUP_HANDLING_DISABLED 0
86 #define MARKUP_HANDLING_ENABLED 1
87 #define EOL '\n'
88
89
90 typedef picoos_int8 pico_tokenSubType;
91 typedef picoos_uint8 pico_tokenType;
92
93 /** @todo : consider adding these specialized exception codes: */
94
95 #define PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE PICO_ERR_OTHER
96 #define PICO_ERR_INVALID_MARKUP_TAG PICO_ERR_OTHER
97 #define PICO_ERR_INTERNAL_LIMIT PICO_ERR_OTHER
98
99 typedef enum {MIDummyStart, MIIgnore,
100 MIPitch, MISpeed, MIVolume,
101 MIVoice, MIPreprocContext, MIMarker,
102 MIPlay, MIUseSig, MIGenFile, MIParagraph,
103 MISentence, MIBreak, MISpell, MIPhoneme, MIItem, MISpeaker, MIDummyEnd
104 } MarkupId;
105 typedef enum {MSNotInMarkup, MSGotStart, MSExpectingmarkupTagName, MSInmarkupTagName,
106 MSGotmarkupTagName, MSInAttrName, MSGotAttrName, MSGotEqual, MSInAttrValue,
107 MSInAttrValueEscaped, MSGotAttrValue, MSGotEndSlash, MSGotEnd,
108 MSError, MSErrorTooLong, MSErrorSyntax
109 } MarkupState;
110 typedef enum {MENone, MEMissingStart, MEUnknownTag, MEIdent, MEMissingEqual,
111 MEMissingQuote, MEMissingEnd, MEUnexpectedChar, MEInterprete
112 } MarkupParseError;
113
114 typedef enum {MTNone, MTStart, MTEnd, MTEmpty} MarkupTagType;
115
116 #define UTF_CHAR_COMPLETE 2
117 #define UTF_CHAR_INCOMPLETE 1
118 #define UTF_CHAR_MALFORMED 0
119
120 #define TOK_MARKUP_KW_IGNORE (picoos_uchar*)"ignore"
121 #define TOK_MARKUP_KW_SPEED (picoos_uchar*)"speed"
122 #define TOK_MARKUP_KW_PITCH (picoos_uchar*)"pitch"
123 #define TOK_MARKUP_KW_VOLUME (picoos_uchar*)"volume"
124 #define TOK_MARKUP_KW_VOICE (picoos_uchar*)"voice"
125 #define TOK_MARKUP_KW_CONTEXT (picoos_uchar*)"preproccontext"
126 #define TOK_MARKUP_KW_MARK (picoos_uchar*)"mark"
127 #define TOK_MARKUP_KW_PLAY (picoos_uchar*)"play"
128 #define TOK_MARKUP_KW_USESIG (picoos_uchar*)"usesig"
129 #define TOK_MARKUP_KW_GENFILE (picoos_uchar*)"genfile"
130 #define TOK_MARKUP_KW_SENTENCE (picoos_uchar*)"sentence"
131 #define TOK_MARKUP_KW_S (picoos_uchar*)"s"
132 #define TOK_MARKUP_KW_PARAGRAPH (picoos_uchar*)"paragraph"
133 #define TOK_MARKUP_KW_P (picoos_uchar*)"p"
134 #define TOK_MARKUP_KW_BREAK (picoos_uchar*)"break"
135 #define TOK_MARKUP_KW_SPELL (picoos_uchar*)"spell"
136 #define TOK_MARKUP_KW_PHONEME (picoos_uchar*)"phoneme"
137 #define TOK_MARKUP_KW_ITEM (picoos_uchar*)"item"
138 #define TOK_MARKUP_KW_SPEAKER (picoos_uchar*)"speaker"
139
140 #define KWLevel (picoos_uchar *)"level"
141 #define KWName (picoos_uchar *)"name"
142 #define KWProsDomain (picoos_uchar *)"prosodydomain"
143 #define KWTime (picoos_uchar *)"time"
144 #define KWMode (picoos_uchar *)"mode"
145 #define KWSB (picoos_uchar *)"sb"
146 #define KWPB (picoos_uchar *)"pb"
147 #define KWFile (picoos_uchar *)"file"
148 #define KWType (picoos_uchar *)"type"
149 #define KWF0Beg (picoos_uchar *)"f0beg"
150 #define KWF0End (picoos_uchar *)"f0end"
151 #define KWXFadeBeg (picoos_uchar *)"xfadebeg"
152 #define KWXFadeEnd (picoos_uchar *)"xfadeend"
153 #define KWAlphabet (picoos_uchar *)"alphabet"
154 #define KWPH (picoos_uchar *)"ph"
155 #define KWOrthMode (picoos_uchar *)"orthmode"
156 #define KWIgnorePunct (picoos_uchar *)"ignorepunct"
157 #define KWInfo1 (picoos_uchar *)"info1"
158 #define KWInfo2 (picoos_uchar *)"info2"
159 #define KWDATA (picoos_uchar *)"data"
160
161 #define PICO_SPEED_MIN 20
162 #define PICO_SPEED_MAX 500
163 #define PICO_SPEED_DEFAULT 100
164 #define PICO_SPEED_FACTOR_MIN 500
165 #define PICO_SPEED_FACTOR_MAX 2000
166
167 #define PICO_PITCH_MIN 50
168 #define PICO_PITCH_MAX 200
169 #define PICO_PITCH_DEFAULT 100
170 #define PICO_PITCH_FACTOR_MIN 500
171 #define PICO_PITCH_FACTOR_MAX 2000
172 #define PICO_PITCH_ADD_MIN -100
173 #define PICO_PITCH_ADD_MAX 100
174 #define PICO_PITCH_ADD_DEFAULT 0
175
176 #define PICO_VOLUME_MIN 0
177 #define PICO_VOLUME_MAX 500
178 #define PICO_VOLUME_DEFAULT 100
179 #define PICO_VOLUME_FACTOR_MIN 500
180 #define PICO_VOLUME_FACTOR_MAX 2000
181
182 #define PICO_SPEAKER_MIN 20
183 #define PICO_SPEAKER_MAX 180
184 #define PICO_SPEAKER_DEFAULT 100
185 #define PICO_SPEAKER_FACTOR_MIN 500
186 #define PICO_SPEAKER_FACTOR_MAX 2000
187
188 #define PICO_CONTEXT_DEFAULT (picoos_uchar*)"DEFAULT"
189
190 #define PARAGRAPH_PAUSE_DUR 500
191 #define SPELL_WITH_PHRASE_BREAK 1
192 #define SPELL_WITH_SENTENCE_BREAK 2
193
194 /* *****************************************************************************/
195
196 #define TOK_PUNC_FLUSH (picoos_char) '\0'
197
198 typedef picoos_uchar Word[MARKUP_STRING_BUF_SIZE];
199
200
201 struct MarkupParam {
202 Word paramId;
203 Word paramVal;
204 };
205
206 typedef struct MarkupParam MarkupParams[MAX_NR_MARKUP_PARAMS];
207
208 typedef picoos_uchar utf8char0c[5]; /* one more than needed so it is ended always with 0c*/
209
210 /** subobject : TokenizeUnit
211 * shortcut : tok
212 */
213 typedef struct tok_subobj
214 {
215 picoos_int32 ignLevel;
216
217 utf8char0c utf;
218 picoos_int32 utfpos;
219 picoos_int32 utflen;
220
221 MarkupParams markupParams;
222 picoos_int32 nrMarkupParams;
223 MarkupState markupState;
224 picoos_uchar markupStr[MARKUP_STRING_BUF_SIZE];
225 picoos_int32 markupPos;
226 picoos_int32 markupLevel[MIDummyEnd+1];
227 picoos_uchar markupTagName[IN_BUF_SIZE];
228 MarkupTagType markupTagType;
229 MarkupParseError markupTagErr;
230
231 picoos_int32 strPos;
232 picoos_uchar strDelim;
233 picoos_bool isFileAttr;
234
235 pico_tokenType tokenType;
236 pico_tokenSubType tokenSubType;
237
238 picoos_int32 tokenPos;
239 picoos_uchar tokenStr[IN_BUF_SIZE];
240
241 picoos_int32 nrEOL;
242
243 picoos_bool markupHandlingMode; /* to be moved ??? */
244 picoos_bool aborted; /* to be moved ??? */
245
246 picoos_bool start;
247
248 picoos_uint8 outBuf[OUT_BUF_SIZE]; /* internal output buffer */
249 picoos_uint16 outReadPos; /* next pos to read from outBuf */
250 picoos_uint16 outWritePos; /* next pos to write to outBuf */
251
252 picoos_uchar saveFile[IN_BUF_SIZE];
253 Word phonemes;
254
255 picotrns_SimpleTransducer transducer;
256
257 /* kbs */
258
259 picoktab_Graphs graphTab;
260 picokfst_FST xsampa_parser;
261 picokfst_FST svoxpa_parser;
262 picokfst_FST xsampa2svoxpa_mapper;
263
264
265
266 } tok_subobj_t;
267
268 /* *****************************************************************************/
269
270 static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok);
271 static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling);
272 static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok);
273 static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[]);
274 static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok);
275 static MarkupId tok_markupTagId (picoos_uchar tagId[]);
276
277 /* *****************************************************************************/
278
tok_strEqual(picoos_uchar * str1,picoos_uchar * str2)279 static picoos_bool tok_strEqual(picoos_uchar * str1, picoos_uchar * str2)
280 {
281 return (picoos_strcmp((picoos_char*)str1, (picoos_char*)str2) == 0);
282 }
283
tok_reduceBlanks(picoos_uchar * str)284 static void tok_reduceBlanks(picoos_uchar * str)
285 /* Remove leading and trailing blanks of 'str' and reduce
286 groups of blanks within string to exactly one blank. */
287
288 {
289 int i = 0;
290 int j = 0;
291
292 while (str[j] != 0) {
293 if (str[j] == (picoos_uchar)' ') {
294 /* note one blank except at the beginning of string */
295 if (i > 0) {
296 str[i] = (picoos_uchar)' ';
297 i++;
298 }
299 j++;
300 while (str[j] == (picoos_uchar)' ') {
301 j++;
302 }
303 } else {
304 str[i] = str[j];
305 j++;
306 i++;
307 }
308 }
309
310 /* remove blanks at end of string */
311 if ((i > 0) && (str[i - 1] == ' ')) {
312 i--;
313 }
314 str[i] = 0;
315 }
316
317
tok_startIgnore(tok_subobj_t * tok)318 static void tok_startIgnore (tok_subobj_t * tok)
319 {
320 tok->ignLevel++;
321 }
322
323
tok_endIgnore(tok_subobj_t * tok)324 static void tok_endIgnore (tok_subobj_t * tok)
325 {
326 if (tok->ignLevel > 0) {
327 tok->ignLevel--;
328 }
329 }
330
331
tok_getParamIntVal(MarkupParams params,picoos_uchar paramId[],picoos_int32 * paramVal,picoos_bool * paramFound)332 static void tok_getParamIntVal (MarkupParams params, picoos_uchar paramId[], picoos_int32 * paramVal, picoos_bool * paramFound)
333 {
334 int i=0;
335
336 while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId,params[i].paramId)) {
337 i++;
338 }
339 if ((i < MAX_NR_MARKUP_PARAMS)) {
340 (*paramVal) = picoos_atoi((picoos_char*)params[i].paramVal);
341 (*paramFound) = TRUE;
342 } else {
343 (*paramVal) = -1;
344 (*paramFound) = FALSE;
345 }
346 }
347
348
349
tok_getParamStrVal(MarkupParams params,picoos_uchar paramId[],picoos_uchar paramStrVal[],picoos_bool * paramFound)350 static void tok_getParamStrVal (MarkupParams params, picoos_uchar paramId[], picoos_uchar paramStrVal[], picoos_bool * paramFound)
351 {
352 int i=0;
353
354 while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId,params[i].paramId)) {
355 i++;
356 }
357 if (i < MAX_NR_MARKUP_PARAMS) {
358 picoos_strcpy((picoos_char*)paramStrVal, (picoos_char*)params[i].paramVal);
359 (*paramFound) = TRUE;
360 } else {
361 paramStrVal[0] = 0;
362 (*paramFound) = FALSE;
363 }
364 }
365
366
tok_getParamPhonesStr(MarkupParams params,picoos_uchar paramId[],picoos_uchar alphabet[],picoos_uchar phones[],picoos_int32 phoneslen,picoos_bool * paramFound)367 static void tok_getParamPhonesStr (MarkupParams params, picoos_uchar paramId[], picoos_uchar alphabet[], picoos_uchar phones[], picoos_int32 phoneslen, picoos_bool * paramFound)
368 {
369
370 int i;
371 picoos_bool done;
372
373 i = 0;
374 while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId, params[i].paramId)) {
375 i++;
376 }
377 if (i < MAX_NR_MARKUP_PARAMS) {
378 if (tok_strEqual(alphabet, PICODATA_XSAMPA) || tok_strEqual(alphabet, (picoos_uchar*)"")) {
379 picoos_strlcpy((picoos_char*)phones, (picoos_char*)params[i].paramVal, phoneslen);
380 done = TRUE;
381 } else {
382 done = FALSE;
383 }
384 (*paramFound) = TRUE;
385 } else {
386 done = FALSE;
387 (*paramFound) = FALSE;
388 }
389 if (!done) {
390 phones[0] = 0;
391 }
392 }
393
394
tok_clearMarkupParams(MarkupParams params)395 static void tok_clearMarkupParams (MarkupParams params)
396 {
397 int i;
398
399 for (i = 0; i<MAX_NR_MARKUP_PARAMS; i++) {
400 params[i].paramId[0] = 0;
401 params[i].paramVal[0] = 0;
402 }
403 }
404
405
tok_getDur(picoos_uchar durStr[],picoos_uint32 * dur,picoos_bool * done)406 static void tok_getDur (picoos_uchar durStr[], picoos_uint32 * dur, picoos_bool * done)
407 {
408
409 int num=0;
410 int i=0;
411 picoos_uchar tmpWord[IN_BUF_SIZE];
412
413 picoos_strlcpy((picoos_char*)tmpWord, (picoos_char*)durStr, sizeof(tmpWord));
414 tok_reduceBlanks(tmpWord);
415 while ((durStr[i] >= '0') && (durStr[i] <= '9')) {
416 num = 10 * num + (int)durStr[i] - (int)'0';
417 tmpWord[i] = ' ';
418 i++;
419 }
420 tok_reduceBlanks(tmpWord);
421 if (tok_strEqual(tmpWord, (picoos_uchar*)"s")) {
422 (*dur) = (1000 * num);
423 (*done) = TRUE;
424 } else if (tok_strEqual(tmpWord,(picoos_uchar*)"ms")) {
425 (*dur) = num;
426 (*done) = TRUE;
427 } else {
428 (*dur) = 0;
429 (*done) = FALSE;
430 }
431 }
432
433
tok_putToUtf(tok_subobj_t * tok,picoos_uchar ch)434 static picoos_int32 tok_putToUtf (tok_subobj_t * tok, picoos_uchar ch)
435 {
436 if (tok->utfpos < PICOBASE_UTF8_MAXLEN) {
437 tok->utf[tok->utfpos] = ch;
438 if (tok->utfpos == 0) {
439 tok->utflen = picobase_det_utf8_length(ch);
440 } else if (((ch < (picoos_uchar)'\200') || (ch >= (picoos_uchar)'\300'))) {
441 tok->utflen = 0;
442 }
443 (tok->utfpos)++;
444 if ((tok->utfpos == tok->utflen)) {
445 if ((tok->utfpos < PICOBASE_UTF8_MAXLEN)) {
446 tok->utf[tok->utfpos] = 0;
447 }
448 return UTF_CHAR_COMPLETE;
449 } else if (tok->utfpos < tok->utflen) {
450 return UTF_CHAR_INCOMPLETE;
451 } else {
452 return UTF_CHAR_MALFORMED;
453 }
454 } else {
455 return UTF_CHAR_MALFORMED;
456 }
457 }
458
459
tok_isRelative(picoos_uchar strval[],picoos_uint32 * val)460 static picoos_bool tok_isRelative (picoos_uchar strval[], picoos_uint32 * val)
461 {
462 picoos_int32 len;
463 picoos_bool rel;
464
465 rel = FALSE;
466 len = picoos_strlen((picoos_char*)strval);
467 if (len > 0) {
468 if (strval[len - 1] == '%') {
469 strval[len - 1] = 0;
470 if ((strval[0] == '+') || (strval[0] == '-')) {
471 (*val) = 1000 + (picoos_atoi((picoos_char*)strval) * 10);
472 } else {
473 (*val) = picoos_atoi((picoos_char*)strval) * 10;
474 }
475 rel = TRUE;
476 }
477 }
478 return rel;
479 }
480
481
tok_putItem(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uint8 itemType,picoos_uint8 info1,picoos_uint8 info2,picoos_uint16 val,picoos_uchar str[])482 static void tok_putItem (picodata_ProcessingUnit this, tok_subobj_t * tok,
483 picoos_uint8 itemType, picoos_uint8 info1, picoos_uint8 info2,
484 picoos_uint16 val,
485 picoos_uchar str[])
486 {
487 picoos_int32 len, i;
488
489 if ((itemType == PICODATA_ITEM_CMD) && (info1 == PICODATA_ITEMINFO1_CMD_FLUSH)) {
490 tok->outBuf[tok->outWritePos++] = itemType;
491 tok->outBuf[tok->outWritePos++] = info1;
492 tok->outBuf[tok->outWritePos++] = info2;
493 tok->outBuf[tok->outWritePos++] = 0;
494 }
495 else if (tok->ignLevel <= 0) {
496 switch (itemType) {
497 case PICODATA_ITEM_CMD:
498 switch (info1) {
499 case PICODATA_ITEMINFO1_CMD_CONTEXT:
500 case PICODATA_ITEMINFO1_CMD_VOICE:
501 case PICODATA_ITEMINFO1_CMD_MARKER:
502 case PICODATA_ITEMINFO1_CMD_PLAY:
503 case PICODATA_ITEMINFO1_CMD_SAVE:
504 case PICODATA_ITEMINFO1_CMD_UNSAVE:
505 case PICODATA_ITEMINFO1_CMD_PROSDOMAIN:
506 case PICODATA_ITEMINFO1_CMD_PHONEME:
507 len = picoos_strlen((picoos_char*)str);
508 if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) {
509 tok->outBuf[tok->outWritePos++] = itemType;
510 tok->outBuf[tok->outWritePos++] = info1;
511 tok->outBuf[tok->outWritePos++] = info2;
512 tok->outBuf[tok->outWritePos++] = len;
513 for (i=0; i<len; i++) {
514 tok->outBuf[tok->outWritePos++] = str[i];
515 }
516 }
517 else {
518 PICODBG_WARN(("tok_putItem: output buffer too small"));
519 }
520 break;
521 case PICODATA_ITEMINFO1_CMD_IGNSIG:
522 case PICODATA_ITEMINFO1_CMD_IGNORE:
523 if (tok->outWritePos + 4 < OUT_BUF_SIZE) {
524 tok->outBuf[tok->outWritePos++] = itemType;
525 tok->outBuf[tok->outWritePos++] = info1;
526 tok->outBuf[tok->outWritePos++] = info2;
527 tok->outBuf[tok->outWritePos++] = 0;
528 }
529 else {
530 PICODBG_WARN(("tok_putItem: output buffer too small"));
531 }
532 break;
533 case PICODATA_ITEMINFO1_CMD_SPEED:
534 case PICODATA_ITEMINFO1_CMD_PITCH:
535 case PICODATA_ITEMINFO1_CMD_VOLUME:
536 case PICODATA_ITEMINFO1_CMD_SPELL:
537 case PICODATA_ITEMINFO1_CMD_SIL:
538 case PICODATA_ITEMINFO1_CMD_SPEAKER:
539 if (tok->outWritePos + 4 + 2 < OUT_BUF_SIZE) {
540 tok->outBuf[tok->outWritePos++] = itemType;
541 tok->outBuf[tok->outWritePos++] = info1;
542 tok->outBuf[tok->outWritePos++] = info2;
543 tok->outBuf[tok->outWritePos++] = 2;
544 tok->outBuf[tok->outWritePos++] = val % 256;
545 tok->outBuf[tok->outWritePos++] = val / 256;
546 }
547 else {
548 PICODBG_WARN(("tok_putItem: output buffer too small"));
549 }
550 break;
551 default:
552 PICODBG_WARN(("tok_putItem: unknown command type"));
553 }
554 break;
555 case PICODATA_ITEM_TOKEN:
556 len = picoos_strlen((picoos_char*)str);
557 if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) {
558 tok->outBuf[tok->outWritePos++] = itemType;
559 tok->outBuf[tok->outWritePos++] = info1;
560 tok->outBuf[tok->outWritePos++] = info2;
561 tok->outBuf[tok->outWritePos++] = len;
562 for (i=0; i<len; i++) {
563 tok->outBuf[tok->outWritePos++] = str[i];
564 }
565 }
566 else {
567 PICODBG_WARN(("tok_putItem: output buffer too small"));
568 }
569 break;
570 default:
571 PICODBG_WARN(("tok_putItem: unknown item type"));
572 }
573 }
574 }
575
576
tok_putItem2(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uint8 type,picoos_uint8 info1,picoos_uint8 info2,picoos_uint8 len,picoos_uint8 data[])577 static void tok_putItem2 (picodata_ProcessingUnit this, tok_subobj_t * tok,
578 picoos_uint8 type,
579 picoos_uint8 info1, picoos_uint8 info2,
580 picoos_uint8 len,
581 picoos_uint8 data[])
582 {
583 picoos_int32 i;
584
585 if (is_valid_itemtype(type)) {
586 tok->outBuf[tok->outWritePos++] = type;
587 tok->outBuf[tok->outWritePos++] = info1;
588 tok->outBuf[tok->outWritePos++] = info2;
589 tok->outBuf[tok->outWritePos++] = len;
590 for (i=0; i<len; i++) {
591 tok->outBuf[tok->outWritePos++] = data[i];
592 }
593 }
594 }
595
596
tok_markupTagId(picoos_uchar tagId[])597 static MarkupId tok_markupTagId (picoos_uchar tagId[])
598 {
599 if (picoos_strstr(tagId,(picoos_char *)"svox:") == (picoos_char *)tagId) {
600 tagId+=5;
601 }
602 if (tok_strEqual(tagId, TOK_MARKUP_KW_IGNORE)) {
603 return MIIgnore;
604 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEED)) {
605 return MISpeed;
606 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PITCH)) {
607 return MIPitch;
608 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOLUME)) {
609 return MIVolume;
610 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEAKER)) {
611 return MISpeaker;
612 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOICE)) {
613 return MIVoice;
614 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_CONTEXT)) {
615 return MIPreprocContext;
616 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_MARK)) {
617 return MIMarker;
618 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PLAY)) {
619 return MIPlay;
620 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_USESIG)) {
621 return MIUseSig;
622 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_GENFILE)) {
623 return MIGenFile;
624 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SENTENCE) || tok_strEqual(tagId, TOK_MARKUP_KW_S)) {
625 return MISentence;
626 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PARAGRAPH) || tok_strEqual(tagId, TOK_MARKUP_KW_P)) {
627 return MIParagraph;
628 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_BREAK)) {
629 return MIBreak;
630 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPELL)) {
631 return MISpell;
632 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PHONEME)) {
633 return MIPhoneme;
634 } else if (tok_strEqual(tagId, TOK_MARKUP_KW_ITEM)) {
635 return MIItem;
636 } else {
637 return MIDummyEnd;
638 }
639 }
640
641
tok_checkLimits(picodata_ProcessingUnit this,picoos_uint32 * value,picoos_uint32 min,picoos_uint32 max,picoos_uchar valueType[])642 static void tok_checkLimits (picodata_ProcessingUnit this, picoos_uint32 * value, picoos_uint32 min, picoos_uint32 max, picoos_uchar valueType[])
643 {
644 if ((((*value) < min) || ((*value) > max))) {
645 picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %i for %s", *value, valueType);
646 if (((*value) < min)) {
647 (*value) = min;
648 } else if (((*value) > max)) {
649 (*value) = max;
650 }
651 }
652 }
653
654
655
656 /*
657
658 static void tok_checkRealLimits (picodata_ProcessingUnit this, picoos_single * value, picoos_single min, picoos_single max, picoos_uchar valueType[])
659 {
660 if ((((*value) < min) || ((*value) > max))) {
661 picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %f for %s", *value, valueType);
662 if (((*value) < min)) {
663 (*value) = min;
664 } else if (((*value) > max)) {
665 (*value) = max;
666 }
667 }
668 }
669 */
670
671 #define VAL_STR_LEN 21
672
tok_interpretMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_bool isStartTag,MarkupId mId)673 static void tok_interpretMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_bool isStartTag, MarkupId mId)
674 {
675 picoos_bool done;
676 picoos_int32 ival;
677 picoos_uint32 uval;
678 picoos_int32 ival2;
679 picoos_uchar valStr[VAL_STR_LEN];
680 picoos_uchar valStr2[VAL_STR_LEN];
681 picoos_uchar valStr3[VAL_STR_LEN];
682 picoos_int32 i2;
683 picoos_uint32 dur;
684 picoos_bool done1;
685 picoos_bool paramFound;
686 picoos_uint8 type, info1, info2;
687 picoos_uint8 data[256];
688 picoos_int32 pos, n, len;
689 picoos_uchar part[10];
690
691 done = FALSE;
692 switch (mId) {
693 case MIIgnore:
694 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
695 tok_startIgnore(tok);
696 done = TRUE;
697 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
698 tok_endIgnore(tok);
699 done = TRUE;
700 }
701 break;
702 case MISpeed:
703 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
704 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
705 tok_checkLimits(this, & uval, PICO_SPEED_FACTOR_MIN, PICO_SPEED_FACTOR_MAX,(picoos_uchar*)"relative speed factor");
706 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
707 } else {
708 uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
709 tok_checkLimits(this, & uval, PICO_SPEED_MIN, PICO_SPEED_MAX,(picoos_uchar*)"speed");
710 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
711 }
712 done = TRUE;
713 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
714 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEED_DEFAULT, (picoos_uchar*)"");
715 done = TRUE;
716 }
717 break;
718 case MIPitch:
719 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
720 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
721 tok_checkLimits(this, & uval,PICO_PITCH_FACTOR_MIN,PICO_PITCH_FACTOR_MAX, (picoos_uchar*)"relative pitch factor");
722 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
723 } else {
724 uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
725 tok_checkLimits(this, & uval,PICO_PITCH_MIN,PICO_PITCH_MAX, (picoos_uchar*)"pitch");
726 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
727 }
728 done = TRUE;
729 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
730 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_PITCH_DEFAULT, (picoos_uchar*)"");
731 done = TRUE;
732 }
733 break;
734 case MIVolume:
735 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
736 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
737 tok_checkLimits(this, & uval, PICO_VOLUME_FACTOR_MIN, PICO_VOLUME_FACTOR_MAX, (picoos_uchar*)"relative volume factor");
738 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
739 } else {
740 uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
741 tok_checkLimits(this, & uval, PICO_VOLUME_MIN, PICO_VOLUME_MAX, (picoos_uchar*)"volume");
742 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
743 }
744 done = TRUE;
745 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
746 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_VOLUME_DEFAULT, (picoos_uchar*)"");
747 done = TRUE;
748 }
749 break;
750 case MISpeaker:
751 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
752 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
753 tok_checkLimits(this, & uval, PICO_SPEAKER_FACTOR_MIN, PICO_SPEAKER_FACTOR_MAX, (picoos_uchar*)"relative speaker factor");
754 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
755 } else {
756 uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
757 tok_checkLimits(this, & uval, PICO_SPEAKER_MIN, PICO_SPEAKER_MAX, (picoos_uchar*)"volume");
758 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
759 }
760 done = TRUE;
761 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
762 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEAKER_DEFAULT, (picoos_uchar*)"");
763 done = TRUE;
764 }
765 break;
766
767 case MIVoice:
768 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
769 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
770 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
771 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)"");
772 done = TRUE;
773 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
774 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
775 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
776 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)"");
777 done = TRUE;
778 }
779 break;
780 case MIPreprocContext:
781 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
782 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
783 done = TRUE;
784 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
785 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, PICO_CONTEXT_DEFAULT);
786 done = TRUE;
787 }
788 break;
789 case MIMarker:
790 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
791 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_MARKER, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
792 done = TRUE;
793 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
794 done = TRUE;
795 }
796 break;
797 case MISentence:
798 if (isStartTag) {
799 tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound);
800 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
801 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, valStr);
802 done = TRUE;
803 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
804 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
805 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, (picoos_uchar*)"");
806 done = TRUE;
807 }
808 break;
809 case MIParagraph:
810 if (isStartTag) {
811 tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound);
812 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
813 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, valStr);
814 done = TRUE;
815 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
816 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
817 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, PARAGRAPH_PAUSE_DUR, (picoos_uchar*)"");
818 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, (picoos_uchar*)"");
819 done = TRUE;
820 }
821 break;
822 case MIBreak:
823 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWTime)) {
824 tok_getDur(tok->markupParams[0].paramVal, & dur, & done1);
825 tok_checkLimits (this, &dur, 0, 65535, (picoos_uchar*)"time");
826 if (done1) {
827 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, dur, (picoos_uchar*)"");
828 done = TRUE;
829 }
830 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
831 done = TRUE;
832 }
833 break;
834 case MISpell:
835 if (isStartTag) {
836 if (tok_strEqual(tok->markupParams[0].paramId, KWMode)) {
837 if (tok_strEqual(tok->markupParams[0].paramVal, KWPB)) {
838 uval = SPELL_WITH_PHRASE_BREAK;
839 } else if (tok_strEqual(tok->markupParams[0].paramVal, KWSB)) {
840 uval = SPELL_WITH_SENTENCE_BREAK;
841 } else {
842 tok_getDur(tok->markupParams[0].paramVal, & uval, & done1);
843 tok_checkLimits (this, & uval, 0, 65535, (picoos_uchar*)"time");
844 if (done1) {
845 done = TRUE;
846 }
847 }
848 } else {
849 uval = SPELL_WITH_PHRASE_BREAK;
850 }
851 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_START, uval, (picoos_uchar*)"");
852 done = TRUE;
853 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
854 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)"");
855 done = TRUE;
856 }
857 break;
858 case MIGenFile:
859 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
860 if (tok->saveFile[0] != 0) {
861 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE,
862 picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, tok->saveFile);
863 tok->saveFile[0] = 0;
864 }
865 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SAVE,
866 picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/FALSE), 0, tok->markupParams[0].paramVal);
867 picoos_strcpy((picoos_char*)tok->saveFile, (picoos_char*)tok->markupParams[0].paramVal);
868 done = TRUE;
869 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
870 if (tok->saveFile[0] != 0) {
871 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE,
872 picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, (picoos_uchar*)"");
873 tok->saveFile[0] = 0;
874 }
875 done = TRUE;
876 }
877 break;
878 case MIPlay:
879 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
880 if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) {
881 tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound);
882 tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound);
883 tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3,& paramFound);
884 tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound);
885 tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound);
886 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY,
887 picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal);
888 tok_startIgnore(tok);
889 } else {
890 if (tok->ignLevel > 0) {
891 tok_startIgnore(tok);
892 } else {
893 picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead\n", tok->markupParams[0].paramVal);
894 }
895 }
896 done = TRUE;
897 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
898 tok_endIgnore(tok);
899 done = TRUE;
900 }
901 break;
902 case MIUseSig:
903 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
904 if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) {
905 tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound);
906 tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound);
907 tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3, & paramFound);
908 tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound);
909 tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound);
910 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY,
911 picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal);
912 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_START, 0, (picoos_uchar*)"");
913 } else {
914 if (tok->ignLevel <= 0) {
915 picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead", tok->markupParams[0].paramVal);
916 }
917 }
918 done = TRUE;
919 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
920 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)"");
921 done = TRUE;
922 }
923 break;
924 case MIPhoneme:
925 i2 = 0;
926 if (isStartTag) {
927 if (tok_strEqual(tok->markupParams[0].paramId, KWAlphabet) && tok_strEqual(tok->markupParams[1].paramId, KWPH)) {
928 if (tok_strEqual(tok->markupParams[2].paramId, KWOrthMode)
929 && tok_strEqual(tok->markupParams[2].paramVal, KWIgnorePunct)) {
930 i2 = 1;
931 }
932 if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[1].paramVal, tok->markupParams[0].paramVal, tok->phonemes, sizeof(tok->phonemes)-1) == PICO_OK) {
933 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
934 PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes);
935 done = TRUE;
936 } else {
937 PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal));
938 picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal);
939 done = TRUE;
940 }
941 } else if (tok_strEqual(tok->markupParams[0].paramId, KWPH)) {
942 if (tok_strEqual(tok->markupParams[1].paramId, KWOrthMode)
943 && tok_strEqual(tok->markupParams[1].paramVal, KWIgnorePunct)) {
944 i2 = 1;
945 }
946 if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[0].paramVal, PICODATA_XSAMPA, tok->phonemes, sizeof(tok->phonemes)) == PICO_OK) {
947 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
948 PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes);
949 done = TRUE;
950 }
951 else {
952 PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal));
953 picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizing text instead", tok->markupParams[0].paramVal);
954 done = TRUE;
955 }
956 }
957 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
958 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
959 PICODATA_ITEMINFO2_CMD_END, i2, (picoos_uchar*)"");
960 done = TRUE;
961 }
962 break;
963 case MIItem:
964 if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWType) &&
965 tok_strEqual(tok->markupParams[1].paramId, KWInfo1)&&
966 tok_strEqual(tok->markupParams[2].paramId, KWInfo2)&&
967 tok_strEqual(tok->markupParams[3].paramId, KWDATA)) {
968 picoos_int32 len2, n2;
969 type = picoos_atoi(tok->markupParams[0].paramVal);
970 info1 = picoos_atoi(tok->markupParams[1].paramVal);
971 info2 = picoos_atoi(tok->markupParams[2].paramVal);
972 n = 0; n2 = 0;
973 len2 = (picoos_int32)picoos_strlen(tok->markupParams[3].paramVal);
974 while (n<len2) {
975 while ((tok->markupParams[3].paramVal[n] != 0) && (tok->markupParams[3].paramVal[n] <= 32)) {
976 n++;
977 }
978 tok->markupParams[3].paramVal[n2] = tok->markupParams[3].paramVal[n];
979 n++;
980 n2++;
981 }
982 if (is_valid_itemtype(type)) {
983 done = TRUE;
984 len = 0;
985 pos = 0;
986 picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal),
987 &pos, ',', part, 10, &done1);
988 while (done && done1) {
989 n = picoos_atoi(part);
990 if ((n>=0) && (n<256) && (len<256)) {
991 data[len++] = n;
992 }
993 else {
994 done = FALSE;
995 }
996 picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal),
997 &pos, ',', part, 10, &done1);
998 }
999 if (done) {
1000 tok_putItem2(this, tok, type, info1, info2, len, data);
1001 }
1002 }
1003 else {
1004 done = FALSE;
1005 }
1006 } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
1007 done = TRUE;
1008 }
1009 break;
1010 default:
1011 break;
1012 }
1013 if (!done) {
1014 tok->markupTagErr = MEInterprete;
1015 }
1016 if (isStartTag) {
1017 tok->markupLevel[mId]++;
1018 } else if ((tok->markupLevel[mId] > 0)) {
1019 tok->markupLevel[mId]--;
1020 }
1021 }
1022
1023
tok_attrChar(picoos_uchar ch,picoos_bool first)1024 static picoos_bool tok_attrChar (picoos_uchar ch, picoos_bool first)
1025 {
1026 return ((((ch >= (picoos_uchar)'A') && (ch <= (picoos_uchar)'Z')) ||
1027 ((ch >= (picoos_uchar)'a') && (ch <= (picoos_uchar)'z'))) ||
1028 ( !(first) && ((ch >= (picoos_uchar)'0') && (ch <= (picoos_uchar)'9'))));
1029 }
1030
1031
1032
tok_idChar(picoos_uchar ch,picoos_bool first)1033 static picoos_bool tok_idChar (picoos_uchar ch, picoos_bool first)
1034 {
1035 return tok_attrChar(ch, first) || ( !(first) && (ch == (picoos_uchar)':'));
1036 }
1037
1038
tok_setIsFileAttr(picoos_uchar name[],picoos_bool * isFile)1039 static void tok_setIsFileAttr (picoos_uchar name[], picoos_bool * isFile)
1040 {
1041 (*isFile) = tok_strEqual(name, KWFile);
1042 }
1043
1044 /* *****************************************************************************/
1045
tok_putToSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar str[],pico_tokenType type,pico_tokenSubType subtype)1046 static void tok_putToSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[], pico_tokenType type, pico_tokenSubType subtype)
1047 {
1048 int i, len;
1049
1050 if (str[0] != 0) {
1051 len = picoos_strlen((picoos_char*)str);
1052 for (i = 0; i < len; i++) {
1053 if (tok->tokenPos >= IN_BUF_SIZE) {
1054 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT, (picoos_char*)"", (picoos_char*)"simple token too long; forced treatment");
1055 tok_treatSimpleToken(this, tok);
1056 }
1057 tok->tokenStr[tok->tokenPos] = str[i];
1058 tok->tokenPos++;
1059 }
1060 }
1061 tok->tokenType = type;
1062 tok->tokenSubType = subtype;
1063 }
1064
1065
tok_putToMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar str[])1066 static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[])
1067 {
1068 picoos_int32 i, len;
1069 picoos_uint8 ok;
1070
1071 tok->markupTagErr = MENone;
1072 len = picoos_strlen((picoos_char*)str);
1073 for (i = 0; i< len; i++) {
1074 if (tok->markupPos >= (MARKUP_STRING_BUF_SIZE - 1)) {
1075 if ((tok->markupPos == (MARKUP_STRING_BUF_SIZE - 1)) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) {
1076 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"markup tag too long");
1077 }
1078 tok->markupState = MSErrorTooLong;
1079 } else if ((str[i] == (picoos_uchar)' ') && ((tok->markupState == MSExpectingmarkupTagName) || (tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSGotAttrName) || (tok->markupState == MSGotEqual) || (tok->markupState == MSGotAttrValue))) {
1080 } else if ((str[i] == (picoos_uchar)'>') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) {
1081 tok->markupState = MSGotEnd;
1082 } else if ((str[i] == (picoos_uchar)'/') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) {
1083 if (tok->markupTagType == MTEnd) {
1084 tok->markupTagErr = MEUnexpectedChar;
1085 tok->markupState = MSError;
1086 } else {
1087 tok->markupTagType = MTEmpty;
1088 tok->markupState = MSGotEndSlash;
1089 }
1090 } else {
1091 switch (tok->markupState) {
1092 case MSNotInMarkup:
1093 if (str[i] == (picoos_uchar)'<') {
1094 tok_clearMarkupParams(tok->markupParams);
1095 tok->nrMarkupParams = 0;
1096 tok->strPos = 0;
1097 tok->markupTagType = MTStart;
1098 tok->markupState = MSGotStart;
1099 } else {
1100 tok->markupTagErr = MEMissingStart;
1101 tok->markupState = MSError;
1102 }
1103 break;
1104 case MSGotStart:
1105 if (str[i] == (picoos_uchar)'/') {
1106 tok->markupTagType = MTEnd;
1107 tok->markupState = MSExpectingmarkupTagName;
1108 } else if (str[i] == (picoos_uchar)' ') {
1109 tok->markupState = MSExpectingmarkupTagName;
1110 } else if (tok_idChar(str[i],TRUE)) {
1111 tok->markupTagType = MTStart;
1112 tok->markupTagName[tok->strPos] = str[i];
1113 tok->strPos++;
1114 tok->markupTagName[tok->strPos] = 0;
1115 tok->markupState = MSInmarkupTagName;
1116 } else {
1117 tok->markupTagErr = MEUnexpectedChar;
1118 tok->markupState = MSError;
1119 }
1120 break;
1121 case MSInmarkupTagName: case MSExpectingmarkupTagName:
1122 if (tok_idChar(str[i],tok->markupState == MSExpectingmarkupTagName)) {
1123 tok->markupTagName[tok->strPos] = str[i];
1124 tok->strPos++;
1125 tok->markupTagName[(tok->strPos)] = 0;
1126 tok->markupState = MSInmarkupTagName;
1127 } else if ((tok->markupState == MSInmarkupTagName) && (str[i] == (picoos_uchar)' ')) {
1128 tok->markupState = MSGotmarkupTagName;
1129 picobase_lowercase_utf8_str(tok->markupTagName, (picoos_char*)tok->markupTagName, IN_BUF_SIZE, &ok);
1130 tok->strPos = 0;
1131 } else {
1132 tok->markupTagErr = MEIdent;
1133 tok->markupState = MSError;
1134 }
1135 break;
1136 case MSGotmarkupTagName: case MSGotAttrValue:
1137 if (tok_attrChar(str[i], TRUE)) {
1138 if (tok->markupTagType == MTEnd) {
1139 tok->markupTagErr = MEUnexpectedChar;
1140 tok->markupState = MSError;
1141 } else {
1142 if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1143 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i];
1144 tok->strPos++;
1145 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0;
1146 } else {
1147 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"too many attributes in markup; ignoring");
1148 }
1149 tok->markupState = MSInAttrName;
1150 }
1151 } else {
1152 tok->markupTagErr = MEUnexpectedChar;
1153 tok->markupState = MSError;
1154 }
1155 break;
1156 case MSInAttrName:
1157 if (tok_attrChar(str[i], FALSE)) {
1158 if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1159 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i];
1160 tok->strPos++;
1161 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0;
1162 }
1163 tok->markupState = MSInAttrName;
1164 } else if (str[i] == (picoos_uchar)' ') {
1165 picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok);
1166 tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr);
1167 tok->markupState = MSGotAttrName;
1168 } else if (str[i] == (picoos_uchar)'=') {
1169 picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok);
1170 tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr);
1171 tok->markupState = MSGotEqual;
1172 } else {
1173 tok->markupTagErr = MEMissingEqual;
1174 tok->markupState = MSError;
1175 }
1176 break;
1177 case MSGotAttrName:
1178 if (str[i] == (picoos_uchar)'=') {
1179 tok->markupState = MSGotEqual;
1180 } else {
1181 tok->markupTagErr = MEMissingEqual;
1182 tok->markupState = MSError;
1183 }
1184 break;
1185 case MSGotEqual:
1186 if ((str[i] == (picoos_uchar)'"') || (str[i] == (picoos_uchar)'\'')) {
1187 tok->strDelim = str[i];
1188 tok->strPos = 0;
1189 tok->markupState = MSInAttrValue;
1190 } else {
1191 tok->markupTagErr = MEMissingQuote;
1192 tok->markupState = MSError;
1193 }
1194 break;
1195 case MSInAttrValue:
1196 if (!(tok->isFileAttr) && (str[i] == (picoos_uchar)'\\')) {
1197 tok->markupState = MSInAttrValueEscaped;
1198 } else if (str[i] == tok->strDelim) {
1199 if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1200 tok->nrMarkupParams++;
1201 }
1202 tok->strPos = 0;
1203 tok->markupState = MSGotAttrValue;
1204 } else {
1205 if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1206 tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i];
1207 tok->strPos++;
1208 tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0;
1209 }
1210 tok->markupState = MSInAttrValue;
1211 }
1212 break;
1213 case MSInAttrValueEscaped:
1214 if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1215 tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i];
1216 tok->strPos++;
1217 tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0;
1218 }
1219 tok->markupState = MSInAttrValue;
1220 break;
1221 case MSGotEndSlash:
1222 if (str[i] == (picoos_uchar)'>') {
1223 tok->markupState = MSGotEnd;
1224 } else {
1225 tok->markupTagErr = MEUnexpectedChar;
1226 tok->markupState = MSError;
1227 }
1228 break;
1229 default:
1230 tok->markupTagErr = MEUnexpectedChar;
1231 tok->markupState = MSError;
1232 break;
1233 }
1234 }
1235 if (tok->markupTagErr == MENone) {
1236 tok->markupStr[tok->markupPos] = str[i];
1237 tok->markupPos++;
1238 } /* else restart parsing at current char */
1239 tok->markupStr[tok->markupPos] = 0;
1240 }
1241 /*
1242 PICODBG_DEBUG(("putToMarkup %s", tok->markupStr));
1243 */
1244 }
1245
1246 /* *****************************************************************************/
1247
tok_treatMarkupAsSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok)1248 static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok)
1249 {
1250 picoos_int32 i;
1251
1252 tok->utfpos = 0;
1253 tok->utflen = 0;
1254 tok->markupState = MSNotInMarkup;
1255 for (i = 0; i < tok->markupPos; i++) {
1256 tok_treatChar(this, tok, tok->markupStr[i], FALSE);
1257 }
1258 tok->markupPos = 0;
1259 tok->strPos = 0;
1260 }
1261
1262
tok_treatMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok)1263 static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok)
1264 {
1265 MarkupId mId;
1266
1267 if (tok_markupTagId(tok->markupTagName) != MIDummyEnd) {
1268 if (tok->markupTagErr == MENone) {
1269 tok->markupState = MSNotInMarkup;
1270 if ((tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_SPACE) && (tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED)) {
1271 tok_treatSimpleToken(this, tok);
1272 }
1273 tok_putToSimpleToken(this, tok, (picoos_uchar*)" ", PICODATA_ITEMINFO1_TOKTYPE_SPACE, -1);
1274 mId = tok_markupTagId(tok->markupTagName);
1275 if ((tok->markupTagType == MTStart) || (tok->markupTagType == MTEmpty)) {
1276 tok_interpretMarkup(this, tok, TRUE, mId);
1277 }
1278 if (((tok->markupTagType == MTEnd) || (tok->markupTagType == MTEmpty))) {
1279 tok_clearMarkupParams(tok->markupParams);
1280 tok->nrMarkupParams = 0;
1281 tok_interpretMarkup(this, tok, FALSE,mId);
1282 }
1283 }
1284 if (tok->markupTagErr != MENone) {
1285 if (!tok->aborted) {
1286 picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"syntax error in markup token '%s'",tok->markupStr);
1287 }
1288 tok_treatMarkupAsSimpleToken(this, tok);
1289 }
1290 } else {
1291 tok_treatMarkupAsSimpleToken(this, tok);
1292 }
1293 tok->markupState = MSNotInMarkup;
1294 tok->markupPos = 0;
1295 tok->strPos = 0;
1296 }
1297
1298
1299
tok_treatChar(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar ch,picoos_bool markupHandling)1300 static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling)
1301 {
1302 picoos_int32 i, id;
1303 picoos_uint8 uval8;
1304 pico_tokenType type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1305 pico_tokenSubType subtype = -1;
1306 picoos_bool dummy;
1307 utf8char0c utf2;
1308 picoos_int32 utf2pos;
1309
1310 if (ch == NULLC) {
1311 tok_treatSimpleToken(this, tok);
1312 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
1313 }
1314 else {
1315 switch (tok_putToUtf(tok, ch)) {
1316 case UTF_CHAR_MALFORMED:
1317 tok->utfpos = 0;
1318 tok->utflen = 0;
1319 break;
1320 case UTF_CHAR_INCOMPLETE:
1321 break;
1322 case UTF_CHAR_COMPLETE:
1323 markupHandling = (markupHandling && (tok->markupHandlingMode == MARKUP_HANDLING_ENABLED));
1324 id = picoktab_graphOffset(tok->graphTab, tok->utf);
1325 if (id > 0) {
1326 if (picoktab_getIntPropTokenType(tok->graphTab, id, &uval8)) {
1327 type = (pico_tokenType)uval8;
1328 if (type == PICODATA_ITEMINFO1_TOKTYPE_LETTERV) {
1329 type = PICODATA_ITEMINFO1_TOKTYPE_LETTER;
1330 }
1331 }
1332 dummy = picoktab_getIntPropTokenSubType(tok->graphTab, id, &subtype);
1333 } else if (tok->utf[tok->utfpos-1] <= (picoos_uchar)' ') {
1334 type = PICODATA_ITEMINFO1_TOKTYPE_SPACE;
1335 subtype = -1;
1336 } else {
1337 type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1338 subtype = -1;
1339 }
1340 if ((tok->utf[tok->utfpos-1] > (picoos_uchar)' ')) {
1341 tok->nrEOL = 0;
1342 } else if ((tok->utf[tok->utfpos-1] == EOL)) {
1343 tok->nrEOL++;
1344 }
1345 if (markupHandling && (tok->markupState != MSNotInMarkup)) {
1346 tok_putToMarkup(this, tok, tok->utf);
1347 if (tok->markupState >= MSError) {
1348 picoos_strlcpy(utf2, tok->utf, 5);
1349 utf2pos = tok->utfpos;
1350 /* treat string up to (but not including) current char as simple
1351 token and restart markup tag parsing with current char */
1352 tok_treatMarkupAsSimpleToken(this, tok);
1353 for (i = 0; i < utf2pos; i++) {
1354 tok_treatChar(this, tok, utf2[i], markupHandling);
1355 }
1356 } else if (tok->markupState == MSGotEnd) {
1357 tok_treatMarkup(this, tok);
1358 }
1359 } else if ((markupHandling && (tok->utf[tok->utfpos-1] == (picoos_uchar)'<'))) {
1360 tok_putToMarkup(this, tok, tok->utf);
1361 } else if (type != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED) {
1362 if ((type != tok->tokenType) || (type == PICODATA_ITEMINFO1_TOKTYPE_CHAR) || (subtype != tok->tokenSubType)) {
1363 tok_treatSimpleToken(this, tok);
1364 } else if ((tok->utf[tok->utfpos-1] == EOL) && (tok->nrEOL == 2)) {
1365 tok_treatSimpleToken(this, tok);
1366 tok_putToSimpleToken(this, tok, (picoos_uchar*)".", PICODATA_ITEMINFO1_TOKTYPE_CHAR, -1);
1367 tok_treatSimpleToken(this, tok);
1368 }
1369 tok_putToSimpleToken(this, tok, tok->utf, type, subtype);
1370 } else {
1371 tok_treatSimpleToken(this, tok);
1372 }
1373 tok->utfpos = 0;
1374 tok->utflen = 0;
1375 break;
1376 }
1377 }
1378 }
1379
1380
tok_treatSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok)1381 static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok)
1382 {
1383 if (tok->tokenPos < IN_BUF_SIZE) {
1384 tok->tokenStr[tok->tokenPos] = 0;
1385 }
1386 if (tok->markupState != MSNotInMarkup) {
1387 if (!(tok->aborted) && (tok->markupState >= MSGotmarkupTagName) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) {
1388 picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"unfinished markup tag '%s'",tok->markupStr);
1389 }
1390 tok_treatMarkupAsSimpleToken(this, tok);
1391 tok_treatSimpleToken(this, tok);
1392 } else if ((tok->tokenPos > 0) && ((tok->ignLevel <= 0) || (tok->tokenType == PICODATA_ITEMINFO1_TOKTYPE_SPACE))) {
1393 tok_putItem(this, tok, PICODATA_ITEM_TOKEN, tok->tokenType, (picoos_uint8)tok->tokenSubType, 0, tok->tokenStr);
1394 }
1395 tok->tokenPos = 0;
1396 tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1397 tok->tokenSubType = -1;
1398 }
1399
1400 /* *****************************************************************************/
1401
tokReset(register picodata_ProcessingUnit this,picoos_int32 resetMode)1402 static pico_status_t tokReset(register picodata_ProcessingUnit this, picoos_int32 resetMode)
1403 {
1404 tok_subobj_t * tok;
1405 MarkupId mId;
1406
1407 if (NULL == this || NULL == this->subObj) {
1408 return PICO_ERR_OTHER;
1409 }
1410 tok = (tok_subobj_t *) this->subObj;
1411
1412 tok->ignLevel = 0;
1413
1414 tok->utfpos = 0;
1415 tok->utflen = 0;
1416
1417 tok_clearMarkupParams(tok->markupParams);
1418 tok->nrMarkupParams = 0;
1419 tok->markupState = MSNotInMarkup;
1420 tok->markupPos = 0;
1421 for (mId = MIDummyStart; mId <= MIDummyEnd; mId++) {
1422 tok->markupLevel[mId] = 0;
1423 }
1424 tok->markupTagName[0] = 0;
1425 tok->markupTagType = MTNone;
1426 tok->markupTagErr = MENone;
1427
1428 tok->strPos = 0;
1429 tok->strDelim = 0;
1430 tok->isFileAttr = FALSE;
1431
1432 tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1433 tok->tokenSubType = -1;
1434 tok->tokenPos = 0;
1435
1436 tok->nrEOL = 0;
1437
1438
1439 tok->markupHandlingMode = TRUE;
1440 tok->aborted = FALSE;
1441
1442 tok->start = TRUE;
1443
1444 tok->outReadPos = 0;
1445 tok->outWritePos = 0;
1446
1447 tok->saveFile[0] = 0;
1448
1449
1450 tok->graphTab = picoktab_getGraphs(this->voice->kbArray[PICOKNOW_KBID_TAB_GRAPHS]);
1451
1452 tok->xsampa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA_PARSE]);
1453 PICODBG_TRACE(("got xsampa_parser @ %i",tok->xsampa_parser));
1454
1455 tok->svoxpa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_SVOXPA_PARSE]);
1456 PICODBG_TRACE(("got svoxpa_parser @ %i",tok->svoxpa_parser));
1457
1458 tok->xsampa2svoxpa_mapper = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA2SVOXPA]);
1459 PICODBG_TRACE(("got xsampa2svoxpa_mapper @ %i",tok->xsampa2svoxpa_mapper));
1460
1461
1462
1463 return PICO_OK;
1464 }
1465
tokInitialize(register picodata_ProcessingUnit this,picoos_int32 resetMode)1466 static pico_status_t tokInitialize(register picodata_ProcessingUnit this, picoos_int32 resetMode)
1467 {
1468 /*
1469
1470 tok_subobj_t * tok;
1471
1472 if (NULL == this || NULL == this->subObj) {
1473 return PICO_ERR_OTHER;
1474 }
1475 tok = (tok_subobj_t *) this->subObj;
1476 */
1477 return tokReset(this, resetMode);
1478 }
1479
1480
tokTerminate(register picodata_ProcessingUnit this)1481 static pico_status_t tokTerminate(register picodata_ProcessingUnit this)
1482 {
1483 return PICO_OK;
1484 }
1485
1486 static picodata_step_result_t tokStep(register picodata_ProcessingUnit this, picoos_int16 mode, picoos_uint16 * numBytesOutput);
1487
tokSubObjDeallocate(register picodata_ProcessingUnit this,picoos_MemoryManager mm)1488 static pico_status_t tokSubObjDeallocate(register picodata_ProcessingUnit this,
1489 picoos_MemoryManager mm)
1490 {
1491
1492 if (NULL != this) {
1493 picoos_deallocate(this->common->mm, (void *) &this->subObj);
1494 }
1495 mm = mm; /* avoid warning "var not used in this function"*/
1496 return PICO_OK;
1497 }
1498
picotok_newTokenizeUnit(picoos_MemoryManager mm,picoos_Common common,picodata_CharBuffer cbIn,picodata_CharBuffer cbOut,picorsrc_Voice voice)1499 picodata_ProcessingUnit picotok_newTokenizeUnit(picoos_MemoryManager mm, picoos_Common common,
1500 picodata_CharBuffer cbIn, picodata_CharBuffer cbOut,
1501 picorsrc_Voice voice)
1502 {
1503 tok_subobj_t * tok;
1504 picodata_ProcessingUnit this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
1505 if (this == NULL) {
1506 return NULL;
1507 }
1508 this->initialize = tokInitialize;
1509 PICODBG_DEBUG(("set this->step to tokStep"));
1510 this->step = tokStep;
1511 this->terminate = tokTerminate;
1512 this->subDeallocate = tokSubObjDeallocate;
1513 this->subObj = picoos_allocate(mm, sizeof(tok_subobj_t));
1514 if (this->subObj == NULL) {
1515 picoos_deallocate(mm, (void *)&this);
1516 return NULL;
1517 }
1518 tok = (tok_subobj_t *) this->subObj;
1519 tok->transducer = picotrns_newSimpleTransducer(mm, common, 10*(PICOTRNS_MAX_NUM_POSSYM+2));
1520 if (NULL == tok->transducer) {
1521 tokSubObjDeallocate(this,mm);
1522 picoos_deallocate(mm, (void *)&this);
1523 return NULL;
1524 }
1525 tokInitialize(this, PICO_RESET_FULL);
1526 return this;
1527 }
1528
1529 /**
1530 * fill up internal buffer, try to locate token, write token to output
1531 */
tokStep(register picodata_ProcessingUnit this,picoos_int16 mode,picoos_uint16 * numBytesOutput)1532 picodata_step_result_t tokStep(register picodata_ProcessingUnit this,
1533 picoos_int16 mode, picoos_uint16 * numBytesOutput)
1534 {
1535 register tok_subobj_t * tok;
1536
1537 if (NULL == this || NULL == this->subObj) {
1538 return PICODATA_PU_ERROR;
1539 }
1540 tok = (tok_subobj_t *) this->subObj;
1541
1542 mode = mode; /* avoid warning "var not used in this function"*/
1543
1544 *numBytesOutput = 0;
1545 while (1) { /* exit via return */
1546 picoos_int16 ch;
1547
1548 if ((tok->outWritePos - tok->outReadPos) > 0) {
1549 if (picodata_cbPutItem(this->cbOut, &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos, numBytesOutput) == PICO_OK) {
1550 PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
1551 (picoos_uint8 *)"tok:", &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos);
1552 tok->outReadPos += *numBytesOutput;
1553 if (tok->outWritePos == tok->outReadPos) {
1554 tok->outWritePos = 0;
1555 tok->outReadPos = 0;
1556 }
1557 }
1558 else {
1559 return PICODATA_PU_OUT_FULL;
1560 }
1561
1562 }
1563 else if (PICO_EOF != (ch = picodata_cbGetCh(this->cbIn))) {
1564 PICODBG_DEBUG(("read in %c", (picoos_char) ch));
1565 tok_treatChar(this, tok, (picoos_uchar) ch, /*markupHandling*/TRUE);
1566 }
1567 else {
1568 return PICODATA_PU_IDLE;
1569 }
1570 }
1571 }
1572
1573 #ifdef __cplusplus
1574 }
1575 #endif
1576
1577 /* end */
1578