1 /* 2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * @file picotok.h 18 * 19 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 20 * All rights reserved. 21 * 22 * History: 23 * - 2009-04-20 -- initial version 24 * 25 */ 26 27 28 /** @addtogroup picotok 29 itemtype, iteminfo1, iteminfo2, content -> TYPE(INFO1,INFO2)content 30 in the following 31 32 input 33 ===== 34 35 - UTF8 text 36 37 limitations: currently only german umlauts in addition to ASCII 38 39 40 minimal input size (before processing starts) 41 ================== 42 43 processing (ie. tokenization) starts when 44 - 'PICO_EOF' char received (which happens whenever the cbIn buffer is empty) 45 - tok-internal buffer is full 46 47 48 items output 49 ============ 50 51 processing the character stream can result in one of the 52 following items: 53 -> WORDGRAPH(NA,NA)graph <- mapped to lower case; incl. 1-2 digit nrs (0-99) 54 -> OTHER(NA,NA)string <- skip or spell 55 -> PUNC(PUNCtype,PUNCsubtype) 56 -> CMD(CMDtype,CMDsubtype)args 57 58 with 59 - PUNCtype %d 60 PICODATA_ITEMINFO1_PUNC_SENTEND 61 PICODATA_ITEMINFO1_PUNC_PHRASEEND 62 - PUNCsubtype %d 63 PICODATA_ITEMINFO2_PUNC_SENT_T 64 PICODATA_ITEMINFO2_PUNC_SENT_Q 65 PICODATA_ITEMINFO2_PUNC_SENT_E 66 PICODATA_ITEMINFO2_PUNC_PHRASE 67 (used later: PICODATA_ITEMINFO2_PUNC_PHRASE_FORCED) 68 - CMDtype %d 69 PICODATA_ITEMINFO1_CMD_FLUSH (no args) 70 ? PICODATA_ITEMINFO1_CMD_PLAY ? (not yet) 71 - CMDsubtype %d 72 PICODATA_ITEMINFO2_NA 73 ? PICODATA_ITEMINFO2_CMD_PLAY_G2P ? (not yet) 74 - graph, len>0, utf8 graphemes, %s 75 - string, len>0, can be any string with printable ascii characters, %s 76 77 78 other limitations 79 ================= 80 81 - item size: header plus len=256 (valid for Pico in general) 82 */ 83 84 85 #ifndef PICOTOK_H_ 86 #define PICOTOK_H_ 87 88 #include "picoos.h" 89 #include "picodata.h" 90 #include "picorsrc.h" 91 92 #ifdef __cplusplus 93 extern "C" { 94 #endif 95 #if 0 96 } 97 #endif 98 99 100 101 picodata_ProcessingUnit picotok_newTokenizeUnit( 102 picoos_MemoryManager mm, 103 picoos_Common common, 104 picodata_CharBuffer cbIn, 105 picodata_CharBuffer cbOut, 106 picorsrc_Voice voice); 107 108 #define PICOTOK_OUTBUF_SIZE 256 109 110 #ifdef __cplusplus 111 } 112 #endif 113 114 115 #endif /*PICOTOK_H_*/ 116