1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #ifndef DICTBE_H 9 #define DICTBE_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utext.h" 14 15 #include "brkeng.h" 16 17 U_NAMESPACE_BEGIN 18 19 class DictionaryMatcher; 20 21 /******************************************************************* 22 * DictionaryBreakEngine 23 */ 24 25 /** 26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 27 * dictionary to determine language-specific breaks.</p> 28 * 29 * <p>After it is constructed a DictionaryBreakEngine may be shared between 30 * threads without synchronization.</p> 31 */ 32 class DictionaryBreakEngine : public LanguageBreakEngine { 33 private: 34 /** 35 * The set of characters handled by this engine 36 * @internal 37 */ 38 39 UnicodeSet fSet; 40 41 /** 42 * The set of break types handled by this engine 43 * @internal 44 */ 45 46 uint32_t fTypes; 47 48 /** 49 * <p>Default constructor.</p> 50 * 51 */ 52 DictionaryBreakEngine(); 53 54 public: 55 56 /** 57 * <p>Constructor setting the break types handled.</p> 58 * 59 * @param breakTypes A bitmap of types handled by the engine. 60 */ 61 DictionaryBreakEngine( uint32_t breakTypes ); 62 63 /** 64 * <p>Virtual destructor.</p> 65 */ 66 virtual ~DictionaryBreakEngine(); 67 68 /** 69 * <p>Indicate whether this engine handles a particular character for 70 * a particular kind of break.</p> 71 * 72 * @param c A character which begins a run that the engine might handle 73 * @param breakType The type of text break which the caller wants to determine 74 * @return TRUE if this engine handles the particular character and break 75 * type. 76 */ 77 virtual UBool handles( UChar32 c, int32_t breakType ) const; 78 79 /** 80 * <p>Find any breaks within a run in the supplied text.</p> 81 * 82 * @param text A UText representing the text. The iterator is left at 83 * the end of the run of characters which the engine is capable of handling 84 * that starts from the first (or last) character in the range. 85 * @param startPos The start of the run within the supplied text. 86 * @param endPos The end of the run within the supplied text. 87 * @param reverse Whether the caller is looking for breaks in a reverse 88 * direction. 89 * @param breakType The type of break desired, or -1. 90 * @param foundBreaks An allocated C array of the breaks found, if any 91 * @return The number of breaks found. 92 */ 93 virtual int32_t findBreaks( UText *text, 94 int32_t startPos, 95 int32_t endPos, 96 UBool reverse, 97 int32_t breakType, 98 UStack &foundBreaks ) const; 99 100 protected: 101 102 /** 103 * <p>Set the character set handled by this engine.</p> 104 * 105 * @param set A UnicodeSet of the set of characters handled by the engine 106 */ 107 virtual void setCharacters( const UnicodeSet &set ); 108 109 /** 110 * <p>Set the break types handled by this engine.</p> 111 * 112 * @param breakTypes A bitmap of types handled by the engine. 113 */ 114 // virtual void setBreakTypes( uint32_t breakTypes ); 115 116 /** 117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 118 * 119 * @param text A UText representing the text 120 * @param rangeStart The start of the range of dictionary characters 121 * @param rangeEnd The end of the range of dictionary characters 122 * @param foundBreaks Output of C array of int32_t break positions, or 0 123 * @return The number of breaks found 124 */ 125 virtual int32_t divideUpDictionaryRange( UText *text, 126 int32_t rangeStart, 127 int32_t rangeEnd, 128 UStack &foundBreaks ) const = 0; 129 130 }; 131 132 /******************************************************************* 133 * ThaiBreakEngine 134 */ 135 136 /** 137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 138 * dictionary and heuristics to determine Thai-specific breaks.</p> 139 * 140 * <p>After it is constructed a ThaiBreakEngine may be shared between 141 * threads without synchronization.</p> 142 */ 143 class ThaiBreakEngine : public DictionaryBreakEngine { 144 private: 145 /** 146 * The set of characters handled by this engine 147 * @internal 148 */ 149 150 UnicodeSet fThaiWordSet; 151 UnicodeSet fEndWordSet; 152 UnicodeSet fBeginWordSet; 153 UnicodeSet fSuffixSet; 154 UnicodeSet fMarkSet; 155 DictionaryMatcher *fDictionary; 156 157 public: 158 159 /** 160 * <p>Default constructor.</p> 161 * 162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 163 * engine is deleted. 164 */ 165 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 166 167 /** 168 * <p>Virtual destructor.</p> 169 */ 170 virtual ~ThaiBreakEngine(); 171 172 protected: 173 /** 174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 175 * 176 * @param text A UText representing the text 177 * @param rangeStart The start of the range of dictionary characters 178 * @param rangeEnd The end of the range of dictionary characters 179 * @param foundBreaks Output of C array of int32_t break positions, or 0 180 * @return The number of breaks found 181 */ 182 virtual int32_t divideUpDictionaryRange( UText *text, 183 int32_t rangeStart, 184 int32_t rangeEnd, 185 UStack &foundBreaks ) const; 186 187 }; 188 189 /******************************************************************* 190 * LaoBreakEngine 191 */ 192 193 /** 194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 195 * dictionary and heuristics to determine Lao-specific breaks.</p> 196 * 197 * <p>After it is constructed a LaoBreakEngine may be shared between 198 * threads without synchronization.</p> 199 */ 200 class LaoBreakEngine : public DictionaryBreakEngine { 201 private: 202 /** 203 * The set of characters handled by this engine 204 * @internal 205 */ 206 207 UnicodeSet fLaoWordSet; 208 UnicodeSet fEndWordSet; 209 UnicodeSet fBeginWordSet; 210 UnicodeSet fMarkSet; 211 DictionaryMatcher *fDictionary; 212 213 public: 214 215 /** 216 * <p>Default constructor.</p> 217 * 218 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 219 * engine is deleted. 220 */ 221 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 222 223 /** 224 * <p>Virtual destructor.</p> 225 */ 226 virtual ~LaoBreakEngine(); 227 228 protected: 229 /** 230 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 231 * 232 * @param text A UText representing the text 233 * @param rangeStart The start of the range of dictionary characters 234 * @param rangeEnd The end of the range of dictionary characters 235 * @param foundBreaks Output of C array of int32_t break positions, or 0 236 * @return The number of breaks found 237 */ 238 virtual int32_t divideUpDictionaryRange( UText *text, 239 int32_t rangeStart, 240 int32_t rangeEnd, 241 UStack &foundBreaks ) const; 242 243 }; 244 245 /******************************************************************* 246 * KhmerBreakEngine 247 */ 248 249 /** 250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 252 * 253 * <p>After it is constructed a KhmerBreakEngine may be shared between 254 * threads without synchronization.</p> 255 */ 256 class KhmerBreakEngine : public DictionaryBreakEngine { 257 private: 258 /** 259 * The set of characters handled by this engine 260 * @internal 261 */ 262 263 UnicodeSet fKhmerWordSet; 264 UnicodeSet fEndWordSet; 265 UnicodeSet fBeginWordSet; 266 UnicodeSet fMarkSet; 267 DictionaryMatcher *fDictionary; 268 269 public: 270 271 /** 272 * <p>Default constructor.</p> 273 * 274 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 275 * engine is deleted. 276 */ 277 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 278 279 /** 280 * <p>Virtual destructor.</p> 281 */ 282 virtual ~KhmerBreakEngine(); 283 284 protected: 285 /** 286 * <p>Divide up a range of known dictionary characters.</p> 287 * 288 * @param text A UText representing the text 289 * @param rangeStart The start of the range of dictionary characters 290 * @param rangeEnd The end of the range of dictionary characters 291 * @param foundBreaks Output of C array of int32_t break positions, or 0 292 * @return The number of breaks found 293 */ 294 virtual int32_t divideUpDictionaryRange( UText *text, 295 int32_t rangeStart, 296 int32_t rangeEnd, 297 UStack &foundBreaks ) const; 298 299 }; 300 301 #if !UCONFIG_NO_NORMALIZATION 302 303 /******************************************************************* 304 * CjkBreakEngine 305 */ 306 307 //indicates language/script that the CjkBreakEngine will handle 308 enum LanguageType { 309 kKorean, 310 kChineseJapanese 311 }; 312 313 /** 314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 315 * dictionary with costs associated with each word and 316 * Viterbi decoding to determine CJK-specific breaks.</p> 317 */ 318 class CjkBreakEngine : public DictionaryBreakEngine { 319 protected: 320 /** 321 * The set of characters handled by this engine 322 * @internal 323 */ 324 UnicodeSet fHangulWordSet; 325 UnicodeSet fHanWordSet; 326 UnicodeSet fKatakanaWordSet; 327 UnicodeSet fHiraganaWordSet; 328 329 DictionaryMatcher *fDictionary; 330 331 public: 332 333 /** 334 * <p>Default constructor.</p> 335 * 336 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 337 * engine is deleted. The DictionaryMatcher must contain costs for each word 338 * in order for the dictionary to work properly. 339 */ 340 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 341 342 /** 343 * <p>Virtual destructor.</p> 344 */ 345 virtual ~CjkBreakEngine(); 346 347 protected: 348 /** 349 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 350 * 351 * @param text A UText representing the text 352 * @param rangeStart The start of the range of dictionary characters 353 * @param rangeEnd The end of the range of dictionary characters 354 * @param foundBreaks Output of C array of int32_t break positions, or 0 355 * @return The number of breaks found 356 */ 357 virtual int32_t divideUpDictionaryRange( UText *text, 358 int32_t rangeStart, 359 int32_t rangeEnd, 360 UStack &foundBreaks ) const; 361 362 }; 363 364 #endif 365 366 U_NAMESPACE_END 367 368 /* DICTBE_H */ 369 #endif 370