1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2014, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #ifndef DICTBE_H 9 #define DICTBE_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utext.h" 14 15 #include "brkeng.h" 16 17 U_NAMESPACE_BEGIN 18 19 class DictionaryMatcher; 20 class Normalizer2; 21 22 /******************************************************************* 23 * DictionaryBreakEngine 24 */ 25 26 /** 27 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 28 * dictionary to determine language-specific breaks.</p> 29 * 30 * <p>After it is constructed a DictionaryBreakEngine may be shared between 31 * threads without synchronization.</p> 32 */ 33 class DictionaryBreakEngine : public LanguageBreakEngine { 34 private: 35 /** 36 * The set of characters handled by this engine 37 * @internal 38 */ 39 40 UnicodeSet fSet; 41 42 /** 43 * The set of break types handled by this engine 44 * @internal 45 */ 46 47 uint32_t fTypes; 48 49 /** 50 * <p>Default constructor.</p> 51 * 52 */ 53 DictionaryBreakEngine(); 54 55 public: 56 57 /** 58 * <p>Constructor setting the break types handled.</p> 59 * 60 * @param breakTypes A bitmap of types handled by the engine. 61 */ 62 DictionaryBreakEngine( uint32_t breakTypes ); 63 64 /** 65 * <p>Virtual destructor.</p> 66 */ 67 virtual ~DictionaryBreakEngine(); 68 69 /** 70 * <p>Indicate whether this engine handles a particular character for 71 * a particular kind of break.</p> 72 * 73 * @param c A character which begins a run that the engine might handle 74 * @param breakType The type of text break which the caller wants to determine 75 * @return TRUE if this engine handles the particular character and break 76 * type. 77 */ 78 virtual UBool handles( UChar32 c, int32_t breakType ) const; 79 80 /** 81 * <p>Find any breaks within a run in the supplied text.</p> 82 * 83 * @param text A UText representing the text. The iterator is left at 84 * the end of the run of characters which the engine is capable of handling 85 * that starts from the first (or last) character in the range. 86 * @param startPos The start of the run within the supplied text. 87 * @param endPos The end of the run within the supplied text. 88 * @param reverse Whether the caller is looking for breaks in a reverse 89 * direction. 90 * @param breakType The type of break desired, or -1. 91 * @param foundBreaks An allocated C array of the breaks found, if any 92 * @return The number of breaks found. 93 */ 94 virtual int32_t findBreaks( UText *text, 95 int32_t startPos, 96 int32_t endPos, 97 UBool reverse, 98 int32_t breakType, 99 UStack &foundBreaks ) const; 100 101 protected: 102 103 /** 104 * <p>Set the character set handled by this engine.</p> 105 * 106 * @param set A UnicodeSet of the set of characters handled by the engine 107 */ 108 virtual void setCharacters( const UnicodeSet &set ); 109 110 /** 111 * <p>Set the break types handled by this engine.</p> 112 * 113 * @param breakTypes A bitmap of types handled by the engine. 114 */ 115 // virtual void setBreakTypes( uint32_t breakTypes ); 116 117 /** 118 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 119 * 120 * @param text A UText representing the text 121 * @param rangeStart The start of the range of dictionary characters 122 * @param rangeEnd The end of the range of dictionary characters 123 * @param foundBreaks Output of C array of int32_t break positions, or 0 124 * @return The number of breaks found 125 */ 126 virtual int32_t divideUpDictionaryRange( UText *text, 127 int32_t rangeStart, 128 int32_t rangeEnd, 129 UStack &foundBreaks ) const = 0; 130 131 }; 132 133 /******************************************************************* 134 * ThaiBreakEngine 135 */ 136 137 /** 138 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 139 * dictionary and heuristics to determine Thai-specific breaks.</p> 140 * 141 * <p>After it is constructed a ThaiBreakEngine may be shared between 142 * threads without synchronization.</p> 143 */ 144 class ThaiBreakEngine : public DictionaryBreakEngine { 145 private: 146 /** 147 * The set of characters handled by this engine 148 * @internal 149 */ 150 151 UnicodeSet fThaiWordSet; 152 UnicodeSet fEndWordSet; 153 UnicodeSet fBeginWordSet; 154 UnicodeSet fSuffixSet; 155 UnicodeSet fMarkSet; 156 DictionaryMatcher *fDictionary; 157 158 public: 159 160 /** 161 * <p>Default constructor.</p> 162 * 163 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 164 * engine is deleted. 165 */ 166 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 167 168 /** 169 * <p>Virtual destructor.</p> 170 */ 171 virtual ~ThaiBreakEngine(); 172 173 protected: 174 /** 175 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 176 * 177 * @param text A UText representing the text 178 * @param rangeStart The start of the range of dictionary characters 179 * @param rangeEnd The end of the range of dictionary characters 180 * @param foundBreaks Output of C array of int32_t break positions, or 0 181 * @return The number of breaks found 182 */ 183 virtual int32_t divideUpDictionaryRange( UText *text, 184 int32_t rangeStart, 185 int32_t rangeEnd, 186 UStack &foundBreaks ) const; 187 188 }; 189 190 /******************************************************************* 191 * LaoBreakEngine 192 */ 193 194 /** 195 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 196 * dictionary and heuristics to determine Lao-specific breaks.</p> 197 * 198 * <p>After it is constructed a LaoBreakEngine may be shared between 199 * threads without synchronization.</p> 200 */ 201 class LaoBreakEngine : public DictionaryBreakEngine { 202 private: 203 /** 204 * The set of characters handled by this engine 205 * @internal 206 */ 207 208 UnicodeSet fLaoWordSet; 209 UnicodeSet fEndWordSet; 210 UnicodeSet fBeginWordSet; 211 UnicodeSet fMarkSet; 212 DictionaryMatcher *fDictionary; 213 214 public: 215 216 /** 217 * <p>Default constructor.</p> 218 * 219 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 220 * engine is deleted. 221 */ 222 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 223 224 /** 225 * <p>Virtual destructor.</p> 226 */ 227 virtual ~LaoBreakEngine(); 228 229 protected: 230 /** 231 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 232 * 233 * @param text A UText representing the text 234 * @param rangeStart The start of the range of dictionary characters 235 * @param rangeEnd The end of the range of dictionary characters 236 * @param foundBreaks Output of C array of int32_t break positions, or 0 237 * @return The number of breaks found 238 */ 239 virtual int32_t divideUpDictionaryRange( UText *text, 240 int32_t rangeStart, 241 int32_t rangeEnd, 242 UStack &foundBreaks ) const; 243 244 }; 245 246 /******************************************************************* 247 * BurmeseBreakEngine 248 */ 249 250 /** 251 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 252 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 253 * 254 * <p>After it is constructed a BurmeseBreakEngine may be shared between 255 * threads without synchronization.</p> 256 */ 257 class BurmeseBreakEngine : public DictionaryBreakEngine { 258 private: 259 /** 260 * The set of characters handled by this engine 261 * @internal 262 */ 263 264 UnicodeSet fBurmeseWordSet; 265 UnicodeSet fEndWordSet; 266 UnicodeSet fBeginWordSet; 267 UnicodeSet fMarkSet; 268 DictionaryMatcher *fDictionary; 269 270 public: 271 272 /** 273 * <p>Default constructor.</p> 274 * 275 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 276 * engine is deleted. 277 */ 278 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 279 280 /** 281 * <p>Virtual destructor.</p> 282 */ 283 virtual ~BurmeseBreakEngine(); 284 285 protected: 286 /** 287 * <p>Divide up a range of known dictionary characters.</p> 288 * 289 * @param text A UText representing the text 290 * @param rangeStart The start of the range of dictionary characters 291 * @param rangeEnd The end of the range of dictionary characters 292 * @param foundBreaks Output of C array of int32_t break positions, or 0 293 * @return The number of breaks found 294 */ 295 virtual int32_t divideUpDictionaryRange( UText *text, 296 int32_t rangeStart, 297 int32_t rangeEnd, 298 UStack &foundBreaks ) const; 299 300 }; 301 302 /******************************************************************* 303 * KhmerBreakEngine 304 */ 305 306 /** 307 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 308 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 309 * 310 * <p>After it is constructed a KhmerBreakEngine may be shared between 311 * threads without synchronization.</p> 312 */ 313 class KhmerBreakEngine : public DictionaryBreakEngine { 314 private: 315 /** 316 * The set of characters handled by this engine 317 * @internal 318 */ 319 320 UnicodeSet fKhmerWordSet; 321 UnicodeSet fEndWordSet; 322 UnicodeSet fBeginWordSet; 323 UnicodeSet fMarkSet; 324 DictionaryMatcher *fDictionary; 325 326 public: 327 328 /** 329 * <p>Default constructor.</p> 330 * 331 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 332 * engine is deleted. 333 */ 334 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 335 336 /** 337 * <p>Virtual destructor.</p> 338 */ 339 virtual ~KhmerBreakEngine(); 340 341 protected: 342 /** 343 * <p>Divide up a range of known dictionary characters.</p> 344 * 345 * @param text A UText representing the text 346 * @param rangeStart The start of the range of dictionary characters 347 * @param rangeEnd The end of the range of dictionary characters 348 * @param foundBreaks Output of C array of int32_t break positions, or 0 349 * @return The number of breaks found 350 */ 351 virtual int32_t divideUpDictionaryRange( UText *text, 352 int32_t rangeStart, 353 int32_t rangeEnd, 354 UStack &foundBreaks ) const; 355 356 }; 357 358 #if !UCONFIG_NO_NORMALIZATION 359 360 /******************************************************************* 361 * CjkBreakEngine 362 */ 363 364 //indicates language/script that the CjkBreakEngine will handle 365 enum LanguageType { 366 kKorean, 367 kChineseJapanese 368 }; 369 370 /** 371 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 372 * dictionary with costs associated with each word and 373 * Viterbi decoding to determine CJK-specific breaks.</p> 374 */ 375 class CjkBreakEngine : public DictionaryBreakEngine { 376 protected: 377 /** 378 * The set of characters handled by this engine 379 * @internal 380 */ 381 UnicodeSet fHangulWordSet; 382 UnicodeSet fHanWordSet; 383 UnicodeSet fKatakanaWordSet; 384 UnicodeSet fHiraganaWordSet; 385 386 DictionaryMatcher *fDictionary; 387 const Normalizer2 *nfkcNorm2; 388 389 public: 390 391 /** 392 * <p>Default constructor.</p> 393 * 394 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 395 * engine is deleted. The DictionaryMatcher must contain costs for each word 396 * in order for the dictionary to work properly. 397 */ 398 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 399 400 /** 401 * <p>Virtual destructor.</p> 402 */ 403 virtual ~CjkBreakEngine(); 404 405 protected: 406 /** 407 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 408 * 409 * @param text A UText representing the text 410 * @param rangeStart The start of the range of dictionary characters 411 * @param rangeEnd The end of the range of dictionary characters 412 * @param foundBreaks Output of C array of int32_t break positions, or 0 413 * @return The number of breaks found 414 */ 415 virtual int32_t divideUpDictionaryRange( UText *text, 416 int32_t rangeStart, 417 int32_t rangeEnd, 418 UStack &foundBreaks ) const; 419 420 }; 421 422 #endif 423 424 U_NAMESPACE_END 425 426 /* DICTBE_H */ 427 #endif 428