1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "uvectr32.h" 19 20 U_NAMESPACE_BEGIN 21 22 class DictionaryMatcher; 23 class Normalizer2; 24 25 /******************************************************************* 26 * DictionaryBreakEngine 27 */ 28 29 /** 30 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 31 * dictionary to determine language-specific breaks.</p> 32 * 33 * <p>After it is constructed a DictionaryBreakEngine may be shared between 34 * threads without synchronization.</p> 35 */ 36 class DictionaryBreakEngine : public LanguageBreakEngine { 37 private: 38 /** 39 * The set of characters handled by this engine 40 * @internal 41 */ 42 43 UnicodeSet fSet; 44 45 /** 46 * The set of break types handled by this engine 47 * @internal 48 */ 49 50 uint32_t fTypes; 51 52 /** 53 * <p>Default constructor.</p> 54 * 55 */ 56 DictionaryBreakEngine(); 57 58 public: 59 60 /** 61 * <p>Constructor setting the break types handled.</p> 62 * 63 * @param breakTypes A bitmap of types handled by the engine. 64 */ 65 DictionaryBreakEngine( uint32_t breakTypes ); 66 67 /** 68 * <p>Virtual destructor.</p> 69 */ 70 virtual ~DictionaryBreakEngine(); 71 72 /** 73 * <p>Indicate whether this engine handles a particular character for 74 * a particular kind of break.</p> 75 * 76 * @param c A character which begins a run that the engine might handle 77 * @param breakType The type of text break which the caller wants to determine 78 * @return TRUE if this engine handles the particular character and break 79 * type. 80 */ 81 virtual UBool handles( UChar32 c, int32_t breakType ) const; 82 83 /** 84 * <p>Find any breaks within a run in the supplied text.</p> 85 * 86 * @param text A UText representing the text. The iterator is left at 87 * the end of the run of characters which the engine is capable of handling 88 * that starts from the first character in the range. 89 * @param startPos The start of the run within the supplied text. 90 * @param endPos The end of the run within the supplied text. 91 * @param breakType The type of break desired, or -1. 92 * @param foundBreaks vector of int32_t to receive the break positions 93 * @return The number of breaks found. 94 */ 95 virtual int32_t findBreaks( UText *text, 96 int32_t startPos, 97 int32_t endPos, 98 int32_t breakType, 99 UVector32 &foundBreaks ) const; 100 101 protected: 102 103 /** 104 * <p>Set the character set handled by this engine.</p> 105 * 106 * @param set A UnicodeSet of the set of characters handled by the engine 107 */ 108 virtual void setCharacters( const UnicodeSet &set ); 109 110 /** 111 * <p>Set the break types handled by this engine.</p> 112 * 113 * @param breakTypes A bitmap of types handled by the engine. 114 */ 115 // virtual void setBreakTypes( uint32_t breakTypes ); 116 117 /** 118 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 119 * 120 * @param text A UText representing the text 121 * @param rangeStart The start of the range of dictionary characters 122 * @param rangeEnd The end of the range of dictionary characters 123 * @param foundBreaks Output of C array of int32_t break positions, or 0 124 * @return The number of breaks found 125 */ 126 virtual int32_t divideUpDictionaryRange( UText *text, 127 int32_t rangeStart, 128 int32_t rangeEnd, 129 UVector32 &foundBreaks ) const = 0; 130 131 }; 132 133 /******************************************************************* 134 * ThaiBreakEngine 135 */ 136 137 /** 138 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 139 * dictionary and heuristics to determine Thai-specific breaks.</p> 140 * 141 * <p>After it is constructed a ThaiBreakEngine may be shared between 142 * threads without synchronization.</p> 143 */ 144 class ThaiBreakEngine : public DictionaryBreakEngine { 145 private: 146 /** 147 * The set of characters handled by this engine 148 * @internal 149 */ 150 151 UnicodeSet fThaiWordSet; 152 UnicodeSet fEndWordSet; 153 UnicodeSet fBeginWordSet; 154 UnicodeSet fSuffixSet; 155 UnicodeSet fMarkSet; 156 DictionaryMatcher *fDictionary; 157 158 public: 159 160 /** 161 * <p>Default constructor.</p> 162 * 163 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 164 * engine is deleted. 165 */ 166 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 167 168 /** 169 * <p>Virtual destructor.</p> 170 */ 171 virtual ~ThaiBreakEngine(); 172 173 protected: 174 /** 175 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 176 * 177 * @param text A UText representing the text 178 * @param rangeStart The start of the range of dictionary characters 179 * @param rangeEnd The end of the range of dictionary characters 180 * @param foundBreaks Output of C array of int32_t break positions, or 0 181 * @return The number of breaks found 182 */ 183 virtual int32_t divideUpDictionaryRange( UText *text, 184 int32_t rangeStart, 185 int32_t rangeEnd, 186 UVector32 &foundBreaks ) const; 187 188 }; 189 190 /******************************************************************* 191 * LaoBreakEngine 192 */ 193 194 /** 195 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 196 * dictionary and heuristics to determine Lao-specific breaks.</p> 197 * 198 * <p>After it is constructed a LaoBreakEngine may be shared between 199 * threads without synchronization.</p> 200 */ 201 class LaoBreakEngine : public DictionaryBreakEngine { 202 private: 203 /** 204 * The set of characters handled by this engine 205 * @internal 206 */ 207 208 UnicodeSet fLaoWordSet; 209 UnicodeSet fEndWordSet; 210 UnicodeSet fBeginWordSet; 211 UnicodeSet fMarkSet; 212 DictionaryMatcher *fDictionary; 213 214 public: 215 216 /** 217 * <p>Default constructor.</p> 218 * 219 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 220 * engine is deleted. 221 */ 222 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 223 224 /** 225 * <p>Virtual destructor.</p> 226 */ 227 virtual ~LaoBreakEngine(); 228 229 protected: 230 /** 231 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 232 * 233 * @param text A UText representing the text 234 * @param rangeStart The start of the range of dictionary characters 235 * @param rangeEnd The end of the range of dictionary characters 236 * @param foundBreaks Output of C array of int32_t break positions, or 0 237 * @return The number of breaks found 238 */ 239 virtual int32_t divideUpDictionaryRange( UText *text, 240 int32_t rangeStart, 241 int32_t rangeEnd, 242 UVector32 &foundBreaks ) const; 243 244 }; 245 246 /******************************************************************* 247 * BurmeseBreakEngine 248 */ 249 250 /** 251 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 252 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 253 * 254 * <p>After it is constructed a BurmeseBreakEngine may be shared between 255 * threads without synchronization.</p> 256 */ 257 class BurmeseBreakEngine : public DictionaryBreakEngine { 258 private: 259 /** 260 * The set of characters handled by this engine 261 * @internal 262 */ 263 264 UnicodeSet fBurmeseWordSet; 265 UnicodeSet fEndWordSet; 266 UnicodeSet fBeginWordSet; 267 UnicodeSet fMarkSet; 268 DictionaryMatcher *fDictionary; 269 270 public: 271 272 /** 273 * <p>Default constructor.</p> 274 * 275 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 276 * engine is deleted. 277 */ 278 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 279 280 /** 281 * <p>Virtual destructor.</p> 282 */ 283 virtual ~BurmeseBreakEngine(); 284 285 protected: 286 /** 287 * <p>Divide up a range of known dictionary characters.</p> 288 * 289 * @param text A UText representing the text 290 * @param rangeStart The start of the range of dictionary characters 291 * @param rangeEnd The end of the range of dictionary characters 292 * @param foundBreaks Output of C array of int32_t break positions, or 0 293 * @return The number of breaks found 294 */ 295 virtual int32_t divideUpDictionaryRange( UText *text, 296 int32_t rangeStart, 297 int32_t rangeEnd, 298 UVector32 &foundBreaks ) const; 299 300 }; 301 302 /******************************************************************* 303 * KhmerBreakEngine 304 */ 305 306 /** 307 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 308 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 309 * 310 * <p>After it is constructed a KhmerBreakEngine may be shared between 311 * threads without synchronization.</p> 312 */ 313 class KhmerBreakEngine : public DictionaryBreakEngine { 314 private: 315 /** 316 * The set of characters handled by this engine 317 * @internal 318 */ 319 320 UnicodeSet fKhmerWordSet; 321 UnicodeSet fEndWordSet; 322 UnicodeSet fBeginWordSet; 323 UnicodeSet fMarkSet; 324 DictionaryMatcher *fDictionary; 325 326 public: 327 328 /** 329 * <p>Default constructor.</p> 330 * 331 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 332 * engine is deleted. 333 */ 334 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 335 336 /** 337 * <p>Virtual destructor.</p> 338 */ 339 virtual ~KhmerBreakEngine(); 340 341 protected: 342 /** 343 * <p>Divide up a range of known dictionary characters.</p> 344 * 345 * @param text A UText representing the text 346 * @param rangeStart The start of the range of dictionary characters 347 * @param rangeEnd The end of the range of dictionary characters 348 * @param foundBreaks Output of C array of int32_t break positions, or 0 349 * @return The number of breaks found 350 */ 351 virtual int32_t divideUpDictionaryRange( UText *text, 352 int32_t rangeStart, 353 int32_t rangeEnd, 354 UVector32 &foundBreaks ) const; 355 356 }; 357 358 #if !UCONFIG_NO_NORMALIZATION 359 360 /******************************************************************* 361 * CjkBreakEngine 362 */ 363 364 //indicates language/script that the CjkBreakEngine will handle 365 enum LanguageType { 366 kKorean, 367 kChineseJapanese 368 }; 369 370 /** 371 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 372 * dictionary with costs associated with each word and 373 * Viterbi decoding to determine CJK-specific breaks.</p> 374 */ 375 class CjkBreakEngine : public DictionaryBreakEngine { 376 protected: 377 /** 378 * The set of characters handled by this engine 379 * @internal 380 */ 381 UnicodeSet fHangulWordSet; 382 UnicodeSet fHanWordSet; 383 UnicodeSet fKatakanaWordSet; 384 UnicodeSet fHiraganaWordSet; 385 386 DictionaryMatcher *fDictionary; 387 const Normalizer2 *nfkcNorm2; 388 389 public: 390 391 /** 392 * <p>Default constructor.</p> 393 * 394 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 395 * engine is deleted. The DictionaryMatcher must contain costs for each word 396 * in order for the dictionary to work properly. 397 */ 398 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 399 400 /** 401 * <p>Virtual destructor.</p> 402 */ 403 virtual ~CjkBreakEngine(); 404 405 protected: 406 /** 407 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 408 * 409 * @param text A UText representing the text 410 * @param rangeStart The start of the range of dictionary characters 411 * @param rangeEnd The end of the range of dictionary characters 412 * @param foundBreaks Output of C array of int32_t break positions, or 0 413 * @return The number of breaks found 414 */ 415 virtual int32_t divideUpDictionaryRange( UText *text, 416 int32_t rangeStart, 417 int32_t rangeEnd, 418 UVector32 &foundBreaks ) const; 419 420 }; 421 422 #endif 423 424 U_NAMESPACE_END 425 426 /* DICTBE_H */ 427 #endif 428