1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "hash.h" 19 #include "mlbe.h" 20 #include "uvectr32.h" 21 22 U_NAMESPACE_BEGIN 23 24 class DictionaryMatcher; 25 class MlBreakEngine; 26 class Normalizer2; 27 28 /******************************************************************* 29 * DictionaryBreakEngine 30 */ 31 32 /** 33 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 34 * dictionary to determine language-specific breaks.</p> 35 * 36 * <p>After it is constructed a DictionaryBreakEngine may be shared between 37 * threads without synchronization.</p> 38 */ 39 class DictionaryBreakEngine : public LanguageBreakEngine { 40 private: 41 /** 42 * The set of characters handled by this engine 43 * @internal 44 */ 45 46 UnicodeSet fSet; 47 48 public: 49 50 /** 51 * <p>Constructor </p> 52 */ 53 DictionaryBreakEngine(); 54 55 /** 56 * <p>Virtual destructor.</p> 57 */ 58 virtual ~DictionaryBreakEngine(); 59 60 /** 61 * <p>Indicate whether this engine handles a particular character for 62 * a particular kind of break.</p> 63 * 64 * @param c A character which begins a run that the engine might handle 65 * @return true if this engine handles the particular character and break 66 * type. 67 */ 68 virtual UBool handles(UChar32 c) const override; 69 70 /** 71 * <p>Find any breaks within a run in the supplied text.</p> 72 * 73 * @param text A UText representing the text. The iterator is left at 74 * the end of the run of characters which the engine is capable of handling 75 * that starts from the first character in the range. 76 * @param startPos The start of the run within the supplied text. 77 * @param endPos The end of the run within the supplied text. 78 * @param foundBreaks vector of int32_t to receive the break positions 79 * @param status Information on any errors encountered. 80 * @return The number of breaks found. 81 */ 82 virtual int32_t findBreaks( UText *text, 83 int32_t startPos, 84 int32_t endPos, 85 UVector32 &foundBreaks, 86 UBool isPhraseBreaking, 87 UErrorCode& status ) const override; 88 89 protected: 90 91 /** 92 * <p>Set the character set handled by this engine.</p> 93 * 94 * @param set A UnicodeSet of the set of characters handled by the engine 95 */ 96 virtual void setCharacters( const UnicodeSet &set ); 97 98 /** 99 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 100 * 101 * @param text A UText representing the text 102 * @param rangeStart The start of the range of dictionary characters 103 * @param rangeEnd The end of the range of dictionary characters 104 * @param foundBreaks Output of C array of int32_t break positions, or 0 105 * @param status Information on any errors encountered. 106 * @return The number of breaks found 107 */ 108 virtual int32_t divideUpDictionaryRange( UText *text, 109 int32_t rangeStart, 110 int32_t rangeEnd, 111 UVector32 &foundBreaks, 112 UBool isPhraseBreaking, 113 UErrorCode& status) const = 0; 114 115 }; 116 117 /******************************************************************* 118 * ThaiBreakEngine 119 */ 120 121 /** 122 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 123 * dictionary and heuristics to determine Thai-specific breaks.</p> 124 * 125 * <p>After it is constructed a ThaiBreakEngine may be shared between 126 * threads without synchronization.</p> 127 */ 128 class ThaiBreakEngine : public DictionaryBreakEngine { 129 private: 130 /** 131 * The set of characters handled by this engine 132 * @internal 133 */ 134 135 UnicodeSet fEndWordSet; 136 UnicodeSet fBeginWordSet; 137 UnicodeSet fSuffixSet; 138 UnicodeSet fMarkSet; 139 DictionaryMatcher *fDictionary; 140 141 public: 142 143 /** 144 * <p>Default constructor.</p> 145 * 146 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 147 * engine is deleted. 148 */ 149 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 150 151 /** 152 * <p>Virtual destructor.</p> 153 */ 154 virtual ~ThaiBreakEngine(); 155 156 protected: 157 /** 158 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 159 * 160 * @param text A UText representing the text 161 * @param rangeStart The start of the range of dictionary characters 162 * @param rangeEnd The end of the range of dictionary characters 163 * @param foundBreaks Output of C array of int32_t break positions, or 0 164 * @param status Information on any errors encountered. 165 * @return The number of breaks found 166 */ 167 virtual int32_t divideUpDictionaryRange( UText *text, 168 int32_t rangeStart, 169 int32_t rangeEnd, 170 UVector32 &foundBreaks, 171 UBool isPhraseBreaking, 172 UErrorCode& status) const override; 173 174 }; 175 176 /******************************************************************* 177 * LaoBreakEngine 178 */ 179 180 /** 181 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 182 * dictionary and heuristics to determine Lao-specific breaks.</p> 183 * 184 * <p>After it is constructed a LaoBreakEngine may be shared between 185 * threads without synchronization.</p> 186 */ 187 class LaoBreakEngine : public DictionaryBreakEngine { 188 private: 189 /** 190 * The set of characters handled by this engine 191 * @internal 192 */ 193 194 UnicodeSet fEndWordSet; 195 UnicodeSet fBeginWordSet; 196 UnicodeSet fMarkSet; 197 DictionaryMatcher *fDictionary; 198 199 public: 200 201 /** 202 * <p>Default constructor.</p> 203 * 204 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 205 * engine is deleted. 206 */ 207 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 208 209 /** 210 * <p>Virtual destructor.</p> 211 */ 212 virtual ~LaoBreakEngine(); 213 214 protected: 215 /** 216 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 217 * 218 * @param text A UText representing the text 219 * @param rangeStart The start of the range of dictionary characters 220 * @param rangeEnd The end of the range of dictionary characters 221 * @param foundBreaks Output of C array of int32_t break positions, or 0 222 * @param status Information on any errors encountered. 223 * @return The number of breaks found 224 */ 225 virtual int32_t divideUpDictionaryRange( UText *text, 226 int32_t rangeStart, 227 int32_t rangeEnd, 228 UVector32 &foundBreaks, 229 UBool isPhraseBreaking, 230 UErrorCode& status) const override; 231 232 }; 233 234 /******************************************************************* 235 * BurmeseBreakEngine 236 */ 237 238 /** 239 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 240 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 241 * 242 * <p>After it is constructed a BurmeseBreakEngine may be shared between 243 * threads without synchronization.</p> 244 */ 245 class BurmeseBreakEngine : public DictionaryBreakEngine { 246 private: 247 /** 248 * The set of characters handled by this engine 249 * @internal 250 */ 251 252 UnicodeSet fEndWordSet; 253 UnicodeSet fBeginWordSet; 254 UnicodeSet fMarkSet; 255 DictionaryMatcher *fDictionary; 256 257 public: 258 259 /** 260 * <p>Default constructor.</p> 261 * 262 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 263 * engine is deleted. 264 */ 265 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 266 267 /** 268 * <p>Virtual destructor.</p> 269 */ 270 virtual ~BurmeseBreakEngine(); 271 272 protected: 273 /** 274 * <p>Divide up a range of known dictionary characters.</p> 275 * 276 * @param text A UText representing the text 277 * @param rangeStart The start of the range of dictionary characters 278 * @param rangeEnd The end of the range of dictionary characters 279 * @param foundBreaks Output of C array of int32_t break positions, or 0 280 * @param status Information on any errors encountered. 281 * @return The number of breaks found 282 */ 283 virtual int32_t divideUpDictionaryRange( UText *text, 284 int32_t rangeStart, 285 int32_t rangeEnd, 286 UVector32 &foundBreaks, 287 UBool isPhraseBreaking, 288 UErrorCode& status) const override; 289 290 }; 291 292 /******************************************************************* 293 * KhmerBreakEngine 294 */ 295 296 /** 297 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 298 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 299 * 300 * <p>After it is constructed a KhmerBreakEngine may be shared between 301 * threads without synchronization.</p> 302 */ 303 class KhmerBreakEngine : public DictionaryBreakEngine { 304 private: 305 /** 306 * The set of characters handled by this engine 307 * @internal 308 */ 309 310 UnicodeSet fEndWordSet; 311 UnicodeSet fBeginWordSet; 312 UnicodeSet fMarkSet; 313 DictionaryMatcher *fDictionary; 314 315 public: 316 317 /** 318 * <p>Default constructor.</p> 319 * 320 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 321 * engine is deleted. 322 */ 323 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 324 325 /** 326 * <p>Virtual destructor.</p> 327 */ 328 virtual ~KhmerBreakEngine(); 329 330 protected: 331 /** 332 * <p>Divide up a range of known dictionary characters.</p> 333 * 334 * @param text A UText representing the text 335 * @param rangeStart The start of the range of dictionary characters 336 * @param rangeEnd The end of the range of dictionary characters 337 * @param foundBreaks Output of C array of int32_t break positions, or 0 338 * @param status Information on any errors encountered. 339 * @return The number of breaks found 340 */ 341 virtual int32_t divideUpDictionaryRange( UText *text, 342 int32_t rangeStart, 343 int32_t rangeEnd, 344 UVector32 &foundBreaks, 345 UBool isPhraseBreaking, 346 UErrorCode& status) const override; 347 348 }; 349 350 #if !UCONFIG_NO_NORMALIZATION 351 352 /******************************************************************* 353 * CjkBreakEngine 354 */ 355 356 //indicates language/script that the CjkBreakEngine will handle 357 enum LanguageType { 358 kKorean, 359 kChineseJapanese 360 }; 361 362 /** 363 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 364 * dictionary with costs associated with each word and 365 * Viterbi decoding to determine CJK-specific breaks.</p> 366 */ 367 class CjkBreakEngine : public DictionaryBreakEngine { 368 protected: 369 /** 370 * The set of characters handled by this engine 371 * @internal 372 */ 373 UnicodeSet fHangulWordSet; 374 UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; 375 UnicodeSet fClosePunctuationSet; 376 377 DictionaryMatcher *fDictionary; 378 const Normalizer2 *nfkcNorm2; 379 MlBreakEngine *fMlBreakEngine; 380 bool isCj; 381 382 private: 383 // Load Japanese extensions. 384 void loadJapaneseExtensions(UErrorCode& error); 385 // Load Japanese Hiragana. 386 void loadHiragana(UErrorCode& error); 387 // Initialize fSkipSet by loading Japanese Hiragana and extensions. 388 void initJapanesePhraseParameter(UErrorCode& error); 389 390 Hashtable fSkipSet; 391 392 public: 393 394 /** 395 * <p>Default constructor.</p> 396 * 397 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 398 * engine is deleted. The DictionaryMatcher must contain costs for each word 399 * in order for the dictionary to work properly. 400 */ 401 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 402 403 /** 404 * <p>Virtual destructor.</p> 405 */ 406 virtual ~CjkBreakEngine(); 407 408 protected: 409 /** 410 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 411 * 412 * @param text A UText representing the text 413 * @param rangeStart The start of the range of dictionary characters 414 * @param rangeEnd The end of the range of dictionary characters 415 * @param foundBreaks Output of C array of int32_t break positions, or 0 416 * @param status Information on any errors encountered. 417 * @return The number of breaks found 418 */ 419 virtual int32_t divideUpDictionaryRange( UText *text, 420 int32_t rangeStart, 421 int32_t rangeEnd, 422 UVector32 &foundBreaks, 423 UBool isPhraseBreaking, 424 UErrorCode& status) const override; 425 426 }; 427 428 #endif 429 430 U_NAMESPACE_END 431 432 /* DICTBE_H */ 433 #endif 434