1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "hash.h" 19 #include "uvectr32.h" 20 21 U_NAMESPACE_BEGIN 22 23 class DictionaryMatcher; 24 class Normalizer2; 25 26 /******************************************************************* 27 * DictionaryBreakEngine 28 */ 29 30 /** 31 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 32 * dictionary to determine language-specific breaks.</p> 33 * 34 * <p>After it is constructed a DictionaryBreakEngine may be shared between 35 * threads without synchronization.</p> 36 */ 37 class DictionaryBreakEngine : public LanguageBreakEngine { 38 private: 39 /** 40 * The set of characters handled by this engine 41 * @internal 42 */ 43 44 UnicodeSet fSet; 45 46 public: 47 48 /** 49 * <p>Constructor </p> 50 */ 51 DictionaryBreakEngine(); 52 53 /** 54 * <p>Virtual destructor.</p> 55 */ 56 virtual ~DictionaryBreakEngine(); 57 58 /** 59 * <p>Indicate whether this engine handles a particular character for 60 * a particular kind of break.</p> 61 * 62 * @param c A character which begins a run that the engine might handle 63 * @return true if this engine handles the particular character and break 64 * type. 65 */ 66 virtual UBool handles(UChar32 c) const override; 67 68 /** 69 * <p>Find any breaks within a run in the supplied text.</p> 70 * 71 * @param text A UText representing the text. The iterator is left at 72 * the end of the run of characters which the engine is capable of handling 73 * that starts from the first character in the range. 74 * @param startPos The start of the run within the supplied text. 75 * @param endPos The end of the run within the supplied text. 76 * @param foundBreaks vector of int32_t to receive the break positions 77 * @param status Information on any errors encountered. 78 * @return The number of breaks found. 79 */ 80 virtual int32_t findBreaks( UText *text, 81 int32_t startPos, 82 int32_t endPos, 83 UVector32 &foundBreaks, 84 UBool isPhraseBreaking, 85 UErrorCode& status ) const override; 86 87 protected: 88 89 /** 90 * <p>Set the character set handled by this engine.</p> 91 * 92 * @param set A UnicodeSet of the set of characters handled by the engine 93 */ 94 virtual void setCharacters( const UnicodeSet &set ); 95 96 /** 97 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 98 * 99 * @param text A UText representing the text 100 * @param rangeStart The start of the range of dictionary characters 101 * @param rangeEnd The end of the range of dictionary characters 102 * @param foundBreaks Output of C array of int32_t break positions, or 0 103 * @param status Information on any errors encountered. 104 * @return The number of breaks found 105 */ 106 virtual int32_t divideUpDictionaryRange( UText *text, 107 int32_t rangeStart, 108 int32_t rangeEnd, 109 UVector32 &foundBreaks, 110 UBool isPhraseBreaking, 111 UErrorCode& status) const = 0; 112 113 }; 114 115 /******************************************************************* 116 * ThaiBreakEngine 117 */ 118 119 /** 120 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 121 * dictionary and heuristics to determine Thai-specific breaks.</p> 122 * 123 * <p>After it is constructed a ThaiBreakEngine may be shared between 124 * threads without synchronization.</p> 125 */ 126 class ThaiBreakEngine : public DictionaryBreakEngine { 127 private: 128 /** 129 * The set of characters handled by this engine 130 * @internal 131 */ 132 133 UnicodeSet fEndWordSet; 134 UnicodeSet fBeginWordSet; 135 UnicodeSet fSuffixSet; 136 UnicodeSet fMarkSet; 137 DictionaryMatcher *fDictionary; 138 139 public: 140 141 /** 142 * <p>Default constructor.</p> 143 * 144 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 145 * engine is deleted. 146 */ 147 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 148 149 /** 150 * <p>Virtual destructor.</p> 151 */ 152 virtual ~ThaiBreakEngine(); 153 154 protected: 155 /** 156 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 157 * 158 * @param text A UText representing the text 159 * @param rangeStart The start of the range of dictionary characters 160 * @param rangeEnd The end of the range of dictionary characters 161 * @param foundBreaks Output of C array of int32_t break positions, or 0 162 * @param status Information on any errors encountered. 163 * @return The number of breaks found 164 */ 165 virtual int32_t divideUpDictionaryRange( UText *text, 166 int32_t rangeStart, 167 int32_t rangeEnd, 168 UVector32 &foundBreaks, 169 UBool isPhraseBreaking, 170 UErrorCode& status) const override; 171 172 }; 173 174 /******************************************************************* 175 * LaoBreakEngine 176 */ 177 178 /** 179 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 180 * dictionary and heuristics to determine Lao-specific breaks.</p> 181 * 182 * <p>After it is constructed a LaoBreakEngine may be shared between 183 * threads without synchronization.</p> 184 */ 185 class LaoBreakEngine : public DictionaryBreakEngine { 186 private: 187 /** 188 * The set of characters handled by this engine 189 * @internal 190 */ 191 192 UnicodeSet fEndWordSet; 193 UnicodeSet fBeginWordSet; 194 UnicodeSet fMarkSet; 195 DictionaryMatcher *fDictionary; 196 197 public: 198 199 /** 200 * <p>Default constructor.</p> 201 * 202 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 203 * engine is deleted. 204 */ 205 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 206 207 /** 208 * <p>Virtual destructor.</p> 209 */ 210 virtual ~LaoBreakEngine(); 211 212 protected: 213 /** 214 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 215 * 216 * @param text A UText representing the text 217 * @param rangeStart The start of the range of dictionary characters 218 * @param rangeEnd The end of the range of dictionary characters 219 * @param foundBreaks Output of C array of int32_t break positions, or 0 220 * @param status Information on any errors encountered. 221 * @return The number of breaks found 222 */ 223 virtual int32_t divideUpDictionaryRange( UText *text, 224 int32_t rangeStart, 225 int32_t rangeEnd, 226 UVector32 &foundBreaks, 227 UBool isPhraseBreaking, 228 UErrorCode& status) const override; 229 230 }; 231 232 /******************************************************************* 233 * BurmeseBreakEngine 234 */ 235 236 /** 237 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 238 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 239 * 240 * <p>After it is constructed a BurmeseBreakEngine may be shared between 241 * threads without synchronization.</p> 242 */ 243 class BurmeseBreakEngine : public DictionaryBreakEngine { 244 private: 245 /** 246 * The set of characters handled by this engine 247 * @internal 248 */ 249 250 UnicodeSet fEndWordSet; 251 UnicodeSet fBeginWordSet; 252 UnicodeSet fMarkSet; 253 DictionaryMatcher *fDictionary; 254 255 public: 256 257 /** 258 * <p>Default constructor.</p> 259 * 260 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 261 * engine is deleted. 262 */ 263 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 264 265 /** 266 * <p>Virtual destructor.</p> 267 */ 268 virtual ~BurmeseBreakEngine(); 269 270 protected: 271 /** 272 * <p>Divide up a range of known dictionary characters.</p> 273 * 274 * @param text A UText representing the text 275 * @param rangeStart The start of the range of dictionary characters 276 * @param rangeEnd The end of the range of dictionary characters 277 * @param foundBreaks Output of C array of int32_t break positions, or 0 278 * @param status Information on any errors encountered. 279 * @return The number of breaks found 280 */ 281 virtual int32_t divideUpDictionaryRange( UText *text, 282 int32_t rangeStart, 283 int32_t rangeEnd, 284 UVector32 &foundBreaks, 285 UBool isPhraseBreaking, 286 UErrorCode& status) const override; 287 288 }; 289 290 /******************************************************************* 291 * KhmerBreakEngine 292 */ 293 294 /** 295 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 296 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 297 * 298 * <p>After it is constructed a KhmerBreakEngine may be shared between 299 * threads without synchronization.</p> 300 */ 301 class KhmerBreakEngine : public DictionaryBreakEngine { 302 private: 303 /** 304 * The set of characters handled by this engine 305 * @internal 306 */ 307 308 UnicodeSet fEndWordSet; 309 UnicodeSet fBeginWordSet; 310 UnicodeSet fMarkSet; 311 DictionaryMatcher *fDictionary; 312 313 public: 314 315 /** 316 * <p>Default constructor.</p> 317 * 318 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 319 * engine is deleted. 320 */ 321 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 322 323 /** 324 * <p>Virtual destructor.</p> 325 */ 326 virtual ~KhmerBreakEngine(); 327 328 protected: 329 /** 330 * <p>Divide up a range of known dictionary characters.</p> 331 * 332 * @param text A UText representing the text 333 * @param rangeStart The start of the range of dictionary characters 334 * @param rangeEnd The end of the range of dictionary characters 335 * @param foundBreaks Output of C array of int32_t break positions, or 0 336 * @param status Information on any errors encountered. 337 * @return The number of breaks found 338 */ 339 virtual int32_t divideUpDictionaryRange( UText *text, 340 int32_t rangeStart, 341 int32_t rangeEnd, 342 UVector32 &foundBreaks, 343 UBool isPhraseBreaking, 344 UErrorCode& status) const override; 345 346 }; 347 348 #if !UCONFIG_NO_NORMALIZATION 349 350 /******************************************************************* 351 * CjkBreakEngine 352 */ 353 354 //indicates language/script that the CjkBreakEngine will handle 355 enum LanguageType { 356 kKorean, 357 kChineseJapanese 358 }; 359 360 /** 361 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 362 * dictionary with costs associated with each word and 363 * Viterbi decoding to determine CJK-specific breaks.</p> 364 */ 365 class CjkBreakEngine : public DictionaryBreakEngine { 366 protected: 367 /** 368 * The set of characters handled by this engine 369 * @internal 370 */ 371 UnicodeSet fHangulWordSet; 372 UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; 373 UnicodeSet fClosePunctuationSet; 374 375 DictionaryMatcher *fDictionary; 376 const Normalizer2 *nfkcNorm2; 377 378 private: 379 // Load Japanese extensions. 380 void loadJapaneseExtensions(UErrorCode& error); 381 // Load Japanese Hiragana. 382 void loadHiragana(UErrorCode& error); 383 // Initialize fSkipSet by loading Japanese Hiragana and extensions. 384 void initJapanesePhraseParameter(UErrorCode& error); 385 386 Hashtable fSkipSet; 387 388 public: 389 390 /** 391 * <p>Default constructor.</p> 392 * 393 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 394 * engine is deleted. The DictionaryMatcher must contain costs for each word 395 * in order for the dictionary to work properly. 396 */ 397 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 398 399 /** 400 * <p>Virtual destructor.</p> 401 */ 402 virtual ~CjkBreakEngine(); 403 404 protected: 405 /** 406 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 407 * 408 * @param text A UText representing the text 409 * @param rangeStart The start of the range of dictionary characters 410 * @param rangeEnd The end of the range of dictionary characters 411 * @param foundBreaks Output of C array of int32_t break positions, or 0 412 * @param status Information on any errors encountered. 413 * @return The number of breaks found 414 */ 415 virtual int32_t divideUpDictionaryRange( UText *text, 416 int32_t rangeStart, 417 int32_t rangeEnd, 418 UVector32 &foundBreaks, 419 UBool isPhraseBreaking, 420 UErrorCode& status) const override; 421 422 }; 423 424 #endif 425 426 U_NAMESPACE_END 427 428 /* DICTBE_H */ 429 #endif 430