1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "uvectr32.h" 19 20 U_NAMESPACE_BEGIN 21 22 class DictionaryMatcher; 23 class Normalizer2; 24 25 /******************************************************************* 26 * DictionaryBreakEngine 27 */ 28 29 /** 30 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 31 * dictionary to determine language-specific breaks.</p> 32 * 33 * <p>After it is constructed a DictionaryBreakEngine may be shared between 34 * threads without synchronization.</p> 35 */ 36 class DictionaryBreakEngine : public LanguageBreakEngine { 37 private: 38 /** 39 * The set of characters handled by this engine 40 * @internal 41 */ 42 43 UnicodeSet fSet; 44 45 public: 46 47 /** 48 * <p>Constructor </p> 49 */ 50 DictionaryBreakEngine(); 51 52 /** 53 * <p>Virtual destructor.</p> 54 */ 55 virtual ~DictionaryBreakEngine(); 56 57 /** 58 * <p>Indicate whether this engine handles a particular character for 59 * a particular kind of break.</p> 60 * 61 * @param c A character which begins a run that the engine might handle 62 * @return true if this engine handles the particular character and break 63 * type. 64 */ 65 virtual UBool handles(UChar32 c) const override; 66 67 /** 68 * <p>Find any breaks within a run in the supplied text.</p> 69 * 70 * @param text A UText representing the text. The iterator is left at 71 * the end of the run of characters which the engine is capable of handling 72 * that starts from the first character in the range. 73 * @param startPos The start of the run within the supplied text. 74 * @param endPos The end of the run within the supplied text. 75 * @param foundBreaks vector of int32_t to receive the break positions 76 * @param status Information on any errors encountered. 77 * @return The number of breaks found. 78 */ 79 virtual int32_t findBreaks( UText *text, 80 int32_t startPos, 81 int32_t endPos, 82 UVector32 &foundBreaks, 83 UErrorCode& status ) const override; 84 85 protected: 86 87 /** 88 * <p>Set the character set handled by this engine.</p> 89 * 90 * @param set A UnicodeSet of the set of characters handled by the engine 91 */ 92 virtual void setCharacters( const UnicodeSet &set ); 93 94 /** 95 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 96 * 97 * @param text A UText representing the text 98 * @param rangeStart The start of the range of dictionary characters 99 * @param rangeEnd The end of the range of dictionary characters 100 * @param foundBreaks Output of C array of int32_t break positions, or 0 101 * @param status Information on any errors encountered. 102 * @return The number of breaks found 103 */ 104 virtual int32_t divideUpDictionaryRange( UText *text, 105 int32_t rangeStart, 106 int32_t rangeEnd, 107 UVector32 &foundBreaks, 108 UErrorCode& status) const = 0; 109 110 }; 111 112 /******************************************************************* 113 * ThaiBreakEngine 114 */ 115 116 /** 117 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 118 * dictionary and heuristics to determine Thai-specific breaks.</p> 119 * 120 * <p>After it is constructed a ThaiBreakEngine may be shared between 121 * threads without synchronization.</p> 122 */ 123 class ThaiBreakEngine : public DictionaryBreakEngine { 124 private: 125 /** 126 * The set of characters handled by this engine 127 * @internal 128 */ 129 130 UnicodeSet fThaiWordSet; 131 UnicodeSet fEndWordSet; 132 UnicodeSet fBeginWordSet; 133 UnicodeSet fSuffixSet; 134 UnicodeSet fMarkSet; 135 DictionaryMatcher *fDictionary; 136 137 public: 138 139 /** 140 * <p>Default constructor.</p> 141 * 142 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 143 * engine is deleted. 144 */ 145 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 146 147 /** 148 * <p>Virtual destructor.</p> 149 */ 150 virtual ~ThaiBreakEngine(); 151 152 protected: 153 /** 154 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 155 * 156 * @param text A UText representing the text 157 * @param rangeStart The start of the range of dictionary characters 158 * @param rangeEnd The end of the range of dictionary characters 159 * @param foundBreaks Output of C array of int32_t break positions, or 0 160 * @param status Information on any errors encountered. 161 * @return The number of breaks found 162 */ 163 virtual int32_t divideUpDictionaryRange( UText *text, 164 int32_t rangeStart, 165 int32_t rangeEnd, 166 UVector32 &foundBreaks, 167 UErrorCode& status) const override; 168 169 }; 170 171 /******************************************************************* 172 * LaoBreakEngine 173 */ 174 175 /** 176 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 177 * dictionary and heuristics to determine Lao-specific breaks.</p> 178 * 179 * <p>After it is constructed a LaoBreakEngine may be shared between 180 * threads without synchronization.</p> 181 */ 182 class LaoBreakEngine : public DictionaryBreakEngine { 183 private: 184 /** 185 * The set of characters handled by this engine 186 * @internal 187 */ 188 189 UnicodeSet fLaoWordSet; 190 UnicodeSet fEndWordSet; 191 UnicodeSet fBeginWordSet; 192 UnicodeSet fMarkSet; 193 DictionaryMatcher *fDictionary; 194 195 public: 196 197 /** 198 * <p>Default constructor.</p> 199 * 200 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 201 * engine is deleted. 202 */ 203 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 204 205 /** 206 * <p>Virtual destructor.</p> 207 */ 208 virtual ~LaoBreakEngine(); 209 210 protected: 211 /** 212 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 213 * 214 * @param text A UText representing the text 215 * @param rangeStart The start of the range of dictionary characters 216 * @param rangeEnd The end of the range of dictionary characters 217 * @param foundBreaks Output of C array of int32_t break positions, or 0 218 * @param status Information on any errors encountered. 219 * @return The number of breaks found 220 */ 221 virtual int32_t divideUpDictionaryRange( UText *text, 222 int32_t rangeStart, 223 int32_t rangeEnd, 224 UVector32 &foundBreaks, 225 UErrorCode& status) const override; 226 227 }; 228 229 /******************************************************************* 230 * BurmeseBreakEngine 231 */ 232 233 /** 234 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 235 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 236 * 237 * <p>After it is constructed a BurmeseBreakEngine may be shared between 238 * threads without synchronization.</p> 239 */ 240 class BurmeseBreakEngine : public DictionaryBreakEngine { 241 private: 242 /** 243 * The set of characters handled by this engine 244 * @internal 245 */ 246 247 UnicodeSet fBurmeseWordSet; 248 UnicodeSet fEndWordSet; 249 UnicodeSet fBeginWordSet; 250 UnicodeSet fMarkSet; 251 DictionaryMatcher *fDictionary; 252 253 public: 254 255 /** 256 * <p>Default constructor.</p> 257 * 258 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 259 * engine is deleted. 260 */ 261 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 262 263 /** 264 * <p>Virtual destructor.</p> 265 */ 266 virtual ~BurmeseBreakEngine(); 267 268 protected: 269 /** 270 * <p>Divide up a range of known dictionary characters.</p> 271 * 272 * @param text A UText representing the text 273 * @param rangeStart The start of the range of dictionary characters 274 * @param rangeEnd The end of the range of dictionary characters 275 * @param foundBreaks Output of C array of int32_t break positions, or 0 276 * @param status Information on any errors encountered. 277 * @return The number of breaks found 278 */ 279 virtual int32_t divideUpDictionaryRange( UText *text, 280 int32_t rangeStart, 281 int32_t rangeEnd, 282 UVector32 &foundBreaks, 283 UErrorCode& status) const override; 284 285 }; 286 287 /******************************************************************* 288 * KhmerBreakEngine 289 */ 290 291 /** 292 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 293 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 294 * 295 * <p>After it is constructed a KhmerBreakEngine may be shared between 296 * threads without synchronization.</p> 297 */ 298 class KhmerBreakEngine : public DictionaryBreakEngine { 299 private: 300 /** 301 * The set of characters handled by this engine 302 * @internal 303 */ 304 305 UnicodeSet fKhmerWordSet; 306 UnicodeSet fEndWordSet; 307 UnicodeSet fBeginWordSet; 308 UnicodeSet fMarkSet; 309 DictionaryMatcher *fDictionary; 310 311 public: 312 313 /** 314 * <p>Default constructor.</p> 315 * 316 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 317 * engine is deleted. 318 */ 319 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 320 321 /** 322 * <p>Virtual destructor.</p> 323 */ 324 virtual ~KhmerBreakEngine(); 325 326 protected: 327 /** 328 * <p>Divide up a range of known dictionary characters.</p> 329 * 330 * @param text A UText representing the text 331 * @param rangeStart The start of the range of dictionary characters 332 * @param rangeEnd The end of the range of dictionary characters 333 * @param foundBreaks Output of C array of int32_t break positions, or 0 334 * @param status Information on any errors encountered. 335 * @return The number of breaks found 336 */ 337 virtual int32_t divideUpDictionaryRange( UText *text, 338 int32_t rangeStart, 339 int32_t rangeEnd, 340 UVector32 &foundBreaks, 341 UErrorCode& status) const override; 342 343 }; 344 345 #if !UCONFIG_NO_NORMALIZATION 346 347 /******************************************************************* 348 * CjkBreakEngine 349 */ 350 351 //indicates language/script that the CjkBreakEngine will handle 352 enum LanguageType { 353 kKorean, 354 kChineseJapanese 355 }; 356 357 /** 358 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 359 * dictionary with costs associated with each word and 360 * Viterbi decoding to determine CJK-specific breaks.</p> 361 */ 362 class CjkBreakEngine : public DictionaryBreakEngine { 363 protected: 364 /** 365 * The set of characters handled by this engine 366 * @internal 367 */ 368 UnicodeSet fHangulWordSet; 369 UnicodeSet fHanWordSet; 370 UnicodeSet fKatakanaWordSet; 371 UnicodeSet fHiraganaWordSet; 372 373 DictionaryMatcher *fDictionary; 374 const Normalizer2 *nfkcNorm2; 375 376 public: 377 378 /** 379 * <p>Default constructor.</p> 380 * 381 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 382 * engine is deleted. The DictionaryMatcher must contain costs for each word 383 * in order for the dictionary to work properly. 384 */ 385 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 386 387 /** 388 * <p>Virtual destructor.</p> 389 */ 390 virtual ~CjkBreakEngine(); 391 392 protected: 393 /** 394 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 395 * 396 * @param text A UText representing the text 397 * @param rangeStart The start of the range of dictionary characters 398 * @param rangeEnd The end of the range of dictionary characters 399 * @param foundBreaks Output of C array of int32_t break positions, or 0 400 * @param status Information on any errors encountered. 401 * @return The number of breaks found 402 */ 403 virtual int32_t divideUpDictionaryRange( UText *text, 404 int32_t rangeStart, 405 int32_t rangeEnd, 406 UVector32 &foundBreaks, 407 UErrorCode& status) const override; 408 409 }; 410 411 #endif 412 413 U_NAMESPACE_END 414 415 /* DICTBE_H */ 416 #endif 417