1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "uvectr32.h" 19 20 U_NAMESPACE_BEGIN 21 22 class DictionaryMatcher; 23 class Normalizer2; 24 25 /******************************************************************* 26 * DictionaryBreakEngine 27 */ 28 29 /** 30 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 31 * dictionary to determine language-specific breaks.</p> 32 * 33 * <p>After it is constructed a DictionaryBreakEngine may be shared between 34 * threads without synchronization.</p> 35 */ 36 class DictionaryBreakEngine : public LanguageBreakEngine { 37 private: 38 /** 39 * The set of characters handled by this engine 40 * @internal 41 */ 42 43 UnicodeSet fSet; 44 45 public: 46 47 /** 48 * <p>Constructor </p> 49 */ 50 DictionaryBreakEngine(); 51 52 /** 53 * <p>Virtual destructor.</p> 54 */ 55 virtual ~DictionaryBreakEngine(); 56 57 /** 58 * <p>Indicate whether this engine handles a particular character for 59 * a particular kind of break.</p> 60 * 61 * @param c A character which begins a run that the engine might handle 62 * @return TRUE if this engine handles the particular character and break 63 * type. 64 */ 65 virtual UBool handles(UChar32 c) const; 66 67 /** 68 * <p>Find any breaks within a run in the supplied text.</p> 69 * 70 * @param text A UText representing the text. The iterator is left at 71 * the end of the run of characters which the engine is capable of handling 72 * that starts from the first character in the range. 73 * @param startPos The start of the run within the supplied text. 74 * @param endPos The end of the run within the supplied text. 75 * @param foundBreaks vector of int32_t to receive the break positions 76 * @return The number of breaks found. 77 */ 78 virtual int32_t findBreaks( UText *text, 79 int32_t startPos, 80 int32_t endPos, 81 UVector32 &foundBreaks ) const; 82 83 protected: 84 85 /** 86 * <p>Set the character set handled by this engine.</p> 87 * 88 * @param set A UnicodeSet of the set of characters handled by the engine 89 */ 90 virtual void setCharacters( const UnicodeSet &set ); 91 92 /** 93 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 94 * 95 * @param text A UText representing the text 96 * @param rangeStart The start of the range of dictionary characters 97 * @param rangeEnd The end of the range of dictionary characters 98 * @param foundBreaks Output of C array of int32_t break positions, or 0 99 * @return The number of breaks found 100 */ 101 virtual int32_t divideUpDictionaryRange( UText *text, 102 int32_t rangeStart, 103 int32_t rangeEnd, 104 UVector32 &foundBreaks ) const = 0; 105 106 }; 107 108 /******************************************************************* 109 * ThaiBreakEngine 110 */ 111 112 /** 113 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 114 * dictionary and heuristics to determine Thai-specific breaks.</p> 115 * 116 * <p>After it is constructed a ThaiBreakEngine may be shared between 117 * threads without synchronization.</p> 118 */ 119 class ThaiBreakEngine : public DictionaryBreakEngine { 120 private: 121 /** 122 * The set of characters handled by this engine 123 * @internal 124 */ 125 126 UnicodeSet fThaiWordSet; 127 UnicodeSet fEndWordSet; 128 UnicodeSet fBeginWordSet; 129 UnicodeSet fSuffixSet; 130 UnicodeSet fMarkSet; 131 DictionaryMatcher *fDictionary; 132 133 public: 134 135 /** 136 * <p>Default constructor.</p> 137 * 138 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 139 * engine is deleted. 140 */ 141 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 142 143 /** 144 * <p>Virtual destructor.</p> 145 */ 146 virtual ~ThaiBreakEngine(); 147 148 protected: 149 /** 150 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 151 * 152 * @param text A UText representing the text 153 * @param rangeStart The start of the range of dictionary characters 154 * @param rangeEnd The end of the range of dictionary characters 155 * @param foundBreaks Output of C array of int32_t break positions, or 0 156 * @return The number of breaks found 157 */ 158 virtual int32_t divideUpDictionaryRange( UText *text, 159 int32_t rangeStart, 160 int32_t rangeEnd, 161 UVector32 &foundBreaks ) const; 162 163 }; 164 165 /******************************************************************* 166 * LaoBreakEngine 167 */ 168 169 /** 170 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 171 * dictionary and heuristics to determine Lao-specific breaks.</p> 172 * 173 * <p>After it is constructed a LaoBreakEngine may be shared between 174 * threads without synchronization.</p> 175 */ 176 class LaoBreakEngine : public DictionaryBreakEngine { 177 private: 178 /** 179 * The set of characters handled by this engine 180 * @internal 181 */ 182 183 UnicodeSet fLaoWordSet; 184 UnicodeSet fEndWordSet; 185 UnicodeSet fBeginWordSet; 186 UnicodeSet fMarkSet; 187 DictionaryMatcher *fDictionary; 188 189 public: 190 191 /** 192 * <p>Default constructor.</p> 193 * 194 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 195 * engine is deleted. 196 */ 197 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 198 199 /** 200 * <p>Virtual destructor.</p> 201 */ 202 virtual ~LaoBreakEngine(); 203 204 protected: 205 /** 206 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 207 * 208 * @param text A UText representing the text 209 * @param rangeStart The start of the range of dictionary characters 210 * @param rangeEnd The end of the range of dictionary characters 211 * @param foundBreaks Output of C array of int32_t break positions, or 0 212 * @return The number of breaks found 213 */ 214 virtual int32_t divideUpDictionaryRange( UText *text, 215 int32_t rangeStart, 216 int32_t rangeEnd, 217 UVector32 &foundBreaks ) const; 218 219 }; 220 221 /******************************************************************* 222 * BurmeseBreakEngine 223 */ 224 225 /** 226 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 227 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 228 * 229 * <p>After it is constructed a BurmeseBreakEngine may be shared between 230 * threads without synchronization.</p> 231 */ 232 class BurmeseBreakEngine : public DictionaryBreakEngine { 233 private: 234 /** 235 * The set of characters handled by this engine 236 * @internal 237 */ 238 239 UnicodeSet fBurmeseWordSet; 240 UnicodeSet fEndWordSet; 241 UnicodeSet fBeginWordSet; 242 UnicodeSet fMarkSet; 243 DictionaryMatcher *fDictionary; 244 245 public: 246 247 /** 248 * <p>Default constructor.</p> 249 * 250 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 251 * engine is deleted. 252 */ 253 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 254 255 /** 256 * <p>Virtual destructor.</p> 257 */ 258 virtual ~BurmeseBreakEngine(); 259 260 protected: 261 /** 262 * <p>Divide up a range of known dictionary characters.</p> 263 * 264 * @param text A UText representing the text 265 * @param rangeStart The start of the range of dictionary characters 266 * @param rangeEnd The end of the range of dictionary characters 267 * @param foundBreaks Output of C array of int32_t break positions, or 0 268 * @return The number of breaks found 269 */ 270 virtual int32_t divideUpDictionaryRange( UText *text, 271 int32_t rangeStart, 272 int32_t rangeEnd, 273 UVector32 &foundBreaks ) const; 274 275 }; 276 277 /******************************************************************* 278 * KhmerBreakEngine 279 */ 280 281 /** 282 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 283 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 284 * 285 * <p>After it is constructed a KhmerBreakEngine may be shared between 286 * threads without synchronization.</p> 287 */ 288 class KhmerBreakEngine : public DictionaryBreakEngine { 289 private: 290 /** 291 * The set of characters handled by this engine 292 * @internal 293 */ 294 295 UnicodeSet fKhmerWordSet; 296 UnicodeSet fEndWordSet; 297 UnicodeSet fBeginWordSet; 298 UnicodeSet fMarkSet; 299 DictionaryMatcher *fDictionary; 300 301 public: 302 303 /** 304 * <p>Default constructor.</p> 305 * 306 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 307 * engine is deleted. 308 */ 309 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 310 311 /** 312 * <p>Virtual destructor.</p> 313 */ 314 virtual ~KhmerBreakEngine(); 315 316 protected: 317 /** 318 * <p>Divide up a range of known dictionary characters.</p> 319 * 320 * @param text A UText representing the text 321 * @param rangeStart The start of the range of dictionary characters 322 * @param rangeEnd The end of the range of dictionary characters 323 * @param foundBreaks Output of C array of int32_t break positions, or 0 324 * @return The number of breaks found 325 */ 326 virtual int32_t divideUpDictionaryRange( UText *text, 327 int32_t rangeStart, 328 int32_t rangeEnd, 329 UVector32 &foundBreaks ) const; 330 331 }; 332 333 #if !UCONFIG_NO_NORMALIZATION 334 335 /******************************************************************* 336 * CjkBreakEngine 337 */ 338 339 //indicates language/script that the CjkBreakEngine will handle 340 enum LanguageType { 341 kKorean, 342 kChineseJapanese 343 }; 344 345 /** 346 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 347 * dictionary with costs associated with each word and 348 * Viterbi decoding to determine CJK-specific breaks.</p> 349 */ 350 class CjkBreakEngine : public DictionaryBreakEngine { 351 protected: 352 /** 353 * The set of characters handled by this engine 354 * @internal 355 */ 356 UnicodeSet fHangulWordSet; 357 UnicodeSet fHanWordSet; 358 UnicodeSet fKatakanaWordSet; 359 UnicodeSet fHiraganaWordSet; 360 361 DictionaryMatcher *fDictionary; 362 const Normalizer2 *nfkcNorm2; 363 364 public: 365 366 /** 367 * <p>Default constructor.</p> 368 * 369 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 370 * engine is deleted. The DictionaryMatcher must contain costs for each word 371 * in order for the dictionary to work properly. 372 */ 373 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 374 375 /** 376 * <p>Virtual destructor.</p> 377 */ 378 virtual ~CjkBreakEngine(); 379 380 protected: 381 /** 382 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 383 * 384 * @param text A UText representing the text 385 * @param rangeStart The start of the range of dictionary characters 386 * @param rangeEnd The end of the range of dictionary characters 387 * @param foundBreaks Output of C array of int32_t break positions, or 0 388 * @return The number of breaks found 389 */ 390 virtual int32_t divideUpDictionaryRange( UText *text, 391 int32_t rangeStart, 392 int32_t rangeEnd, 393 UVector32 &foundBreaks ) const; 394 395 }; 396 397 #endif 398 399 U_NAMESPACE_END 400 401 /* DICTBE_H */ 402 #endif 403