1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006,2011, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #ifndef DICTBE_H 9 #define DICTBE_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utext.h" 14 15 #include "brkeng.h" 16 17 U_NAMESPACE_BEGIN 18 19 class TrieWordDictionary; 20 21 /******************************************************************* 22 * DictionaryBreakEngine 23 */ 24 25 /** 26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 27 * dictionary to determine language-specific breaks.</p> 28 * 29 * <p>After it is constructed a DictionaryBreakEngine may be shared between 30 * threads without synchronization.</p> 31 */ 32 class DictionaryBreakEngine : public LanguageBreakEngine { 33 private: 34 /** 35 * The set of characters handled by this engine 36 * @internal 37 */ 38 39 UnicodeSet fSet; 40 41 /** 42 * The set of break types handled by this engine 43 * @internal 44 */ 45 46 uint32_t fTypes; 47 48 /** 49 * <p>Default constructor.</p> 50 * 51 */ 52 DictionaryBreakEngine(); 53 54 public: 55 56 /** 57 * <p>Constructor setting the break types handled.</p> 58 * 59 * @param breakTypes A bitmap of types handled by the engine. 60 */ 61 DictionaryBreakEngine( uint32_t breakTypes ); 62 63 /** 64 * <p>Virtual destructor.</p> 65 */ 66 virtual ~DictionaryBreakEngine(); 67 68 /** 69 * <p>Indicate whether this engine handles a particular character for 70 * a particular kind of break.</p> 71 * 72 * @param c A character which begins a run that the engine might handle 73 * @param breakType The type of text break which the caller wants to determine 74 * @return TRUE if this engine handles the particular character and break 75 * type. 76 */ 77 virtual UBool handles( UChar32 c, int32_t breakType ) const; 78 79 /** 80 * <p>Find any breaks within a run in the supplied text.</p> 81 * 82 * @param text A UText representing the text. The 83 * iterator is left at the end of the run of characters which the engine 84 * is capable of handling. 85 * @param startPos The start of the run within the supplied text. 86 * @param endPos The end of the run within the supplied text. 87 * @param reverse Whether the caller is looking for breaks in a reverse 88 * direction. 89 * @param breakType The type of break desired, or -1. 90 * @param foundBreaks An allocated C array of the breaks found, if any 91 * @return The number of breaks found. 92 */ 93 virtual int32_t findBreaks( UText *text, 94 int32_t startPos, 95 int32_t endPos, 96 UBool reverse, 97 int32_t breakType, 98 UStack &foundBreaks ) const; 99 100 protected: 101 102 /** 103 * <p>Set the character set handled by this engine.</p> 104 * 105 * @param set A UnicodeSet of the set of characters handled by the engine 106 */ 107 virtual void setCharacters( const UnicodeSet &set ); 108 109 /** 110 * <p>Set the break types handled by this engine.</p> 111 * 112 * @param breakTypes A bitmap of types handled by the engine. 113 */ 114 // virtual void setBreakTypes( uint32_t breakTypes ); 115 116 /** 117 * <p>Divide up a range of known dictionary characters.</p> 118 * 119 * @param text A UText representing the text 120 * @param rangeStart The start of the range of dictionary characters 121 * @param rangeEnd The end of the range of dictionary characters 122 * @param foundBreaks Output of C array of int32_t break positions, or 0 123 * @return The number of breaks found 124 */ 125 virtual int32_t divideUpDictionaryRange( UText *text, 126 int32_t rangeStart, 127 int32_t rangeEnd, 128 UStack &foundBreaks ) const = 0; 129 130 }; 131 132 /******************************************************************* 133 * ThaiBreakEngine 134 */ 135 136 /** 137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 138 * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p> 139 * 140 * <p>After it is constructed a ThaiBreakEngine may be shared between 141 * threads without synchronization.</p> 142 */ 143 class ThaiBreakEngine : public DictionaryBreakEngine { 144 private: 145 /** 146 * The set of characters handled by this engine 147 * @internal 148 */ 149 150 UnicodeSet fThaiWordSet; 151 UnicodeSet fEndWordSet; 152 UnicodeSet fBeginWordSet; 153 UnicodeSet fSuffixSet; 154 UnicodeSet fMarkSet; 155 const TrieWordDictionary *fDictionary; 156 157 public: 158 159 /** 160 * <p>Default constructor.</p> 161 * 162 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 163 * engine is deleted. 164 */ 165 ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); 166 167 /** 168 * <p>Virtual destructor.</p> 169 */ 170 virtual ~ThaiBreakEngine(); 171 172 protected: 173 /** 174 * <p>Divide up a range of known dictionary characters.</p> 175 * 176 * @param text A UText representing the text 177 * @param rangeStart The start of the range of dictionary characters 178 * @param rangeEnd The end of the range of dictionary characters 179 * @param foundBreaks Output of C array of int32_t break positions, or 0 180 * @return The number of breaks found 181 */ 182 virtual int32_t divideUpDictionaryRange( UText *text, 183 int32_t rangeStart, 184 int32_t rangeEnd, 185 UStack &foundBreaks ) const; 186 187 }; 188 189 190 /******************************************************************* 191 * KhmerBreakEngine 192 */ 193 194 /** 195 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 196 * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p> 197 * 198 * <p>After it is constructed a KhmerBreakEngine may be shared between 199 * threads without synchronization.</p> 200 */ 201 class KhmerBreakEngine : public DictionaryBreakEngine { 202 private: 203 /** 204 * The set of characters handled by this engine 205 * @internal 206 */ 207 208 UnicodeSet fKhmerWordSet; 209 UnicodeSet fEndWordSet; 210 UnicodeSet fBeginWordSet; 211 UnicodeSet fMarkSet; 212 const TrieWordDictionary *fDictionary; 213 214 public: 215 216 /** 217 * <p>Default constructor.</p> 218 * 219 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 220 * engine is deleted. 221 */ 222 KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); 223 224 /** 225 * <p>Virtual destructor.</p> 226 */ 227 virtual ~KhmerBreakEngine(); 228 229 protected: 230 /** 231 * <p>Divide up a range of known dictionary characters.</p> 232 * 233 * @param text A UText representing the text 234 * @param rangeStart The start of the range of dictionary characters 235 * @param rangeEnd The end of the range of dictionary characters 236 * @param foundBreaks Output of C array of int32_t break positions, or 0 237 * @return The number of breaks found 238 */ 239 virtual int32_t divideUpDictionaryRange( UText *text, 240 int32_t rangeStart, 241 int32_t rangeEnd, 242 UStack &foundBreaks ) const; 243 244 }; 245 246 247 U_NAMESPACE_END 248 249 /* DICTBE_H */ 250 #endif 251