1 /** 2 ************************************************************************************ 3 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 4 * All Rights Reserved. * 5 ************************************************************************************ 6 */ 7 8 #ifndef BRKENG_H 9 #define BRKENG_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uobject.h" 13 #include "unicode/utext.h" 14 #include "unicode/uscript.h" 15 16 U_NAMESPACE_BEGIN 17 18 class UnicodeSet; 19 class UStack; 20 class DictionaryMatcher; 21 22 /******************************************************************* 23 * LanguageBreakEngine 24 */ 25 26 /** 27 * <p>LanguageBreakEngines implement language-specific knowledge for 28 * finding text boundaries within a run of characters belonging to a 29 * specific set. The boundaries will be of a specific kind, e.g. word, 30 * line, etc.</p> 31 * 32 * <p>LanguageBreakEngines should normally be implemented so as to 33 * be shared between threads without locking.</p> 34 */ 35 class LanguageBreakEngine : public UMemory { 36 public: 37 38 /** 39 * <p>Default constructor.</p> 40 * 41 */ 42 LanguageBreakEngine(); 43 44 /** 45 * <p>Virtual destructor.</p> 46 */ 47 virtual ~LanguageBreakEngine(); 48 49 /** 50 * <p>Indicate whether this engine handles a particular character for 51 * a particular kind of break.</p> 52 * 53 * @param c A character which begins a run that the engine might handle 54 * @param breakType The type of text break which the caller wants to determine 55 * @return TRUE if this engine handles the particular character and break 56 * type. 57 */ 58 virtual UBool handles(UChar32 c, int32_t breakType) const = 0; 59 60 /** 61 * <p>Find any breaks within a run in the supplied text.</p> 62 * 63 * @param text A UText representing the text. The 64 * iterator is left at the end of the run of characters which the engine 65 * is capable of handling. 66 * @param startPos The start of the run within the supplied text. 67 * @param endPos The end of the run within the supplied text. 68 * @param reverse Whether the caller is looking for breaks in a reverse 69 * direction. 70 * @param breakType The type of break desired, or -1. 71 * @param foundBreaks An allocated C array of the breaks found, if any 72 * @return The number of breaks found. 73 */ 74 virtual int32_t findBreaks( UText *text, 75 int32_t startPos, 76 int32_t endPos, 77 UBool reverse, 78 int32_t breakType, 79 UStack &foundBreaks ) const = 0; 80 81 }; 82 83 /******************************************************************* 84 * LanguageBreakFactory 85 */ 86 87 /** 88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 89 * that can determine breaks for characters in a specific set, if 90 * such an object can be found.</p> 91 * 92 * <p>If a LanguageBreakFactory is to be shared between threads, 93 * appropriate synchronization must be used; there is none internal 94 * to the factory.</p> 95 * 96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 97 * normally be shared between threads without synchronization, unless 98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 99 * 100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 101 * it returns when it itself is deleted, unless the specific subclass of 102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 103 * not be deleted until the LanguageBreakEngines it has returned are no 104 * longer needed.</p> 105 */ 106 class LanguageBreakFactory : public UMemory { 107 public: 108 109 /** 110 * <p>Default constructor.</p> 111 * 112 */ 113 LanguageBreakFactory(); 114 115 /** 116 * <p>Virtual destructor.</p> 117 */ 118 virtual ~LanguageBreakFactory(); 119 120 /** 121 * <p>Find and return a LanguageBreakEngine that can find the desired 122 * kind of break for the set of characters to which the supplied 123 * character belongs. It is up to the set of available engines to 124 * determine what the sets of characters are.</p> 125 * 126 * @param c A character that begins a run for which a LanguageBreakEngine is 127 * sought. 128 * @param breakType The kind of text break for which a LanguageBreakEngine is 129 * sought. 130 * @return A LanguageBreakEngine with the desired characteristics, or 0. 131 */ 132 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; 133 134 }; 135 136 /******************************************************************* 137 * UnhandledEngine 138 */ 139 140 /** 141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 142 * handles characters that no other LanguageBreakEngine is available to 143 * handle. It is told the character and the type of break; at its 144 * discretion it may handle more than the specified character (e.g., 145 * the entire script to which that character belongs.</p> 146 * 147 * <p>UnhandledEngines may not be shared between threads without 148 * external synchronization.</p> 149 */ 150 151 class UnhandledEngine : public LanguageBreakEngine { 152 private: 153 154 /** 155 * The sets of characters handled, for each break type 156 * @internal 157 */ 158 159 UnicodeSet *fHandled[4]; 160 161 public: 162 163 /** 164 * <p>Default constructor.</p> 165 * 166 */ 167 UnhandledEngine(UErrorCode &status); 168 169 /** 170 * <p>Virtual destructor.</p> 171 */ 172 virtual ~UnhandledEngine(); 173 174 /** 175 * <p>Indicate whether this engine handles a particular character for 176 * a particular kind of break.</p> 177 * 178 * @param c A character which begins a run that the engine might handle 179 * @param breakType The type of text break which the caller wants to determine 180 * @return TRUE if this engine handles the particular character and break 181 * type. 182 */ 183 virtual UBool handles(UChar32 c, int32_t breakType) const; 184 185 /** 186 * <p>Find any breaks within a run in the supplied text.</p> 187 * 188 * @param text A UText representing the text (TODO: UText). The 189 * iterator is left at the end of the run of characters which the engine 190 * is capable of handling. 191 * @param startPos The start of the run within the supplied text. 192 * @param endPos The end of the run within the supplied text. 193 * @param reverse Whether the caller is looking for breaks in a reverse 194 * direction. 195 * @param breakType The type of break desired, or -1. 196 * @param foundBreaks An allocated C array of the breaks found, if any 197 * @return The number of breaks found. 198 */ 199 virtual int32_t findBreaks( UText *text, 200 int32_t startPos, 201 int32_t endPos, 202 UBool reverse, 203 int32_t breakType, 204 UStack &foundBreaks ) const; 205 206 /** 207 * <p>Tell the engine to handle a particular character and break type.</p> 208 * 209 * @param c A character which the engine should handle 210 * @param breakType The type of text break for which the engine should handle c 211 */ 212 virtual void handleCharacter(UChar32 c, int32_t breakType); 213 214 }; 215 216 /******************************************************************* 217 * ICULanguageBreakFactory 218 */ 219 220 /** 221 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 222 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 223 * data in the ICU data file.</p> 224 */ 225 class ICULanguageBreakFactory : public LanguageBreakFactory { 226 private: 227 228 /** 229 * The stack of break engines created by this factory 230 * @internal 231 */ 232 233 UStack *fEngines; 234 235 public: 236 237 /** 238 * <p>Standard constructor.</p> 239 * 240 */ 241 ICULanguageBreakFactory(UErrorCode &status); 242 243 /** 244 * <p>Virtual destructor.</p> 245 */ 246 virtual ~ICULanguageBreakFactory(); 247 248 /** 249 * <p>Find and return a LanguageBreakEngine that can find the desired 250 * kind of break for the set of characters to which the supplied 251 * character belongs. It is up to the set of available engines to 252 * determine what the sets of characters are.</p> 253 * 254 * @param c A character that begins a run for which a LanguageBreakEngine is 255 * sought. 256 * @param breakType The kind of text break for which a LanguageBreakEngine is 257 * sought. 258 * @return A LanguageBreakEngine with the desired characteristics, or 0. 259 */ 260 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); 261 262 protected: 263 /** 264 * <p>Create a LanguageBreakEngine for the set of characters to which 265 * the supplied character belongs, for the specified break type.</p> 266 * 267 * @param c A character that begins a run for which a LanguageBreakEngine is 268 * sought. 269 * @param breakType The kind of text break for which a LanguageBreakEngine is 270 * sought. 271 * @return A LanguageBreakEngine with the desired characteristics, or 0. 272 */ 273 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); 274 275 /** 276 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 277 * @param script An ISO 15924 script code that identifies the dictionary to be 278 * created. 279 * @param breakType The kind of text break for which a dictionary is 280 * sought. 281 * @return A DictionaryMatcher with the desired characteristics, or NULL. 282 */ 283 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); 284 }; 285 286 U_NAMESPACE_END 287 288 /* BRKENG_H */ 289 #endif 290