1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/utext.h" 16 #include "unicode/uscript.h" 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeSet; 21 class UStack; 22 class UVector32; 23 class DictionaryMatcher; 24 25 /******************************************************************* 26 * LanguageBreakEngine 27 */ 28 29 /** 30 * <p>LanguageBreakEngines implement language-specific knowledge for 31 * finding text boundaries within a run of characters belonging to a 32 * specific set. The boundaries will be of a specific kind, e.g. word, 33 * line, etc.</p> 34 * 35 * <p>LanguageBreakEngines should normally be implemented so as to 36 * be shared between threads without locking.</p> 37 */ 38 class LanguageBreakEngine : public UMemory { 39 public: 40 41 /** 42 * <p>Default constructor.</p> 43 * 44 */ 45 LanguageBreakEngine(); 46 47 /** 48 * <p>Virtual destructor.</p> 49 */ 50 virtual ~LanguageBreakEngine(); 51 52 /** 53 * <p>Indicate whether this engine handles a particular character for 54 * a particular kind of break.</p> 55 * 56 * @param c A character which begins a run that the engine might handle 57 * @param breakType The type of text break which the caller wants to determine 58 * @return TRUE if this engine handles the particular character and break 59 * type. 60 */ 61 virtual UBool handles(UChar32 c, int32_t breakType) const = 0; 62 63 /** 64 * <p>Find any breaks within a run in the supplied text.</p> 65 * 66 * @param text A UText representing the text. The 67 * iterator is left at the end of the run of characters which the engine 68 * is capable of handling. 69 * @param startPos The start of the run within the supplied text. 70 * @param endPos The end of the run within the supplied text. 71 * @param breakType The type of break desired, or -1. 72 * @param foundBreaks A Vector of int32_t to receive the breaks. 73 * @return The number of breaks found. 74 */ 75 virtual int32_t findBreaks( UText *text, 76 int32_t startPos, 77 int32_t endPos, 78 int32_t breakType, 79 UVector32 &foundBreaks ) const = 0; 80 81 }; 82 83 /******************************************************************* 84 * LanguageBreakFactory 85 */ 86 87 /** 88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 89 * that can determine breaks for characters in a specific set, if 90 * such an object can be found.</p> 91 * 92 * <p>If a LanguageBreakFactory is to be shared between threads, 93 * appropriate synchronization must be used; there is none internal 94 * to the factory.</p> 95 * 96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 97 * normally be shared between threads without synchronization, unless 98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 99 * 100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 101 * it returns when it itself is deleted, unless the specific subclass of 102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 103 * not be deleted until the LanguageBreakEngines it has returned are no 104 * longer needed.</p> 105 */ 106 class LanguageBreakFactory : public UMemory { 107 public: 108 109 /** 110 * <p>Default constructor.</p> 111 * 112 */ 113 LanguageBreakFactory(); 114 115 /** 116 * <p>Virtual destructor.</p> 117 */ 118 virtual ~LanguageBreakFactory(); 119 120 /** 121 * <p>Find and return a LanguageBreakEngine that can find the desired 122 * kind of break for the set of characters to which the supplied 123 * character belongs. It is up to the set of available engines to 124 * determine what the sets of characters are.</p> 125 * 126 * @param c A character that begins a run for which a LanguageBreakEngine is 127 * sought. 128 * @param breakType The kind of text break for which a LanguageBreakEngine is 129 * sought. 130 * @return A LanguageBreakEngine with the desired characteristics, or 0. 131 */ 132 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; 133 134 }; 135 136 /******************************************************************* 137 * UnhandledEngine 138 */ 139 140 /** 141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 142 * handles characters that no other LanguageBreakEngine is available to 143 * handle. It is told the character and the type of break; at its 144 * discretion it may handle more than the specified character (e.g., 145 * the entire script to which that character belongs.</p> 146 * 147 * <p>UnhandledEngines may not be shared between threads without 148 * external synchronization.</p> 149 */ 150 151 class UnhandledEngine : public LanguageBreakEngine { 152 private: 153 154 /** 155 * The sets of characters handled, for each break type 156 * @internal 157 */ 158 159 UnicodeSet *fHandled[4]; 160 161 public: 162 163 /** 164 * <p>Default constructor.</p> 165 * 166 */ 167 UnhandledEngine(UErrorCode &status); 168 169 /** 170 * <p>Virtual destructor.</p> 171 */ 172 virtual ~UnhandledEngine(); 173 174 /** 175 * <p>Indicate whether this engine handles a particular character for 176 * a particular kind of break.</p> 177 * 178 * @param c A character which begins a run that the engine might handle 179 * @param breakType The type of text break which the caller wants to determine 180 * @return TRUE if this engine handles the particular character and break 181 * type. 182 */ 183 virtual UBool handles(UChar32 c, int32_t breakType) const; 184 185 /** 186 * <p>Find any breaks within a run in the supplied text.</p> 187 * 188 * @param text A UText representing the text (TODO: UText). The 189 * iterator is left at the end of the run of characters which the engine 190 * is capable of handling. 191 * @param startPos The start of the run within the supplied text. 192 * @param endPos The end of the run within the supplied text. 193 * @param breakType The type of break desired, or -1. 194 * @param foundBreaks An allocated C array of the breaks found, if any 195 * @return The number of breaks found. 196 */ 197 virtual int32_t findBreaks( UText *text, 198 int32_t startPos, 199 int32_t endPos, 200 int32_t breakType, 201 UVector32 &foundBreaks ) const; 202 203 /** 204 * <p>Tell the engine to handle a particular character and break type.</p> 205 * 206 * @param c A character which the engine should handle 207 * @param breakType The type of text break for which the engine should handle c 208 */ 209 virtual void handleCharacter(UChar32 c, int32_t breakType); 210 211 }; 212 213 /******************************************************************* 214 * ICULanguageBreakFactory 215 */ 216 217 /** 218 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 219 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 220 * data in the ICU data file.</p> 221 */ 222 class ICULanguageBreakFactory : public LanguageBreakFactory { 223 private: 224 225 /** 226 * The stack of break engines created by this factory 227 * @internal 228 */ 229 230 UStack *fEngines; 231 232 public: 233 234 /** 235 * <p>Standard constructor.</p> 236 * 237 */ 238 ICULanguageBreakFactory(UErrorCode &status); 239 240 /** 241 * <p>Virtual destructor.</p> 242 */ 243 virtual ~ICULanguageBreakFactory(); 244 245 /** 246 * <p>Find and return a LanguageBreakEngine that can find the desired 247 * kind of break for the set of characters to which the supplied 248 * character belongs. It is up to the set of available engines to 249 * determine what the sets of characters are.</p> 250 * 251 * @param c A character that begins a run for which a LanguageBreakEngine is 252 * sought. 253 * @param breakType The kind of text break for which a LanguageBreakEngine is 254 * sought. 255 * @return A LanguageBreakEngine with the desired characteristics, or 0. 256 */ 257 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); 258 259 protected: 260 /** 261 * <p>Create a LanguageBreakEngine for the set of characters to which 262 * the supplied character belongs, for the specified break type.</p> 263 * 264 * @param c A character that begins a run for which a LanguageBreakEngine is 265 * sought. 266 * @param breakType The kind of text break for which a LanguageBreakEngine is 267 * sought. 268 * @return A LanguageBreakEngine with the desired characteristics, or 0. 269 */ 270 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); 271 272 /** 273 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 274 * @param script An ISO 15924 script code that identifies the dictionary to be 275 * created. 276 * @param breakType The kind of text break for which a dictionary is 277 * sought. 278 * @return A DictionaryMatcher with the desired characteristics, or NULL. 279 */ 280 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); 281 }; 282 283 U_NAMESPACE_END 284 285 /* BRKENG_H */ 286 #endif 287