1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/utext.h" 16 #include "unicode/uscript.h" 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeSet; 21 class UStack; 22 class UVector32; 23 class DictionaryMatcher; 24 25 /******************************************************************* 26 * LanguageBreakEngine 27 */ 28 29 /** 30 * <p>LanguageBreakEngines implement language-specific knowledge for 31 * finding text boundaries within a run of characters belonging to a 32 * specific set. The boundaries will be of a specific kind, e.g. word, 33 * line, etc.</p> 34 * 35 * <p>LanguageBreakEngines should normally be implemented so as to 36 * be shared between threads without locking.</p> 37 */ 38 class LanguageBreakEngine : public UMemory { 39 public: 40 41 /** 42 * <p>Default constructor.</p> 43 * 44 */ 45 LanguageBreakEngine(); 46 47 /** 48 * <p>Virtual destructor.</p> 49 */ 50 virtual ~LanguageBreakEngine(); 51 52 /** 53 * <p>Indicate whether this engine handles a particular character for 54 * a particular kind of break.</p> 55 * 56 * @param c A character which begins a run that the engine might handle 57 * @return true if this engine handles the particular character and break 58 * type. 59 */ 60 virtual UBool handles(UChar32 c) const = 0; 61 62 /** 63 * <p>Find any breaks within a run in the supplied text.</p> 64 * 65 * @param text A UText representing the text. The 66 * iterator is left at the end of the run of characters which the engine 67 * is capable of handling. 68 * @param startPos The start of the run within the supplied text. 69 * @param endPos The end of the run within the supplied text. 70 * @param foundBreaks A Vector of int32_t to receive the breaks. 71 * @return The number of breaks found. 72 */ 73 virtual int32_t findBreaks( UText *text, 74 int32_t startPos, 75 int32_t endPos, 76 UVector32 &foundBreaks ) const = 0; 77 78 }; 79 80 /******************************************************************* 81 * LanguageBreakFactory 82 */ 83 84 /** 85 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 86 * that can determine breaks for characters in a specific set, if 87 * such an object can be found.</p> 88 * 89 * <p>If a LanguageBreakFactory is to be shared between threads, 90 * appropriate synchronization must be used; there is none internal 91 * to the factory.</p> 92 * 93 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 94 * normally be shared between threads without synchronization, unless 95 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 96 * 97 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 98 * it returns when it itself is deleted, unless the specific subclass of 99 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 100 * not be deleted until the LanguageBreakEngines it has returned are no 101 * longer needed.</p> 102 */ 103 class LanguageBreakFactory : public UMemory { 104 public: 105 106 /** 107 * <p>Default constructor.</p> 108 * 109 */ 110 LanguageBreakFactory(); 111 112 /** 113 * <p>Virtual destructor.</p> 114 */ 115 virtual ~LanguageBreakFactory(); 116 117 /** 118 * <p>Find and return a LanguageBreakEngine that can find the desired 119 * kind of break for the set of characters to which the supplied 120 * character belongs. It is up to the set of available engines to 121 * determine what the sets of characters are.</p> 122 * 123 * @param c A character that begins a run for which a LanguageBreakEngine is 124 * sought. 125 * @return A LanguageBreakEngine with the desired characteristics, or 0. 126 */ 127 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; 128 129 }; 130 131 /******************************************************************* 132 * UnhandledEngine 133 */ 134 135 /** 136 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 137 * handles characters that no other LanguageBreakEngine is available to 138 * handle. It is told the character and the type of break; at its 139 * discretion it may handle more than the specified character (e.g., 140 * the entire script to which that character belongs.</p> 141 * 142 * <p>UnhandledEngines may not be shared between threads without 143 * external synchronization.</p> 144 */ 145 146 class UnhandledEngine : public LanguageBreakEngine { 147 private: 148 149 /** 150 * The sets of characters handled. 151 * @internal 152 */ 153 154 UnicodeSet *fHandled; 155 156 public: 157 158 /** 159 * <p>Default constructor.</p> 160 * 161 */ 162 UnhandledEngine(UErrorCode &status); 163 164 /** 165 * <p>Virtual destructor.</p> 166 */ 167 virtual ~UnhandledEngine(); 168 169 /** 170 * <p>Indicate whether this engine handles a particular character for 171 * a particular kind of break.</p> 172 * 173 * @param c A character which begins a run that the engine might handle 174 * @return true if this engine handles the particular character and break 175 * type. 176 */ 177 virtual UBool handles(UChar32 c) const; 178 179 /** 180 * <p>Find any breaks within a run in the supplied text.</p> 181 * 182 * @param text A UText representing the text (TODO: UText). The 183 * iterator is left at the end of the run of characters which the engine 184 * is capable of handling. 185 * @param startPos The start of the run within the supplied text. 186 * @param endPos The end of the run within the supplied text. 187 * @param foundBreaks An allocated C array of the breaks found, if any 188 * @return The number of breaks found. 189 */ 190 virtual int32_t findBreaks( UText *text, 191 int32_t startPos, 192 int32_t endPos, 193 UVector32 &foundBreaks ) const; 194 195 /** 196 * <p>Tell the engine to handle a particular character and break type.</p> 197 * 198 * @param c A character which the engine should handle 199 */ 200 virtual void handleCharacter(UChar32 c); 201 202 }; 203 204 /******************************************************************* 205 * ICULanguageBreakFactory 206 */ 207 208 /** 209 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 210 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 211 * data in the ICU data file.</p> 212 */ 213 class ICULanguageBreakFactory : public LanguageBreakFactory { 214 private: 215 216 /** 217 * The stack of break engines created by this factory 218 * @internal 219 */ 220 221 UStack *fEngines; 222 223 public: 224 225 /** 226 * <p>Standard constructor.</p> 227 * 228 */ 229 ICULanguageBreakFactory(UErrorCode &status); 230 231 /** 232 * <p>Virtual destructor.</p> 233 */ 234 virtual ~ICULanguageBreakFactory(); 235 236 /** 237 * <p>Find and return a LanguageBreakEngine that can find the desired 238 * kind of break for the set of characters to which the supplied 239 * character belongs. It is up to the set of available engines to 240 * determine what the sets of characters are.</p> 241 * 242 * @param c A character that begins a run for which a LanguageBreakEngine is 243 * sought. 244 * @return A LanguageBreakEngine with the desired characteristics, or 0. 245 */ 246 virtual const LanguageBreakEngine *getEngineFor(UChar32 c); 247 248 protected: 249 /** 250 * <p>Create a LanguageBreakEngine for the set of characters to which 251 * the supplied character belongs, for the specified break type.</p> 252 * 253 * @param c A character that begins a run for which a LanguageBreakEngine is 254 * sought. 255 * @return A LanguageBreakEngine with the desired characteristics, or 0. 256 */ 257 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); 258 259 /** 260 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 261 * @param script An ISO 15924 script code that identifies the dictionary to be 262 * created. 263 * @return A DictionaryMatcher with the desired characteristics, or NULL. 264 */ 265 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); 266 }; 267 268 U_NAMESPACE_END 269 270 /* BRKENG_H */ 271 #endif 272