1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/utext.h" 16 #include "unicode/uscript.h" 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeSet; 21 class UStack; 22 class UVector32; 23 class DictionaryMatcher; 24 25 /******************************************************************* 26 * LanguageBreakEngine 27 */ 28 29 /** 30 * <p>LanguageBreakEngines implement language-specific knowledge for 31 * finding text boundaries within a run of characters belonging to a 32 * specific set. The boundaries will be of a specific kind, e.g. word, 33 * line, etc.</p> 34 * 35 * <p>LanguageBreakEngines should normally be implemented so as to 36 * be shared between threads without locking.</p> 37 */ 38 class LanguageBreakEngine : public UMemory { 39 public: 40 41 /** 42 * <p>Default constructor.</p> 43 * 44 */ 45 LanguageBreakEngine(); 46 47 /** 48 * <p>Virtual destructor.</p> 49 */ 50 virtual ~LanguageBreakEngine(); 51 52 /** 53 * <p>Indicate whether this engine handles a particular character for 54 * a particular kind of break.</p> 55 * 56 * @param c A character which begins a run that the engine might handle 57 * @return true if this engine handles the particular character and break 58 * type. 59 */ 60 virtual UBool handles(UChar32 c) const = 0; 61 62 /** 63 * <p>Find any breaks within a run in the supplied text.</p> 64 * 65 * @param text A UText representing the text. The 66 * iterator is left at the end of the run of characters which the engine 67 * is capable of handling. 68 * @param startPos The start of the run within the supplied text. 69 * @param endPos The end of the run within the supplied text. 70 * @param foundBreaks A Vector of int32_t to receive the breaks. 71 * @param status Information on any errors encountered. 72 * @return The number of breaks found. 73 */ 74 virtual int32_t findBreaks( UText *text, 75 int32_t startPos, 76 int32_t endPos, 77 UVector32 &foundBreaks, 78 UErrorCode &status) const = 0; 79 80 }; 81 82 /******************************************************************* 83 * LanguageBreakFactory 84 */ 85 86 /** 87 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 88 * that can determine breaks for characters in a specific set, if 89 * such an object can be found.</p> 90 * 91 * <p>If a LanguageBreakFactory is to be shared between threads, 92 * appropriate synchronization must be used; there is none internal 93 * to the factory.</p> 94 * 95 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 96 * normally be shared between threads without synchronization, unless 97 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 98 * 99 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 100 * it returns when it itself is deleted, unless the specific subclass of 101 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 102 * not be deleted until the LanguageBreakEngines it has returned are no 103 * longer needed.</p> 104 */ 105 class LanguageBreakFactory : public UMemory { 106 public: 107 108 /** 109 * <p>Default constructor.</p> 110 * 111 */ 112 LanguageBreakFactory(); 113 114 /** 115 * <p>Virtual destructor.</p> 116 */ 117 virtual ~LanguageBreakFactory(); 118 119 /** 120 * <p>Find and return a LanguageBreakEngine that can find the desired 121 * kind of break for the set of characters to which the supplied 122 * character belongs. It is up to the set of available engines to 123 * determine what the sets of characters are.</p> 124 * 125 * @param c A character that begins a run for which a LanguageBreakEngine is 126 * sought. 127 * @return A LanguageBreakEngine with the desired characteristics, or 0. 128 */ 129 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; 130 131 }; 132 133 /******************************************************************* 134 * UnhandledEngine 135 */ 136 137 /** 138 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 139 * handles characters that no other LanguageBreakEngine is available to 140 * handle. It is told the character and the type of break; at its 141 * discretion it may handle more than the specified character (e.g., 142 * the entire script to which that character belongs.</p> 143 * 144 * <p>UnhandledEngines may not be shared between threads without 145 * external synchronization.</p> 146 */ 147 148 class UnhandledEngine : public LanguageBreakEngine { 149 private: 150 151 /** 152 * The sets of characters handled. 153 * @internal 154 */ 155 156 UnicodeSet *fHandled; 157 158 public: 159 160 /** 161 * <p>Default constructor.</p> 162 * 163 */ 164 UnhandledEngine(UErrorCode &status); 165 166 /** 167 * <p>Virtual destructor.</p> 168 */ 169 virtual ~UnhandledEngine(); 170 171 /** 172 * <p>Indicate whether this engine handles a particular character for 173 * a particular kind of break.</p> 174 * 175 * @param c A character which begins a run that the engine might handle 176 * @return true if this engine handles the particular character and break 177 * type. 178 */ 179 virtual UBool handles(UChar32 c) const override; 180 181 /** 182 * <p>Find any breaks within a run in the supplied text.</p> 183 * 184 * @param text A UText representing the text (TODO: UText). The 185 * iterator is left at the end of the run of characters which the engine 186 * is capable of handling. 187 * @param startPos The start of the run within the supplied text. 188 * @param endPos The end of the run within the supplied text. 189 * @param foundBreaks An allocated C array of the breaks found, if any 190 * @param status Information on any errors encountered. 191 * @return The number of breaks found. 192 */ 193 virtual int32_t findBreaks( UText *text, 194 int32_t startPos, 195 int32_t endPos, 196 UVector32 &foundBreaks, 197 UErrorCode &status) const override; 198 199 /** 200 * <p>Tell the engine to handle a particular character and break type.</p> 201 * 202 * @param c A character which the engine should handle 203 */ 204 virtual void handleCharacter(UChar32 c); 205 206 }; 207 208 /******************************************************************* 209 * ICULanguageBreakFactory 210 */ 211 212 /** 213 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 214 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 215 * data in the ICU data file.</p> 216 */ 217 class ICULanguageBreakFactory : public LanguageBreakFactory { 218 private: 219 220 /** 221 * The stack of break engines created by this factory 222 * @internal 223 */ 224 225 UStack *fEngines; 226 227 public: 228 229 /** 230 * <p>Standard constructor.</p> 231 * 232 */ 233 ICULanguageBreakFactory(UErrorCode &status); 234 235 /** 236 * <p>Virtual destructor.</p> 237 */ 238 virtual ~ICULanguageBreakFactory(); 239 240 /** 241 * <p>Find and return a LanguageBreakEngine that can find the desired 242 * kind of break for the set of characters to which the supplied 243 * character belongs. It is up to the set of available engines to 244 * determine what the sets of characters are.</p> 245 * 246 * @param c A character that begins a run for which a LanguageBreakEngine is 247 * sought. 248 * @return A LanguageBreakEngine with the desired characteristics, or 0. 249 */ 250 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override; 251 252 protected: 253 /** 254 * <p>Create a LanguageBreakEngine for the set of characters to which 255 * the supplied character belongs, for the specified break type.</p> 256 * 257 * @param c A character that begins a run for which a LanguageBreakEngine is 258 * sought. 259 * @return A LanguageBreakEngine with the desired characteristics, or 0. 260 */ 261 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); 262 263 /** 264 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 265 * @param script An ISO 15924 script code that identifies the dictionary to be 266 * created. 267 * @return A DictionaryMatcher with the desired characteristics, or NULL. 268 */ 269 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); 270 }; 271 272 U_NAMESPACE_END 273 274 /* BRKENG_H */ 275 #endif 276