1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/utext.h" 16 #include "unicode/uscript.h" 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeSet; 21 class UStack; 22 class UVector32; 23 class DictionaryMatcher; 24 25 /******************************************************************* 26 * LanguageBreakEngine 27 */ 28 29 /** 30 * <p>LanguageBreakEngines implement language-specific knowledge for 31 * finding text boundaries within a run of characters belonging to a 32 * specific set. The boundaries will be of a specific kind, e.g. word, 33 * line, etc.</p> 34 * 35 * <p>LanguageBreakEngines should normally be implemented so as to 36 * be shared between threads without locking.</p> 37 */ 38 class LanguageBreakEngine : public UMemory { 39 public: 40 41 /** 42 * <p>Default constructor.</p> 43 * 44 */ 45 LanguageBreakEngine(); 46 47 /** 48 * <p>Virtual destructor.</p> 49 */ 50 virtual ~LanguageBreakEngine(); 51 52 /** 53 * <p>Indicate whether this engine handles a particular character for 54 * a particular kind of break.</p> 55 * 56 * @param c A character which begins a run that the engine might handle 57 * @return true if this engine handles the particular character and break 58 * type. 59 */ 60 virtual UBool handles(UChar32 c) const = 0; 61 62 /** 63 * <p>Find any breaks within a run in the supplied text.</p> 64 * 65 * @param text A UText representing the text. The 66 * iterator is left at the end of the run of characters which the engine 67 * is capable of handling. 68 * @param startPos The start of the run within the supplied text. 69 * @param endPos The end of the run within the supplied text. 70 * @param foundBreaks A Vector of int32_t to receive the breaks. 71 * @param status Information on any errors encountered. 72 * @return The number of breaks found. 73 */ 74 virtual int32_t findBreaks( UText *text, 75 int32_t startPos, 76 int32_t endPos, 77 UVector32 &foundBreaks, 78 UBool isPhraseBreaking, 79 UErrorCode &status) const = 0; 80 81 }; 82 83 /******************************************************************* 84 * LanguageBreakFactory 85 */ 86 87 /** 88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 89 * that can determine breaks for characters in a specific set, if 90 * such an object can be found.</p> 91 * 92 * <p>If a LanguageBreakFactory is to be shared between threads, 93 * appropriate synchronization must be used; there is none internal 94 * to the factory.</p> 95 * 96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 97 * normally be shared between threads without synchronization, unless 98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 99 * 100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 101 * it returns when it itself is deleted, unless the specific subclass of 102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 103 * not be deleted until the LanguageBreakEngines it has returned are no 104 * longer needed.</p> 105 */ 106 class LanguageBreakFactory : public UMemory { 107 public: 108 109 /** 110 * <p>Default constructor.</p> 111 * 112 */ 113 LanguageBreakFactory(); 114 115 /** 116 * <p>Virtual destructor.</p> 117 */ 118 virtual ~LanguageBreakFactory(); 119 120 /** 121 * <p>Find and return a LanguageBreakEngine that can find the desired 122 * kind of break for the set of characters to which the supplied 123 * character belongs. It is up to the set of available engines to 124 * determine what the sets of characters are.</p> 125 * 126 * @param c A character that begins a run for which a LanguageBreakEngine is 127 * sought. 128 * @return A LanguageBreakEngine with the desired characteristics, or 0. 129 */ 130 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; 131 132 }; 133 134 /******************************************************************* 135 * UnhandledEngine 136 */ 137 138 /** 139 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 140 * handles characters that no other LanguageBreakEngine is available to 141 * handle. It is told the character and the type of break; at its 142 * discretion it may handle more than the specified character (e.g., 143 * the entire script to which that character belongs.</p> 144 * 145 * <p>UnhandledEngines may not be shared between threads without 146 * external synchronization.</p> 147 */ 148 149 class UnhandledEngine : public LanguageBreakEngine { 150 private: 151 152 /** 153 * The sets of characters handled. 154 * @internal 155 */ 156 157 UnicodeSet *fHandled; 158 159 public: 160 161 /** 162 * <p>Default constructor.</p> 163 * 164 */ 165 UnhandledEngine(UErrorCode &status); 166 167 /** 168 * <p>Virtual destructor.</p> 169 */ 170 virtual ~UnhandledEngine(); 171 172 /** 173 * <p>Indicate whether this engine handles a particular character for 174 * a particular kind of break.</p> 175 * 176 * @param c A character which begins a run that the engine might handle 177 * @return true if this engine handles the particular character and break 178 * type. 179 */ 180 virtual UBool handles(UChar32 c) const override; 181 182 /** 183 * <p>Find any breaks within a run in the supplied text.</p> 184 * 185 * @param text A UText representing the text (TODO: UText). The 186 * iterator is left at the end of the run of characters which the engine 187 * is capable of handling. 188 * @param startPos The start of the run within the supplied text. 189 * @param endPos The end of the run within the supplied text. 190 * @param foundBreaks An allocated C array of the breaks found, if any 191 * @param status Information on any errors encountered. 192 * @return The number of breaks found. 193 */ 194 virtual int32_t findBreaks( UText *text, 195 int32_t startPos, 196 int32_t endPos, 197 UVector32 &foundBreaks, 198 UBool isPhraseBreaking, 199 UErrorCode &status) const override; 200 201 /** 202 * <p>Tell the engine to handle a particular character and break type.</p> 203 * 204 * @param c A character which the engine should handle 205 */ 206 virtual void handleCharacter(UChar32 c); 207 208 }; 209 210 /******************************************************************* 211 * ICULanguageBreakFactory 212 */ 213 214 /** 215 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 216 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 217 * data in the ICU data file.</p> 218 */ 219 class ICULanguageBreakFactory : public LanguageBreakFactory { 220 private: 221 222 /** 223 * The stack of break engines created by this factory 224 * @internal 225 */ 226 227 UStack *fEngines; 228 229 public: 230 231 /** 232 * <p>Standard constructor.</p> 233 * 234 */ 235 ICULanguageBreakFactory(UErrorCode &status); 236 237 /** 238 * <p>Virtual destructor.</p> 239 */ 240 virtual ~ICULanguageBreakFactory(); 241 242 /** 243 * <p>Find and return a LanguageBreakEngine that can find the desired 244 * kind of break for the set of characters to which the supplied 245 * character belongs. It is up to the set of available engines to 246 * determine what the sets of characters are.</p> 247 * 248 * @param c A character that begins a run for which a LanguageBreakEngine is 249 * sought. 250 * @return A LanguageBreakEngine with the desired characteristics, or 0. 251 */ 252 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override; 253 254 protected: 255 /** 256 * <p>Create a LanguageBreakEngine for the set of characters to which 257 * the supplied character belongs, for the specified break type.</p> 258 * 259 * @param c A character that begins a run for which a LanguageBreakEngine is 260 * sought. 261 * @return A LanguageBreakEngine with the desired characteristics, or 0. 262 */ 263 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); 264 265 /** 266 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 267 * @param script An ISO 15924 script code that identifies the dictionary to be 268 * created. 269 * @return A DictionaryMatcher with the desired characteristics, or nullptr. 270 */ 271 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); 272 }; 273 274 U_NAMESPACE_END 275 276 /* BRKENG_H */ 277 #endif 278