1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************** 5 * Copyright (c) 2002-2010, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ************************************************************************** 8 * Date Name Description * 9 * 01/28/2002 aliu Creation. * 10 ************************************************************************** 11 */ 12 #ifndef TRIDPARS_H 13 #define TRIDPARS_H 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_TRANSLITERATION 18 19 #include "unicode/uobject.h" 20 #include "unicode/unistr.h" 21 22 U_NAMESPACE_BEGIN 23 24 class Transliterator; 25 class UnicodeSet; 26 class UVector; 27 28 /** 29 * Parsing component for transliterator IDs. This class contains only 30 * static members; it cannot be instantiated. Methods in this class 31 * parse various ID formats, including the following: 32 * 33 * A basic ID, which contains source, target, and variant, but no 34 * filter and no explicit inverse. Examples include 35 * "Latin-Greek/UNGEGN" and "Null". 36 * 37 * A single ID, which is a basic ID plus optional filter and optional 38 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and 39 * "Lower (Upper)". 40 * 41 * A compound ID, which is a sequence of one or more single IDs, 42 * separated by semicolons, with optional forward and reverse global 43 * filters. The global filters are UnicodeSet patterns prepended or 44 * appended to the IDs, separated by semicolons. An appended filter 45 * must be enclosed in parentheses and applies in the reverse 46 * direction. 47 * 48 * @author Alan Liu 49 */ 50 class TransliteratorIDParser /* not : public UObject because all methods are static */ { 51 52 public: 53 54 /** 55 * A structure containing the parsed data of a filtered ID, that 56 * is, a basic ID optionally with a filter. 57 * 58 * 'source' and 'target' will always be non-null. The 'variant' 59 * will be non-null only if a non-empty variant was parsed. 60 * 61 * 'sawSource' is true if there was an explicit source in the 62 * parsed id. If there was no explicit source, then an implied 63 * source of ANY is returned and 'sawSource' is set to false. 64 * 65 * 'filter' is the parsed filter pattern, or null if there was no 66 * filter. 67 */ 68 class Specs : public UMemory { 69 public: 70 UnicodeString source; // not null 71 UnicodeString target; // not null 72 UnicodeString variant; // may be null 73 UnicodeString filter; // may be null 74 UBool sawSource; 75 Specs(const UnicodeString& s, const UnicodeString& t, 76 const UnicodeString& v, UBool sawS, 77 const UnicodeString& f); 78 79 private: 80 81 Specs(const Specs &other); // forbid copying of this class 82 Specs &operator=(const Specs &other); // forbid copying of this class 83 }; 84 85 /** 86 * A structure containing the canonicalized data of a filtered ID, 87 * that is, a basic ID optionally with a filter. 88 * 89 * 'canonID' is always non-null. It may be the empty string "". 90 * It is the id that should be assigned to the created 91 * transliterator. It _cannot_ be instantiated directly. 92 * 93 * 'basicID' is always non-null and non-empty. It is always of 94 * the form S-T or S-T/V. It is designed to be fed to low-level 95 * instantiation code that only understands these two formats. 96 * 97 * 'filter' may be null, if there is none, or non-null and 98 * non-empty. 99 */ 100 class SingleID : public UMemory { 101 public: 102 UnicodeString canonID; 103 UnicodeString basicID; 104 UnicodeString filter; 105 SingleID(const UnicodeString& c, const UnicodeString& b, 106 const UnicodeString& f); 107 SingleID(const UnicodeString& c, const UnicodeString& b); 108 Transliterator* createInstance(); 109 110 private: 111 112 SingleID(const SingleID &other); // forbid copying of this class 113 SingleID &operator=(const SingleID &other); // forbid copying of this class 114 }; 115 116 /** 117 * Parse a filter ID, that is, an ID of the general form 118 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 119 * @param id the id to be parsed 120 * @param pos INPUT-OUTPUT parameter. On input, the position of 121 * the first character to parse. On output, the position after 122 * the last character parsed. 123 * @return a SingleID object or null if the parse fails 124 */ 125 static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos); 126 127 /** 128 * Parse a single ID, that is, an ID of the general form 129 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 130 * optional, the filters optional, and the variants optional. 131 * @param id the id to be parsed 132 * @param pos INPUT-OUTPUT parameter. On input, the position of 133 * the first character to parse. On output, the position after 134 * the last character parsed. 135 * @param dir the direction. If the direction is REVERSE then the 136 * SingleID is constructed for the reverse direction. 137 * @return a SingleID object or null 138 */ 139 static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos, 140 int32_t dir, UErrorCode& status); 141 142 /** 143 * Parse a global filter of the form "[f]" or "([f])", depending 144 * on 'withParens'. 145 * @param id the pattern the parse 146 * @param pos INPUT-OUTPUT parameter. On input, the position of 147 * the first character to parse. On output, the position after 148 * the last character parsed. 149 * @param dir the direction. 150 * @param withParens INPUT-OUTPUT parameter. On entry, if 151 * withParens[0] is 0, then parens are disallowed. If it is 1, 152 * then parens are required. If it is -1, then parens are 153 * optional, and the return result will be set to 0 or 1. 154 * @param canonID OUTPUT parameter. The pattern for the filter 155 * added to the canonID, either at the end, if dir is FORWARD, or 156 * at the start, if dir is REVERSE. The pattern will be enclosed 157 * in parentheses if appropriate, and will be suffixed with an 158 * ID_DELIM character. May be null. 159 * @return a UnicodeSet object or null. A non-null results 160 * indicates a successful parse, regardless of whether the filter 161 * applies to the given direction. The caller should discard it 162 * if withParens != (dir == REVERSE). 163 */ 164 static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos, 165 int32_t dir, 166 int32_t& withParens, 167 UnicodeString* canonID); 168 169 /** 170 * Parse a compound ID, consisting of an optional forward global 171 * filter, a separator, one or more single IDs delimited by 172 * separators, an an optional reverse global filter. The 173 * separator is a semicolon. The global filters are UnicodeSet 174 * patterns. The reverse global filter must be enclosed in 175 * parentheses. 176 * @param id the pattern the parse 177 * @param dir the direction. 178 * @param canonID OUTPUT parameter that receives the canonical ID, 179 * consisting of canonical IDs for all elements, as returned by 180 * parseSingleID(), separated by semicolons. Previous contents 181 * are discarded. 182 * @param list OUTPUT parameter that receives a list of SingleID 183 * objects representing the parsed IDs. Previous contents are 184 * discarded. 185 * @param globalFilter OUTPUT parameter that receives a pointer to 186 * a newly created global filter for this ID in this direction, or 187 * null if there is none. 188 * @return true if the parse succeeds, that is, if the entire 189 * id is consumed without syntax error. 190 */ 191 static UBool parseCompoundID(const UnicodeString& id, int32_t dir, 192 UnicodeString& canonID, 193 UVector& list, 194 UnicodeSet*& globalFilter); 195 196 /** 197 * Convert the elements of the 'list' vector, which are SingleID 198 * objects, into actual Transliterator objects. In the course of 199 * this, some (or all) entries may be removed. If all entries 200 * are removed, the Null transliterator will be added. 201 * 202 * Delete entries with empty basicIDs; these are generated by 203 * elements like "(A)" in the forward direction, or "A()" in 204 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 205 * SingleID entries to actual transliterators. 206 * 207 * @param list vector of SingleID objects. On exit, vector 208 * of one or more Transliterators. 209 * @param ec Output param to receive a success or an error code. 210 * @return new value of insertIndex. The index will shift if 211 * there are empty items, like "(Lower)", with indices less than 212 * insertIndex. 213 */ 214 static void instantiateList(UVector& list, 215 UErrorCode& ec); 216 217 /** 218 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 219 * S-T/V, or S/V-T. If the source is missing, return a source of 220 * ANY. 221 * @param id the id string, in any of several forms 222 * @param source the given source. 223 * @param target the given target. 224 * @param variant the given variant 225 * @param isSourcePresent If true then the source is present. 226 * If the source is not present, ANY will be 227 * given as the source, and isSourcePresent will be null 228 * @return an array of 4 strings: source, target, variant, and 229 * isSourcePresent. If the source is not present, ANY will be 230 * given as the source, and isSourcePresent will be null. Otherwise 231 * isSourcePresent will be non-null. The target may be empty if the 232 * id is not well-formed. The variant may be empty. 233 */ 234 static void IDtoSTV(const UnicodeString& id, 235 UnicodeString& source, 236 UnicodeString& target, 237 UnicodeString& variant, 238 UBool& isSourcePresent); 239 240 /** 241 * Given source, target, and variant strings, concatenate them into a 242 * full ID. If the source is empty, then "Any" will be used for the 243 * source, so the ID will always be of the form s-t/v or s-t. 244 */ 245 static void STVtoID(const UnicodeString& source, 246 const UnicodeString& target, 247 const UnicodeString& variant, 248 UnicodeString& id); 249 250 /** 251 * Register two targets as being inverses of one another. For 252 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 253 * Transliterator to form the following inverse relationships: 254 * 255 * <pre>NFC => NFD 256 * Any-NFC => Any-NFD 257 * NFD => NFC 258 * Any-NFD => Any-NFC</pre> 259 * 260 * (Without the special inverse registration, the inverse of NFC 261 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 262 * that the presence or absence of "Any-" is preserved. 263 * 264 * <p>The relationship is symmetrical; registering (a, b) is 265 * equivalent to registering (b, a). 266 * 267 * <p>The relevant IDs must still be registered separately as 268 * factories or classes. 269 * 270 * <p>Only the targets are specified. Special inverses always 271 * have the form Any-Target1 <=> Any-Target2. The target should 272 * have canonical casing (the casing desired to be produced when 273 * an inverse is formed) and should contain no whitespace or other 274 * extraneous characters. 275 * 276 * @param target the target against which to register the inverse 277 * @param inverseTarget the inverse of target, that is 278 * Any-target.getInverse() => Any-inverseTarget 279 * @param bidirectional if true, register the reverse relation 280 * as well, that is, Any-inverseTarget.getInverse() => Any-target 281 */ 282 static void registerSpecialInverse(const UnicodeString& target, 283 const UnicodeString& inverseTarget, 284 UBool bidirectional, 285 UErrorCode &status); 286 287 /** 288 * Free static memory. 289 */ 290 static void cleanup(); 291 292 private: 293 //---------------------------------------------------------------- 294 // Private implementation 295 //---------------------------------------------------------------- 296 297 // forbid instantiation 298 TransliteratorIDParser(); 299 300 /** 301 * Parse an ID into component pieces. Take IDs of the form T, 302 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 303 * source of ANY. 304 * @param id the id string, in any of several forms 305 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 306 * offset of the first character to parse in id. On output, 307 * pos[0] is the offset after the last parsed character. If the 308 * parse failed, pos[0] will be unchanged. 309 * @param allowFilter if true, a UnicodeSet pattern is allowed 310 * at any location between specs or delimiters, and is returned 311 * as the fifth string in the array. 312 * @return a Specs object, or null if the parse failed. If 313 * neither source nor target was seen in the parsed id, then the 314 * parse fails. If allowFilter is true, then the parsed filter 315 * pattern is returned in the Specs object, otherwise the returned 316 * filter reference is null. If the parse fails for any reason 317 * null is returned. 318 */ 319 static Specs* parseFilterID(const UnicodeString& id, int32_t& pos, 320 UBool allowFilter); 321 322 /** 323 * Givens a Specs object, convert it to a SingleID object. The 324 * Spec object is a more unprocessed parse result. The SingleID 325 * object contains information about canonical and basic IDs. 326 * @param specs the given Specs object. 327 * @param dir either FORWARD or REVERSE. 328 * @return a SingleID; never returns null. Returned object always 329 * has 'filter' field of null. 330 */ 331 static SingleID* specsToID(const Specs* specs, int32_t dir); 332 333 /** 334 * Given a Specs object, return a SingleID representing the 335 * special inverse of that ID. If there is no special inverse 336 * then return null. 337 * @param specs the given Specs. 338 * @return a SingleID or null. Returned object always has 339 * 'filter' field of null. 340 */ 341 static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status); 342 343 /** 344 * Glue method to get around access problems in C++. 345 * @param id the id string for the transliterator, in any of several forms 346 * @param canonID the given canonical ID 347 */ 348 static Transliterator* createBasicInstance(const UnicodeString& id, 349 const UnicodeString* canonID); 350 351 /** 352 * Initialize static memory. 353 */ 354 static void U_CALLCONV init(UErrorCode &status); 355 356 friend class SingleID; 357 }; 358 359 U_NAMESPACE_END 360 361 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 362 363 #endif 364