1 /* 2 ************************************************************************** 3 * Copyright (c) 2002-2010, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ************************************************************************** 6 * Date Name Description * 7 * 01/28/2002 aliu Creation. * 8 ************************************************************************** 9 */ 10 #ifndef TRIDPARS_H 11 #define TRIDPARS_H 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uobject.h" 18 #include "unicode/unistr.h" 19 20 U_NAMESPACE_BEGIN 21 22 class Transliterator; 23 class UnicodeSet; 24 class UVector; 25 26 /** 27 * Parsing component for transliterator IDs. This class contains only 28 * static members; it cannot be instantiated. Methods in this class 29 * parse various ID formats, including the following: 30 * 31 * A basic ID, which contains source, target, and variant, but no 32 * filter and no explicit inverse. Examples include 33 * "Latin-Greek/UNGEGN" and "Null". 34 * 35 * A single ID, which is a basic ID plus optional filter and optional 36 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and 37 * "Lower (Upper)". 38 * 39 * A compound ID, which is a sequence of one or more single IDs, 40 * separated by semicolons, with optional forward and reverse global 41 * filters. The global filters are UnicodeSet patterns prepended or 42 * appended to the IDs, separated by semicolons. An appended filter 43 * must be enclosed in parentheses and applies in the reverse 44 * direction. 45 * 46 * @author Alan Liu 47 */ 48 class TransliteratorIDParser /* not : public UObject because all methods are static */ { 49 50 public: 51 52 /** 53 * A structure containing the parsed data of a filtered ID, that 54 * is, a basic ID optionally with a filter. 55 * 56 * 'source' and 'target' will always be non-null. The 'variant' 57 * will be non-null only if a non-empty variant was parsed. 58 * 59 * 'sawSource' is true if there was an explicit source in the 60 * parsed id. If there was no explicit source, then an implied 61 * source of ANY is returned and 'sawSource' is set to false. 62 * 63 * 'filter' is the parsed filter pattern, or null if there was no 64 * filter. 65 */ 66 class Specs : public UMemory { 67 public: 68 UnicodeString source; // not null 69 UnicodeString target; // not null 70 UnicodeString variant; // may be null 71 UnicodeString filter; // may be null 72 UBool sawSource; 73 Specs(const UnicodeString& s, const UnicodeString& t, 74 const UnicodeString& v, UBool sawS, 75 const UnicodeString& f); 76 77 private: 78 79 Specs(const Specs &other); // forbid copying of this class 80 Specs &operator=(const Specs &other); // forbid copying of this class 81 }; 82 83 /** 84 * A structure containing the canonicalized data of a filtered ID, 85 * that is, a basic ID optionally with a filter. 86 * 87 * 'canonID' is always non-null. It may be the empty string "". 88 * It is the id that should be assigned to the created 89 * transliterator. It _cannot_ be instantiated directly. 90 * 91 * 'basicID' is always non-null and non-empty. It is always of 92 * the form S-T or S-T/V. It is designed to be fed to low-level 93 * instantiation code that only understands these two formats. 94 * 95 * 'filter' may be null, if there is none, or non-null and 96 * non-empty. 97 */ 98 class SingleID : public UMemory { 99 public: 100 UnicodeString canonID; 101 UnicodeString basicID; 102 UnicodeString filter; 103 SingleID(const UnicodeString& c, const UnicodeString& b, 104 const UnicodeString& f); 105 SingleID(const UnicodeString& c, const UnicodeString& b); 106 Transliterator* createInstance(); 107 108 private: 109 110 SingleID(const SingleID &other); // forbid copying of this class 111 SingleID &operator=(const SingleID &other); // forbid copying of this class 112 }; 113 114 /** 115 * Parse a filter ID, that is, an ID of the general form 116 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 117 * @param id the id to be parsed 118 * @param pos INPUT-OUTPUT parameter. On input, the position of 119 * the first character to parse. On output, the position after 120 * the last character parsed. 121 * @return a SingleID object or null if the parse fails 122 */ 123 static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos); 124 125 /** 126 * Parse a single ID, that is, an ID of the general form 127 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 128 * optional, the filters optional, and the variants optional. 129 * @param id the id to be parsed 130 * @param pos INPUT-OUTPUT parameter. On input, the position of 131 * the first character to parse. On output, the position after 132 * the last character parsed. 133 * @param dir the direction. If the direction is REVERSE then the 134 * SingleID is constructed for the reverse direction. 135 * @return a SingleID object or null 136 */ 137 static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos, 138 int32_t dir, UErrorCode& status); 139 140 /** 141 * Parse a global filter of the form "[f]" or "([f])", depending 142 * on 'withParens'. 143 * @param id the pattern the parse 144 * @param pos INPUT-OUTPUT parameter. On input, the position of 145 * the first character to parse. On output, the position after 146 * the last character parsed. 147 * @param dir the direction. 148 * @param withParens INPUT-OUTPUT parameter. On entry, if 149 * withParens[0] is 0, then parens are disallowed. If it is 1, 150 * then parens are required. If it is -1, then parens are 151 * optional, and the return result will be set to 0 or 1. 152 * @param canonID OUTPUT parameter. The pattern for the filter 153 * added to the canonID, either at the end, if dir is FORWARD, or 154 * at the start, if dir is REVERSE. The pattern will be enclosed 155 * in parentheses if appropriate, and will be suffixed with an 156 * ID_DELIM character. May be null. 157 * @return a UnicodeSet object or null. A non-null results 158 * indicates a successful parse, regardless of whether the filter 159 * applies to the given direction. The caller should discard it 160 * if withParens != (dir == REVERSE). 161 */ 162 static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos, 163 int32_t dir, 164 int32_t& withParens, 165 UnicodeString* canonID); 166 167 /** 168 * Parse a compound ID, consisting of an optional forward global 169 * filter, a separator, one or more single IDs delimited by 170 * separators, an an optional reverse global filter. The 171 * separator is a semicolon. The global filters are UnicodeSet 172 * patterns. The reverse global filter must be enclosed in 173 * parentheses. 174 * @param id the pattern the parse 175 * @param dir the direction. 176 * @param canonID OUTPUT parameter that receives the canonical ID, 177 * consisting of canonical IDs for all elements, as returned by 178 * parseSingleID(), separated by semicolons. Previous contents 179 * are discarded. 180 * @param list OUTPUT parameter that receives a list of SingleID 181 * objects representing the parsed IDs. Previous contents are 182 * discarded. 183 * @param globalFilter OUTPUT parameter that receives a pointer to 184 * a newly created global filter for this ID in this direction, or 185 * null if there is none. 186 * @return true if the parse succeeds, that is, if the entire 187 * id is consumed without syntax error. 188 */ 189 static UBool parseCompoundID(const UnicodeString& id, int32_t dir, 190 UnicodeString& canonID, 191 UVector& list, 192 UnicodeSet*& globalFilter); 193 194 /** 195 * Convert the elements of the 'list' vector, which are SingleID 196 * objects, into actual Transliterator objects. In the course of 197 * this, some (or all) entries may be removed. If all entries 198 * are removed, the Null transliterator will be added. 199 * 200 * Delete entries with empty basicIDs; these are generated by 201 * elements like "(A)" in the forward direction, or "A()" in 202 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 203 * SingleID entries to actual transliterators. 204 * 205 * @param list vector of SingleID objects. On exit, vector 206 * of one or more Transliterators. 207 * @param ec Output param to receive a success or an error code. 208 * @return new value of insertIndex. The index will shift if 209 * there are empty items, like "(Lower)", with indices less than 210 * insertIndex. 211 */ 212 static void instantiateList(UVector& list, 213 UErrorCode& ec); 214 215 /** 216 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 217 * S-T/V, or S/V-T. If the source is missing, return a source of 218 * ANY. 219 * @param id the id string, in any of several forms 220 * @param source the given source. 221 * @param target the given target. 222 * @param variant the given variant 223 * @param isSourcePresent If TRUE then the source is present. 224 * If the source is not present, ANY will be 225 * given as the source, and isSourcePresent will be null 226 * @return an array of 4 strings: source, target, variant, and 227 * isSourcePresent. If the source is not present, ANY will be 228 * given as the source, and isSourcePresent will be null. Otherwise 229 * isSourcePresent will be non-null. The target may be empty if the 230 * id is not well-formed. The variant may be empty. 231 */ 232 static void IDtoSTV(const UnicodeString& id, 233 UnicodeString& source, 234 UnicodeString& target, 235 UnicodeString& variant, 236 UBool& isSourcePresent); 237 238 /** 239 * Given source, target, and variant strings, concatenate them into a 240 * full ID. If the source is empty, then "Any" will be used for the 241 * source, so the ID will always be of the form s-t/v or s-t. 242 */ 243 static void STVtoID(const UnicodeString& source, 244 const UnicodeString& target, 245 const UnicodeString& variant, 246 UnicodeString& id); 247 248 /** 249 * Register two targets as being inverses of one another. For 250 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 251 * Transliterator to form the following inverse relationships: 252 * 253 * <pre>NFC => NFD 254 * Any-NFC => Any-NFD 255 * NFD => NFC 256 * Any-NFD => Any-NFC</pre> 257 * 258 * (Without the special inverse registration, the inverse of NFC 259 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 260 * that the presence or absence of "Any-" is preserved. 261 * 262 * <p>The relationship is symmetrical; registering (a, b) is 263 * equivalent to registering (b, a). 264 * 265 * <p>The relevant IDs must still be registered separately as 266 * factories or classes. 267 * 268 * <p>Only the targets are specified. Special inverses always 269 * have the form Any-Target1 <=> Any-Target2. The target should 270 * have canonical casing (the casing desired to be produced when 271 * an inverse is formed) and should contain no whitespace or other 272 * extraneous characters. 273 * 274 * @param target the target against which to register the inverse 275 * @param inverseTarget the inverse of target, that is 276 * Any-target.getInverse() => Any-inverseTarget 277 * @param bidirectional if true, register the reverse relation 278 * as well, that is, Any-inverseTarget.getInverse() => Any-target 279 */ 280 static void registerSpecialInverse(const UnicodeString& target, 281 const UnicodeString& inverseTarget, 282 UBool bidirectional, 283 UErrorCode &status); 284 285 /** 286 * Free static memory. 287 */ 288 static void cleanup(); 289 290 private: 291 //---------------------------------------------------------------- 292 // Private implementation 293 //---------------------------------------------------------------- 294 295 // forbid instantiation 296 TransliteratorIDParser(); 297 298 /** 299 * Parse an ID into component pieces. Take IDs of the form T, 300 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 301 * source of ANY. 302 * @param id the id string, in any of several forms 303 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 304 * offset of the first character to parse in id. On output, 305 * pos[0] is the offset after the last parsed character. If the 306 * parse failed, pos[0] will be unchanged. 307 * @param allowFilter if true, a UnicodeSet pattern is allowed 308 * at any location between specs or delimiters, and is returned 309 * as the fifth string in the array. 310 * @return a Specs object, or null if the parse failed. If 311 * neither source nor target was seen in the parsed id, then the 312 * parse fails. If allowFilter is true, then the parsed filter 313 * pattern is returned in the Specs object, otherwise the returned 314 * filter reference is null. If the parse fails for any reason 315 * null is returned. 316 */ 317 static Specs* parseFilterID(const UnicodeString& id, int32_t& pos, 318 UBool allowFilter); 319 320 /** 321 * Givens a Specs object, convert it to a SingleID object. The 322 * Spec object is a more unprocessed parse result. The SingleID 323 * object contains information about canonical and basic IDs. 324 * @param specs the given Specs object. 325 * @param dir either FORWARD or REVERSE. 326 * @return a SingleID; never returns null. Returned object always 327 * has 'filter' field of null. 328 */ 329 static SingleID* specsToID(const Specs* specs, int32_t dir); 330 331 /** 332 * Given a Specs object, return a SingleID representing the 333 * special inverse of that ID. If there is no special inverse 334 * then return null. 335 * @param specs the given Specs. 336 * @return a SingleID or null. Returned object always has 337 * 'filter' field of null. 338 */ 339 static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status); 340 341 /** 342 * Glue method to get around access problems in C++. 343 * @param id the id string for the transliterator, in any of several forms 344 * @param canonID the given canonical ID 345 */ 346 static Transliterator* createBasicInstance(const UnicodeString& id, 347 const UnicodeString* canonID); 348 349 /** 350 * Initialize static memory. 351 */ 352 static void init(UErrorCode &status); 353 354 friend class SingleID; 355 }; 356 357 U_NAMESPACE_END 358 359 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 360 361 #endif 362