1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2011, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 #ifndef RBT_PARS_H 13 #define RBT_PARS_H 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_TRANSLITERATION 18 #ifdef __cplusplus 19 20 #include "unicode/uobject.h" 21 #include "unicode/parseerr.h" 22 #include "unicode/unorm.h" 23 #include "rbt.h" 24 #include "hash.h" 25 #include "uvector.h" 26 27 U_NAMESPACE_BEGIN 28 29 class TransliterationRuleData; 30 class UnicodeFunctor; 31 class ParseData; 32 class RuleHalf; 33 class ParsePosition; 34 class StringMatcher; 35 36 class TransliteratorParser : public UMemory { 37 38 public: 39 40 /** 41 * A Vector of TransliterationRuleData objects, one for each discrete group 42 * of rules in the rule set 43 */ 44 UVector dataVector; 45 46 /** 47 * PUBLIC data member. 48 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set 49 */ 50 UVector idBlockVector; 51 52 /** 53 * PUBLIC data member containing the parsed compound filter, if any. 54 */ 55 UnicodeSet* compoundFilter; 56 57 private: 58 59 /** 60 * The current data object for which we are parsing rules 61 */ 62 TransliterationRuleData* curData; 63 64 UTransDirection direction; 65 66 /** 67 * Parse error information. 68 */ 69 UParseError parseError; 70 71 /** 72 * Temporary symbol table used during parsing. 73 */ 74 ParseData* parseData; 75 76 /** 77 * Temporary vector of matcher variables. When parsing is complete, this 78 * is copied into the array data.variables. As with data.variables, 79 * element 0 corresponds to character data.variablesBase. 80 */ 81 UVector variablesVector; 82 83 /** 84 * Temporary table of variable names. When parsing is complete, this is 85 * copied into data.variableNames. 86 */ 87 Hashtable variableNames; 88 89 /** 90 * String of standins for segments. Used during the parsing of a single 91 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 92 * to StringMatcher object segmentObjects.elementAt(0), etc. 93 */ 94 UnicodeString segmentStandins; 95 96 /** 97 * Vector of StringMatcher objects for segments. Used during the 98 * parsing of a single rule. 99 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 100 * to StringMatcher object segmentObjects.elementAt(0), etc. 101 */ 102 UVector segmentObjects; 103 104 /** 105 * The next available stand-in for variables. This starts at some point in 106 * the private use area (discovered dynamically) and increments up toward 107 * <code>variableLimit</code>. At any point during parsing, available 108 * variables are <code>variableNext..variableLimit-1</code>. 109 */ 110 UChar variableNext; 111 112 /** 113 * The last available stand-in for variables. This is discovered 114 * dynamically. At any point during parsing, available variables are 115 * <code>variableNext..variableLimit-1</code>. 116 */ 117 UChar variableLimit; 118 119 /** 120 * When we encounter an undefined variable, we do not immediately signal 121 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 122 * Instead, we save the name of the undefined variable, and substitute 123 * in the placeholder char variableLimit - 1, and decrement 124 * variableLimit. 125 */ 126 UnicodeString undefinedVariableName; 127 128 /** 129 * The stand-in character for the 'dot' set, represented by '.' in 130 * patterns. This is allocated the first time it is needed, and 131 * reused thereafter. 132 */ 133 UChar dotStandIn; 134 135 public: 136 137 /** 138 * Constructor. 139 */ 140 TransliteratorParser(UErrorCode &statusReturn); 141 142 /** 143 * Destructor. 144 */ 145 ~TransliteratorParser(); 146 147 /** 148 * Parse the given string as a sequence of rules, separated by newline 149 * characters ('\n'), and cause this object to implement those rules. Any 150 * previous rules are discarded. Typically this method is called exactly 151 * once after construction. 152 * 153 * Parse the given rules, in the given direction. After this call 154 * returns, query the public data members for results. The caller 155 * owns the 'data' and 'compoundFilter' data members after this 156 * call returns. 157 * @param rules rules, separated by ';' 158 * @param direction either FORWARD or REVERSE. 159 * @param pe Struct to recieve information on position 160 * of error if an error is encountered 161 * @param ec Output param set to success/failure code. 162 */ 163 void parse(const UnicodeString& rules, 164 UTransDirection direction, 165 UParseError& pe, 166 UErrorCode& ec); 167 168 /** 169 * Return the compound filter parsed by parse(). Caller owns result. 170 * @return the compound filter parsed by parse(). 171 */ 172 UnicodeSet* orphanCompoundFilter(); 173 174 private: 175 176 /** 177 * Return a representation of this transliterator as source rules. 178 * @param rules Output param to receive the rules. 179 * @param direction either FORWARD or REVERSE. 180 */ 181 void parseRules(const UnicodeString& rules, 182 UTransDirection direction, 183 UErrorCode& status); 184 185 /** 186 * MAIN PARSER. Parse the next rule in the given rule string, starting 187 * at pos. Return the index after the last character parsed. Do not 188 * parse characters at or after limit. 189 * 190 * Important: The character at pos must be a non-whitespace character 191 * that is not the comment character. 192 * 193 * This method handles quoting, escaping, and whitespace removal. It 194 * parses the end-of-rule character. It recognizes context and cursor 195 * indicators. Once it does a lexical breakdown of the rule at pos, it 196 * creates a rule object and adds it to our rule list. 197 * @param rules Output param to receive the rules. 198 * @param pos the starting position. 199 * @param limit pointer past the last character of the rule. 200 * @return the index after the last character parsed. 201 */ 202 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 203 204 /** 205 * Set the variable range to [start, end] (inclusive). 206 * @param start the start value of the range. 207 * @param end the end value of the range. 208 */ 209 void setVariableRange(int32_t start, int32_t end, UErrorCode& status); 210 211 /** 212 * Assert that the given character is NOT within the variable range. 213 * If it is, return false. This is necessary to ensure that the 214 * variable range does not overlap characters used in a rule. 215 * @param ch the given character. 216 * @return True, if the given character is NOT within the variable range. 217 */ 218 UBool checkVariableRange(UChar32 ch) const; 219 220 /** 221 * Set the maximum backup to 'backup', in response to a pragma 222 * statement. 223 * @param backup the new value to be set. 224 */ 225 void pragmaMaximumBackup(int32_t backup); 226 227 /** 228 * Begin normalizing all rules using the given mode, in response 229 * to a pragma statement. 230 * @param mode the given mode. 231 */ 232 void pragmaNormalizeRules(UNormalizationMode mode); 233 234 /** 235 * Return true if the given rule looks like a pragma. 236 * @param pos offset to the first non-whitespace character 237 * of the rule. 238 * @param limit pointer past the last character of the rule. 239 * @return true if the given rule looks like a pragma. 240 */ 241 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); 242 243 /** 244 * Parse a pragma. This method assumes resemblesPragma() has 245 * already returned true. 246 * @param pos offset to the first non-whitespace character 247 * of the rule. 248 * @param limit pointer past the last character of the rule. 249 * @return the position index after the final ';' of the pragma, 250 * or -1 on failure. 251 */ 252 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 253 254 /** 255 * Called by main parser upon syntax error. Search the rule string 256 * for the probable end of the rule. Of course, if the error is that 257 * the end of rule marker is missing, then the rule end will not be found. 258 * In any case the rule start will be correctly reported. 259 * @param parseErrorCode error code. 260 * @param msg error description. 261 * @param start position of first character of current rule. 262 * @return start position of first character of current rule. 263 */ 264 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, 265 UErrorCode& status); 266 267 /** 268 * Parse a UnicodeSet out, store it, and return the stand-in character 269 * used to represent it. 270 * 271 * @param rule the rule for UnicodeSet. 272 * @param pos the position in pattern at which to start parsing. 273 * @return the stand-in character used to represent it. 274 */ 275 UChar parseSet(const UnicodeString& rule, 276 ParsePosition& pos, 277 UErrorCode& status); 278 279 /** 280 * Generate and return a stand-in for a new UnicodeFunctor. Store 281 * the matcher (adopt it). 282 * @param adopted the UnicodeFunctor to be adopted. 283 * @return a stand-in for a new UnicodeFunctor. 284 */ 285 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); 286 287 /** 288 * Return the standin for segment seg (1-based). 289 * @param seg the given segment. 290 * @return the standIn character for the given segment. 291 */ 292 UChar getSegmentStandin(int32_t seg, UErrorCode& status); 293 294 /** 295 * Set the object for segment seg (1-based). 296 * @param seg the given segment. 297 * @param adopted the StringMatcher to be adopted. 298 */ 299 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); 300 301 /** 302 * Return the stand-in for the dot set. It is allocated the first 303 * time and reused thereafter. 304 * @return the stand-in for the dot set. 305 */ 306 UChar getDotStandIn(UErrorCode& status); 307 308 /** 309 * Append the value of the given variable name to the given 310 * UnicodeString. 311 * @param name the variable name to be appended. 312 * @param buf the given UnicodeString to append to. 313 */ 314 void appendVariableDef(const UnicodeString& name, 315 UnicodeString& buf, 316 UErrorCode& status); 317 318 /** 319 * Glue method to get around access restrictions in C++. 320 */ 321 /*static Transliterator* createBasicInstance(const UnicodeString& id, 322 const UnicodeString* canonID);*/ 323 324 friend class RuleHalf; 325 326 // Disallowed methods; no impl. 327 /** 328 * Copy constructor 329 */ 330 TransliteratorParser(const TransliteratorParser&); 331 332 /** 333 * Assignment operator 334 */ 335 TransliteratorParser& operator=(const TransliteratorParser&); 336 }; 337 338 U_NAMESPACE_END 339 340 #endif /* #ifdef __cplusplus */ 341 342 /** 343 * Strip/convert the following from the transliterator rules: 344 * comments 345 * newlines 346 * white space at the beginning and end of a line 347 * unescape \u notation 348 * 349 * The target must be equal in size as the source. 350 * @internal 351 */ 352 U_CAPI int32_t 353 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); 354 355 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 356 357 #endif 358