1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2011, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 #ifndef RBT_PARS_H 11 #define RBT_PARS_H 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 #ifdef __cplusplus 17 18 #include "unicode/uobject.h" 19 #include "unicode/parseerr.h" 20 #include "unicode/unorm.h" 21 #include "rbt.h" 22 #include "hash.h" 23 #include "uvector.h" 24 25 U_NAMESPACE_BEGIN 26 27 class TransliterationRuleData; 28 class UnicodeFunctor; 29 class ParseData; 30 class RuleHalf; 31 class ParsePosition; 32 class StringMatcher; 33 34 class TransliteratorParser : public UMemory { 35 36 public: 37 38 /** 39 * A Vector of TransliterationRuleData objects, one for each discrete group 40 * of rules in the rule set 41 */ 42 UVector dataVector; 43 44 /** 45 * PUBLIC data member. 46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set 47 */ 48 UVector idBlockVector; 49 50 /** 51 * PUBLIC data member containing the parsed compound filter, if any. 52 */ 53 UnicodeSet* compoundFilter; 54 55 private: 56 57 /** 58 * The current data object for which we are parsing rules 59 */ 60 TransliterationRuleData* curData; 61 62 UTransDirection direction; 63 64 /** 65 * Parse error information. 66 */ 67 UParseError parseError; 68 69 /** 70 * Temporary symbol table used during parsing. 71 */ 72 ParseData* parseData; 73 74 /** 75 * Temporary vector of matcher variables. When parsing is complete, this 76 * is copied into the array data.variables. As with data.variables, 77 * element 0 corresponds to character data.variablesBase. 78 */ 79 UVector variablesVector; 80 81 /** 82 * Temporary table of variable names. When parsing is complete, this is 83 * copied into data.variableNames. 84 */ 85 Hashtable variableNames; 86 87 /** 88 * String of standins for segments. Used during the parsing of a single 89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 90 * to StringMatcher object segmentObjects.elementAt(0), etc. 91 */ 92 UnicodeString segmentStandins; 93 94 /** 95 * Vector of StringMatcher objects for segments. Used during the 96 * parsing of a single rule. 97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 98 * to StringMatcher object segmentObjects.elementAt(0), etc. 99 */ 100 UVector segmentObjects; 101 102 /** 103 * The next available stand-in for variables. This starts at some point in 104 * the private use area (discovered dynamically) and increments up toward 105 * <code>variableLimit</code>. At any point during parsing, available 106 * variables are <code>variableNext..variableLimit-1</code>. 107 */ 108 UChar variableNext; 109 110 /** 111 * The last available stand-in for variables. This is discovered 112 * dynamically. At any point during parsing, available variables are 113 * <code>variableNext..variableLimit-1</code>. 114 */ 115 UChar variableLimit; 116 117 /** 118 * When we encounter an undefined variable, we do not immediately signal 119 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 120 * Instead, we save the name of the undefined variable, and substitute 121 * in the placeholder char variableLimit - 1, and decrement 122 * variableLimit. 123 */ 124 UnicodeString undefinedVariableName; 125 126 /** 127 * The stand-in character for the 'dot' set, represented by '.' in 128 * patterns. This is allocated the first time it is needed, and 129 * reused thereafter. 130 */ 131 UChar dotStandIn; 132 133 public: 134 135 /** 136 * Constructor. 137 */ 138 TransliteratorParser(UErrorCode &statusReturn); 139 140 /** 141 * Destructor. 142 */ 143 ~TransliteratorParser(); 144 145 /** 146 * Parse the given string as a sequence of rules, separated by newline 147 * characters ('\n'), and cause this object to implement those rules. Any 148 * previous rules are discarded. Typically this method is called exactly 149 * once after construction. 150 * 151 * Parse the given rules, in the given direction. After this call 152 * returns, query the public data members for results. The caller 153 * owns the 'data' and 'compoundFilter' data members after this 154 * call returns. 155 * @param rules rules, separated by ';' 156 * @param direction either FORWARD or REVERSE. 157 * @param pe Struct to recieve information on position 158 * of error if an error is encountered 159 * @param ec Output param set to success/failure code. 160 */ 161 void parse(const UnicodeString& rules, 162 UTransDirection direction, 163 UParseError& pe, 164 UErrorCode& ec); 165 166 /** 167 * Return the compound filter parsed by parse(). Caller owns result. 168 * @return the compound filter parsed by parse(). 169 */ 170 UnicodeSet* orphanCompoundFilter(); 171 172 private: 173 174 /** 175 * Return a representation of this transliterator as source rules. 176 * @param rules Output param to receive the rules. 177 * @param direction either FORWARD or REVERSE. 178 */ 179 void parseRules(const UnicodeString& rules, 180 UTransDirection direction, 181 UErrorCode& status); 182 183 /** 184 * MAIN PARSER. Parse the next rule in the given rule string, starting 185 * at pos. Return the index after the last character parsed. Do not 186 * parse characters at or after limit. 187 * 188 * Important: The character at pos must be a non-whitespace character 189 * that is not the comment character. 190 * 191 * This method handles quoting, escaping, and whitespace removal. It 192 * parses the end-of-rule character. It recognizes context and cursor 193 * indicators. Once it does a lexical breakdown of the rule at pos, it 194 * creates a rule object and adds it to our rule list. 195 * @param rules Output param to receive the rules. 196 * @param pos the starting position. 197 * @param limit pointer past the last character of the rule. 198 * @return the index after the last character parsed. 199 */ 200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 201 202 /** 203 * Set the variable range to [start, end] (inclusive). 204 * @param start the start value of the range. 205 * @param end the end value of the range. 206 */ 207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status); 208 209 /** 210 * Assert that the given character is NOT within the variable range. 211 * If it is, return FALSE. This is neccesary to ensure that the 212 * variable range does not overlap characters used in a rule. 213 * @param ch the given character. 214 * @return True, if the given character is NOT within the variable range. 215 */ 216 UBool checkVariableRange(UChar32 ch) const; 217 218 /** 219 * Set the maximum backup to 'backup', in response to a pragma 220 * statement. 221 * @param backup the new value to be set. 222 */ 223 void pragmaMaximumBackup(int32_t backup); 224 225 /** 226 * Begin normalizing all rules using the given mode, in response 227 * to a pragma statement. 228 * @param mode the given mode. 229 */ 230 void pragmaNormalizeRules(UNormalizationMode mode); 231 232 /** 233 * Return true if the given rule looks like a pragma. 234 * @param pos offset to the first non-whitespace character 235 * of the rule. 236 * @param limit pointer past the last character of the rule. 237 * @return true if the given rule looks like a pragma. 238 */ 239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); 240 241 /** 242 * Parse a pragma. This method assumes resemblesPragma() has 243 * already returned true. 244 * @param pos offset to the first non-whitespace character 245 * of the rule. 246 * @param limit pointer past the last character of the rule. 247 * @return the position index after the final ';' of the pragma, 248 * or -1 on failure. 249 */ 250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 251 252 /** 253 * Called by main parser upon syntax error. Search the rule string 254 * for the probable end of the rule. Of course, if the error is that 255 * the end of rule marker is missing, then the rule end will not be found. 256 * In any case the rule start will be correctly reported. 257 * @param parseErrorCode error code. 258 * @param msg error description. 259 * @param start position of first character of current rule. 260 * @return start position of first character of current rule. 261 */ 262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, 263 UErrorCode& status); 264 265 /** 266 * Parse a UnicodeSet out, store it, and return the stand-in character 267 * used to represent it. 268 * 269 * @param rule the rule for UnicodeSet. 270 * @param pos the position in pattern at which to start parsing. 271 * @return the stand-in character used to represent it. 272 */ 273 UChar parseSet(const UnicodeString& rule, 274 ParsePosition& pos, 275 UErrorCode& status); 276 277 /** 278 * Generate and return a stand-in for a new UnicodeFunctor. Store 279 * the matcher (adopt it). 280 * @param adopted the UnicodeFunctor to be adopted. 281 * @return a stand-in for a new UnicodeFunctor. 282 */ 283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); 284 285 /** 286 * Return the standin for segment seg (1-based). 287 * @param seg the given segment. 288 * @return the standIn character for the given segment. 289 */ 290 UChar getSegmentStandin(int32_t seg, UErrorCode& status); 291 292 /** 293 * Set the object for segment seg (1-based). 294 * @param seg the given segment. 295 * @param adopted the StringMatcher to be adopted. 296 */ 297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); 298 299 /** 300 * Return the stand-in for the dot set. It is allocated the first 301 * time and reused thereafter. 302 * @return the stand-in for the dot set. 303 */ 304 UChar getDotStandIn(UErrorCode& status); 305 306 /** 307 * Append the value of the given variable name to the given 308 * UnicodeString. 309 * @param name the variable name to be appended. 310 * @param buf the given UnicodeString to append to. 311 */ 312 void appendVariableDef(const UnicodeString& name, 313 UnicodeString& buf, 314 UErrorCode& status); 315 316 /** 317 * Glue method to get around access restrictions in C++. 318 */ 319 /*static Transliterator* createBasicInstance(const UnicodeString& id, 320 const UnicodeString* canonID);*/ 321 322 friend class RuleHalf; 323 324 // Disallowed methods; no impl. 325 /** 326 * Copy constructor 327 */ 328 TransliteratorParser(const TransliteratorParser&); 329 330 /** 331 * Assignment operator 332 */ 333 TransliteratorParser& operator=(const TransliteratorParser&); 334 }; 335 336 U_NAMESPACE_END 337 338 #endif /* #ifdef __cplusplus */ 339 340 /** 341 * Strip/convert the following from the transliterator rules: 342 * comments 343 * newlines 344 * white space at the beginning and end of a line 345 * unescape \u notation 346 * 347 * The target must be equal in size as the source. 348 * @internal 349 */ 350 U_CAPI int32_t 351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); 352 353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 354 355 #endif 356