1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2007, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 #ifndef RBT_H 13 #define RBT_H 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_TRANSLITERATION 18 19 #include "unicode/translit.h" 20 #include "unicode/utypes.h" 21 #include "unicode/parseerr.h" 22 #include "unicode/udata.h" 23 24 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit" 25 26 U_NAMESPACE_BEGIN 27 28 class TransliterationRuleData; 29 30 /** 31 * <code>RuleBasedTransliterator</code> is a transliterator 32 * that reads a set of rules in order to determine how to perform 33 * translations. Rule sets are stored in resource bundles indexed by 34 * name. Rules within a rule set are separated by semicolons (';'). 35 * To include a literal semicolon, prefix it with a backslash ('\'). 36 * Whitespace, as defined by <code>Character.isWhitespace()</code>, 37 * is ignored. If the first non-blank character on a line is '#', 38 * the entire line is ignored as a comment. </p> 39 * 40 * <p>Each set of rules consists of two groups, one forward, and one 41 * reverse. This is a convention that is not enforced; rules for one 42 * direction may be omitted, with the result that translations in 43 * that direction will not modify the source text. In addition, 44 * bidirectional forward-reverse rules may be specified for 45 * symmetrical transformations.</p> 46 * 47 * <p><b>Rule syntax</b> </p> 48 * 49 * <p>Rule statements take one of the following forms: </p> 50 * 51 * <dl> 52 * <dt><code>$alefmadda=\u0622;</code></dt> 53 * <dd><strong>Variable definition.</strong> The name on the 54 * left is assigned the text on the right. In this example, 55 * after this statement, instances of the left hand name, 56 * "<code>$alefmadda</code>", will be replaced by 57 * the Unicode character U+0622. Variable names must begin 58 * with a letter and consist only of letters, digits, and 59 * underscores. Case is significant. Duplicate names cause 60 * an exception to be thrown, that is, variables cannot be 61 * redefined. The right hand side may contain well-formed 62 * text of any length, including no text at all ("<code>$empty=;</code>"). 63 * The right hand side may contain embedded <code>UnicodeSet</code> 64 * patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd> 65 * <dd> </dd> 66 * <dt><code>ai>$alefmadda;</code></dt> 67 * <dd><strong>Forward translation rule.</strong> This rule 68 * states that the string on the left will be changed to the 69 * string on the right when performing forward 70 * transliteration.</dd> 71 * <dt> </dt> 72 * <dt><code>ai<$alefmadda;</code></dt> 73 * <dd><strong>Reverse translation rule.</strong> This rule 74 * states that the string on the right will be changed to 75 * the string on the left when performing reverse 76 * transliteration.</dd> 77 * </dl> 78 * 79 * <dl> 80 * <dt><code>ai<>$alefmadda;</code></dt> 81 * <dd><strong>Bidirectional translation rule.</strong> This 82 * rule states that the string on the right will be changed 83 * to the string on the left when performing forward 84 * transliteration, and vice versa when performing reverse 85 * transliteration.</dd> 86 * </dl> 87 * 88 * <p>Translation rules consist of a <em>match pattern</em> and an <em>output 89 * string</em>. The match pattern consists of literal characters, 90 * optionally preceded by context, and optionally followed by 91 * context. Context characters, like literal pattern characters, 92 * must be matched in the text being transliterated. However, unlike 93 * literal pattern characters, they are not replaced by the output 94 * text. For example, the pattern "<code>abc{def}</code>" 95 * indicates the characters "<code>def</code>" must be 96 * preceded by "<code>abc</code>" for a successful match. 97 * If there is a successful match, "<code>def</code>" will 98 * be replaced, but not "<code>abc</code>". The final '<code>}</code>' 99 * is optional, so "<code>abc{def</code>" is equivalent to 100 * "<code>abc{def}</code>". Another example is "<code>{123}456</code>" 101 * (or "<code>123}456</code>") in which the literal 102 * pattern "<code>123</code>" must be followed by "<code>456</code>". 103 * </p> 104 * 105 * <p>The output string of a forward or reverse rule consists of 106 * characters to replace the literal pattern characters. If the 107 * output string contains the character '<code>|</code>', this is 108 * taken to indicate the location of the <em>cursor</em> after 109 * replacement. The cursor is the point in the text at which the 110 * next replacement, if any, will be applied. The cursor is usually 111 * placed within the replacement text; however, it can actually be 112 * placed into the precending or following context by using the 113 * special character '<code>@</code>'. Examples:</p> 114 * 115 * <blockquote> 116 * <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor 117 * before a<br> 118 * {foo} xyz > bar @@|; # foo -> bar, cursor between 119 * y and z</code></p> 120 * </blockquote> 121 * 122 * <p><b>UnicodeSet</b></p> 123 * 124 * <p><code>UnicodeSet</code> patterns may appear anywhere that 125 * makes sense. They may appear in variable definitions. 126 * Contrariwise, <code>UnicodeSet</code> patterns may themselves 127 * contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>", 128 * or "<code>$range=a-z;$ll=[$range]</code>".</p> 129 * 130 * <p><code>UnicodeSet</code> patterns may also be embedded directly 131 * into rule strings. Thus, the following two rules are equivalent:</p> 132 * 133 * <blockquote> 134 * <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br> 135 * [aeiou]>'*'; 136 * # 137 * Another way</code></p> 138 * </blockquote> 139 * 140 * <p>See {@link UnicodeSet} for more documentation and examples.</p> 141 * 142 * <p><b>Segments</b></p> 143 * 144 * <p>Segments of the input string can be matched and copied to the 145 * output string. This makes certain sets of rules simpler and more 146 * general, and makes reordering possible. For example:</p> 147 * 148 * <blockquote> 149 * <p><code>([a-z]) > $1 $1; 150 * # 151 * double lowercase letters<br> 152 * ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p> 153 * </blockquote> 154 * 155 * <p>The segment of the input string to be copied is delimited by 156 * "<code>(</code>" and "<code>)</code>". Up to 157 * nine segments may be defined. Segments may not overlap. In the 158 * output string, "<code>$1</code>" through "<code>$9</code>" 159 * represent the input string segments, in left-to-right order of 160 * definition.</p> 161 * 162 * <p><b>Anchors</b></p> 163 * 164 * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the 165 * special characters '<code>^</code>' and '<code>$</code>'. For example:</p> 166 * 167 * <blockquote> 168 * <p><code>^ a > 'BEG_A'; # match 'a' at start of text<br> 169 * a > 'A'; # match other instances 170 * of 'a'<br> 171 * z $ > 'END_Z'; # match 'z' at end of text<br> 172 * z > 'Z'; # match other instances 173 * of 'z'</code></p> 174 * </blockquote> 175 * 176 * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>. 177 * This is done by including a virtual anchor character '<code>$</code>' at the end of the 178 * set pattern. Although this is usually the match chafacter for the end anchor, the set will 179 * match either the beginning or the end of the text, depending on its placement. For 180 * example:</p> 181 * 182 * <blockquote> 183 * <p><code>$x = [a-z$]; # match 'a' through 'z' OR anchor<br> 184 * $x 1 > 2; # match '1' after a-z or at the start<br> 185 * 3 $x > 4; # match '3' before a-z or at the end</code></p> 186 * </blockquote> 187 * 188 * <p><b>Example</b> </p> 189 * 190 * <p>The following example rules illustrate many of the features of 191 * the rule language. </p> 192 * 193 * <table border="0" cellpadding="4"> 194 * <tr> 195 * <td valign="top">Rule 1.</td> 196 * <td valign="top" nowrap><code>abc{def}>x|y</code></td> 197 * </tr> 198 * <tr> 199 * <td valign="top">Rule 2.</td> 200 * <td valign="top" nowrap><code>xyz>r</code></td> 201 * </tr> 202 * <tr> 203 * <td valign="top">Rule 3.</td> 204 * <td valign="top" nowrap><code>yz>q</code></td> 205 * </tr> 206 * </table> 207 * 208 * <p>Applying these rules to the string "<code>adefabcdefz</code>" 209 * yields the following results: </p> 210 * 211 * <table border="0" cellpadding="4"> 212 * <tr> 213 * <td valign="top" nowrap><code>|adefabcdefz</code></td> 214 * <td valign="top">Initial state, no rules match. Advance 215 * cursor.</td> 216 * </tr> 217 * <tr> 218 * <td valign="top" nowrap><code>a|defabcdefz</code></td> 219 * <td valign="top">Still no match. Rule 1 does not match 220 * because the preceding context is not present.</td> 221 * </tr> 222 * <tr> 223 * <td valign="top" nowrap><code>ad|efabcdefz</code></td> 224 * <td valign="top">Still no match. Keep advancing until 225 * there is a match...</td> 226 * </tr> 227 * <tr> 228 * <td valign="top" nowrap><code>ade|fabcdefz</code></td> 229 * <td valign="top">...</td> 230 * </tr> 231 * <tr> 232 * <td valign="top" nowrap><code>adef|abcdefz</code></td> 233 * <td valign="top">...</td> 234 * </tr> 235 * <tr> 236 * <td valign="top" nowrap><code>adefa|bcdefz</code></td> 237 * <td valign="top">...</td> 238 * </tr> 239 * <tr> 240 * <td valign="top" nowrap><code>adefab|cdefz</code></td> 241 * <td valign="top">...</td> 242 * </tr> 243 * <tr> 244 * <td valign="top" nowrap><code>adefabc|defz</code></td> 245 * <td valign="top">Rule 1 matches; replace "<code>def</code>" 246 * with "<code>xy</code>" and back up the cursor 247 * to before the '<code>y</code>'.</td> 248 * </tr> 249 * <tr> 250 * <td valign="top" nowrap><code>adefabcx|yz</code></td> 251 * <td valign="top">Although "<code>xyz</code>" is 252 * present, rule 2 does not match because the cursor is 253 * before the '<code>y</code>', not before the '<code>x</code>'. 254 * Rule 3 does match. Replace "<code>yz</code>" 255 * with "<code>q</code>".</td> 256 * </tr> 257 * <tr> 258 * <td valign="top" nowrap><code>adefabcxq|</code></td> 259 * <td valign="top">The cursor is at the end; 260 * transliteration is complete.</td> 261 * </tr> 262 * </table> 263 * 264 * <p>The order of rules is significant. If multiple rules may match 265 * at some point, the first matching rule is applied. </p> 266 * 267 * <p>Forward and reverse rules may have an empty output string. 268 * Otherwise, an empty left or right hand side of any statement is a 269 * syntax error. </p> 270 * 271 * <p>Single quotes are used to quote any character other than a 272 * digit or letter. To specify a single quote itself, inside or 273 * outside of quotes, use two single quotes in a row. For example, 274 * the rule "<code>'>'>o''clock</code>" changes the 275 * string "<code>></code>" to the string "<code>o'clock</code>". 276 * </p> 277 * 278 * <p><b>Notes</b> </p> 279 * 280 * <p>While a RuleBasedTransliterator is being built, it checks that 281 * the rules are added in proper order. For example, if the rule 282 * "a>x" is followed by the rule "ab>y", 283 * then the second rule will throw an exception. The reason is that 284 * the second rule can never be triggered, since the first rule 285 * always matches anything it matches. In other words, the first 286 * rule <em>masks</em> the second rule. </p> 287 * 288 * @author Alan Liu 289 * @internal Use transliterator factory methods instead since this class will be removed in that release. 290 */ 291 class RuleBasedTransliterator : public Transliterator { 292 private: 293 /** 294 * The data object is immutable, so we can freely share it with 295 * other instances of RBT, as long as we do NOT own this object. 296 * TODO: data is no longer immutable. See bugs #1866, 2155 297 */ 298 TransliterationRuleData* fData; 299 300 /** 301 * If true, we own the data object and must delete it. 302 */ 303 UBool isDataOwned; 304 305 public: 306 307 /** 308 * Constructs a new transliterator from the given rules. 309 * @param rules rules, separated by ';' 310 * @param direction either FORWARD or REVERSE. 311 * @exception IllegalArgumentException if rules are malformed. 312 * @internal Use transliterator factory methods instead since this class will be removed in that release. 313 */ 314 RuleBasedTransliterator(const UnicodeString& id, 315 const UnicodeString& rules, 316 UTransDirection direction, 317 UnicodeFilter* adoptedFilter, 318 UParseError& parseError, 319 UErrorCode& status); 320 321 /** 322 * Constructs a new transliterator from the given rules. 323 * @param rules rules, separated by ';' 324 * @param direction either FORWARD or REVERSE. 325 * @exception IllegalArgumentException if rules are malformed. 326 * @internal Use transliterator factory methods instead since this class will be removed in that release. 327 */ 328 /*RuleBasedTransliterator(const UnicodeString& id, 329 const UnicodeString& rules, 330 UTransDirection direction, 331 UnicodeFilter* adoptedFilter, 332 UErrorCode& status);*/ 333 334 /** 335 * Covenience constructor with no filter. 336 * @internal Use transliterator factory methods instead since this class will be removed in that release. 337 */ 338 /*RuleBasedTransliterator(const UnicodeString& id, 339 const UnicodeString& rules, 340 UTransDirection direction, 341 UErrorCode& status);*/ 342 343 /** 344 * Covenience constructor with no filter and FORWARD direction. 345 * @internal Use transliterator factory methods instead since this class will be removed in that release. 346 */ 347 /*RuleBasedTransliterator(const UnicodeString& id, 348 const UnicodeString& rules, 349 UErrorCode& status);*/ 350 351 /** 352 * Covenience constructor with FORWARD direction. 353 * @internal Use transliterator factory methods instead since this class will be removed in that release. 354 */ 355 /*RuleBasedTransliterator(const UnicodeString& id, 356 const UnicodeString& rules, 357 UnicodeFilter* adoptedFilter, 358 UErrorCode& status);*/ 359 private: 360 361 friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor 362 /** 363 * Covenience constructor. 364 * @param id the id for the transliterator. 365 * @param theData the rule data for the transliterator. 366 * @param adoptedFilter the filter for the transliterator 367 */ 368 RuleBasedTransliterator(const UnicodeString& id, 369 const TransliterationRuleData* theData, 370 UnicodeFilter* adoptedFilter = 0); 371 372 373 friend class Transliterator; // to access following ct 374 375 /** 376 * Internal constructor. 377 * @param id the id for the transliterator. 378 * @param theData the rule data for the transliterator. 379 * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'. 380 */ 381 RuleBasedTransliterator(const UnicodeString& id, 382 TransliterationRuleData* data, 383 UBool isDataAdopted); 384 385 public: 386 387 /** 388 * Copy constructor. 389 * @internal Use transliterator factory methods instead since this class will be removed in that release. 390 */ 391 RuleBasedTransliterator(const RuleBasedTransliterator&); 392 393 virtual ~RuleBasedTransliterator(); 394 395 /** 396 * Implement Transliterator API. 397 * @internal Use transliterator factory methods instead since this class will be removed in that release. 398 */ 399 virtual Transliterator* clone(void) const; 400 401 protected: 402 /** 403 * Implements {@link Transliterator#handleTransliterate}. 404 * @internal Use transliterator factory methods instead since this class will be removed in that release. 405 */ 406 virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets, 407 UBool isIncremental) const; 408 409 public: 410 /** 411 * Return a representation of this transliterator as source rules. 412 * These rules will produce an equivalent transliterator if used 413 * to construct a new transliterator. 414 * @param result the string to receive the rules. Previous 415 * contents will be deleted. 416 * @param escapeUnprintable if TRUE then convert unprintable 417 * character to their hex escape representations, \uxxxx or 418 * \Uxxxxxxxx. Unprintable characters are those other than 419 * U+000A, U+0020..U+007E. 420 * @internal Use transliterator factory methods instead since this class will be removed in that release. 421 */ 422 virtual UnicodeString& toRules(UnicodeString& result, 423 UBool escapeUnprintable) const; 424 425 protected: 426 /** 427 * Implement Transliterator framework 428 */ 429 virtual void handleGetSourceSet(UnicodeSet& result) const; 430 431 public: 432 /** 433 * Override Transliterator framework 434 */ 435 virtual UnicodeSet& getTargetSet(UnicodeSet& result) const; 436 437 /** 438 * Return the class ID for this class. This is useful only for 439 * comparing to a return value from getDynamicClassID(). For example: 440 * <pre> 441 * . Base* polymorphic_pointer = createPolymorphicObject(); 442 * . if (polymorphic_pointer->getDynamicClassID() == 443 * . Derived::getStaticClassID()) ... 444 * </pre> 445 * @return The class ID for all objects of this class. 446 * @internal Use transliterator factory methods instead since this class will be removed in that release. 447 */ 448 U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void); 449 450 /** 451 * Returns a unique class ID <b>polymorphically</b>. This method 452 * is to implement a simple version of RTTI, since not all C++ 453 * compilers support genuine RTTI. Polymorphic operator==() and 454 * clone() methods call this method. 455 * 456 * @return The class ID for this object. All objects of a given 457 * class have the same class ID. Objects of other classes have 458 * different class IDs. 459 */ 460 virtual UClassID getDynamicClassID(void) const; 461 462 private: 463 464 void _construct(const UnicodeString& rules, 465 UTransDirection direction, 466 UParseError& parseError, 467 UErrorCode& status); 468 }; 469 470 471 U_NAMESPACE_END 472 473 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 474 475 #endif 476