1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.MessageFormat; 12 import java.util.ArrayList; 13 import java.util.Collections; 14 import java.util.Enumeration; 15 import java.util.HashMap; 16 import java.util.List; 17 import java.util.Locale; 18 import java.util.Map; 19 import java.util.MissingResourceException; 20 import java.util.Objects; 21 22 import com.ibm.icu.impl.ICUData; 23 import com.ibm.icu.impl.ICUResourceBundle; 24 import com.ibm.icu.impl.Utility; 25 import com.ibm.icu.impl.UtilityExtensions; 26 import com.ibm.icu.text.RuleBasedTransliterator.Data; 27 import com.ibm.icu.text.TransliteratorIDParser.SingleID; 28 import com.ibm.icu.util.CaseInsensitiveString; 29 import com.ibm.icu.util.ULocale; 30 import com.ibm.icu.util.ULocale.Category; 31 import com.ibm.icu.util.UResourceBundle; 32 33 /** 34 * <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common 35 * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator 36 * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not 37 * <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference 38 * to the meanings of words and sentences. 39 * 40 * <p> 41 * Although script conversion is its most common use, a transliterator can actually perform a more general class of 42 * tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the 43 * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of 44 * <code>Transliterator</code>. 45 * 46 * <p> 47 * <b>Transliterators are stateless</b> 48 * 49 * <p> 50 * <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to 51 * <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might 52 * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex 53 * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In 54 * other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the 55 * needed information, and delayed operation allows arbitrary complexity. 56 * 57 * <p> 58 * <b>Batch transliteration</b> 59 * 60 * <p> 61 * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as 62 * <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>, 63 * the call 64 * 65 * <blockquote><code>String result = t.transliterate(input); 66 * </code></blockquote> 67 * 68 * will transliterate it and return the result. Other methods allow the client to specify a substring to be 69 * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band 70 * information (such as text styles). 71 * 72 * <p> 73 * <b>Keyboard transliteration</b> 74 * 75 * <p> 76 * Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that 77 * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal 78 * fashion. 79 * 80 * <p> 81 * In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as 82 * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being 83 * modified as each new character arrives. 84 * 85 * <p> 86 * Consider the simple rule-based Transliterator: 87 * 88 * <blockquote><code> 89 * th>{theta}<br> 90 * t>{tau} 91 * </code></blockquote> 92 * 93 * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is 94 * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string: 95 * 96 * <blockquote><code> 97 * t>|{tau}<br> 98 * {tau}h>{theta} 99 * </code></blockquote> 100 * 101 * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is 102 * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across 103 * calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a 104 * case like the one above, it will precede the insertion point. 105 * 106 * <p> 107 * Keyboard transliteration methods maintain a set of three indices that are updated with each call to 108 * <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and 109 * they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring 110 * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index; 111 * that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the 112 * transliterator last stopped, either because it reached the end, or because it required more characters to 113 * disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules. 114 * Any characters before the <code>cursor</code> index are frozen; future keyboard 115 * transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code> 116 * index, which marks the end of the substring that the transliterator looks at. 117 * 118 * <p> 119 * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It 120 * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the 121 * client code knows that no more characters are forthcoming, perhaps because the user has performed some input 122 * termination operation, then it should call <code>finishTransliteration()</code> to complete any pending 123 * transliterations. 124 * 125 * <p> 126 * <b>Inverses</b> 127 * 128 * <p> 129 * Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates 130 * characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character 131 * values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound 132 * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input 133 * text. 134 * 135 * The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists, 136 * or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true 137 * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider 138 * two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates 139 * 'B' to 'A'. It might seem that these are exact inverses, since 140 * 141 * <blockquote>"A" x <b>AB</b> -> "B"<br> 142 * "B" x <b>BA</b> -> "A"</blockquote> 143 * 144 * where 'x' represents transliteration. However, 145 * 146 * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br> 147 * "BBCD" x <b>BA</b> -> "AACD"</blockquote> 148 * 149 * so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be 150 * <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return 151 * <b>BA</b>. 152 * 153 * <p> 154 * <b>Filtering</b> 155 * <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The 156 * filter affects just the characters that are changed -- the characters outside of the filter are still part of the 157 * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'. 158 * 159 * <pre> 160 * String rules = "x > y; x{a} > b; "; 161 * Transliterator tempTrans = Transliterator.createFromRules("temp", rules, Transliterator.FORWARD); 162 * tempTrans.setFilter(new UnicodeSet("[a]")); 163 * String tempResult = tempTrans.transform("xa"); 164 * // results in "xb" 165 *</pre> 166 * <p> 167 * <b>IDs and display names</b> 168 * 169 * <p> 170 * A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format 171 * <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em> 172 * describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of 173 * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from 174 * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1 175 * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the 176 * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes. 177 * 178 * <p> 179 * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces, 180 * returned by {@link #getDisplayName}. 181 * 182 * <p> 183 * <b>Factory methods and registration</b> 184 * 185 * <p> 186 * In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a 187 * transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators 188 * are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object. 189 * 190 * <p> 191 * In addition to the system transliterators registered at startup, user transliterators may be registered by calling 192 * <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it 193 * is needed), users may call <code>registerClass()</code>. 194 * 195 * <p> 196 * <b>Composed transliterators</b> 197 * 198 * <p> 199 * In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em> 200 * transliterators. These are implemented by composing two or more component transliterators. For example, if we have 201 * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12 202 * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an 203 * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M", 204 * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em> 205 * <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72 206 * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that 207 * the given transliterator is intended to be composed with others, rather than be used as is. 208 * 209 * <p> 210 * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati" 211 * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this 212 * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns 213 * "Devanagari-Gujarati"). 214 * 215 * <p> 216 * <b>Subclassing</b> 217 * 218 * <p> 219 * Subclasses must implement the abstract method <code>handleTransliterate()</code>. 220 * <p> 221 * Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the 222 * <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of 223 * these methods can be improved over the performance obtained by the default implementations in this class. 224 * 225 * <p><b>Rule syntax</b> 226 * 227 * <p>A set of rules determines how to perform translations. 228 * Rules within a rule set are separated by semicolons (';'). 229 * To include a literal semicolon, prefix it with a backslash ('\'). 230 * Unicode Pattern_White_Space is ignored. 231 * If the first non-blank character on a line is '#', 232 * the entire line is ignored as a comment. 233 * 234 * <p>Each set of rules consists of two groups, one forward, and one 235 * reverse. This is a convention that is not enforced; rules for one 236 * direction may be omitted, with the result that translations in 237 * that direction will not modify the source text. In addition, 238 * bidirectional forward-reverse rules may be specified for 239 * symmetrical transformations. 240 * 241 * <p>Note: Another description of the Transliterator rule syntax is available in 242 * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section 243 * Transform Rules Syntax of UTS #35: Unicode LDML</a>. 244 * The rules are shown there using arrow symbols ← and → and ↔. 245 * ICU supports both those and the equivalent ASCII symbols < and > and <>. 246 * 247 * <p>Rule statements take one of the following forms: 248 * 249 * <dl> 250 * <dt><code>$alefmadda=\\u0622;</code></dt> 251 * <dd><strong>Variable definition.</strong> The name on the 252 * left is assigned the text on the right. In this example, 253 * after this statement, instances of the left hand name, 254 * "<code>$alefmadda</code>", will be replaced by 255 * the Unicode character U+0622. Variable names must begin 256 * with a letter and consist only of letters, digits, and 257 * underscores. Case is significant. Duplicate names cause 258 * an exception to be thrown, that is, variables cannot be 259 * redefined. The right hand side may contain well-formed 260 * text of any length, including no text at all ("<code>$empty=;</code>"). 261 * The right hand side may contain embedded <code>UnicodeSet</code> 262 * patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd> 263 * <dt><code>ai>$alefmadda;</code></dt> 264 * <dd><strong>Forward translation rule.</strong> This rule 265 * states that the string on the left will be changed to the 266 * string on the right when performing forward 267 * transliteration.</dd> 268 * <dt><code>ai<$alefmadda;</code></dt> 269 * <dd><strong>Reverse translation rule.</strong> This rule 270 * states that the string on the right will be changed to 271 * the string on the left when performing reverse 272 * transliteration.</dd> 273 * </dl> 274 * 275 * <dl> 276 * <dt><code>ai<>$alefmadda;</code></dt> 277 * <dd><strong>Bidirectional translation rule.</strong> This 278 * rule states that the string on the right will be changed 279 * to the string on the left when performing forward 280 * transliteration, and vice versa when performing reverse 281 * transliteration.</dd> 282 * </dl> 283 * 284 * <p>Translation rules consist of a <em>match pattern</em> and an <em>output 285 * string</em>. The match pattern consists of literal characters, 286 * optionally preceded by context, and optionally followed by 287 * context. Context characters, like literal pattern characters, 288 * must be matched in the text being transliterated. However, unlike 289 * literal pattern characters, they are not replaced by the output 290 * text. For example, the pattern "<code>abc{def}</code>" 291 * indicates the characters "<code>def</code>" must be 292 * preceded by "<code>abc</code>" for a successful match. 293 * If there is a successful match, "<code>def</code>" will 294 * be replaced, but not "<code>abc</code>". The final '<code>}</code>' 295 * is optional, so "<code>abc{def</code>" is equivalent to 296 * "<code>abc{def}</code>". Another example is "<code>{123}456</code>" 297 * (or "<code>123}456</code>") in which the literal 298 * pattern "<code>123</code>" must be followed by "<code>456</code>". 299 * 300 * <p>The output string of a forward or reverse rule consists of 301 * characters to replace the literal pattern characters. If the 302 * output string contains the character '<code>|</code>', this is 303 * taken to indicate the location of the <em>cursor</em> after 304 * replacement. The cursor is the point in the text at which the 305 * next replacement, if any, will be applied. The cursor is usually 306 * placed within the replacement text; however, it can actually be 307 * placed into the precending or following context by using the 308 * special character '@'. Examples: 309 * 310 * <pre> 311 * a {foo} z > | @ bar; # foo -> bar, move cursor before a 312 * {foo} xyz > bar @@|; # foo -> bar, cursor between y and z 313 * </pre> 314 * 315 * <p><b>UnicodeSet</b> 316 * 317 * <p><code>UnicodeSet</code> patterns may appear anywhere that 318 * makes sense. They may appear in variable definitions. 319 * Contrariwise, <code>UnicodeSet</code> patterns may themselves 320 * contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>", 321 * or "<code>$range=a-z;$ll=[$range]</code>". 322 * 323 * <p><code>UnicodeSet</code> patterns may also be embedded directly 324 * into rule strings. Thus, the following two rules are equivalent: 325 * 326 * <pre> 327 * $vowel=[aeiou]; $vowel>'*'; # One way to do this 328 * [aeiou]>'*'; # Another way 329 * </pre> 330 * 331 * <p>See {@link UnicodeSet} for more documentation and examples. 332 * 333 * <p><b>Segments</b> 334 * 335 * <p>Segments of the input string can be matched and copied to the 336 * output string. This makes certain sets of rules simpler and more 337 * general, and makes reordering possible. For example: 338 * 339 * <pre> 340 * ([a-z]) > $1 $1; # double lowercase letters 341 * ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs 342 * </pre> 343 * 344 * <p>The segment of the input string to be copied is delimited by 345 * "<code>(</code>" and "<code>)</code>". Up to 346 * nine segments may be defined. Segments may not overlap. In the 347 * output string, "<code>$1</code>" through "<code>$9</code>" 348 * represent the input string segments, in left-to-right order of 349 * definition. 350 * 351 * <p><b>Anchors</b> 352 * 353 * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the 354 * special characters '<code>^</code>' and '<code>$</code>'. For example: 355 * 356 * <pre> 357 * ^ a > 'BEG_A'; # match 'a' at start of text 358 * a > 'A'; # match other instances of 'a' 359 * z $ > 'END_Z'; # match 'z' at end of text 360 * z > 'Z'; # match other instances of 'z' 361 * </pre> 362 * 363 * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>. 364 * This is done by including a virtual anchor character '<code>$</code>' at the end of the 365 * set pattern. Although this is usually the match chafacter for the end anchor, the set will 366 * match either the beginning or the end of the text, depending on its placement. For 367 * example: 368 * 369 * <pre> 370 * $x = [a-z$]; # match 'a' through 'z' OR anchor 371 * $x 1 > 2; # match '1' after a-z or at the start 372 * 3 $x > 4; # match '3' before a-z or at the end 373 * </pre> 374 * 375 * <p><b>Example</b> 376 * 377 * <p>The following example rules illustrate many of the features of 378 * the rule language. 379 * 380 * <table border="0" cellpadding="4"> 381 * <tr> 382 * <td style="vertical-align: top;">Rule 1.</td> 383 * <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}>x|y</code></td> 384 * </tr> 385 * <tr> 386 * <td style="vertical-align: top;">Rule 2.</td> 387 * <td style="vertical-align: top; write-space: nowrap;"><code>xyz>r</code></td> 388 * </tr> 389 * <tr> 390 * <td style="vertical-align: top;">Rule 3.</td> 391 * <td style="vertical-align: top; write-space: nowrap;"><code>yz>q</code></td> 392 * </tr> 393 * </table> 394 * 395 * <p>Applying these rules to the string "<code>adefabcdefz</code>" 396 * yields the following results: 397 * 398 * <table border="0" cellpadding="4"> 399 * <tr> 400 * <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td> 401 * <td style="vertical-align: top;">Initial state, no rules match. Advance 402 * cursor.</td> 403 * </tr> 404 * <tr> 405 * <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td> 406 * <td style="vertical-align: top;">Still no match. Rule 1 does not match 407 * because the preceding context is not present.</td> 408 * </tr> 409 * <tr> 410 * <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td> 411 * <td style="vertical-align: top;">Still no match. Keep advancing until 412 * there is a match...</td> 413 * </tr> 414 * <tr> 415 * <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td> 416 * <td style="vertical-align: top;">...</td> 417 * </tr> 418 * <tr> 419 * <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td> 420 * <td style="vertical-align: top;">...</td> 421 * </tr> 422 * <tr> 423 * <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td> 424 * <td style="vertical-align: top;">...</td> 425 * </tr> 426 * <tr> 427 * <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td> 428 * <td style="vertical-align: top;">...</td> 429 * </tr> 430 * <tr> 431 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td> 432 * <td style="vertical-align: top;">Rule 1 matches; replace "<code>def</code>" 433 * with "<code>xy</code>" and back up the cursor 434 * to before the '<code>y</code>'.</td> 435 * </tr> 436 * <tr> 437 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td> 438 * <td style="vertical-align: top;">Although "<code>xyz</code>" is 439 * present, rule 2 does not match because the cursor is 440 * before the '<code>y</code>', not before the '<code>x</code>'. 441 * Rule 3 does match. Replace "<code>yz</code>" 442 * with "<code>q</code>".</td> 443 * </tr> 444 * <tr> 445 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td> 446 * <td style="vertical-align: top;">The cursor is at the end; 447 * transliteration is complete.</td> 448 * </tr> 449 * </table> 450 * 451 * <p>The order of rules is significant. If multiple rules may match 452 * at some point, the first matching rule is applied. 453 * 454 * <p>Forward and reverse rules may have an empty output string. 455 * Otherwise, an empty left or right hand side of any statement is a 456 * syntax error. 457 * 458 * <p>Single quotes are used to quote any character other than a 459 * digit or letter. To specify a single quote itself, inside or 460 * outside of quotes, use two single quotes in a row. For example, 461 * the rule "<code>'>'>o''clock</code>" changes the 462 * string "<code>></code>" to the string "<code>o'clock</code>". 463 * 464 * <p><b>Notes</b> 465 * 466 * <p>While a Transliterator is being built from rules, it checks that 467 * the rules are added in proper order. For example, if the rule 468 * "a>x" is followed by the rule "ab>y", 469 * then the second rule will throw an exception. The reason is that 470 * the second rule can never be triggered, since the first rule 471 * always matches anything it matches. In other words, the first 472 * rule <em>masks</em> the second rule. 473 * 474 * @author Alan Liu 475 * @stable ICU 2.0 476 */ 477 public abstract class Transliterator implements StringTransform { 478 /** 479 * Direction constant indicating the forward direction in a transliterator, 480 * e.g., the forward rules of a rule-based Transliterator. An "A-B" 481 * transliterator transliterates A to B when operating in the forward 482 * direction, and B to A when operating in the reverse direction. 483 * @stable ICU 2.0 484 */ 485 public static final int FORWARD = 0; 486 487 /** 488 * Direction constant indicating the reverse direction in a transliterator, 489 * e.g., the reverse rules of a rule-based Transliterator. An "A-B" 490 * transliterator transliterates A to B when operating in the forward 491 * direction, and B to A when operating in the reverse direction. 492 * @stable ICU 2.0 493 */ 494 public static final int REVERSE = 1; 495 496 /** 497 * Position structure for incremental transliteration. This data 498 * structure defines two substrings of the text being 499 * transliterated. The first region, [contextStart, 500 * contextLimit), defines what characters the transliterator will 501 * read as context. The second region, [start, limit), defines 502 * what characters will actually be transliterated. The second 503 * region should be a subset of the first. 504 * 505 * <p>After a transliteration operation, some of the indices in this 506 * structure will be modified. See the field descriptions for 507 * details. 508 * 509 * <p>contextStart <= start <= limit <= contextLimit 510 * 511 * <p>Note: All index values in this structure must be at code point 512 * boundaries. That is, none of them may occur between two code units 513 * of a surrogate pair. If any index does split a surrogate pair, 514 * results are unspecified. 515 * @stable ICU 2.0 516 */ 517 public static class Position { 518 519 /** 520 * Beginning index, inclusive, of the context to be considered for 521 * a transliteration operation. The transliterator will ignore 522 * anything before this index. INPUT/OUTPUT parameter: This parameter 523 * is updated by a transliteration operation to reflect the maximum 524 * amount of antecontext needed by a transliterator. 525 * @stable ICU 2.0 526 */ 527 public int contextStart; 528 529 /** 530 * Ending index, exclusive, of the context to be considered for a 531 * transliteration operation. The transliterator will ignore 532 * anything at or after this index. INPUT/OUTPUT parameter: This 533 * parameter is updated to reflect changes in the length of the 534 * text, but points to the same logical position in the text. 535 * @stable ICU 2.0 536 */ 537 public int contextLimit; 538 539 /** 540 * Beginning index, inclusive, of the text to be transliteratd. 541 * INPUT/OUTPUT parameter: This parameter is advanced past 542 * characters that have already been transliterated by a 543 * transliteration operation. 544 * @stable ICU 2.0 545 */ 546 public int start; 547 548 /** 549 * Ending index, exclusive, of the text to be transliteratd. 550 * INPUT/OUTPUT parameter: This parameter is updated to reflect 551 * changes in the length of the text, but points to the same 552 * logical position in the text. 553 * @stable ICU 2.0 554 */ 555 public int limit; 556 557 /** 558 * Constructs a Position object with start, limit, 559 * contextStart, and contextLimit all equal to zero. 560 * @stable ICU 2.0 561 */ Position()562 public Position() { 563 this(0, 0, 0, 0); 564 } 565 566 /** 567 * Constructs a Position object with the given start, 568 * contextStart, and contextLimit. The limit is set to the 569 * contextLimit. 570 * @stable ICU 2.0 571 */ Position(int contextStart, int contextLimit, int start)572 public Position(int contextStart, int contextLimit, int start) { 573 this(contextStart, contextLimit, start, contextLimit); 574 } 575 576 /** 577 * Constructs a Position object with the given start, limit, 578 * contextStart, and contextLimit. 579 * @stable ICU 2.0 580 */ Position(int contextStart, int contextLimit, int start, int limit)581 public Position(int contextStart, int contextLimit, 582 int start, int limit) { 583 this.contextStart = contextStart; 584 this.contextLimit = contextLimit; 585 this.start = start; 586 this.limit = limit; 587 } 588 589 /** 590 * Constructs a Position object that is a copy of another. 591 * @stable ICU 2.6 592 */ Position(Position pos)593 public Position(Position pos) { 594 set(pos); 595 } 596 597 /** 598 * Copies the indices of this position from another. 599 * @stable ICU 2.6 600 */ set(Position pos)601 public void set(Position pos) { 602 contextStart = pos.contextStart; 603 contextLimit = pos.contextLimit; 604 start = pos.start; 605 limit = pos.limit; 606 } 607 608 /** 609 * Returns true if this Position is equal to the given object. 610 * @stable ICU 2.0 611 */ 612 @Override equals(Object obj)613 public boolean equals(Object obj) { 614 if (obj instanceof Position) { 615 Position pos = (Position) obj; 616 return contextStart == pos.contextStart && 617 contextLimit == pos.contextLimit && 618 start == pos.start && 619 limit == pos.limit; 620 } 621 return false; 622 } 623 624 /** 625 * {@inheritDoc} 626 * @stable ICU 2.0 627 */ 628 @Override hashCode()629 public int hashCode() { 630 return Objects.hash(contextStart, contextLimit, start, limit); 631 } 632 633 /** 634 * Returns a string representation of this Position. 635 * @return a string representation of the object. 636 * @stable ICU 2.0 637 */ 638 @Override toString()639 public String toString() { 640 return "[cs=" + contextStart 641 + ", s=" + start 642 + ", l=" + limit 643 + ", cl=" + contextLimit 644 + "]"; 645 } 646 647 /** 648 * Check all bounds. If they are invalid, throw an exception. 649 * @param length the length of the string this object applies to 650 * @exception IllegalArgumentException if any indices are out 651 * of bounds 652 * @stable ICU 2.0 653 */ validate(int length)654 public final void validate(int length) { 655 if (contextStart < 0 || 656 start < contextStart || 657 limit < start || 658 contextLimit < limit || 659 length < contextLimit) { 660 throw new IllegalArgumentException("Invalid Position {cs=" + 661 contextStart + ", s=" + 662 start + ", l=" + 663 limit + ", cl=" + 664 contextLimit + "}, len=" + 665 length); 666 } 667 } 668 } 669 670 /** 671 * Programmatic name, e.g., "Latin-Arabic". 672 */ 673 private String ID; 674 675 /** 676 * This transliterator's filter. Any character for which 677 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 678 * altered by this transliterator. If <tt>filter</tt> is 679 * <tt>null</tt> then no filtering is applied. 680 */ 681 private UnicodeSet filter; 682 683 private int maximumContextLength = 0; 684 685 /** 686 * System transliterator registry. 687 */ 688 private static TransliteratorRegistry registry; 689 690 private static Map<CaseInsensitiveString, String> displayNameCache; 691 692 /** 693 * Prefix for resource bundle key for the display name for a 694 * transliterator. The ID is appended to this to form the key. 695 * The resource bundle value should be a String. 696 */ 697 private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%"; 698 699 /** 700 * Prefix for resource bundle key for the display name for a 701 * transliterator SCRIPT. The ID is appended to this to form the key. 702 * The resource bundle value should be a String. 703 */ 704 private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%"; 705 706 /** 707 * Resource bundle key for display name pattern. 708 * The resource bundle value should be a String forming a 709 * MessageFormat pattern, e.g.: 710 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". 711 */ 712 private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern"; 713 714 /** 715 * Delimiter between elements in a compound ID. 716 */ 717 static final char ID_DELIM = ';'; 718 719 /** 720 * Delimiter before target in an ID. 721 */ 722 static final char ID_SEP = '-'; 723 724 /** 725 * Delimiter before variant in an ID. 726 */ 727 static final char VARIANT_SEP = '/'; 728 729 /** 730 * To enable debugging output in the Transliterator component, set 731 * DEBUG to true. 732 * 733 * N.B. Make sure to recompile all of the com.ibm.icu.text package 734 * after changing this. Easiest way to do this is 'ant clean 735 * core' ('ant' will NOT pick up the dependency automatically). 736 * 737 * <<This generates a lot of output.>> 738 */ 739 static final boolean DEBUG = false; 740 741 /** 742 * Default constructor. 743 * @param ID the string identifier for this transliterator 744 * @param filter the filter. Any character for which 745 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 746 * altered by this transliterator. If <tt>filter</tt> is 747 * <tt>null</tt> then no filtering is applied. 748 * @stable ICU 2.0 749 */ Transliterator(String ID, UnicodeFilter filter)750 protected Transliterator(String ID, UnicodeFilter filter) { 751 if (ID == null) { 752 throw new NullPointerException(); 753 } 754 this.ID = ID; 755 setFilter(filter); 756 } 757 758 /** 759 * Transliterates a segment of a string, with optional filtering. 760 * 761 * @param text the string to be transliterated 762 * @param start the beginning index, inclusive; <code>0 <= start 763 * <= limit</code>. 764 * @param limit the ending index, exclusive; <code>start <= limit 765 * <= text.length()</code>. 766 * @return The new limit index. The text previously occupying <code>[start, 767 * limit)</code> has been transliterated, possibly to a string of a different 768 * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where 769 * <em>new-limit</em> is the return value. If the input offsets are out of bounds, 770 * the returned value is -1 and the input string remains unchanged. 771 * @stable ICU 2.0 772 */ transliterate(Replaceable text, int start, int limit)773 public final int transliterate(Replaceable text, int start, int limit) { 774 if (start < 0 || 775 limit < start || 776 text.length() < limit) { 777 return -1; 778 } 779 780 Position pos = new Position(start, limit, start); 781 filteredTransliterate(text, pos, false, true); 782 return pos.limit; 783 } 784 785 /** 786 * Transliterates an entire string in place. Convenience method. 787 * @param text the string to be transliterated 788 * @stable ICU 2.0 789 */ transliterate(Replaceable text)790 public final void transliterate(Replaceable text) { 791 transliterate(text, 0, text.length()); 792 } 793 794 /** 795 * Transliterate an entire string and returns the result. Convenience method. 796 * 797 * @param text the string to be transliterated 798 * @return The transliterated text 799 * @stable ICU 2.0 800 */ transliterate(String text)801 public final String transliterate(String text) { 802 ReplaceableString result = new ReplaceableString(text); 803 transliterate(result); 804 return result.toString(); 805 } 806 807 /** 808 * Transliterates the portion of the text buffer that can be 809 * transliterated unambiguosly after new text has been inserted, 810 * typically as a result of a keyboard event. The new text in 811 * <code>insertion</code> will be inserted into <code>text</code> 812 * at <code>index.contextLimit</code>, advancing 813 * <code>index.contextLimit</code> by <code>insertion.length()</code>. 814 * Then the transliterator will try to transliterate characters of 815 * <code>text</code> between <code>index.start</code> and 816 * <code>index.contextLimit</code>. Characters before 817 * <code>index.start</code> will not be changed. 818 * 819 * <p>Upon return, values in <code>index</code> will be updated. 820 * <code>index.contextStart</code> will be advanced to the first 821 * character that future calls to this method will read. 822 * <code>index.start</code> and <code>index.contextLimit</code> will 823 * be adjusted to delimit the range of text that future calls to 824 * this method may change. 825 * 826 * <p>Typical usage of this method begins with an initial call 827 * with <code>index.contextStart</code> and <code>index.contextLimit</code> 828 * set to indicate the portion of <code>text</code> to be 829 * transliterated, and <code>index.start == index.contextStart</code>. 830 * Thereafter, <code>index</code> can be used without 831 * modification in future calls, provided that all changes to 832 * <code>text</code> are made via this method. 833 * 834 * <p>This method assumes that future calls may be made that will 835 * insert new text into the buffer. As a result, it only performs 836 * unambiguous transliterations. After the last call to this 837 * method, there may be untransliterated text that is waiting for 838 * more input to resolve an ambiguity. In order to perform these 839 * pending transliterations, clients should call {@link 840 * #finishTransliteration} after the last call to this 841 * method has been made. 842 * 843 * @param text the buffer holding transliterated and untransliterated text 844 * @param index the start and limit of the text, the position 845 * of the cursor, and the start and limit of transliteration. 846 * @param insertion text to be inserted and possibly 847 * transliterated into the translation buffer at 848 * <code>index.contextLimit</code>. If <code>null</code> then no text 849 * is inserted. 850 * @see #handleTransliterate 851 * @exception IllegalArgumentException if <code>index</code> 852 * is invalid 853 * @stable ICU 2.0 854 */ transliterate(Replaceable text, Position index, String insertion)855 public final void transliterate(Replaceable text, Position index, 856 String insertion) { 857 index.validate(text.length()); 858 859 // int originalStart = index.contextStart; 860 if (insertion != null) { 861 text.replace(index.limit, index.limit, insertion); 862 index.limit += insertion.length(); 863 index.contextLimit += insertion.length(); 864 } 865 866 if (index.limit > 0 && 867 UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) { 868 // Oops, there is a dangling lead surrogate in the buffer. 869 // This will break most transliterators, since they will 870 // assume it is part of a pair. Don't transliterate until 871 // more text comes in. 872 return; 873 } 874 875 filteredTransliterate(text, index, true, true); 876 877 // TODO 878 // This doesn't work once we add quantifier support. Need to rewrite 879 // this code to support quantifiers and 'use maximum backup <n>;'. 880 // 881 // index.contextStart = Math.max(index.start - getMaximumContextLength(), 882 // originalStart); 883 } 884 885 /** 886 * Transliterates the portion of the text buffer that can be 887 * transliterated unambiguosly after a new character has been 888 * inserted, typically as a result of a keyboard event. This is a 889 * convenience method; see {@link #transliterate(Replaceable, 890 * Transliterator.Position, String)} for details. 891 * @param text the buffer holding transliterated and 892 * untransliterated text 893 * @param index the start and limit of the text, the position 894 * of the cursor, and the start and limit of transliteration. 895 * @param insertion text to be inserted and possibly 896 * transliterated into the translation buffer at 897 * <code>index.contextLimit</code>. 898 * @see #transliterate(Replaceable, Transliterator.Position, String) 899 * @stable ICU 2.0 900 */ transliterate(Replaceable text, Position index, int insertion)901 public final void transliterate(Replaceable text, Position index, 902 int insertion) { 903 transliterate(text, index, UTF16.valueOf(insertion)); 904 } 905 906 /** 907 * Transliterates the portion of the text buffer that can be 908 * transliterated unambiguosly. This is a convenience method; see 909 * {@link #transliterate(Replaceable, Transliterator.Position, 910 * String)} for details. 911 * @param text the buffer holding transliterated and 912 * untransliterated text 913 * @param index the start and limit of the text, the position 914 * of the cursor, and the start and limit of transliteration. 915 * @see #transliterate(Replaceable, Transliterator.Position, String) 916 * @stable ICU 2.0 917 */ transliterate(Replaceable text, Position index)918 public final void transliterate(Replaceable text, Position index) { 919 transliterate(text, index, null); 920 } 921 922 /** 923 * Finishes any pending transliterations that were waiting for 924 * more characters. Clients should call this method as the last 925 * call after a sequence of one or more calls to 926 * <code>transliterate()</code>. 927 * @param text the buffer holding transliterated and 928 * untransliterated text. 929 * @param index the array of indices previously passed to {@link 930 * #transliterate} 931 * @stable ICU 2.0 932 */ finishTransliteration(Replaceable text, Position index)933 public final void finishTransliteration(Replaceable text, 934 Position index) { 935 index.validate(text.length()); 936 filteredTransliterate(text, index, false, true); 937 } 938 939 /** 940 * Abstract method that concrete subclasses define to implement 941 * their transliteration algorithm. This method handles both 942 * incremental and non-incremental transliteration. Let 943 * <code>originalStart</code> refer to the value of 944 * <code>pos.start</code> upon entry. 945 * 946 * <ul> 947 * <li>If <code>incremental</code> is false, then this method 948 * should transliterate all characters between 949 * <code>pos.start</code> and <code>pos.limit</code>. Upon return 950 * <code>pos.start</code> must == <code> pos.limit</code>.</li> 951 * 952 * <li>If <code>incremental</code> is true, then this method 953 * should transliterate all characters between 954 * <code>pos.start</code> and <code>pos.limit</code> that can be 955 * unambiguously transliterated, regardless of future insertions 956 * of text at <code>pos.limit</code>. Upon return, 957 * <code>pos.start</code> should be in the range 958 * [<code>originalStart</code>, <code>pos.limit</code>). 959 * <code>pos.start</code> should be positioned such that 960 * characters [<code>originalStart</code>, <code> 961 * pos.start</code>) will not be changed in the future by this 962 * transliterator and characters [<code>pos.start</code>, 963 * <code>pos.limit</code>) are unchanged.</li> 964 * </ul> 965 * 966 * <p>Implementations of this method should also obey the 967 * following invariants:</p> 968 * 969 * <ul> 970 * <li> <code>pos.limit</code> and <code>pos.contextLimit</code> 971 * should be updated to reflect changes in length of the text 972 * between <code>pos.start</code> and <code>pos.limit</code>. The 973 * difference <code> pos.contextLimit - pos.limit</code> should 974 * not change.</li> 975 * 976 * <li><code>pos.contextStart</code> should not change.</li> 977 * 978 * <li>Upon return, neither <code>pos.start</code> nor 979 * <code>pos.limit</code> should be less than 980 * <code>originalStart</code>.</li> 981 * 982 * <li>Text before <code>originalStart</code> and text after 983 * <code>pos.limit</code> should not change.</li> 984 * 985 * <li>Text before <code>pos.contextStart</code> and text after 986 * <code> pos.contextLimit</code> should be ignored.</li> 987 * </ul> 988 * 989 * <p>Subclasses may safely assume that all characters in 990 * [<code>pos.start</code>, <code>pos.limit</code>) are filtered. 991 * In other words, the filter has already been applied by the time 992 * this method is called. See 993 * <code>filteredTransliterate()</code>. 994 * 995 * <p>This method is <b>not</b> for public consumption. Calling 996 * this method directly will transliterate 997 * [<code>pos.start</code>, <code>pos.limit</code>) without 998 * applying the filter. End user code should call <code> 999 * transliterate()</code> instead of this method. Subclass code 1000 * should call <code>filteredTransliterate()</code> instead of 1001 * this method.<p> 1002 * 1003 * @param text the buffer holding transliterated and 1004 * untransliterated text 1005 * 1006 * @param pos the indices indicating the start, limit, context 1007 * start, and context limit of the text. 1008 * 1009 * @param incremental if true, assume more text may be inserted at 1010 * <code>pos.limit</code> and act accordingly. Otherwise, 1011 * transliterate all text between <code>pos.start</code> and 1012 * <code>pos.limit</code> and move <code>pos.start</code> up to 1013 * <code>pos.limit</code>. 1014 * 1015 * @see #transliterate 1016 * @stable ICU 2.0 1017 */ handleTransliterate(Replaceable text, Position pos, boolean incremental)1018 protected abstract void handleTransliterate(Replaceable text, 1019 Position pos, boolean incremental); 1020 1021 /** 1022 * Top-level transliteration method, handling filtering, incremental and 1023 * non-incremental transliteration, and rollback. All transliteration 1024 * public API methods eventually call this method with a rollback argument 1025 * of TRUE. Other entities may call this method but rollback should be 1026 * FALSE. 1027 * 1028 * <p>If this transliterator has a filter, break up the input text into runs 1029 * of unfiltered characters. Pass each run to 1030 * <subclass>.handleTransliterate(). 1031 * 1032 * <p>In incremental mode, if rollback is TRUE, perform a special 1033 * incremental procedure in which several passes are made over the input 1034 * text, adding one character at a time, and committing successful 1035 * transliterations as they occur. Unsuccessful transliterations are rolled 1036 * back and retried with additional characters to give correct results. 1037 * 1038 * @param text the text to be transliterated 1039 * @param index the position indices 1040 * @param incremental if TRUE, then assume more characters may be inserted 1041 * at index.limit, and postpone processing to accomodate future incoming 1042 * characters 1043 * @param rollback if TRUE and if incremental is TRUE, then perform special 1044 * incremental processing, as described above, and undo partial 1045 * transliterations where necessary. If incremental is FALSE then this 1046 * parameter is ignored. 1047 */ filteredTransliterate(Replaceable text, Position index, boolean incremental, boolean rollback)1048 private void filteredTransliterate(Replaceable text, 1049 Position index, 1050 boolean incremental, 1051 boolean rollback) { 1052 // Short circuit path for transliterators with no filter in 1053 // non-incremental mode. 1054 if (filter == null && !rollback) { 1055 handleTransliterate(text, index, incremental); 1056 return; 1057 } 1058 1059 //---------------------------------------------------------------------- 1060 // This method processes text in two groupings: 1061 // 1062 // RUNS -- A run is a contiguous group of characters which are contained 1063 // in the filter for this transliterator (filter.contains(ch) == true). 1064 // Text outside of runs may appear as context but it is not modified. 1065 // The start and limit Position values are narrowed to each run. 1066 // 1067 // PASSES (incremental only) -- To make incremental mode work correctly, 1068 // each run is broken up into n passes, where n is the length (in code 1069 // points) of the run. Each pass contains the first n characters. If a 1070 // pass is completely transliterated, it is committed, and further passes 1071 // include characters after the committed text. If a pass is blocked, 1072 // and does not transliterate completely, then this method rolls back 1073 // the changes made during the pass, extends the pass by one code point, 1074 // and tries again. 1075 //---------------------------------------------------------------------- 1076 1077 // globalLimit is the limit value for the entire operation. We 1078 // set index.limit to the end of each unfiltered run before 1079 // calling handleTransliterate(), so we need to maintain the real 1080 // value of index.limit here. After each transliteration, we 1081 // update globalLimit for insertions or deletions that have 1082 // happened. 1083 int globalLimit = index.limit; 1084 1085 // If there is a non-null filter, then break the input text up. Say the 1086 // input text has the form: 1087 // xxxabcxxdefxx 1088 // where 'x' represents a filtered character (filter.contains('x') == 1089 // false). Then we break this up into: 1090 // xxxabc xxdef xx 1091 // Each pass through the loop consumes a run of filtered 1092 // characters (which are ignored) and a subsequent run of 1093 // unfiltered characters (which are transliterated). 1094 1095 StringBuffer log = null; 1096 if (DEBUG) { 1097 log = new StringBuffer(); 1098 } 1099 1100 for (;;) { 1101 1102 if (filter != null) { 1103 // Narrow the range to be transliterated to the first run 1104 // of unfiltered characters at or after index.start. 1105 1106 // Advance past filtered chars 1107 int c; 1108 while (index.start < globalLimit && 1109 !filter.contains(c=text.char32At(index.start))) { 1110 index.start += UTF16.getCharCount(c); 1111 } 1112 1113 // Find the end of this run of unfiltered chars 1114 index.limit = index.start; 1115 while (index.limit < globalLimit && 1116 filter.contains(c=text.char32At(index.limit))) { 1117 index.limit += UTF16.getCharCount(c); 1118 } 1119 } 1120 1121 // Check to see if the unfiltered run is empty. This only 1122 // happens at the end of the string when all the remaining 1123 // characters are filtered. 1124 if (index.start == index.limit) { 1125 break; 1126 } 1127 1128 // Is this run incremental? If there is additional 1129 // filtered text (if limit < globalLimit) then we pass in 1130 // an incremental value of FALSE to force the subclass to 1131 // complete the transliteration for this run. 1132 boolean isIncrementalRun = 1133 (index.limit < globalLimit ? false : incremental); 1134 1135 int delta; 1136 1137 // Implement rollback. To understand the need for rollback, 1138 // consider the following transliterator: 1139 // 1140 // "t" is "a > A;" 1141 // "u" is "A > b;" 1142 // "v" is a compound of "t; NFD; u" with a filter [:Ll:] 1143 // 1144 // Now apply "v" to the input text "a". The result is "b". But if 1145 // the transliteration is done incrementally, then the NFD holds 1146 // things up after "t" has already transformed "a" to "A". When 1147 // finishTransliterate() is called, "A" is _not_ processed because 1148 // it gets excluded by the [:Ll:] filter, and the end result is "A" 1149 // -- incorrect. The problem is that the filter is applied to a 1150 // partially-transliterated result, when we only want it to apply to 1151 // input text. Although this example describes a compound 1152 // transliterator containing NFD and a specific filter, it can 1153 // happen with any transliterator which does a partial 1154 // transformation in incremental mode into characters outside its 1155 // filter. 1156 // 1157 // To handle this, when in incremental mode we supply characters to 1158 // handleTransliterate() in several passes. Each pass adds one more 1159 // input character to the input text. That is, for input "ABCD", we 1160 // first try "A", then "AB", then "ABC", and finally "ABCD". If at 1161 // any point we block (upon return, start < limit) then we roll 1162 // back. If at any point we complete the run (upon return start == 1163 // limit) then we commit that run. 1164 1165 if (rollback && isIncrementalRun) { 1166 1167 if (DEBUG) { 1168 log.setLength(0); 1169 System.out.println("filteredTransliterate{"+getID()+"}i: IN=" + 1170 UtilityExtensions.formatInput(text, index)); 1171 } 1172 1173 int runStart = index.start; 1174 int runLimit = index.limit; 1175 int runLength = runLimit - runStart; 1176 1177 // Make a rollback copy at the end of the string 1178 int rollbackOrigin = text.length(); 1179 text.copy(runStart, runLimit, rollbackOrigin); 1180 1181 // Variables reflecting the commitment of completely 1182 // transliterated text. passStart is the runStart, advanced 1183 // past committed text. rollbackStart is the rollbackOrigin, 1184 // advanced past rollback text that corresponds to committed 1185 // text. 1186 int passStart = runStart; 1187 int rollbackStart = rollbackOrigin; 1188 1189 // The limit for each pass; we advance by one code point with 1190 // each iteration. 1191 int passLimit = index.start; 1192 1193 // Total length, in 16-bit code units, of uncommitted text. 1194 // This is the length to be rolled back. 1195 int uncommittedLength = 0; 1196 1197 // Total delta (change in length) for all passes 1198 int totalDelta = 0; 1199 1200 // PASS MAIN LOOP -- Start with a single character, and extend 1201 // the text by one character at a time. Roll back partial 1202 // transliterations and commit complete transliterations. 1203 for (;;) { 1204 // Length of additional code point, either one or two 1205 int charLength = 1206 UTF16.getCharCount(text.char32At(passLimit)); 1207 passLimit += charLength; 1208 if (passLimit > runLimit) { 1209 break; 1210 } 1211 uncommittedLength += charLength; 1212 1213 index.limit = passLimit; 1214 1215 if (DEBUG) { 1216 log.setLength(0); 1217 log.append("filteredTransliterate{"+getID()+"}i: "); 1218 UtilityExtensions.formatInput(log, text, index); 1219 } 1220 1221 // Delegate to subclass for actual transliteration. Upon 1222 // return, start will be updated to point after the 1223 // transliterated text, and limit and contextLimit will be 1224 // adjusted for length changes. 1225 handleTransliterate(text, index, true); 1226 1227 if (DEBUG) { 1228 log.append(" => "); 1229 UtilityExtensions.formatInput(log, text, index); 1230 } 1231 1232 delta = index.limit - passLimit; // change in length 1233 1234 // We failed to completely transliterate this pass. 1235 // Roll back the text. Indices remain unchanged; reset 1236 // them where necessary. 1237 if (index.start != index.limit) { 1238 // Find the rollbackStart, adjusted for length changes 1239 // and the deletion of partially transliterated text. 1240 int rs = rollbackStart + delta - (index.limit - passStart); 1241 1242 // Delete the partially transliterated text 1243 text.replace(passStart, index.limit, ""); 1244 1245 // Copy the rollback text back 1246 text.copy(rs, rs + uncommittedLength, passStart); 1247 1248 // Restore indices to their original values 1249 index.start = passStart; 1250 index.limit = passLimit; 1251 index.contextLimit -= delta; 1252 1253 if (DEBUG) { 1254 log.append(" (ROLLBACK)"); 1255 } 1256 } 1257 1258 // We did completely transliterate this pass. Update the 1259 // commit indices to record how far we got. Adjust indices 1260 // for length change. 1261 else { 1262 // Move the pass indices past the committed text. 1263 passStart = passLimit = index.start; 1264 1265 // Adjust the rollbackStart for length changes and move 1266 // it past the committed text. All characters we've 1267 // processed to this point are committed now, so zero 1268 // out the uncommittedLength. 1269 rollbackStart += delta + uncommittedLength; 1270 uncommittedLength = 0; 1271 1272 // Adjust indices for length changes. 1273 runLimit += delta; 1274 totalDelta += delta; 1275 } 1276 1277 if (DEBUG) { 1278 System.out.println(Utility.escape(log.toString())); 1279 } 1280 } 1281 1282 // Adjust overall limit and rollbackOrigin for insertions and 1283 // deletions. Don't need to worry about contextLimit because 1284 // handleTransliterate() maintains that. 1285 rollbackOrigin += totalDelta; 1286 globalLimit += totalDelta; 1287 1288 // Delete the rollback copy 1289 text.replace(rollbackOrigin, rollbackOrigin + runLength, ""); 1290 1291 // Move start past committed text 1292 index.start = passStart; 1293 } 1294 1295 else { 1296 // Delegate to subclass for actual transliteration. 1297 if (DEBUG) { 1298 log.setLength(0); 1299 log.append("filteredTransliterate{"+getID()+"}: "); 1300 UtilityExtensions.formatInput(log, text, index); 1301 } 1302 1303 int limit = index.limit; 1304 handleTransliterate(text, index, isIncrementalRun); 1305 delta = index.limit - limit; // change in length 1306 1307 if (DEBUG) { 1308 log.append(" => "); 1309 UtilityExtensions.formatInput(log, text, index); 1310 } 1311 1312 // In a properly written transliterator, start == limit after 1313 // handleTransliterate() returns when incremental is false. 1314 // Catch cases where the subclass doesn't do this, and throw 1315 // an exception. (Just pinning start to limit is a bad idea, 1316 // because what's probably happening is that the subclass 1317 // isn't transliterating all the way to the end, and it should 1318 // in non-incremental mode.) 1319 if (!isIncrementalRun && index.start != index.limit) { 1320 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID()); 1321 } 1322 1323 // Adjust overall limit for insertions/deletions. Don't need 1324 // to worry about contextLimit because handleTransliterate() 1325 // maintains that. 1326 globalLimit += delta; 1327 1328 if (DEBUG) { 1329 System.out.println(Utility.escape(log.toString())); 1330 } 1331 } 1332 1333 if (filter == null || isIncrementalRun) { 1334 break; 1335 } 1336 1337 // If we did completely transliterate this 1338 // run, then repeat with the next unfiltered run. 1339 } 1340 1341 // Start is valid where it is. Limit needs to be put back where 1342 // it was, modulo adjustments for deletions/insertions. 1343 index.limit = globalLimit; 1344 1345 if (DEBUG) { 1346 System.out.println("filteredTransliterate{"+getID()+"}: OUT=" + 1347 UtilityExtensions.formatInput(text, index)); 1348 } 1349 } 1350 1351 /** 1352 * Transliterate a substring of text, as specified by index, taking filters 1353 * into account. This method is for subclasses that need to delegate to 1354 * another transliterator. 1355 * @param text the text to be transliterated 1356 * @param index the position indices 1357 * @param incremental if TRUE, then assume more characters may be inserted 1358 * at index.limit, and postpone processing to accomodate future incoming 1359 * characters 1360 * @stable ICU 2.0 1361 */ filteredTransliterate(Replaceable text, Position index, boolean incremental)1362 public void filteredTransliterate(Replaceable text, 1363 Position index, 1364 boolean incremental) { 1365 filteredTransliterate(text, index, incremental, false); 1366 } 1367 1368 /** 1369 * Returns the length of the longest context required by this transliterator. 1370 * This is <em>preceding</em> context. The default value is zero, but 1371 * subclasses can change this by calling <code>setMaximumContextLength()</code>. 1372 * For example, if a transliterator translates "ddd" (where 1373 * d is any digit) to "555" when preceded by "(ddd)", then the preceding 1374 * context length is 5, the length of "(ddd)". 1375 * 1376 * @return The maximum number of preceding context characters this 1377 * transliterator needs to examine 1378 * @stable ICU 2.0 1379 */ getMaximumContextLength()1380 public final int getMaximumContextLength() { 1381 return maximumContextLength; 1382 } 1383 1384 /** 1385 * Method for subclasses to use to set the maximum context length. 1386 * @see #getMaximumContextLength 1387 * @stable ICU 2.0 1388 */ setMaximumContextLength(int a)1389 protected void setMaximumContextLength(int a) { 1390 if (a < 0) { 1391 throw new IllegalArgumentException("Invalid context length " + a); 1392 } 1393 maximumContextLength = a; 1394 } 1395 1396 /** 1397 * Returns a programmatic identifier for this transliterator. 1398 * If this identifier is passed to <code>getInstance()</code>, it 1399 * will return this object, if it has been registered. 1400 * @see #registerClass 1401 * @see #getAvailableIDs 1402 * @stable ICU 2.0 1403 */ getID()1404 public final String getID() { 1405 return ID; 1406 } 1407 1408 /** 1409 * Set the programmatic identifier for this transliterator. Only 1410 * for use by subclasses. 1411 * @stable ICU 2.0 1412 */ setID(String id)1413 protected final void setID(String id) { 1414 ID = id; 1415 } 1416 1417 /** 1418 * Returns a name for this transliterator that is appropriate for 1419 * display to the user in the default <code>DISPLAY</code> locale. See {@link 1420 * #getDisplayName(String,Locale)} for details. 1421 * @see com.ibm.icu.util.ULocale.Category#DISPLAY 1422 * @stable ICU 2.0 1423 */ getDisplayName(String ID)1424 public final static String getDisplayName(String ID) { 1425 return getDisplayName(ID, ULocale.getDefault(Category.DISPLAY)); 1426 } 1427 1428 /** 1429 * Returns a name for this transliterator that is appropriate for 1430 * display to the user in the given locale. This name is taken 1431 * from the locale resource data in the standard manner of the 1432 * <code>java.text</code> package. 1433 * 1434 * <p>If no localized names exist in the system resource bundles, 1435 * a name is synthesized using a localized 1436 * <code>MessageFormat</code> pattern from the resource data. The 1437 * arguments to this pattern are an integer followed by one or two 1438 * strings. The integer is the number of strings, either 1 or 2. 1439 * The strings are formed by splitting the ID for this 1440 * transliterator at the first '-'. If there is no '-', then the 1441 * entire ID forms the only string. 1442 * @param inLocale the Locale in which the display name should be 1443 * localized. 1444 * @see java.text.MessageFormat 1445 * @stable ICU 2.0 1446 */ getDisplayName(String id, Locale inLocale)1447 public static String getDisplayName(String id, Locale inLocale) { 1448 return getDisplayName(id, ULocale.forLocale(inLocale)); 1449 } 1450 1451 /** 1452 * Returns a name for this transliterator that is appropriate for 1453 * display to the user in the given locale. This name is taken 1454 * from the locale resource data in the standard manner of the 1455 * <code>java.text</code> package. 1456 * 1457 * <p>If no localized names exist in the system resource bundles, 1458 * a name is synthesized using a localized 1459 * <code>MessageFormat</code> pattern from the resource data. The 1460 * arguments to this pattern are an integer followed by one or two 1461 * strings. The integer is the number of strings, either 1 or 2. 1462 * The strings are formed by splitting the ID for this 1463 * transliterator at the first '-'. If there is no '-', then the 1464 * entire ID forms the only string. 1465 * @param inLocale the ULocale in which the display name should be 1466 * localized. 1467 * @see java.text.MessageFormat 1468 * @stable ICU 3.2 1469 */ getDisplayName(String id, ULocale inLocale)1470 public static String getDisplayName(String id, ULocale inLocale) { 1471 1472 // Resource bundle containing display name keys and the 1473 // RB_RULE_BASED_IDS array. 1474 // 1475 //If we ever integrate this with the Sun JDK, the resource bundle 1476 // root will change to sun.text.resources.LocaleElements 1477 1478 ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle. 1479 getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, inLocale); 1480 1481 // Normalize the ID 1482 String stv[] = TransliteratorIDParser.IDtoSTV(id); 1483 if (stv == null) { 1484 // No target; malformed id 1485 return ""; 1486 } 1487 String ID = stv[0] + '-' + stv[1]; 1488 if (stv[2] != null && stv[2].length() > 0) { 1489 ID = ID + '/' + stv[2]; 1490 } 1491 1492 // Use the registered display name, if any 1493 String n = displayNameCache.get(new CaseInsensitiveString(ID)); 1494 if (n != null) { 1495 return n; 1496 } 1497 1498 // Use display name for the entire transliterator, if it 1499 // exists. 1500 try { 1501 return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID); 1502 } catch (MissingResourceException e) {} 1503 1504 try { 1505 // Construct the formatter first; if getString() fails 1506 // we'll exit the try block 1507 MessageFormat format = new MessageFormat( 1508 bundle.getString(RB_DISPLAY_NAME_PATTERN)); 1509 // Construct the argument array 1510 Object[] args = new Object[] { Integer.valueOf(2), stv[0], stv[1] }; 1511 1512 // Use display names for the scripts, if they exist 1513 for (int j=1; j<=2; ++j) { 1514 try { 1515 args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX + 1516 (String) args[j]); 1517 } catch (MissingResourceException e) {} 1518 } 1519 1520 // Format it using the pattern in the resource 1521 return (stv[2].length() > 0) ? 1522 (format.format(args) + '/' + stv[2]) : 1523 format.format(args); 1524 } catch (MissingResourceException e2) {} 1525 1526 // We should not reach this point unless there is something 1527 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has 1528 // been deleted from the root RB_LOCALE_ELEMENTS resource. 1529 throw new RuntimeException(); 1530 } 1531 1532 /** 1533 * Returns the filter used by this transliterator, or <tt>null</tt> 1534 * if this transliterator uses no filter. 1535 * @stable ICU 2.0 1536 */ getFilter()1537 public final UnicodeFilter getFilter() { 1538 return filter; 1539 } 1540 1541 /** 1542 * Changes the filter used by this transliterator. If the filter 1543 * is set to <tt>null</tt> then no filtering will occur. 1544 * 1545 * <p>Callers must take care if a transliterator is in use by 1546 * multiple threads. The filter should not be changed by one 1547 * thread while another thread may be transliterating. 1548 * @stable ICU 2.0 1549 */ setFilter(UnicodeFilter filter)1550 public void setFilter(UnicodeFilter filter) { 1551 if (filter == null) { 1552 this.filter = null; 1553 } else { 1554 try { 1555 // fast high-runner case 1556 this.filter = new UnicodeSet((UnicodeSet)filter).freeze(); 1557 } catch (Exception e) { 1558 this.filter = new UnicodeSet(); 1559 filter.addMatchSetTo(this.filter); 1560 this.filter.freeze(); 1561 } 1562 } 1563 } 1564 1565 /** 1566 * Returns a <code>Transliterator</code> object given its ID. 1567 * The ID must be either a system transliterator ID or a ID registered 1568 * using <code>registerClass()</code>. 1569 * 1570 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1571 * @return A <code>Transliterator</code> object with the given ID 1572 * @exception IllegalArgumentException if the given ID is invalid. 1573 * @stable ICU 2.0 1574 */ getInstance(String ID)1575 public static final Transliterator getInstance(String ID) { 1576 return getInstance(ID, FORWARD); 1577 } 1578 1579 /** 1580 * Returns a <code>Transliterator</code> object given its ID. 1581 * The ID must be either a system transliterator ID or a ID registered 1582 * using <code>registerClass()</code>. 1583 * 1584 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1585 * @param dir either FORWARD or REVERSE. If REVERSE then the 1586 * inverse of the given ID is instantiated. 1587 * @return A <code>Transliterator</code> object with the given ID 1588 * @exception IllegalArgumentException if the given ID is invalid. 1589 * @see #registerClass 1590 * @see #getAvailableIDs 1591 * @see #getID 1592 * @stable ICU 2.0 1593 */ getInstance(String ID, int dir)1594 public static Transliterator getInstance(String ID, 1595 int dir) { 1596 StringBuffer canonID = new StringBuffer(); 1597 List<SingleID> list = new ArrayList<>(); 1598 UnicodeSet[] globalFilter = new UnicodeSet[1]; 1599 if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) { 1600 throw new IllegalArgumentException("Invalid ID " + ID); 1601 } 1602 1603 List<Transliterator> translits = TransliteratorIDParser.instantiateList(list); 1604 1605 // assert(list.size() > 0); 1606 Transliterator t = null; 1607 if (list.size() > 1 || canonID.indexOf(";") >= 0) { 1608 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only 1609 // has one child transliterator. This is so that toRules() will return the right thing 1610 // (without any inactive ID), but our main ID still comes out correct. That is, if we 1611 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;" 1612 // even though the ID is "(Lower);Latin-Greek;". 1613 t = new CompoundTransliterator(translits); 1614 } 1615 else { 1616 t = translits.get(0); 1617 } 1618 1619 t.setID(canonID.toString()); 1620 if (globalFilter[0] != null) { 1621 t.setFilter(globalFilter[0]); 1622 } 1623 return t; 1624 } 1625 1626 /** 1627 * Create a transliterator from a basic ID. This is an ID 1628 * containing only the forward direction source, target, and 1629 * variant. 1630 * @param id a basic ID of the form S-T or S-T/V. 1631 * @param canonID canonical ID to apply to the result, or 1632 * null to leave the ID unchanged 1633 * @return a newly created Transliterator or null if the ID is 1634 * invalid. 1635 */ getBasicInstance(String id, String canonID)1636 static Transliterator getBasicInstance(String id, String canonID) { 1637 StringBuffer s = new StringBuffer(); 1638 Transliterator t = registry.get(id, s); 1639 if (s.length() != 0) { 1640 // assert(t==0); 1641 // Instantiate an alias 1642 t = getInstance(s.toString(), FORWARD); 1643 } 1644 if (t != null && canonID != null) { 1645 t.setID(canonID); 1646 } 1647 return t; 1648 } 1649 1650 /** 1651 * Returns a <code>Transliterator</code> object constructed from 1652 * the given rule string. This will be a rule-based Transliterator, 1653 * if the rule string contains only rules, or a 1654 * compound Transliterator, if it contains ID blocks, or a 1655 * null Transliterator, if it contains ID blocks which parse as 1656 * empty for the given direction. 1657 * 1658 * @param ID the id for the transliterator. 1659 * @param rules rules, separated by ';' 1660 * @param dir either FORWARD or REVERSE. 1661 * @return a newly created Transliterator 1662 * @throws IllegalArgumentException if there is a problem with the ID or the rules 1663 * @stable ICU 2.0 1664 */ createFromRules(String ID, String rules, int dir)1665 public static final Transliterator createFromRules(String ID, String rules, int dir) { 1666 Transliterator t = null; 1667 1668 TransliteratorParser parser = new TransliteratorParser(); 1669 parser.parse(rules, dir); 1670 1671 // NOTE: The logic here matches that in TransliteratorRegistry. 1672 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) { 1673 t = new NullTransliterator(); 1674 } 1675 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { 1676 t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter); 1677 } 1678 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { 1679 // idBlock, no data -- this is an alias. The ID has 1680 // been munged from reverse into forward mode, if 1681 // necessary, so instantiate the ID in the forward 1682 // direction. 1683 if (parser.compoundFilter != null) { 1684 t = getInstance(parser.compoundFilter.toPattern(false) + ";" 1685 + parser.idBlockVector.get(0)); 1686 } else { 1687 t = getInstance(parser.idBlockVector.get(0)); 1688 } 1689 1690 if (t != null) { 1691 t.setID(ID); 1692 } 1693 } 1694 else { 1695 List<Transliterator> transliterators = new ArrayList<>(); 1696 int passNumber = 1; 1697 1698 int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size()); 1699 for (int i = 0; i < limit; i++) { 1700 if (i < parser.idBlockVector.size()) { 1701 String idBlock = parser.idBlockVector.get(i); 1702 if (idBlock.length() > 0) { 1703 Transliterator temp = getInstance(idBlock); 1704 if (!(temp instanceof NullTransliterator)) 1705 transliterators.add(getInstance(idBlock)); 1706 } 1707 } 1708 if (i < parser.dataVector.size()) { 1709 Data data = parser.dataVector.get(i); 1710 transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null)); 1711 } 1712 } 1713 1714 t = new CompoundTransliterator(transliterators, passNumber - 1); 1715 t.setID(ID); 1716 if (parser.compoundFilter != null) { 1717 t.setFilter(parser.compoundFilter); 1718 } 1719 } 1720 1721 return t; 1722 } 1723 1724 /** 1725 * Returns a rule string for this transliterator. 1726 * @param escapeUnprintable if true, then unprintable characters 1727 * will be converted to escape form backslash-'u' or 1728 * backslash-'U'. 1729 * @stable ICU 2.0 1730 */ toRules(boolean escapeUnprintable)1731 public String toRules(boolean escapeUnprintable) { 1732 return baseToRules(escapeUnprintable); 1733 } 1734 1735 /** 1736 * Returns a rule string for this transliterator. This is 1737 * a non-overrideable base class implementation that subclasses 1738 * may call. It simply munges the ID into the correct format, 1739 * that is, "foo" => "::foo". 1740 * @param escapeUnprintable if true, then unprintable characters 1741 * will be converted to escape form backslash-'u' or 1742 * backslash-'U'. 1743 * @stable ICU 2.0 1744 */ baseToRules(boolean escapeUnprintable)1745 protected final String baseToRules(boolean escapeUnprintable) { 1746 // The base class implementation of toRules munges the ID into 1747 // the correct format. That is: foo => ::foo 1748 // KEEP in sync with rbt_pars 1749 if (escapeUnprintable) { 1750 StringBuffer rulesSource = new StringBuffer(); 1751 String id = getID(); 1752 for (int i=0; i<id.length();) { 1753 int c = UTF16.charAt(id, i); 1754 if (!Utility.escapeUnprintable(rulesSource, c)) { 1755 UTF16.append(rulesSource, c); 1756 } 1757 i += UTF16.getCharCount(c); 1758 } 1759 rulesSource.insert(0, "::"); 1760 rulesSource.append(ID_DELIM); 1761 return rulesSource.toString(); 1762 } 1763 return "::" + getID() + ID_DELIM; 1764 } 1765 1766 /** 1767 * Return the elements that make up this transliterator. For 1768 * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek" 1769 * were created, the return value of this method would be an array 1770 * of the three transliterator objects that make up that 1771 * transliterator: [NFD, Jamo-Latin, Latin-Greek]. 1772 * 1773 * <p>If this transliterator is not composed of other 1774 * transliterators, then this method will return an array of 1775 * length one containing a reference to this transliterator. 1776 * @return an array of one or more transliterators that make up 1777 * this transliterator 1778 * @stable ICU 3.0 1779 */ getElements()1780 public Transliterator[] getElements() { 1781 Transliterator result[]; 1782 if (this instanceof CompoundTransliterator) { 1783 CompoundTransliterator cpd = (CompoundTransliterator) this; 1784 result = new Transliterator[cpd.getCount()]; 1785 for (int i=0; i<result.length; ++i) { 1786 result[i] = cpd.getTransliterator(i); 1787 } 1788 } else { 1789 result = new Transliterator[] { this }; 1790 } 1791 return result; 1792 } 1793 1794 /** 1795 * Returns the set of all characters that may be modified in the 1796 * input text by this Transliterator. This incorporates this 1797 * object's current filter; if the filter is changed, the return 1798 * value of this function will change. The default implementation 1799 * returns an empty set. Some subclasses may override {@link 1800 * #handleGetSourceSet} to return a more precise result. The 1801 * return result is approximate in any case and is intended for 1802 * use by tests, tools, or utilities. 1803 * @see #getTargetSet 1804 * @see #handleGetSourceSet 1805 * @stable ICU 2.2 1806 */ getSourceSet()1807 public final UnicodeSet getSourceSet() { 1808 UnicodeSet result = new UnicodeSet(); 1809 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), result, new UnicodeSet()); 1810 return result; 1811 } 1812 1813 /** 1814 * Framework method that returns the set of all characters that 1815 * may be modified in the input text by this Transliterator, 1816 * ignoring the effect of this object's filter. The base class 1817 * implementation returns the empty set. Subclasses that wish to 1818 * implement this should override this method. 1819 * @return the set of characters that this transliterator may 1820 * modify. The set may be modified, so subclasses should return a 1821 * newly-created object. 1822 * @see #getSourceSet 1823 * @see #getTargetSet 1824 * @stable ICU 2.2 1825 */ handleGetSourceSet()1826 protected UnicodeSet handleGetSourceSet() { 1827 return new UnicodeSet(); 1828 } 1829 1830 /** 1831 * Returns the set of all characters that may be generated as 1832 * replacement text by this transliterator. The default 1833 * implementation returns the empty set. Some subclasses may 1834 * override this method to return a more precise result. The 1835 * return result is approximate in any case and is intended for 1836 * use by tests, tools, or utilities requiring such 1837 * meta-information. 1838 * <p>Warning. You might expect an empty filter to always produce an empty target. 1839 * However, consider the following: 1840 * <pre> 1841 * [Pp]{}[\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB] > \'; 1842 * </pre> 1843 * With a filter of [], you still get some elements in the target set, because this rule will still match. It could 1844 * be recast to the following if it were important. 1845 * <pre> 1846 * [Pp]{([\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB])} > \' | $1; 1847 * </pre> 1848 * @see #getTargetSet 1849 * @stable ICU 2.2 1850 */ getTargetSet()1851 public UnicodeSet getTargetSet() { 1852 UnicodeSet result = new UnicodeSet(); 1853 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), new UnicodeSet(), result); 1854 return result; 1855 } 1856 1857 /** 1858 * Returns the set of all characters that may be generated as 1859 * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter(). 1860 * <p>SHOULD BE OVERRIDEN BY SUBCLASSES. 1861 * It is probably an error for any transliterator to NOT override this, but we can't force them to 1862 * for backwards compatibility. 1863 * <p>Other methods vector through this. 1864 * <p>When gathering the information on source and target, the compound transliterator makes things complicated. 1865 * For example, suppose we have: 1866 * <pre> 1867 * Global FILTER = [ax] 1868 * a > b; 1869 * :: NULL; 1870 * b > c; 1871 * x > d; 1872 * </pre> 1873 * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets 1874 * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to 1875 * the global filter, intersect that transliterator's filter. Based on that we get the target. 1876 * The next transliterator gets as a global filter (global + last target). And so on. 1877 * <p>There is another complication: 1878 * <pre> 1879 * Global FILTER = [ax] 1880 * a >|b; 1881 * b >c; 1882 * </pre> 1883 * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will 1884 * change the global filter as we go. 1885 * @param targetSet TODO 1886 * @see #getTargetSet 1887 * @internal 1888 * @deprecated This API is ICU internal only. 1889 */ 1890 @Deprecated addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)1891 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 1892 UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); 1893 UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter); 1894 // use old method, if we don't have anything better 1895 sourceSet.addAll(temp); 1896 // clumsy guess with target 1897 for (String s : temp) { 1898 String t = transliterate(s); 1899 if (!s.equals(t)) { 1900 targetSet.addAll(t); 1901 } 1902 } 1903 } 1904 1905 /** 1906 * Returns the intersectionof this instance's filter intersected with an external filter. 1907 * The externalFilter must be frozen (it is frozen if not). 1908 * The result may be frozen, so don't attempt to modify. 1909 * @internal 1910 * @deprecated This API is ICU internal only. 1911 */ 1912 @Deprecated 1913 // TODO change to getMergedFilter getFilterAsUnicodeSet(UnicodeSet externalFilter)1914 public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) { 1915 if (filter == null) { 1916 return externalFilter; 1917 } 1918 UnicodeSet filterSet = new UnicodeSet(externalFilter); 1919 // Most, but not all filters will be UnicodeSets. Optimize for 1920 // the high-runner case. 1921 UnicodeSet temp; 1922 try { 1923 temp = filter; 1924 } catch (ClassCastException e) { 1925 filter.addMatchSetTo(temp = new UnicodeSet()); 1926 } 1927 return filterSet.retainAll(temp).freeze(); 1928 } 1929 1930 /** 1931 * Returns this transliterator's inverse. See the class 1932 * documentation for details. This implementation simply inverts 1933 * the two entities in the ID and attempts to retrieve the 1934 * resulting transliterator. That is, if <code>getID()</code> 1935 * returns "A-B", then this method will return the result of 1936 * <code>getInstance("B-A")</code>, or <code>null</code> if that 1937 * call fails. 1938 * 1939 * <p>Subclasses with knowledge of their inverse may wish to 1940 * override this method. 1941 * 1942 * @return a transliterator that is an inverse, not necessarily 1943 * exact, of this transliterator, or <code>null</code> if no such 1944 * transliterator is registered. 1945 * @see #registerClass 1946 * @stable ICU 2.0 1947 */ getInverse()1948 public final Transliterator getInverse() { 1949 return getInstance(ID, REVERSE); 1950 } 1951 1952 /** 1953 * Registers a subclass of <code>Transliterator</code> with the 1954 * system. This subclass must have a public constructor taking no 1955 * arguments. When that constructor is called, the resulting 1956 * object must return the <code>ID</code> passed to this method if 1957 * its <code>getID()</code> method is called. 1958 * 1959 * @param ID the result of <code>getID()</code> for this 1960 * transliterator 1961 * @param transClass a subclass of <code>Transliterator</code> 1962 * @see #unregister 1963 * @stable ICU 2.0 1964 */ registerClass(String ID, Class<? extends Transliterator> transClass, String displayName)1965 public static void registerClass(String ID, Class<? extends Transliterator> transClass, String displayName) { 1966 registry.put(ID, transClass, true); 1967 if (displayName != null) { 1968 displayNameCache.put(new CaseInsensitiveString(ID), displayName); 1969 } 1970 } 1971 1972 /** 1973 * Register a factory object with the given ID. The factory 1974 * method should return a new instance of the given transliterator. 1975 * 1976 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1977 * be called at application startup, prior to any calls to 1978 * Transliterator.getInstance to avoid undefined behavior. 1979 * 1980 * @param ID the ID of this transliterator 1981 * @param factory the factory object 1982 * @stable ICU 2.0 1983 */ registerFactory(String ID, Factory factory)1984 public static void registerFactory(String ID, Factory factory) { 1985 registry.put(ID, factory, true); 1986 } 1987 1988 /** 1989 * Register a Transliterator object with the given ID. 1990 * 1991 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1992 * be called at application startup, prior to any calls to 1993 * Transliterator.getInstance to avoid undefined behavior. 1994 * 1995 * @param trans the Transliterator object 1996 * @stable ICU 2.2 1997 */ registerInstance(Transliterator trans)1998 public static void registerInstance(Transliterator trans) { 1999 registry.put(trans.getID(), trans, true); 2000 } 2001 2002 /** 2003 * Register a Transliterator object. 2004 * 2005 * <p>Because ICU may choose to cache Transliterator objects internally, this must 2006 * be called at application startup, prior to any calls to 2007 * Transliterator.getInstance to avoid undefined behavior. 2008 * 2009 * @param trans the Transliterator object 2010 */ registerInstance(Transliterator trans, boolean visible)2011 static void registerInstance(Transliterator trans, boolean visible) { 2012 registry.put(trans.getID(), trans, visible); 2013 } 2014 2015 /** 2016 * Register an ID as an alias of another ID. Instantiating 2017 * alias ID produces the same result as instantiating the original ID. 2018 * This is generally used to create short aliases of compound IDs. 2019 * 2020 * <p>Because ICU may choose to cache Transliterator objects internally, this must 2021 * be called at application startup, prior to any calls to 2022 * Transliterator.getInstance to avoid undefined behavior. 2023 * 2024 * @param aliasID The new ID being registered. 2025 * @param realID The existing ID that the new ID should be an alias of. 2026 * @stable ICU 3.6 2027 */ registerAlias(String aliasID, String realID)2028 public static void registerAlias(String aliasID, String realID) { 2029 registry.put(aliasID, realID, true); 2030 } 2031 2032 /** 2033 * Register two targets as being inverses of one another. For 2034 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 2035 * Transliterator to form the following inverse relationships: 2036 * 2037 * <pre>NFC => NFD 2038 * Any-NFC => Any-NFD 2039 * NFD => NFC 2040 * Any-NFD => Any-NFC</pre> 2041 * 2042 * (Without the special inverse registration, the inverse of NFC 2043 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 2044 * that the presence or absence of "Any-" is preserved. 2045 * 2046 * <p>The relationship is symmetrical; registering (a, b) is 2047 * equivalent to registering (b, a). 2048 * 2049 * <p>The relevant IDs must still be registered separately as 2050 * factories or classes. 2051 * 2052 * <p>Only the targets are specified. Special inverses always 2053 * have the form Any-Target1 <=> Any-Target2. The target should 2054 * have canonical casing (the casing desired to be produced when 2055 * an inverse is formed) and should contain no whitespace or other 2056 * extraneous characters. 2057 * 2058 * @param target the target against which to register the inverse 2059 * @param inverseTarget the inverse of target, that is 2060 * Any-target.getInverse() => Any-inverseTarget 2061 * @param bidirectional if true, register the reverse relation 2062 * as well, that is, Any-inverseTarget.getInverse() => Any-target 2063 */ registerSpecialInverse(String target, String inverseTarget, boolean bidirectional)2064 static void registerSpecialInverse(String target, 2065 String inverseTarget, 2066 boolean bidirectional) { 2067 TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional); 2068 } 2069 2070 /** 2071 * Unregisters a transliterator or class. This may be either 2072 * a system transliterator or a user transliterator or class. 2073 * 2074 * @param ID the ID of the transliterator or class 2075 * @see #registerClass 2076 * @stable ICU 2.0 2077 */ unregister(String ID)2078 public static void unregister(String ID) { 2079 displayNameCache.remove(new CaseInsensitiveString(ID)); 2080 registry.remove(ID); 2081 } 2082 2083 /** 2084 * Returns an enumeration over the programmatic names of registered 2085 * <code>Transliterator</code> objects. This includes both system 2086 * transliterators and user transliterators registered using 2087 * <code>registerClass()</code>. The enumerated names may be 2088 * passed to <code>getInstance()</code>. 2089 * 2090 * @return An <code>Enumeration</code> over <code>String</code> objects 2091 * @see #getInstance 2092 * @see #registerClass 2093 * @stable ICU 2.0 2094 */ getAvailableIDs()2095 public static final Enumeration<String> getAvailableIDs() { 2096 return registry.getAvailableIDs(); 2097 } 2098 2099 /** 2100 * Returns an enumeration over the source names of registered 2101 * transliterators. Source names may be passed to 2102 * getAvailableTargets() to obtain available targets for each 2103 * source. 2104 * @stable ICU 2.0 2105 */ getAvailableSources()2106 public static final Enumeration<String> getAvailableSources() { 2107 return registry.getAvailableSources(); 2108 } 2109 2110 /** 2111 * Returns an enumeration over the target names of registered 2112 * transliterators having a given source name. Target names may 2113 * be passed to getAvailableVariants() to obtain available 2114 * variants for each source and target pair. 2115 * @stable ICU 2.0 2116 */ getAvailableTargets(String source)2117 public static final Enumeration<String> getAvailableTargets(String source) { 2118 return registry.getAvailableTargets(source); 2119 } 2120 2121 /** 2122 * Returns an enumeration over the variant names of registered 2123 * transliterators having a given source name and target name. 2124 * @stable ICU 2.0 2125 */ getAvailableVariants(String source, String target)2126 public static final Enumeration<String> getAvailableVariants(String source, 2127 String target) { 2128 return registry.getAvailableVariants(source, target); 2129 } 2130 private static final String ROOT = "root", 2131 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs"; 2132 static { 2133 registry = new TransliteratorRegistry(); 2134 2135 // The display name cache starts out empty 2136 displayNameCache = Collections.synchronizedMap(new HashMap<CaseInsensitiveString, String>()); 2137 /* The following code parses the index table located in 2138 * icu/data/translit/root.txt. The index is an n x 4 table 2139 * that follows this format: 2140 * <id>{ 2141 * file{ 2142 * resource{"<resource>"} 2143 * direction{"<direction>"} 2144 * } 2145 * } 2146 * <id>{ 2147 * internal{ 2148 * resource{"<resource>"} 2149 * direction{"<direction"} 2150 * } 2151 * } 2152 * <id>{ 2153 * alias{"<getInstanceArg"} 2154 * } 2155 * <id> is the ID of the system transliterator being defined. These 2156 * are public IDs enumerated by Transliterator.getAvailableIDs(), 2157 * unless the second field is "internal". 2158 * 2159 * <resource> is a ResourceReader resource name. Currently these refer 2160 * to file names under com/ibm/text/resources. This string is passed 2161 * directly to ResourceReader, together with <encoding>. 2162 * 2163 * <direction> is either "FORWARD" or "REVERSE". 2164 * 2165 * <getInstanceArg> is a string to be passed directly to 2166 * Transliterator.getInstance(). The returned Transliterator object 2167 * then has its ID changed to <id> and is returned. 2168 * 2169 * The extra blank field on "alias" lines is to make the array square. 2170 */ 2171 UResourceBundle bundle, transIDs, colBund; 2172 bundle = UResourceBundle.getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, ROOT); 2173 transIDs = bundle.get(RB_RULE_BASED_IDS); 2174 2175 int row, maxRows; 2176 maxRows = transIDs.getSize(); 2177 for (row = 0; row < maxRows; row++) { 2178 colBund = transIDs.get(row); 2179 String ID = colBund.getKey(); 2180 if (ID.indexOf("-t-") >= 0) { 2181 continue; 2182 } 2183 UResourceBundle res = colBund.get(0); 2184 String type = res.getKey(); 2185 if (type.equals("file") || type.equals("internal")) { 2186 // Rest of line is <resource>:<encoding>:<direction> 2187 // pos colon c2 2188 String resString = res.getString("resource"); 2189 int dir; 2190 String direction = res.getString("direction"); 2191 switch (direction.charAt(0)) { 2192 case 'F': 2193 dir = FORWARD; 2194 break; 2195 case 'R': 2196 dir = REVERSE; 2197 break; 2198 default: 2199 throw new RuntimeException("Can't parse direction: " + direction); 2200 } registry.put(ID, resString, dir, !type.equals("internal"))2201 registry.put(ID, 2202 resString, // resource 2203 dir, 2204 !type.equals("internal")); 2205 } else if (type.equals("alias")) { 2206 //'alias'; row[2]=createInstance argument 2207 String resString = res.getString(); registry.put(ID, resString, true)2208 registry.put(ID, resString, true); 2209 } else { 2210 // Unknown type 2211 throw new RuntimeException("Unknow type: " + type); 2212 } 2213 } 2214 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false)2215 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false); 2216 2217 // Register non-rule-based transliterators registerClass(NullTransliterator._ID, NullTransliterator.class, null)2218 registerClass(NullTransliterator._ID, 2219 NullTransliterator.class, null); RemoveTransliterator.register()2220 RemoveTransliterator.register(); EscapeTransliterator.register()2221 EscapeTransliterator.register(); UnescapeTransliterator.register()2222 UnescapeTransliterator.register(); LowercaseTransliterator.register()2223 LowercaseTransliterator.register(); UppercaseTransliterator.register()2224 UppercaseTransliterator.register(); TitlecaseTransliterator.register()2225 TitlecaseTransliterator.register(); CaseFoldTransliterator.register()2226 CaseFoldTransliterator.register(); UnicodeNameTransliterator.register()2227 UnicodeNameTransliterator.register(); NameUnicodeTransliterator.register()2228 NameUnicodeTransliterator.register(); NormalizationTransliterator.register()2229 NormalizationTransliterator.register(); BreakTransliterator.register()2230 BreakTransliterator.register(); AnyTransliterator.register()2231 AnyTransliterator.register(); // do this last! 2232 } 2233 2234 /** 2235 * Register the script-based "Any" transliterators: Any-Latin, Any-Greek 2236 * @internal 2237 * @deprecated This API is ICU internal only. 2238 */ 2239 @Deprecated registerAny()2240 public static void registerAny() { 2241 AnyTransliterator.register(); 2242 } 2243 2244 /** 2245 * The factory interface for transliterators. Transliterator 2246 * subclasses can register factory objects for IDs using the 2247 * registerFactory() method of Transliterator. When invoked, the 2248 * factory object will be passed the ID being instantiated. This 2249 * makes it possible to register one factory method to more than 2250 * one ID, or for a factory method to parameterize its result 2251 * based on the variant. 2252 * @stable ICU 2.0 2253 */ 2254 public static interface Factory { 2255 /** 2256 * Return a transliterator for the given ID. 2257 * @stable ICU 2.0 2258 */ getInstance(String ID)2259 Transliterator getInstance(String ID); 2260 } 2261 2262 /** 2263 * Implements StringTransform via this method. 2264 * @param source text to be transformed (eg lowercased) 2265 * @return result 2266 * @stable ICU 3.8 2267 */ 2268 @Override transform(String source)2269 public String transform(String source) { 2270 return transliterate(source); 2271 } 2272 } 2273