1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 import java.text.MessageFormat; 13 import java.util.ArrayList; 14 import java.util.Collections; 15 import java.util.Enumeration; 16 import java.util.HashMap; 17 import java.util.List; 18 import java.util.Locale; 19 import java.util.Map; 20 import java.util.MissingResourceException; 21 import java.util.Objects; 22 23 import ohos.global.icu.impl.ICUData; 24 import ohos.global.icu.impl.ICUResourceBundle; 25 import ohos.global.icu.impl.Utility; 26 import ohos.global.icu.impl.UtilityExtensions; 27 import ohos.global.icu.text.RuleBasedTransliterator.Data; 28 import ohos.global.icu.text.TransliteratorIDParser.SingleID; 29 import ohos.global.icu.util.CaseInsensitiveString; 30 import ohos.global.icu.util.ULocale; 31 import ohos.global.icu.util.ULocale.Category; 32 import ohos.global.icu.util.UResourceBundle; 33 34 /** 35 * <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common 36 * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator 37 * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not 38 * <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference 39 * to the meanings of words and sentences. 40 * 41 * <p> 42 * Although script conversion is its most common use, a transliterator can actually perform a more general class of 43 * tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the 44 * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of 45 * <code>Transliterator</code>. 46 * 47 * <p> 48 * <b>Transliterators are stateless</b> 49 * 50 * <p> 51 * <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to 52 * <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might 53 * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex 54 * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In 55 * other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the 56 * needed information, and delayed operation allows arbitrary complexity. 57 * 58 * <p> 59 * <b>Batch transliteration</b> 60 * 61 * <p> 62 * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as 63 * <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>, 64 * the call 65 * 66 * <blockquote><code>String result = t.transliterate(input); 67 * </code></blockquote> 68 * 69 * will transliterate it and return the result. Other methods allow the client to specify a substring to be 70 * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band 71 * information (such as text styles). 72 * 73 * <p> 74 * <b>Keyboard transliteration</b> 75 * 76 * <p> 77 * Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that 78 * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal 79 * fashion. 80 * 81 * <p> 82 * In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as 83 * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being 84 * modified as each new character arrives. 85 * 86 * <p> 87 * Consider the simple rule-based Transliterator: 88 * 89 * <blockquote><code> 90 * th>{theta}<br> 91 * t>{tau} 92 * </code></blockquote> 93 * 94 * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is 95 * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string: 96 * 97 * <blockquote><code> 98 * t>|{tau}<br> 99 * {tau}h>{theta} 100 * </code></blockquote> 101 * 102 * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is 103 * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across 104 * calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a 105 * case like the one above, it will precede the insertion point. 106 * 107 * <p> 108 * Keyboard transliteration methods maintain a set of three indices that are updated with each call to 109 * <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and 110 * they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring 111 * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index; 112 * that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the 113 * transliterator last stopped, either because it reached the end, or because it required more characters to 114 * disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules. 115 * Any characters before the <code>cursor</code> index are frozen; future keyboard 116 * transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code> 117 * index, which marks the end of the substring that the transliterator looks at. 118 * 119 * <p> 120 * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It 121 * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the 122 * client code knows that no more characters are forthcoming, perhaps because the user has performed some input 123 * termination operation, then it should call <code>finishTransliteration()</code> to complete any pending 124 * transliterations. 125 * 126 * <p> 127 * <b>Inverses</b> 128 * 129 * <p> 130 * Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates 131 * characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character 132 * values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound 133 * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input 134 * text. 135 * 136 * The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists, 137 * or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true 138 * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider 139 * two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates 140 * 'B' to 'A'. It might seem that these are exact inverses, since 141 * 142 * <blockquote>"A" x <b>AB</b> -> "B"<br> 143 * "B" x <b>BA</b> -> "A"</blockquote> 144 * 145 * where 'x' represents transliteration. However, 146 * 147 * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br> 148 * "BBCD" x <b>BA</b> -> "AACD"</blockquote> 149 * 150 * so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be 151 * <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return 152 * <b>BA</b>. 153 * 154 * <p> 155 * <b>Filtering</b> 156 * <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The 157 * filter affects just the characters that are changed -- the characters outside of the filter are still part of the 158 * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'. 159 * 160 * <pre> 161 * String rules = "x > y; x{a} > b; "; 162 * Transliterator tempTrans = Transliterator.createFromRules("temp", rules, Transliterator.FORWARD); 163 * tempTrans.setFilter(new UnicodeSet("[a]")); 164 * String tempResult = tempTrans.transform("xa"); 165 * // results in "xb" 166 *</pre> 167 * <p> 168 * <b>IDs and display names</b> 169 * 170 * <p> 171 * A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format 172 * <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em> 173 * describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of 174 * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from 175 * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1 176 * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the 177 * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes. 178 * 179 * <p> 180 * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces, 181 * returned by {@link #getDisplayName}. 182 * 183 * <p> 184 * <b>Factory methods and registration</b> 185 * 186 * <p> 187 * In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a 188 * transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators 189 * are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object. 190 * 191 * <p> 192 * In addition to the system transliterators registered at startup, user transliterators may be registered by calling 193 * <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it 194 * is needed), users may call <code>registerClass()</code>. 195 * 196 * <p> 197 * <b>Composed transliterators</b> 198 * 199 * <p> 200 * In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em> 201 * transliterators. These are implemented by composing two or more component transliterators. For example, if we have 202 * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12 203 * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an 204 * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M", 205 * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em> 206 * <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72 207 * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that 208 * the given transliterator is intended to be composed with others, rather than be used as is. 209 * 210 * <p> 211 * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati" 212 * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this 213 * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns 214 * "Devanagari-Gujarati"). 215 * 216 * <p> 217 * <b>Subclassing</b> 218 * 219 * <p> 220 * Subclasses must implement the abstract method <code>handleTransliterate()</code>. 221 * <p> 222 * Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the 223 * <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of 224 * these methods can be improved over the performance obtained by the default implementations in this class. 225 * 226 * <p><b>Rule syntax</b> 227 * 228 * <p>A set of rules determines how to perform translations. 229 * Rules within a rule set are separated by semicolons (';'). 230 * To include a literal semicolon, prefix it with a backslash ('\'). 231 * Unicode Pattern_White_Space is ignored. 232 * If the first non-blank character on a line is '#', 233 * the entire line is ignored as a comment. 234 * 235 * <p>Each set of rules consists of two groups, one forward, and one 236 * reverse. This is a convention that is not enforced; rules for one 237 * direction may be omitted, with the result that translations in 238 * that direction will not modify the source text. In addition, 239 * bidirectional forward-reverse rules may be specified for 240 * symmetrical transformations. 241 * 242 * <p>Note: Another description of the Transliterator rule syntax is available in 243 * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section 244 * Transform Rules Syntax of UTS #35: Unicode LDML</a>. 245 * The rules are shown there using arrow symbols ← and → and ↔. 246 * ICU supports both those and the equivalent ASCII symbols < and > and <>. 247 * 248 * <p>Rule statements take one of the following forms: 249 * 250 * <dl> 251 * <dt><code>$alefmadda=\\u0622;</code></dt> 252 * <dd><strong>Variable definition.</strong> The name on the 253 * left is assigned the text on the right. In this example, 254 * after this statement, instances of the left hand name, 255 * "<code>$alefmadda</code>", will be replaced by 256 * the Unicode character U+0622. Variable names must begin 257 * with a letter and consist only of letters, digits, and 258 * underscores. Case is significant. Duplicate names cause 259 * an exception to be thrown, that is, variables cannot be 260 * redefined. The right hand side may contain well-formed 261 * text of any length, including no text at all ("<code>$empty=;</code>"). 262 * The right hand side may contain embedded <code>UnicodeSet</code> 263 * patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd> 264 * <dt><code>ai>$alefmadda;</code></dt> 265 * <dd><strong>Forward translation rule.</strong> This rule 266 * states that the string on the left will be changed to the 267 * string on the right when performing forward 268 * transliteration.</dd> 269 * <dt><code>ai<$alefmadda;</code></dt> 270 * <dd><strong>Reverse translation rule.</strong> This rule 271 * states that the string on the right will be changed to 272 * the string on the left when performing reverse 273 * transliteration.</dd> 274 * </dl> 275 * 276 * <dl> 277 * <dt><code>ai<>$alefmadda;</code></dt> 278 * <dd><strong>Bidirectional translation rule.</strong> This 279 * rule states that the string on the right will be changed 280 * to the string on the left when performing forward 281 * transliteration, and vice versa when performing reverse 282 * transliteration.</dd> 283 * </dl> 284 * 285 * <p>Translation rules consist of a <em>match pattern</em> and an <em>output 286 * string</em>. The match pattern consists of literal characters, 287 * optionally preceded by context, and optionally followed by 288 * context. Context characters, like literal pattern characters, 289 * must be matched in the text being transliterated. However, unlike 290 * literal pattern characters, they are not replaced by the output 291 * text. For example, the pattern "<code>abc{def}</code>" 292 * indicates the characters "<code>def</code>" must be 293 * preceded by "<code>abc</code>" for a successful match. 294 * If there is a successful match, "<code>def</code>" will 295 * be replaced, but not "<code>abc</code>". The final '<code>}</code>' 296 * is optional, so "<code>abc{def</code>" is equivalent to 297 * "<code>abc{def}</code>". Another example is "<code>{123}456</code>" 298 * (or "<code>123}456</code>") in which the literal 299 * pattern "<code>123</code>" must be followed by "<code>456</code>". 300 * 301 * <p>The output string of a forward or reverse rule consists of 302 * characters to replace the literal pattern characters. If the 303 * output string contains the character '<code>|</code>', this is 304 * taken to indicate the location of the <em>cursor</em> after 305 * replacement. The cursor is the point in the text at which the 306 * next replacement, if any, will be applied. The cursor is usually 307 * placed within the replacement text; however, it can actually be 308 * placed into the precending or following context by using the 309 * special character '@'. Examples: 310 * 311 * <pre> 312 * a {foo} z > | @ bar; # foo -> bar, move cursor before a 313 * {foo} xyz > bar @@|; # foo -> bar, cursor between y and z 314 * </pre> 315 * 316 * <p><b>UnicodeSet</b> 317 * 318 * <p><code>UnicodeSet</code> patterns may appear anywhere that 319 * makes sense. They may appear in variable definitions. 320 * Contrariwise, <code>UnicodeSet</code> patterns may themselves 321 * contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>", 322 * or "<code>$range=a-z;$ll=[$range]</code>". 323 * 324 * <p><code>UnicodeSet</code> patterns may also be embedded directly 325 * into rule strings. Thus, the following two rules are equivalent: 326 * 327 * <pre> 328 * $vowel=[aeiou]; $vowel>'*'; # One way to do this 329 * [aeiou]>'*'; # Another way 330 * </pre> 331 * 332 * <p>See {@link UnicodeSet} for more documentation and examples. 333 * 334 * <p><b>Segments</b> 335 * 336 * <p>Segments of the input string can be matched and copied to the 337 * output string. This makes certain sets of rules simpler and more 338 * general, and makes reordering possible. For example: 339 * 340 * <pre> 341 * ([a-z]) > $1 $1; # double lowercase letters 342 * ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs 343 * </pre> 344 * 345 * <p>The segment of the input string to be copied is delimited by 346 * "<code>(</code>" and "<code>)</code>". Up to 347 * nine segments may be defined. Segments may not overlap. In the 348 * output string, "<code>$1</code>" through "<code>$9</code>" 349 * represent the input string segments, in left-to-right order of 350 * definition. 351 * 352 * <p><b>Anchors</b> 353 * 354 * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the 355 * special characters '<code>^</code>' and '<code>$</code>'. For example: 356 * 357 * <pre> 358 * ^ a > 'BEG_A'; # match 'a' at start of text 359 * a > 'A'; # match other instances of 'a' 360 * z $ > 'END_Z'; # match 'z' at end of text 361 * z > 'Z'; # match other instances of 'z' 362 * </pre> 363 * 364 * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>. 365 * This is done by including a virtual anchor character '<code>$</code>' at the end of the 366 * set pattern. Although this is usually the match chafacter for the end anchor, the set will 367 * match either the beginning or the end of the text, depending on its placement. For 368 * example: 369 * 370 * <pre> 371 * $x = [a-z$]; # match 'a' through 'z' OR anchor 372 * $x 1 > 2; # match '1' after a-z or at the start 373 * 3 $x > 4; # match '3' before a-z or at the end 374 * </pre> 375 * 376 * <p><b>Example</b> 377 * 378 * <p>The following example rules illustrate many of the features of 379 * the rule language. 380 * 381 * <table border="0" cellpadding="4"> 382 * <tr> 383 * <td style="vertical-align: top;">Rule 1.</td> 384 * <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}>x|y</code></td> 385 * </tr> 386 * <tr> 387 * <td style="vertical-align: top;">Rule 2.</td> 388 * <td style="vertical-align: top; write-space: nowrap;"><code>xyz>r</code></td> 389 * </tr> 390 * <tr> 391 * <td style="vertical-align: top;">Rule 3.</td> 392 * <td style="vertical-align: top; write-space: nowrap;"><code>yz>q</code></td> 393 * </tr> 394 * </table> 395 * 396 * <p>Applying these rules to the string "<code>adefabcdefz</code>" 397 * yields the following results: 398 * 399 * <table border="0" cellpadding="4"> 400 * <tr> 401 * <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td> 402 * <td style="vertical-align: top;">Initial state, no rules match. Advance 403 * cursor.</td> 404 * </tr> 405 * <tr> 406 * <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td> 407 * <td style="vertical-align: top;">Still no match. Rule 1 does not match 408 * because the preceding context is not present.</td> 409 * </tr> 410 * <tr> 411 * <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td> 412 * <td style="vertical-align: top;">Still no match. Keep advancing until 413 * there is a match...</td> 414 * </tr> 415 * <tr> 416 * <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td> 417 * <td style="vertical-align: top;">...</td> 418 * </tr> 419 * <tr> 420 * <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td> 421 * <td style="vertical-align: top;">...</td> 422 * </tr> 423 * <tr> 424 * <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td> 425 * <td style="vertical-align: top;">...</td> 426 * </tr> 427 * <tr> 428 * <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td> 429 * <td style="vertical-align: top;">...</td> 430 * </tr> 431 * <tr> 432 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td> 433 * <td style="vertical-align: top;">Rule 1 matches; replace "<code>def</code>" 434 * with "<code>xy</code>" and back up the cursor 435 * to before the '<code>y</code>'.</td> 436 * </tr> 437 * <tr> 438 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td> 439 * <td style="vertical-align: top;">Although "<code>xyz</code>" is 440 * present, rule 2 does not match because the cursor is 441 * before the '<code>y</code>', not before the '<code>x</code>'. 442 * Rule 3 does match. Replace "<code>yz</code>" 443 * with "<code>q</code>".</td> 444 * </tr> 445 * <tr> 446 * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td> 447 * <td style="vertical-align: top;">The cursor is at the end; 448 * transliteration is complete.</td> 449 * </tr> 450 * </table> 451 * 452 * <p>The order of rules is significant. If multiple rules may match 453 * at some point, the first matching rule is applied. 454 * 455 * <p>Forward and reverse rules may have an empty output string. 456 * Otherwise, an empty left or right hand side of any statement is a 457 * syntax error. 458 * 459 * <p>Single quotes are used to quote any character other than a 460 * digit or letter. To specify a single quote itself, inside or 461 * outside of quotes, use two single quotes in a row. For example, 462 * the rule "<code>'>'>o''clock</code>" changes the 463 * string "<code>></code>" to the string "<code>o'clock</code>". 464 * 465 * <p><b>Notes</b> 466 * 467 * <p>While a Transliterator is being built from rules, it checks that 468 * the rules are added in proper order. For example, if the rule 469 * "a>x" is followed by the rule "ab>y", 470 * then the second rule will throw an exception. The reason is that 471 * the second rule can never be triggered, since the first rule 472 * always matches anything it matches. In other words, the first 473 * rule <em>masks</em> the second rule. 474 * 475 * @author Alan Liu 476 */ 477 public abstract class Transliterator implements StringTransform { 478 /** 479 * Direction constant indicating the forward direction in a transliterator, 480 * e.g., the forward rules of a rule-based Transliterator. An "A-B" 481 * transliterator transliterates A to B when operating in the forward 482 * direction, and B to A when operating in the reverse direction. 483 */ 484 public static final int FORWARD = 0; 485 486 /** 487 * Direction constant indicating the reverse direction in a transliterator, 488 * e.g., the reverse rules of a rule-based Transliterator. An "A-B" 489 * transliterator transliterates A to B when operating in the forward 490 * direction, and B to A when operating in the reverse direction. 491 */ 492 public static final int REVERSE = 1; 493 494 /** 495 * Position structure for incremental transliteration. This data 496 * structure defines two substrings of the text being 497 * transliterated. The first region, [contextStart, 498 * contextLimit), defines what characters the transliterator will 499 * read as context. The second region, [start, limit), defines 500 * what characters will actually be transliterated. The second 501 * region should be a subset of the first. 502 * 503 * <p>After a transliteration operation, some of the indices in this 504 * structure will be modified. See the field descriptions for 505 * details. 506 * 507 * <p>contextStart <= start <= limit <= contextLimit 508 * 509 * <p>Note: All index values in this structure must be at code point 510 * boundaries. That is, none of them may occur between two code units 511 * of a surrogate pair. If any index does split a surrogate pair, 512 * results are unspecified. 513 */ 514 public static class Position { 515 516 /** 517 * Beginning index, inclusive, of the context to be considered for 518 * a transliteration operation. The transliterator will ignore 519 * anything before this index. INPUT/OUTPUT parameter: This parameter 520 * is updated by a transliteration operation to reflect the maximum 521 * amount of antecontext needed by a transliterator. 522 */ 523 public int contextStart; 524 525 /** 526 * Ending index, exclusive, of the context to be considered for a 527 * transliteration operation. The transliterator will ignore 528 * anything at or after this index. INPUT/OUTPUT parameter: This 529 * parameter is updated to reflect changes in the length of the 530 * text, but points to the same logical position in the text. 531 */ 532 public int contextLimit; 533 534 /** 535 * Beginning index, inclusive, of the text to be transliteratd. 536 * INPUT/OUTPUT parameter: This parameter is advanced past 537 * characters that have already been transliterated by a 538 * transliteration operation. 539 */ 540 public int start; 541 542 /** 543 * Ending index, exclusive, of the text to be transliteratd. 544 * INPUT/OUTPUT parameter: This parameter is updated to reflect 545 * changes in the length of the text, but points to the same 546 * logical position in the text. 547 */ 548 public int limit; 549 550 /** 551 * Constructs a Position object with start, limit, 552 * contextStart, and contextLimit all equal to zero. 553 */ Position()554 public Position() { 555 this(0, 0, 0, 0); 556 } 557 558 /** 559 * Constructs a Position object with the given start, 560 * contextStart, and contextLimit. The limit is set to the 561 * contextLimit. 562 */ Position(int contextStart, int contextLimit, int start)563 public Position(int contextStart, int contextLimit, int start) { 564 this(contextStart, contextLimit, start, contextLimit); 565 } 566 567 /** 568 * Constructs a Position object with the given start, limit, 569 * contextStart, and contextLimit. 570 */ Position(int contextStart, int contextLimit, int start, int limit)571 public Position(int contextStart, int contextLimit, 572 int start, int limit) { 573 this.contextStart = contextStart; 574 this.contextLimit = contextLimit; 575 this.start = start; 576 this.limit = limit; 577 } 578 579 /** 580 * Constructs a Position object that is a copy of another. 581 */ Position(Position pos)582 public Position(Position pos) { 583 set(pos); 584 } 585 586 /** 587 * Copies the indices of this position from another. 588 */ set(Position pos)589 public void set(Position pos) { 590 contextStart = pos.contextStart; 591 contextLimit = pos.contextLimit; 592 start = pos.start; 593 limit = pos.limit; 594 } 595 596 /** 597 * Returns true if this Position is equal to the given object. 598 */ 599 @Override equals(Object obj)600 public boolean equals(Object obj) { 601 if (obj instanceof Position) { 602 Position pos = (Position) obj; 603 return contextStart == pos.contextStart && 604 contextLimit == pos.contextLimit && 605 start == pos.start && 606 limit == pos.limit; 607 } 608 return false; 609 } 610 611 /** 612 * {@inheritDoc} 613 */ 614 @Override hashCode()615 public int hashCode() { 616 return Objects.hash(contextStart, contextLimit, start, limit); 617 } 618 619 /** 620 * Returns a string representation of this Position. 621 * @return a string representation of the object. 622 */ 623 @Override toString()624 public String toString() { 625 return "[cs=" + contextStart 626 + ", s=" + start 627 + ", l=" + limit 628 + ", cl=" + contextLimit 629 + "]"; 630 } 631 632 /** 633 * Check all bounds. If they are invalid, throw an exception. 634 * @param length the length of the string this object applies to 635 * @exception IllegalArgumentException if any indices are out 636 * of bounds 637 */ validate(int length)638 public final void validate(int length) { 639 if (contextStart < 0 || 640 start < contextStart || 641 limit < start || 642 contextLimit < limit || 643 length < contextLimit) { 644 throw new IllegalArgumentException("Invalid Position {cs=" + 645 contextStart + ", s=" + 646 start + ", l=" + 647 limit + ", cl=" + 648 contextLimit + "}, len=" + 649 length); 650 } 651 } 652 } 653 654 /** 655 * Programmatic name, e.g., "Latin-Arabic". 656 */ 657 private String ID; 658 659 /** 660 * This transliterator's filter. Any character for which 661 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 662 * altered by this transliterator. If <tt>filter</tt> is 663 * <tt>null</tt> then no filtering is applied. 664 */ 665 private UnicodeSet filter; 666 667 private int maximumContextLength = 0; 668 669 /** 670 * System transliterator registry. 671 */ 672 private static TransliteratorRegistry registry; 673 674 private static Map<CaseInsensitiveString, String> displayNameCache; 675 676 /** 677 * Prefix for resource bundle key for the display name for a 678 * transliterator. The ID is appended to this to form the key. 679 * The resource bundle value should be a String. 680 */ 681 private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%"; 682 683 /** 684 * Prefix for resource bundle key for the display name for a 685 * transliterator SCRIPT. The ID is appended to this to form the key. 686 * The resource bundle value should be a String. 687 */ 688 private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%"; 689 690 /** 691 * Resource bundle key for display name pattern. 692 * The resource bundle value should be a String forming a 693 * MessageFormat pattern, e.g.: 694 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". 695 */ 696 private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern"; 697 698 /** 699 * Delimiter between elements in a compound ID. 700 */ 701 static final char ID_DELIM = ';'; 702 703 /** 704 * Delimiter before target in an ID. 705 */ 706 static final char ID_SEP = '-'; 707 708 /** 709 * Delimiter before variant in an ID. 710 */ 711 static final char VARIANT_SEP = '/'; 712 713 /** 714 * To enable debugging output in the Transliterator component, set 715 * DEBUG to true. 716 * 717 * N.B. Make sure to recompile all of the ohos.global.icu.text package 718 * after changing this. Easiest way to do this is 'ant clean 719 * core' ('ant' will NOT pick up the dependency automatically). 720 * 721 * <<This generates a lot of output.>> 722 */ 723 static final boolean DEBUG = false; 724 725 /** 726 * Default constructor. 727 * @param ID the string identifier for this transliterator 728 * @param filter the filter. Any character for which 729 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 730 * altered by this transliterator. If <tt>filter</tt> is 731 * <tt>null</tt> then no filtering is applied. 732 * @hide unsupported on OHOS 733 */ Transliterator(String ID, UnicodeFilter filter)734 protected Transliterator(String ID, UnicodeFilter filter) { 735 if (ID == null) { 736 throw new NullPointerException(); 737 } 738 this.ID = ID; 739 setFilter(filter); 740 } 741 742 /** 743 * Transliterates a segment of a string, with optional filtering. 744 * 745 * @param text the string to be transliterated 746 * @param start the beginning index, inclusive; <code>0 <= start 747 * <= limit</code>. 748 * @param limit the ending index, exclusive; <code>start <= limit 749 * <= text.length()</code>. 750 * @return The new limit index. The text previously occupying <code>[start, 751 * limit)</code> has been transliterated, possibly to a string of a different 752 * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where 753 * <em>new-limit</em> is the return value. If the input offsets are out of bounds, 754 * the returned value is -1 and the input string remains unchanged. 755 */ transliterate(Replaceable text, int start, int limit)756 public final int transliterate(Replaceable text, int start, int limit) { 757 if (start < 0 || 758 limit < start || 759 text.length() < limit) { 760 return -1; 761 } 762 763 Position pos = new Position(start, limit, start); 764 filteredTransliterate(text, pos, false, true); 765 return pos.limit; 766 } 767 768 /** 769 * Transliterates an entire string in place. Convenience method. 770 * @param text the string to be transliterated 771 */ transliterate(Replaceable text)772 public final void transliterate(Replaceable text) { 773 transliterate(text, 0, text.length()); 774 } 775 776 /** 777 * Transliterate an entire string and returns the result. Convenience method. 778 * 779 * @param text the string to be transliterated 780 * @return The transliterated text 781 */ transliterate(String text)782 public final String transliterate(String text) { 783 ReplaceableString result = new ReplaceableString(text); 784 transliterate(result); 785 return result.toString(); 786 } 787 788 /** 789 * Transliterates the portion of the text buffer that can be 790 * transliterated unambiguosly after new text has been inserted, 791 * typically as a result of a keyboard event. The new text in 792 * <code>insertion</code> will be inserted into <code>text</code> 793 * at <code>index.contextLimit</code>, advancing 794 * <code>index.contextLimit</code> by <code>insertion.length()</code>. 795 * Then the transliterator will try to transliterate characters of 796 * <code>text</code> between <code>index.start</code> and 797 * <code>index.contextLimit</code>. Characters before 798 * <code>index.start</code> will not be changed. 799 * 800 * <p>Upon return, values in <code>index</code> will be updated. 801 * <code>index.contextStart</code> will be advanced to the first 802 * character that future calls to this method will read. 803 * <code>index.start</code> and <code>index.contextLimit</code> will 804 * be adjusted to delimit the range of text that future calls to 805 * this method may change. 806 * 807 * <p>Typical usage of this method begins with an initial call 808 * with <code>index.contextStart</code> and <code>index.contextLimit</code> 809 * set to indicate the portion of <code>text</code> to be 810 * transliterated, and <code>index.start == index.contextStart</code>. 811 * Thereafter, <code>index</code> can be used without 812 * modification in future calls, provided that all changes to 813 * <code>text</code> are made via this method. 814 * 815 * <p>This method assumes that future calls may be made that will 816 * insert new text into the buffer. As a result, it only performs 817 * unambiguous transliterations. After the last call to this 818 * method, there may be untransliterated text that is waiting for 819 * more input to resolve an ambiguity. In order to perform these 820 * pending transliterations, clients should call {@link 821 * #finishTransliteration} after the last call to this 822 * method has been made. 823 * 824 * @param text the buffer holding transliterated and untransliterated text 825 * @param index the start and limit of the text, the position 826 * of the cursor, and the start and limit of transliteration. 827 * @param insertion text to be inserted and possibly 828 * transliterated into the translation buffer at 829 * <code>index.contextLimit</code>. If <code>null</code> then no text 830 * is inserted. 831 * @see #handleTransliterate 832 * @exception IllegalArgumentException if <code>index</code> 833 * is invalid 834 */ transliterate(Replaceable text, Position index, String insertion)835 public final void transliterate(Replaceable text, Position index, 836 String insertion) { 837 index.validate(text.length()); 838 839 // int originalStart = index.contextStart; 840 if (insertion != null) { 841 text.replace(index.limit, index.limit, insertion); 842 index.limit += insertion.length(); 843 index.contextLimit += insertion.length(); 844 } 845 846 if (index.limit > 0 && 847 UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) { 848 // Oops, there is a dangling lead surrogate in the buffer. 849 // This will break most transliterators, since they will 850 // assume it is part of a pair. Don't transliterate until 851 // more text comes in. 852 return; 853 } 854 855 filteredTransliterate(text, index, true, true); 856 857 // TODO 858 // This doesn't work once we add quantifier support. Need to rewrite 859 // this code to support quantifiers and 'use maximum backup <n>;'. 860 // 861 // index.contextStart = Math.max(index.start - getMaximumContextLength(), 862 // originalStart); 863 } 864 865 /** 866 * Transliterates the portion of the text buffer that can be 867 * transliterated unambiguosly after a new character has been 868 * inserted, typically as a result of a keyboard event. This is a 869 * convenience method; see {@link #transliterate(Replaceable, 870 * Transliterator.Position, String)} for details. 871 * @param text the buffer holding transliterated and 872 * untransliterated text 873 * @param index the start and limit of the text, the position 874 * of the cursor, and the start and limit of transliteration. 875 * @param insertion text to be inserted and possibly 876 * transliterated into the translation buffer at 877 * <code>index.contextLimit</code>. 878 * @see #transliterate(Replaceable, Transliterator.Position, String) 879 */ transliterate(Replaceable text, Position index, int insertion)880 public final void transliterate(Replaceable text, Position index, 881 int insertion) { 882 transliterate(text, index, UTF16.valueOf(insertion)); 883 } 884 885 /** 886 * Transliterates the portion of the text buffer that can be 887 * transliterated unambiguosly. This is a convenience method; see 888 * {@link #transliterate(Replaceable, Transliterator.Position, 889 * String)} for details. 890 * @param text the buffer holding transliterated and 891 * untransliterated text 892 * @param index the start and limit of the text, the position 893 * of the cursor, and the start and limit of transliteration. 894 * @see #transliterate(Replaceable, Transliterator.Position, String) 895 */ transliterate(Replaceable text, Position index)896 public final void transliterate(Replaceable text, Position index) { 897 transliterate(text, index, null); 898 } 899 900 /** 901 * Finishes any pending transliterations that were waiting for 902 * more characters. Clients should call this method as the last 903 * call after a sequence of one or more calls to 904 * <code>transliterate()</code>. 905 * @param text the buffer holding transliterated and 906 * untransliterated text. 907 * @param index the array of indices previously passed to {@link 908 * #transliterate} 909 */ finishTransliteration(Replaceable text, Position index)910 public final void finishTransliteration(Replaceable text, 911 Position index) { 912 index.validate(text.length()); 913 filteredTransliterate(text, index, false, true); 914 } 915 916 /** 917 * Abstract method that concrete subclasses define to implement 918 * their transliteration algorithm. This method handles both 919 * incremental and non-incremental transliteration. Let 920 * <code>originalStart</code> refer to the value of 921 * <code>pos.start</code> upon entry. 922 * 923 * <ul> 924 * <li>If <code>incremental</code> is false, then this method 925 * should transliterate all characters between 926 * <code>pos.start</code> and <code>pos.limit</code>. Upon return 927 * <code>pos.start</code> must == <code> pos.limit</code>.</li> 928 * 929 * <li>If <code>incremental</code> is true, then this method 930 * should transliterate all characters between 931 * <code>pos.start</code> and <code>pos.limit</code> that can be 932 * unambiguously transliterated, regardless of future insertions 933 * of text at <code>pos.limit</code>. Upon return, 934 * <code>pos.start</code> should be in the range 935 * [<code>originalStart</code>, <code>pos.limit</code>). 936 * <code>pos.start</code> should be positioned such that 937 * characters [<code>originalStart</code>, <code> 938 * pos.start</code>) will not be changed in the future by this 939 * transliterator and characters [<code>pos.start</code>, 940 * <code>pos.limit</code>) are unchanged.</li> 941 * </ul> 942 * 943 * <p>Implementations of this method should also obey the 944 * following invariants:</p> 945 * 946 * <ul> 947 * <li> <code>pos.limit</code> and <code>pos.contextLimit</code> 948 * should be updated to reflect changes in length of the text 949 * between <code>pos.start</code> and <code>pos.limit</code>. The 950 * difference <code> pos.contextLimit - pos.limit</code> should 951 * not change.</li> 952 * 953 * <li><code>pos.contextStart</code> should not change.</li> 954 * 955 * <li>Upon return, neither <code>pos.start</code> nor 956 * <code>pos.limit</code> should be less than 957 * <code>originalStart</code>.</li> 958 * 959 * <li>Text before <code>originalStart</code> and text after 960 * <code>pos.limit</code> should not change.</li> 961 * 962 * <li>Text before <code>pos.contextStart</code> and text after 963 * <code> pos.contextLimit</code> should be ignored.</li> 964 * </ul> 965 * 966 * <p>Subclasses may safely assume that all characters in 967 * [<code>pos.start</code>, <code>pos.limit</code>) are filtered. 968 * In other words, the filter has already been applied by the time 969 * this method is called. See 970 * <code>filteredTransliterate()</code>. 971 * 972 * <p>This method is <b>not</b> for public consumption. Calling 973 * this method directly will transliterate 974 * [<code>pos.start</code>, <code>pos.limit</code>) without 975 * applying the filter. End user code should call <code> 976 * transliterate()</code> instead of this method. Subclass code 977 * should call <code>filteredTransliterate()</code> instead of 978 * this method.<p> 979 * 980 * @param text the buffer holding transliterated and 981 * untransliterated text 982 * 983 * @param pos the indices indicating the start, limit, context 984 * start, and context limit of the text. 985 * 986 * @param incremental if true, assume more text may be inserted at 987 * <code>pos.limit</code> and act accordingly. Otherwise, 988 * transliterate all text between <code>pos.start</code> and 989 * <code>pos.limit</code> and move <code>pos.start</code> up to 990 * <code>pos.limit</code>. 991 * 992 * @see #transliterate 993 * @hide unsupported on OHOS 994 */ handleTransliterate(Replaceable text, Position pos, boolean incremental)995 protected abstract void handleTransliterate(Replaceable text, 996 Position pos, boolean incremental); 997 998 /** 999 * Top-level transliteration method, handling filtering, incremental and 1000 * non-incremental transliteration, and rollback. All transliteration 1001 * public API methods eventually call this method with a rollback argument 1002 * of TRUE. Other entities may call this method but rollback should be 1003 * FALSE. 1004 * 1005 * <p>If this transliterator has a filter, break up the input text into runs 1006 * of unfiltered characters. Pass each run to 1007 * <subclass>.handleTransliterate(). 1008 * 1009 * <p>In incremental mode, if rollback is TRUE, perform a special 1010 * incremental procedure in which several passes are made over the input 1011 * text, adding one character at a time, and committing successful 1012 * transliterations as they occur. Unsuccessful transliterations are rolled 1013 * back and retried with additional characters to give correct results. 1014 * 1015 * @param text the text to be transliterated 1016 * @param index the position indices 1017 * @param incremental if TRUE, then assume more characters may be inserted 1018 * at index.limit, and postpone processing to accomodate future incoming 1019 * characters 1020 * @param rollback if TRUE and if incremental is TRUE, then perform special 1021 * incremental processing, as described above, and undo partial 1022 * transliterations where necessary. If incremental is FALSE then this 1023 * parameter is ignored. 1024 */ filteredTransliterate(Replaceable text, Position index, boolean incremental, boolean rollback)1025 private void filteredTransliterate(Replaceable text, 1026 Position index, 1027 boolean incremental, 1028 boolean rollback) { 1029 // Short circuit path for transliterators with no filter in 1030 // non-incremental mode. 1031 if (filter == null && !rollback) { 1032 handleTransliterate(text, index, incremental); 1033 return; 1034 } 1035 1036 //---------------------------------------------------------------------- 1037 // This method processes text in two groupings: 1038 // 1039 // RUNS -- A run is a contiguous group of characters which are contained 1040 // in the filter for this transliterator (filter.contains(ch) == true). 1041 // Text outside of runs may appear as context but it is not modified. 1042 // The start and limit Position values are narrowed to each run. 1043 // 1044 // PASSES (incremental only) -- To make incremental mode work correctly, 1045 // each run is broken up into n passes, where n is the length (in code 1046 // points) of the run. Each pass contains the first n characters. If a 1047 // pass is completely transliterated, it is committed, and further passes 1048 // include characters after the committed text. If a pass is blocked, 1049 // and does not transliterate completely, then this method rolls back 1050 // the changes made during the pass, extends the pass by one code point, 1051 // and tries again. 1052 //---------------------------------------------------------------------- 1053 1054 // globalLimit is the limit value for the entire operation. We 1055 // set index.limit to the end of each unfiltered run before 1056 // calling handleTransliterate(), so we need to maintain the real 1057 // value of index.limit here. After each transliteration, we 1058 // update globalLimit for insertions or deletions that have 1059 // happened. 1060 int globalLimit = index.limit; 1061 1062 // If there is a non-null filter, then break the input text up. Say the 1063 // input text has the form: 1064 // xxxabcxxdefxx 1065 // where 'x' represents a filtered character (filter.contains('x') == 1066 // false). Then we break this up into: 1067 // xxxabc xxdef xx 1068 // Each pass through the loop consumes a run of filtered 1069 // characters (which are ignored) and a subsequent run of 1070 // unfiltered characters (which are transliterated). 1071 1072 StringBuffer log = null; 1073 if (DEBUG) { 1074 log = new StringBuffer(); 1075 } 1076 1077 for (;;) { 1078 1079 if (filter != null) { 1080 // Narrow the range to be transliterated to the first run 1081 // of unfiltered characters at or after index.start. 1082 1083 // Advance past filtered chars 1084 int c; 1085 while (index.start < globalLimit && 1086 !filter.contains(c=text.char32At(index.start))) { 1087 index.start += UTF16.getCharCount(c); 1088 } 1089 1090 // Find the end of this run of unfiltered chars 1091 index.limit = index.start; 1092 while (index.limit < globalLimit && 1093 filter.contains(c=text.char32At(index.limit))) { 1094 index.limit += UTF16.getCharCount(c); 1095 } 1096 } 1097 1098 // Check to see if the unfiltered run is empty. This only 1099 // happens at the end of the string when all the remaining 1100 // characters are filtered. 1101 if (index.start == index.limit) { 1102 break; 1103 } 1104 1105 // Is this run incremental? If there is additional 1106 // filtered text (if limit < globalLimit) then we pass in 1107 // an incremental value of FALSE to force the subclass to 1108 // complete the transliteration for this run. 1109 boolean isIncrementalRun = 1110 (index.limit < globalLimit ? false : incremental); 1111 1112 int delta; 1113 1114 // Implement rollback. To understand the need for rollback, 1115 // consider the following transliterator: 1116 // 1117 // "t" is "a > A;" 1118 // "u" is "A > b;" 1119 // "v" is a compound of "t; NFD; u" with a filter [:Ll:] 1120 // 1121 // Now apply "v" to the input text "a". The result is "b". But if 1122 // the transliteration is done incrementally, then the NFD holds 1123 // things up after "t" has already transformed "a" to "A". When 1124 // finishTransliterate() is called, "A" is _not_ processed because 1125 // it gets excluded by the [:Ll:] filter, and the end result is "A" 1126 // -- incorrect. The problem is that the filter is applied to a 1127 // partially-transliterated result, when we only want it to apply to 1128 // input text. Although this example describes a compound 1129 // transliterator containing NFD and a specific filter, it can 1130 // happen with any transliterator which does a partial 1131 // transformation in incremental mode into characters outside its 1132 // filter. 1133 // 1134 // To handle this, when in incremental mode we supply characters to 1135 // handleTransliterate() in several passes. Each pass adds one more 1136 // input character to the input text. That is, for input "ABCD", we 1137 // first try "A", then "AB", then "ABC", and finally "ABCD". If at 1138 // any point we block (upon return, start < limit) then we roll 1139 // back. If at any point we complete the run (upon return start == 1140 // limit) then we commit that run. 1141 1142 if (rollback && isIncrementalRun) { 1143 1144 if (DEBUG) { 1145 log.setLength(0); 1146 System.out.println("filteredTransliterate{"+getID()+"}i: IN=" + 1147 UtilityExtensions.formatInput(text, index)); 1148 } 1149 1150 int runStart = index.start; 1151 int runLimit = index.limit; 1152 int runLength = runLimit - runStart; 1153 1154 // Make a rollback copy at the end of the string 1155 int rollbackOrigin = text.length(); 1156 text.copy(runStart, runLimit, rollbackOrigin); 1157 1158 // Variables reflecting the commitment of completely 1159 // transliterated text. passStart is the runStart, advanced 1160 // past committed text. rollbackStart is the rollbackOrigin, 1161 // advanced past rollback text that corresponds to committed 1162 // text. 1163 int passStart = runStart; 1164 int rollbackStart = rollbackOrigin; 1165 1166 // The limit for each pass; we advance by one code point with 1167 // each iteration. 1168 int passLimit = index.start; 1169 1170 // Total length, in 16-bit code units, of uncommitted text. 1171 // This is the length to be rolled back. 1172 int uncommittedLength = 0; 1173 1174 // Total delta (change in length) for all passes 1175 int totalDelta = 0; 1176 1177 // PASS MAIN LOOP -- Start with a single character, and extend 1178 // the text by one character at a time. Roll back partial 1179 // transliterations and commit complete transliterations. 1180 for (;;) { 1181 // Length of additional code point, either one or two 1182 int charLength = 1183 UTF16.getCharCount(text.char32At(passLimit)); 1184 passLimit += charLength; 1185 if (passLimit > runLimit) { 1186 break; 1187 } 1188 uncommittedLength += charLength; 1189 1190 index.limit = passLimit; 1191 1192 if (DEBUG) { 1193 log.setLength(0); 1194 log.append("filteredTransliterate{"+getID()+"}i: "); 1195 UtilityExtensions.formatInput(log, text, index); 1196 } 1197 1198 // Delegate to subclass for actual transliteration. Upon 1199 // return, start will be updated to point after the 1200 // transliterated text, and limit and contextLimit will be 1201 // adjusted for length changes. 1202 handleTransliterate(text, index, true); 1203 1204 if (DEBUG) { 1205 log.append(" => "); 1206 UtilityExtensions.formatInput(log, text, index); 1207 } 1208 1209 delta = index.limit - passLimit; // change in length 1210 1211 // We failed to completely transliterate this pass. 1212 // Roll back the text. Indices remain unchanged; reset 1213 // them where necessary. 1214 if (index.start != index.limit) { 1215 // Find the rollbackStart, adjusted for length changes 1216 // and the deletion of partially transliterated text. 1217 int rs = rollbackStart + delta - (index.limit - passStart); 1218 1219 // Delete the partially transliterated text 1220 text.replace(passStart, index.limit, ""); 1221 1222 // Copy the rollback text back 1223 text.copy(rs, rs + uncommittedLength, passStart); 1224 1225 // Restore indices to their original values 1226 index.start = passStart; 1227 index.limit = passLimit; 1228 index.contextLimit -= delta; 1229 1230 if (DEBUG) { 1231 log.append(" (ROLLBACK)"); 1232 } 1233 } 1234 1235 // We did completely transliterate this pass. Update the 1236 // commit indices to record how far we got. Adjust indices 1237 // for length change. 1238 else { 1239 // Move the pass indices past the committed text. 1240 passStart = passLimit = index.start; 1241 1242 // Adjust the rollbackStart for length changes and move 1243 // it past the committed text. All characters we've 1244 // processed to this point are committed now, so zero 1245 // out the uncommittedLength. 1246 rollbackStart += delta + uncommittedLength; 1247 uncommittedLength = 0; 1248 1249 // Adjust indices for length changes. 1250 runLimit += delta; 1251 totalDelta += delta; 1252 } 1253 1254 if (DEBUG) { 1255 System.out.println(Utility.escape(log.toString())); 1256 } 1257 } 1258 1259 // Adjust overall limit and rollbackOrigin for insertions and 1260 // deletions. Don't need to worry about contextLimit because 1261 // handleTransliterate() maintains that. 1262 rollbackOrigin += totalDelta; 1263 globalLimit += totalDelta; 1264 1265 // Delete the rollback copy 1266 text.replace(rollbackOrigin, rollbackOrigin + runLength, ""); 1267 1268 // Move start past committed text 1269 index.start = passStart; 1270 } 1271 1272 else { 1273 // Delegate to subclass for actual transliteration. 1274 if (DEBUG) { 1275 log.setLength(0); 1276 log.append("filteredTransliterate{"+getID()+"}: "); 1277 UtilityExtensions.formatInput(log, text, index); 1278 } 1279 1280 int limit = index.limit; 1281 handleTransliterate(text, index, isIncrementalRun); 1282 delta = index.limit - limit; // change in length 1283 1284 if (DEBUG) { 1285 log.append(" => "); 1286 UtilityExtensions.formatInput(log, text, index); 1287 } 1288 1289 // In a properly written transliterator, start == limit after 1290 // handleTransliterate() returns when incremental is false. 1291 // Catch cases where the subclass doesn't do this, and throw 1292 // an exception. (Just pinning start to limit is a bad idea, 1293 // because what's probably happening is that the subclass 1294 // isn't transliterating all the way to the end, and it should 1295 // in non-incremental mode.) 1296 if (!isIncrementalRun && index.start != index.limit) { 1297 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID()); 1298 } 1299 1300 // Adjust overall limit for insertions/deletions. Don't need 1301 // to worry about contextLimit because handleTransliterate() 1302 // maintains that. 1303 globalLimit += delta; 1304 1305 if (DEBUG) { 1306 System.out.println(Utility.escape(log.toString())); 1307 } 1308 } 1309 1310 if (filter == null || isIncrementalRun) { 1311 break; 1312 } 1313 1314 // If we did completely transliterate this 1315 // run, then repeat with the next unfiltered run. 1316 } 1317 1318 // Start is valid where it is. Limit needs to be put back where 1319 // it was, modulo adjustments for deletions/insertions. 1320 index.limit = globalLimit; 1321 1322 if (DEBUG) { 1323 System.out.println("filteredTransliterate{"+getID()+"}: OUT=" + 1324 UtilityExtensions.formatInput(text, index)); 1325 } 1326 } 1327 1328 /** 1329 * Transliterate a substring of text, as specified by index, taking filters 1330 * into account. This method is for subclasses that need to delegate to 1331 * another transliterator. 1332 * @param text the text to be transliterated 1333 * @param index the position indices 1334 * @param incremental if TRUE, then assume more characters may be inserted 1335 * at index.limit, and postpone processing to accomodate future incoming 1336 * characters 1337 */ filteredTransliterate(Replaceable text, Position index, boolean incremental)1338 public void filteredTransliterate(Replaceable text, 1339 Position index, 1340 boolean incremental) { 1341 filteredTransliterate(text, index, incremental, false); 1342 } 1343 1344 /** 1345 * Returns the length of the longest context required by this transliterator. 1346 * This is <em>preceding</em> context. The default value is zero, but 1347 * subclasses can change this by calling <code>setMaximumContextLength()</code>. 1348 * For example, if a transliterator translates "ddd" (where 1349 * d is any digit) to "555" when preceded by "(ddd)", then the preceding 1350 * context length is 5, the length of "(ddd)". 1351 * 1352 * @return The maximum number of preceding context characters this 1353 * transliterator needs to examine 1354 */ getMaximumContextLength()1355 public final int getMaximumContextLength() { 1356 return maximumContextLength; 1357 } 1358 1359 /** 1360 * Method for subclasses to use to set the maximum context length. 1361 * @see #getMaximumContextLength 1362 * @hide unsupported on OHOS 1363 */ setMaximumContextLength(int a)1364 protected void setMaximumContextLength(int a) { 1365 if (a < 0) { 1366 throw new IllegalArgumentException("Invalid context length " + a); 1367 } 1368 maximumContextLength = a; 1369 } 1370 1371 /** 1372 * Returns a programmatic identifier for this transliterator. 1373 * If this identifier is passed to <code>getInstance()</code>, it 1374 * will return this object, if it has been registered. 1375 * @see #registerClass 1376 * @see #getAvailableIDs 1377 */ getID()1378 public final String getID() { 1379 return ID; 1380 } 1381 1382 /** 1383 * Set the programmatic identifier for this transliterator. Only 1384 * for use by subclasses. 1385 * @hide unsupported on OHOS 1386 */ setID(String id)1387 protected final void setID(String id) { 1388 ID = id; 1389 } 1390 1391 /** 1392 * Returns a name for this transliterator that is appropriate for 1393 * display to the user in the default <code>DISPLAY</code> locale. See {@link 1394 * #getDisplayName(String,Locale)} for details. 1395 * @see ohos.global.icu.util.ULocale.Category#DISPLAY 1396 */ getDisplayName(String ID)1397 public final static String getDisplayName(String ID) { 1398 return getDisplayName(ID, ULocale.getDefault(Category.DISPLAY)); 1399 } 1400 1401 /** 1402 * Returns a name for this transliterator that is appropriate for 1403 * display to the user in the given locale. This name is taken 1404 * from the locale resource data in the standard manner of the 1405 * <code>java.text</code> package. 1406 * 1407 * <p>If no localized names exist in the system resource bundles, 1408 * a name is synthesized using a localized 1409 * <code>MessageFormat</code> pattern from the resource data. The 1410 * arguments to this pattern are an integer followed by one or two 1411 * strings. The integer is the number of strings, either 1 or 2. 1412 * The strings are formed by splitting the ID for this 1413 * transliterator at the first '-'. If there is no '-', then the 1414 * entire ID forms the only string. 1415 * @param inLocale the Locale in which the display name should be 1416 * localized. 1417 * @see java.text.MessageFormat 1418 */ getDisplayName(String id, Locale inLocale)1419 public static String getDisplayName(String id, Locale inLocale) { 1420 return getDisplayName(id, ULocale.forLocale(inLocale)); 1421 } 1422 1423 /** 1424 * Returns a name for this transliterator that is appropriate for 1425 * display to the user in the given locale. This name is taken 1426 * from the locale resource data in the standard manner of the 1427 * <code>java.text</code> package. 1428 * 1429 * <p>If no localized names exist in the system resource bundles, 1430 * a name is synthesized using a localized 1431 * <code>MessageFormat</code> pattern from the resource data. The 1432 * arguments to this pattern are an integer followed by one or two 1433 * strings. The integer is the number of strings, either 1 or 2. 1434 * The strings are formed by splitting the ID for this 1435 * transliterator at the first '-'. If there is no '-', then the 1436 * entire ID forms the only string. 1437 * @param inLocale the ULocale in which the display name should be 1438 * localized. 1439 * @see java.text.MessageFormat 1440 */ getDisplayName(String id, ULocale inLocale)1441 public static String getDisplayName(String id, ULocale inLocale) { 1442 1443 // Resource bundle containing display name keys and the 1444 // RB_RULE_BASED_IDS array. 1445 // 1446 //If we ever integrate this with the Sun JDK, the resource bundle 1447 // root will change to sun.text.resources.LocaleElements 1448 1449 ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle. 1450 getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, inLocale); 1451 1452 // Normalize the ID 1453 String stv[] = TransliteratorIDParser.IDtoSTV(id); 1454 if (stv == null) { 1455 // No target; malformed id 1456 return ""; 1457 } 1458 String ID = stv[0] + '-' + stv[1]; 1459 if (stv[2] != null && stv[2].length() > 0) { 1460 ID = ID + '/' + stv[2]; 1461 } 1462 1463 // Use the registered display name, if any 1464 String n = displayNameCache.get(new CaseInsensitiveString(ID)); 1465 if (n != null) { 1466 return n; 1467 } 1468 1469 // Use display name for the entire transliterator, if it 1470 // exists. 1471 try { 1472 return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID); 1473 } catch (MissingResourceException e) {} 1474 1475 try { 1476 // Construct the formatter first; if getString() fails 1477 // we'll exit the try block 1478 MessageFormat format = new MessageFormat( 1479 bundle.getString(RB_DISPLAY_NAME_PATTERN)); 1480 // Construct the argument array 1481 Object[] args = new Object[] { Integer.valueOf(2), stv[0], stv[1] }; 1482 1483 // Use display names for the scripts, if they exist 1484 for (int j=1; j<=2; ++j) { 1485 try { 1486 args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX + 1487 (String) args[j]); 1488 } catch (MissingResourceException e) {} 1489 } 1490 1491 // Format it using the pattern in the resource 1492 return (stv[2].length() > 0) ? 1493 (format.format(args) + '/' + stv[2]) : 1494 format.format(args); 1495 } catch (MissingResourceException e2) {} 1496 1497 // We should not reach this point unless there is something 1498 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has 1499 // been deleted from the root RB_LOCALE_ELEMENTS resource. 1500 throw new RuntimeException(); 1501 } 1502 1503 /** 1504 * Returns the filter used by this transliterator, or <tt>null</tt> 1505 * if this transliterator uses no filter. 1506 */ getFilter()1507 public final UnicodeFilter getFilter() { 1508 return filter; 1509 } 1510 1511 /** 1512 * Changes the filter used by this transliterator. If the filter 1513 * is set to <tt>null</tt> then no filtering will occur. 1514 * 1515 * <p>Callers must take care if a transliterator is in use by 1516 * multiple threads. The filter should not be changed by one 1517 * thread while another thread may be transliterating. 1518 */ setFilter(UnicodeFilter filter)1519 public void setFilter(UnicodeFilter filter) { 1520 if (filter == null) { 1521 this.filter = null; 1522 } else { 1523 try { 1524 // fast high-runner case 1525 this.filter = new UnicodeSet((UnicodeSet)filter).freeze(); 1526 } catch (Exception e) { 1527 this.filter = new UnicodeSet(); 1528 filter.addMatchSetTo(this.filter); 1529 this.filter.freeze(); 1530 } 1531 } 1532 } 1533 1534 /** 1535 * Returns a <code>Transliterator</code> object given its ID. 1536 * The ID must be either a system transliterator ID or a ID registered 1537 * using <code>registerClass()</code>. 1538 * 1539 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1540 * @return A <code>Transliterator</code> object with the given ID 1541 * @exception IllegalArgumentException if the given ID is invalid. 1542 */ getInstance(String ID)1543 public static final Transliterator getInstance(String ID) { 1544 return getInstance(ID, FORWARD); 1545 } 1546 1547 /** 1548 * Returns a <code>Transliterator</code> object given its ID. 1549 * The ID must be either a system transliterator ID or a ID registered 1550 * using <code>registerClass()</code>. 1551 * 1552 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1553 * @param dir either FORWARD or REVERSE. If REVERSE then the 1554 * inverse of the given ID is instantiated. 1555 * @return A <code>Transliterator</code> object with the given ID 1556 * @exception IllegalArgumentException if the given ID is invalid. 1557 * @see #registerClass 1558 * @see #getAvailableIDs 1559 * @see #getID 1560 */ getInstance(String ID, int dir)1561 public static Transliterator getInstance(String ID, 1562 int dir) { 1563 StringBuffer canonID = new StringBuffer(); 1564 List<SingleID> list = new ArrayList<>(); 1565 UnicodeSet[] globalFilter = new UnicodeSet[1]; 1566 if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) { 1567 throw new IllegalArgumentException("Invalid ID " + ID); 1568 } 1569 1570 List<Transliterator> translits = TransliteratorIDParser.instantiateList(list); 1571 1572 // assert(list.size() > 0); 1573 Transliterator t = null; 1574 if (list.size() > 1 || canonID.indexOf(";") >= 0) { 1575 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only 1576 // has one child transliterator. This is so that toRules() will return the right thing 1577 // (without any inactive ID), but our main ID still comes out correct. That is, if we 1578 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;" 1579 // even though the ID is "(Lower);Latin-Greek;". 1580 t = new CompoundTransliterator(translits); 1581 } 1582 else { 1583 t = translits.get(0); 1584 } 1585 1586 t.setID(canonID.toString()); 1587 if (globalFilter[0] != null) { 1588 t.setFilter(globalFilter[0]); 1589 } 1590 return t; 1591 } 1592 1593 /** 1594 * Create a transliterator from a basic ID. This is an ID 1595 * containing only the forward direction source, target, and 1596 * variant. 1597 * @param id a basic ID of the form S-T or S-T/V. 1598 * @param canonID canonical ID to apply to the result, or 1599 * null to leave the ID unchanged 1600 * @return a newly created Transliterator or null if the ID is 1601 * invalid. 1602 */ getBasicInstance(String id, String canonID)1603 static Transliterator getBasicInstance(String id, String canonID) { 1604 StringBuffer s = new StringBuffer(); 1605 Transliterator t = registry.get(id, s); 1606 if (s.length() != 0) { 1607 // assert(t==0); 1608 // Instantiate an alias 1609 t = getInstance(s.toString(), FORWARD); 1610 } 1611 if (t != null && canonID != null) { 1612 t.setID(canonID); 1613 } 1614 return t; 1615 } 1616 1617 /** 1618 * Returns a <code>Transliterator</code> object constructed from 1619 * the given rule string. This will be a rule-based Transliterator, 1620 * if the rule string contains only rules, or a 1621 * compound Transliterator, if it contains ID blocks, or a 1622 * null Transliterator, if it contains ID blocks which parse as 1623 * empty for the given direction. 1624 * 1625 * @param ID the id for the transliterator. 1626 * @param rules rules, separated by ';' 1627 * @param dir either FORWARD or REVERSE. 1628 * @return a newly created Transliterator 1629 * @throws IllegalArgumentException if there is a problem with the ID or the rules 1630 */ createFromRules(String ID, String rules, int dir)1631 public static final Transliterator createFromRules(String ID, String rules, int dir) { 1632 Transliterator t = null; 1633 1634 TransliteratorParser parser = new TransliteratorParser(); 1635 parser.parse(rules, dir); 1636 1637 // NOTE: The logic here matches that in TransliteratorRegistry. 1638 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) { 1639 t = new NullTransliterator(); 1640 } 1641 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { 1642 t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter); 1643 } 1644 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { 1645 // idBlock, no data -- this is an alias. The ID has 1646 // been munged from reverse into forward mode, if 1647 // necessary, so instantiate the ID in the forward 1648 // direction. 1649 if (parser.compoundFilter != null) { 1650 t = getInstance(parser.compoundFilter.toPattern(false) + ";" 1651 + parser.idBlockVector.get(0)); 1652 } else { 1653 t = getInstance(parser.idBlockVector.get(0)); 1654 } 1655 1656 if (t != null) { 1657 t.setID(ID); 1658 } 1659 } 1660 else { 1661 List<Transliterator> transliterators = new ArrayList<>(); 1662 int passNumber = 1; 1663 1664 int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size()); 1665 for (int i = 0; i < limit; i++) { 1666 if (i < parser.idBlockVector.size()) { 1667 String idBlock = parser.idBlockVector.get(i); 1668 if (idBlock.length() > 0) { 1669 Transliterator temp = getInstance(idBlock); 1670 if (!(temp instanceof NullTransliterator)) 1671 transliterators.add(getInstance(idBlock)); 1672 } 1673 } 1674 if (i < parser.dataVector.size()) { 1675 Data data = parser.dataVector.get(i); 1676 transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null)); 1677 } 1678 } 1679 1680 t = new CompoundTransliterator(transliterators, passNumber - 1); 1681 t.setID(ID); 1682 if (parser.compoundFilter != null) { 1683 t.setFilter(parser.compoundFilter); 1684 } 1685 } 1686 1687 return t; 1688 } 1689 1690 /** 1691 * Returns a rule string for this transliterator. 1692 * @param escapeUnprintable if true, then unprintable characters 1693 * will be converted to escape form backslash-'u' or 1694 * backslash-'U'. 1695 */ toRules(boolean escapeUnprintable)1696 public String toRules(boolean escapeUnprintable) { 1697 return baseToRules(escapeUnprintable); 1698 } 1699 1700 /** 1701 * Returns a rule string for this transliterator. This is 1702 * a non-overrideable base class implementation that subclasses 1703 * may call. It simply munges the ID into the correct format, 1704 * that is, "foo" => "::foo". 1705 * @param escapeUnprintable if true, then unprintable characters 1706 * will be converted to escape form backslash-'u' or 1707 * backslash-'U'. 1708 * @hide unsupported on OHOS 1709 */ baseToRules(boolean escapeUnprintable)1710 protected final String baseToRules(boolean escapeUnprintable) { 1711 // The base class implementation of toRules munges the ID into 1712 // the correct format. That is: foo => ::foo 1713 // KEEP in sync with rbt_pars 1714 if (escapeUnprintable) { 1715 StringBuffer rulesSource = new StringBuffer(); 1716 String id = getID(); 1717 for (int i=0; i<id.length();) { 1718 int c = UTF16.charAt(id, i); 1719 if (!Utility.escapeUnprintable(rulesSource, c)) { 1720 UTF16.append(rulesSource, c); 1721 } 1722 i += UTF16.getCharCount(c); 1723 } 1724 rulesSource.insert(0, "::"); 1725 rulesSource.append(ID_DELIM); 1726 return rulesSource.toString(); 1727 } 1728 return "::" + getID() + ID_DELIM; 1729 } 1730 1731 /** 1732 * Return the elements that make up this transliterator. For 1733 * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek" 1734 * were created, the return value of this method would be an array 1735 * of the three transliterator objects that make up that 1736 * transliterator: [NFD, Jamo-Latin, Latin-Greek]. 1737 * 1738 * <p>If this transliterator is not composed of other 1739 * transliterators, then this method will return an array of 1740 * length one containing a reference to this transliterator. 1741 * @return an array of one or more transliterators that make up 1742 * this transliterator 1743 */ getElements()1744 public Transliterator[] getElements() { 1745 Transliterator result[]; 1746 if (this instanceof CompoundTransliterator) { 1747 CompoundTransliterator cpd = (CompoundTransliterator) this; 1748 result = new Transliterator[cpd.getCount()]; 1749 for (int i=0; i<result.length; ++i) { 1750 result[i] = cpd.getTransliterator(i); 1751 } 1752 } else { 1753 result = new Transliterator[] { this }; 1754 } 1755 return result; 1756 } 1757 1758 /** 1759 * Returns the set of all characters that may be modified in the 1760 * input text by this Transliterator. This incorporates this 1761 * object's current filter; if the filter is changed, the return 1762 * value of this function will change. The default implementation 1763 * returns an empty set. Some subclasses may override {@link 1764 * #handleGetSourceSet} to return a more precise result. The 1765 * return result is approximate in any case and is intended for 1766 * use by tests, tools, or utilities. 1767 * @see #getTargetSet 1768 * @see #handleGetSourceSet 1769 */ getSourceSet()1770 public final UnicodeSet getSourceSet() { 1771 UnicodeSet result = new UnicodeSet(); 1772 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), result, new UnicodeSet()); 1773 return result; 1774 } 1775 1776 /** 1777 * Framework method that returns the set of all characters that 1778 * may be modified in the input text by this Transliterator, 1779 * ignoring the effect of this object's filter. The base class 1780 * implementation returns the empty set. Subclasses that wish to 1781 * implement this should override this method. 1782 * @return the set of characters that this transliterator may 1783 * modify. The set may be modified, so subclasses should return a 1784 * newly-created object. 1785 * @see #getSourceSet 1786 * @see #getTargetSet 1787 * @hide unsupported on OHOS 1788 */ handleGetSourceSet()1789 protected UnicodeSet handleGetSourceSet() { 1790 return new UnicodeSet(); 1791 } 1792 1793 /** 1794 * Returns the set of all characters that may be generated as 1795 * replacement text by this transliterator. The default 1796 * implementation returns the empty set. Some subclasses may 1797 * override this method to return a more precise result. The 1798 * return result is approximate in any case and is intended for 1799 * use by tests, tools, or utilities requiring such 1800 * meta-information. 1801 * <p>Warning. You might expect an empty filter to always produce an empty target. 1802 * However, consider the following: 1803 * <pre> 1804 * [Pp]{}[\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB] > \'; 1805 * </pre> 1806 * With a filter of [], you still get some elements in the target set, because this rule will still match. It could 1807 * be recast to the following if it were important. 1808 * <pre> 1809 * [Pp]{([\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB])} > \' | $1; 1810 * </pre> 1811 * @see #getTargetSet 1812 */ getTargetSet()1813 public UnicodeSet getTargetSet() { 1814 UnicodeSet result = new UnicodeSet(); 1815 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), new UnicodeSet(), result); 1816 return result; 1817 } 1818 1819 /** 1820 * Returns the set of all characters that may be generated as 1821 * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter(). 1822 * <p>SHOULD BE OVERRIDEN BY SUBCLASSES. 1823 * It is probably an error for any transliterator to NOT override this, but we can't force them to 1824 * for backwards compatibility. 1825 * <p>Other methods vector through this. 1826 * <p>When gathering the information on source and target, the compound transliterator makes things complicated. 1827 * For example, suppose we have: 1828 * <pre> 1829 * Global FILTER = [ax] 1830 * a > b; 1831 * :: NULL; 1832 * b > c; 1833 * x > d; 1834 * </pre> 1835 * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets 1836 * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to 1837 * the global filter, intersect that transliterator's filter. Based on that we get the target. 1838 * The next transliterator gets as a global filter (global + last target). And so on. 1839 * <p>There is another complication: 1840 * <pre> 1841 * Global FILTER = [ax] 1842 * a >|b; 1843 * b >c; 1844 * </pre> 1845 * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will 1846 * change the global filter as we go. 1847 * @param targetSet TODO 1848 * @see #getTargetSet 1849 * @deprecated This API is ICU internal only. 1850 * @hide deprecated on icu4j-org 1851 * @hide draft / provisional / internal are hidden on OHOS 1852 */ 1853 @Deprecated addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)1854 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 1855 UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); 1856 UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter); 1857 // use old method, if we don't have anything better 1858 sourceSet.addAll(temp); 1859 // clumsy guess with target 1860 for (String s : temp) { 1861 String t = transliterate(s); 1862 if (!s.equals(t)) { 1863 targetSet.addAll(t); 1864 } 1865 } 1866 } 1867 1868 /** 1869 * Returns the intersectionof this instance's filter intersected with an external filter. 1870 * The externalFilter must be frozen (it is frozen if not). 1871 * The result may be frozen, so don't attempt to modify. 1872 * @deprecated This API is ICU internal only. 1873 * @hide deprecated on icu4j-org 1874 * @hide draft / provisional / internal are hidden on OHOS 1875 */ 1876 @Deprecated 1877 // TODO change to getMergedFilter getFilterAsUnicodeSet(UnicodeSet externalFilter)1878 public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) { 1879 if (filter == null) { 1880 return externalFilter; 1881 } 1882 UnicodeSet filterSet = new UnicodeSet(externalFilter); 1883 // Most, but not all filters will be UnicodeSets. Optimize for 1884 // the high-runner case. 1885 UnicodeSet temp; 1886 try { 1887 temp = filter; 1888 } catch (ClassCastException e) { 1889 filter.addMatchSetTo(temp = new UnicodeSet()); 1890 } 1891 return filterSet.retainAll(temp).freeze(); 1892 } 1893 1894 /** 1895 * Returns this transliterator's inverse. See the class 1896 * documentation for details. This implementation simply inverts 1897 * the two entities in the ID and attempts to retrieve the 1898 * resulting transliterator. That is, if <code>getID()</code> 1899 * returns "A-B", then this method will return the result of 1900 * <code>getInstance("B-A")</code>, or <code>null</code> if that 1901 * call fails. 1902 * 1903 * <p>Subclasses with knowledge of their inverse may wish to 1904 * override this method. 1905 * 1906 * @return a transliterator that is an inverse, not necessarily 1907 * exact, of this transliterator, or <code>null</code> if no such 1908 * transliterator is registered. 1909 * @see #registerClass 1910 */ getInverse()1911 public final Transliterator getInverse() { 1912 return getInstance(ID, REVERSE); 1913 } 1914 1915 /** 1916 * Registers a subclass of <code>Transliterator</code> with the 1917 * system. This subclass must have a public constructor taking no 1918 * arguments. When that constructor is called, the resulting 1919 * object must return the <code>ID</code> passed to this method if 1920 * its <code>getID()</code> method is called. 1921 * 1922 * @param ID the result of <code>getID()</code> for this 1923 * transliterator 1924 * @param transClass a subclass of <code>Transliterator</code> 1925 * @see #unregister 1926 * @hide unsupported on OHOS 1927 */ registerClass(String ID, Class<? extends Transliterator> transClass, String displayName)1928 public static void registerClass(String ID, Class<? extends Transliterator> transClass, String displayName) { 1929 registry.put(ID, transClass, true); 1930 if (displayName != null) { 1931 displayNameCache.put(new CaseInsensitiveString(ID), displayName); 1932 } 1933 } 1934 1935 /** 1936 * Register a factory object with the given ID. The factory 1937 * method should return a new instance of the given transliterator. 1938 * 1939 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1940 * be called at application startup, prior to any calls to 1941 * Transliterator.getInstance to avoid undefined behavior. 1942 * 1943 * @param ID the ID of this transliterator 1944 * @param factory the factory object 1945 * @hide unsupported on OHOS 1946 */ registerFactory(String ID, Factory factory)1947 public static void registerFactory(String ID, Factory factory) { 1948 registry.put(ID, factory, true); 1949 } 1950 1951 /** 1952 * Register a Transliterator object with the given ID. 1953 * 1954 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1955 * be called at application startup, prior to any calls to 1956 * Transliterator.getInstance to avoid undefined behavior. 1957 * 1958 * @param trans the Transliterator object 1959 * @hide unsupported on OHOS 1960 */ registerInstance(Transliterator trans)1961 public static void registerInstance(Transliterator trans) { 1962 registry.put(trans.getID(), trans, true); 1963 } 1964 1965 /** 1966 * Register a Transliterator object. 1967 * 1968 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1969 * be called at application startup, prior to any calls to 1970 * Transliterator.getInstance to avoid undefined behavior. 1971 * 1972 * @param trans the Transliterator object 1973 */ registerInstance(Transliterator trans, boolean visible)1974 static void registerInstance(Transliterator trans, boolean visible) { 1975 registry.put(trans.getID(), trans, visible); 1976 } 1977 1978 /** 1979 * Register an ID as an alias of another ID. Instantiating 1980 * alias ID produces the same result as instantiating the original ID. 1981 * This is generally used to create short aliases of compound IDs. 1982 * 1983 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1984 * be called at application startup, prior to any calls to 1985 * Transliterator.getInstance to avoid undefined behavior. 1986 * 1987 * @param aliasID The new ID being registered. 1988 * @param realID The existing ID that the new ID should be an alias of. 1989 * @hide unsupported on OHOS 1990 */ registerAlias(String aliasID, String realID)1991 public static void registerAlias(String aliasID, String realID) { 1992 registry.put(aliasID, realID, true); 1993 } 1994 1995 /** 1996 * Register two targets as being inverses of one another. For 1997 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 1998 * Transliterator to form the following inverse relationships: 1999 * 2000 * <pre>NFC => NFD 2001 * Any-NFC => Any-NFD 2002 * NFD => NFC 2003 * Any-NFD => Any-NFC</pre> 2004 * 2005 * (Without the special inverse registration, the inverse of NFC 2006 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 2007 * that the presence or absence of "Any-" is preserved. 2008 * 2009 * <p>The relationship is symmetrical; registering (a, b) is 2010 * equivalent to registering (b, a). 2011 * 2012 * <p>The relevant IDs must still be registered separately as 2013 * factories or classes. 2014 * 2015 * <p>Only the targets are specified. Special inverses always 2016 * have the form Any-Target1 <=> Any-Target2. The target should 2017 * have canonical casing (the casing desired to be produced when 2018 * an inverse is formed) and should contain no whitespace or other 2019 * extraneous characters. 2020 * 2021 * @param target the target against which to register the inverse 2022 * @param inverseTarget the inverse of target, that is 2023 * Any-target.getInverse() => Any-inverseTarget 2024 * @param bidirectional if true, register the reverse relation 2025 * as well, that is, Any-inverseTarget.getInverse() => Any-target 2026 */ registerSpecialInverse(String target, String inverseTarget, boolean bidirectional)2027 static void registerSpecialInverse(String target, 2028 String inverseTarget, 2029 boolean bidirectional) { 2030 TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional); 2031 } 2032 2033 /** 2034 * Unregisters a transliterator or class. This may be either 2035 * a system transliterator or a user transliterator or class. 2036 * 2037 * @param ID the ID of the transliterator or class 2038 * @see #registerClass 2039 * @hide unsupported on OHOS 2040 */ unregister(String ID)2041 public static void unregister(String ID) { 2042 displayNameCache.remove(new CaseInsensitiveString(ID)); 2043 registry.remove(ID); 2044 } 2045 2046 /** 2047 * Returns an enumeration over the programmatic names of registered 2048 * <code>Transliterator</code> objects. This includes both system 2049 * transliterators and user transliterators registered using 2050 * <code>registerClass()</code>. The enumerated names may be 2051 * passed to <code>getInstance()</code>. 2052 * 2053 * @return An <code>Enumeration</code> over <code>String</code> objects 2054 * @see #getInstance 2055 * @see #registerClass 2056 */ getAvailableIDs()2057 public static final Enumeration<String> getAvailableIDs() { 2058 return registry.getAvailableIDs(); 2059 } 2060 2061 /** 2062 * Returns an enumeration over the source names of registered 2063 * transliterators. Source names may be passed to 2064 * getAvailableTargets() to obtain available targets for each 2065 * source. 2066 */ getAvailableSources()2067 public static final Enumeration<String> getAvailableSources() { 2068 return registry.getAvailableSources(); 2069 } 2070 2071 /** 2072 * Returns an enumeration over the target names of registered 2073 * transliterators having a given source name. Target names may 2074 * be passed to getAvailableVariants() to obtain available 2075 * variants for each source and target pair. 2076 */ getAvailableTargets(String source)2077 public static final Enumeration<String> getAvailableTargets(String source) { 2078 return registry.getAvailableTargets(source); 2079 } 2080 2081 /** 2082 * Returns an enumeration over the variant names of registered 2083 * transliterators having a given source name and target name. 2084 */ getAvailableVariants(String source, String target)2085 public static final Enumeration<String> getAvailableVariants(String source, 2086 String target) { 2087 return registry.getAvailableVariants(source, target); 2088 } 2089 private static final String ROOT = "root", 2090 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs"; 2091 static { 2092 registry = new TransliteratorRegistry(); 2093 2094 // The display name cache starts out empty 2095 displayNameCache = Collections.synchronizedMap(new HashMap<CaseInsensitiveString, String>()); 2096 /* The following code parses the index table located in 2097 * icu/data/translit/root.txt. The index is an n x 4 table 2098 * that follows this format: 2099 * <id>{ 2100 * file{ 2101 * resource{"<resource>"} 2102 * direction{"<direction>"} 2103 * } 2104 * } 2105 * <id>{ 2106 * internal{ 2107 * resource{"<resource>"} 2108 * direction{"<direction"} 2109 * } 2110 * } 2111 * <id>{ 2112 * alias{"<getInstanceArg"} 2113 * } 2114 * <id> is the ID of the system transliterator being defined. These 2115 * are public IDs enumerated by Transliterator.getAvailableIDs(), 2116 * unless the second field is "internal". 2117 * 2118 * <resource> is a ResourceReader resource name. Currently these refer 2119 * to file names under com/ibm/text/resources. This string is passed 2120 * directly to ResourceReader, together with <encoding>. 2121 * 2122 * <direction> is either "FORWARD" or "REVERSE". 2123 * 2124 * <getInstanceArg> is a string to be passed directly to 2125 * Transliterator.getInstance(). The returned Transliterator object 2126 * then has its ID changed to <id> and is returned. 2127 * 2128 * The extra blank field on "alias" lines is to make the array square. 2129 */ 2130 UResourceBundle bundle, transIDs, colBund; 2131 bundle = UResourceBundle.getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, ROOT); 2132 transIDs = bundle.get(RB_RULE_BASED_IDS); 2133 2134 int row, maxRows; 2135 maxRows = transIDs.getSize(); 2136 for (row = 0; row < maxRows; row++) { 2137 colBund = transIDs.get(row); 2138 String ID = colBund.getKey(); 2139 if (ID.indexOf("-t-") >= 0) { 2140 continue; 2141 } 2142 UResourceBundle res = colBund.get(0); 2143 String type = res.getKey(); 2144 if (type.equals("file") || type.equals("internal")) { 2145 // Rest of line is <resource>:<encoding>:<direction> 2146 // pos colon c2 2147 String resString = res.getString("resource"); 2148 int dir; 2149 String direction = res.getString("direction"); 2150 switch (direction.charAt(0)) { 2151 case 'F': 2152 dir = FORWARD; 2153 break; 2154 case 'R': 2155 dir = REVERSE; 2156 break; 2157 default: 2158 throw new RuntimeException("Can't parse direction: " + direction); 2159 } registry.put(ID, resString, dir, !type.equals("internal"))2160 registry.put(ID, 2161 resString, // resource 2162 dir, 2163 !type.equals("internal")); 2164 } else if (type.equals("alias")) { 2165 //'alias'; row[2]=createInstance argument 2166 String resString = res.getString(); registry.put(ID, resString, true)2167 registry.put(ID, resString, true); 2168 } else { 2169 // Unknown type 2170 throw new RuntimeException("Unknow type: " + type); 2171 } 2172 } 2173 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false)2174 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false); 2175 2176 // Register non-rule-based transliterators registerClass(NullTransliterator._ID, NullTransliterator.class, null)2177 registerClass(NullTransliterator._ID, 2178 NullTransliterator.class, null); RemoveTransliterator.register()2179 RemoveTransliterator.register(); EscapeTransliterator.register()2180 EscapeTransliterator.register(); UnescapeTransliterator.register()2181 UnescapeTransliterator.register(); LowercaseTransliterator.register()2182 LowercaseTransliterator.register(); UppercaseTransliterator.register()2183 UppercaseTransliterator.register(); TitlecaseTransliterator.register()2184 TitlecaseTransliterator.register(); CaseFoldTransliterator.register()2185 CaseFoldTransliterator.register(); UnicodeNameTransliterator.register()2186 UnicodeNameTransliterator.register(); NameUnicodeTransliterator.register()2187 NameUnicodeTransliterator.register(); NormalizationTransliterator.register()2188 NormalizationTransliterator.register(); BreakTransliterator.register()2189 BreakTransliterator.register(); AnyTransliterator.register()2190 AnyTransliterator.register(); // do this last! 2191 } 2192 2193 /** 2194 * Register the script-based "Any" transliterators: Any-Latin, Any-Greek 2195 * @deprecated This API is ICU internal only. 2196 * @hide deprecated on icu4j-org 2197 * @hide draft / provisional / internal are hidden on OHOS 2198 */ 2199 @Deprecated registerAny()2200 public static void registerAny() { 2201 AnyTransliterator.register(); 2202 } 2203 2204 /** 2205 * The factory interface for transliterators. Transliterator 2206 * subclasses can register factory objects for IDs using the 2207 * registerFactory() method of Transliterator. When invoked, the 2208 * factory object will be passed the ID being instantiated. This 2209 * makes it possible to register one factory method to more than 2210 * one ID, or for a factory method to parameterize its result 2211 * based on the variant. 2212 * @hide exposed on OHOS 2213 */ 2214 public static interface Factory { 2215 /** 2216 * Return a transliterator for the given ID. 2217 */ getInstance(String ID)2218 Transliterator getInstance(String ID); 2219 } 2220 2221 /** 2222 * Implements StringTransform via this method. 2223 * @param source text to be transformed (eg lowercased) 2224 * @return result 2225 * @hide unsupported on OHOS 2226 */ 2227 @Override transform(String source)2228 public String transform(String source) { 2229 return transliterate(source); 2230 } 2231 } 2232