• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import java.text.MessageFormat;
13 import java.util.ArrayList;
14 import java.util.Collections;
15 import java.util.Enumeration;
16 import java.util.HashMap;
17 import java.util.List;
18 import java.util.Locale;
19 import java.util.Map;
20 import java.util.MissingResourceException;
21 import java.util.Objects;
22 
23 import ohos.global.icu.impl.ICUData;
24 import ohos.global.icu.impl.ICUResourceBundle;
25 import ohos.global.icu.impl.Utility;
26 import ohos.global.icu.impl.UtilityExtensions;
27 import ohos.global.icu.text.RuleBasedTransliterator.Data;
28 import ohos.global.icu.text.TransliteratorIDParser.SingleID;
29 import ohos.global.icu.util.CaseInsensitiveString;
30 import ohos.global.icu.util.ULocale;
31 import ohos.global.icu.util.ULocale.Category;
32 import ohos.global.icu.util.UResourceBundle;
33 
34 /**
35  * <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common
36  * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator
37  * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not
38  * <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference
39  * to the meanings of words and sentences.
40  *
41  * <p>
42  * Although script conversion is its most common use, a transliterator can actually perform a more general class of
43  * tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the
44  * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of
45  * <code>Transliterator</code>.
46  *
47  * <p>
48  * <b>Transliterators are stateless</b>
49  *
50  * <p>
51  * <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to
52  * <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might
53  * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex
54  * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In
55  * other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the
56  * needed information, and delayed operation allows arbitrary complexity.
57  *
58  * <p>
59  * <b>Batch transliteration</b>
60  *
61  * <p>
62  * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as
63  * <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>,
64  * the call
65  *
66  * <blockquote><code>String result = t.transliterate(input);
67  * </code></blockquote>
68  *
69  * will transliterate it and return the result. Other methods allow the client to specify a substring to be
70  * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band
71  * information (such as text styles).
72  *
73  * <p>
74  * <b>Keyboard transliteration</b>
75  *
76  * <p>
77  * Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that
78  * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal
79  * fashion.
80  *
81  * <p>
82  * In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as
83  * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being
84  * modified as each new character arrives.
85  *
86  * <p>
87  * Consider the simple rule-based Transliterator:
88  *
89  * <blockquote><code>
90  * th&gt;{theta}<br>
91  * t&gt;{tau}
92  * </code></blockquote>
93  *
94  * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is
95  * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string:
96  *
97  * <blockquote><code>
98  * t&gt;|{tau}<br>
99  * {tau}h&gt;{theta}
100  * </code></blockquote>
101  *
102  * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is
103  * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across
104  * calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a
105  * case like the one above, it will precede the insertion point.
106  *
107  * <p>
108  * Keyboard transliteration methods maintain a set of three indices that are updated with each call to
109  * <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and
110  * they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring
111  * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index;
112  * that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the
113  * transliterator last stopped, either because it reached the end, or because it required more characters to
114  * disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules.
115  * Any characters before the <code>cursor</code> index are frozen; future keyboard
116  * transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code>
117  * index, which marks the end of the substring that the transliterator looks at.
118  *
119  * <p>
120  * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It
121  * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the
122  * client code knows that no more characters are forthcoming, perhaps because the user has performed some input
123  * termination operation, then it should call <code>finishTransliteration()</code> to complete any pending
124  * transliterations.
125  *
126  * <p>
127  * <b>Inverses</b>
128  *
129  * <p>
130  * Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates
131  * characters by incrementing their Unicode value (so "abc" -&gt; "def"), and transliterator <b>B</b> decrements character
132  * values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound
133  * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input
134  * text.
135  *
136  * The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists,
137  * or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true
138  * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider
139  * two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates
140  * 'B' to 'A'. It might seem that these are exact inverses, since
141  *
142  * <blockquote>"A" x <b>AB</b> -&gt; "B"<br>
143  * "B" x <b>BA</b> -&gt; "A"</blockquote>
144  *
145  * where 'x' represents transliteration. However,
146  *
147  * <blockquote>"ABCD" x <b>AB</b> -&gt; "BBCD"<br>
148  * "BBCD" x <b>BA</b> -&gt; "AACD"</blockquote>
149  *
150  * so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be
151  * <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return
152  * <b>BA</b>.
153  *
154  * <p>
155  * <b>Filtering</b>
156  * <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The
157  * filter affects just the characters that are changed -- the characters outside of the filter are still part of the
158  * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'.
159  *
160  * <pre>
161  * String rules = &quot;x &gt; y; x{a} &gt; b; &quot;;
162  * Transliterator tempTrans = Transliterator.createFromRules(&quot;temp&quot;, rules, Transliterator.FORWARD);
163  * tempTrans.setFilter(new UnicodeSet(&quot;[a]&quot;));
164  * String tempResult = tempTrans.transform(&quot;xa&quot;);
165  * // results in &quot;xb&quot;
166  *</pre>
167  * <p>
168  * <b>IDs and display names</b>
169  *
170  * <p>
171  * A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format
172  * <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em>
173  * describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of
174  * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from
175  * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1
176  * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the
177  * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes.
178  *
179  * <p>
180  * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces,
181  * returned by {@link #getDisplayName}.
182  *
183  * <p>
184  * <b>Factory methods and registration</b>
185  *
186  * <p>
187  * In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a
188  * transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators
189  * are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object.
190  *
191  * <p>
192  * In addition to the system transliterators registered at startup, user transliterators may be registered by calling
193  * <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it
194  * is needed), users may call <code>registerClass()</code>.
195  *
196  * <p>
197  * <b>Composed transliterators</b>
198  *
199  * <p>
200  * In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em>
201  * transliterators. These are implemented by composing two or more component transliterators. For example, if we have
202  * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12
203  * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an
204  * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M",
205  * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em>
206  * <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72
207  * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that
208  * the given transliterator is intended to be composed with others, rather than be used as is.
209  *
210  * <p>
211  * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati"
212  * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this
213  * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns
214  * "Devanagari-Gujarati").
215  *
216  * <p>
217  * <b>Subclassing</b>
218  *
219  * <p>
220  * Subclasses must implement the abstract method <code>handleTransliterate()</code>.
221  * <p>
222  * Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the
223  * <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of
224  * these methods can be improved over the performance obtained by the default implementations in this class.
225  *
226  * <p><b>Rule syntax</b>
227  *
228  * <p>A set of rules determines how to perform translations.
229  * Rules within a rule set are separated by semicolons (';').
230  * To include a literal semicolon, prefix it with a backslash ('\').
231  * Unicode Pattern_White_Space is ignored.
232  * If the first non-blank character on a line is '#',
233  * the entire line is ignored as a comment.
234  *
235  * <p>Each set of rules consists of two groups, one forward, and one
236  * reverse. This is a convention that is not enforced; rules for one
237  * direction may be omitted, with the result that translations in
238  * that direction will not modify the source text. In addition,
239  * bidirectional forward-reverse rules may be specified for
240  * symmetrical transformations.
241  *
242  * <p>Note: Another description of the Transliterator rule syntax is available in
243  * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section
244  * Transform Rules Syntax of UTS #35: Unicode LDML</a>.
245  * The rules are shown there using arrow symbols ← and → and ↔.
246  * ICU supports both those and the equivalent ASCII symbols &lt; and &gt; and &lt;&gt;.
247  *
248  * <p>Rule statements take one of the following forms:
249  *
250  * <dl>
251  *     <dt><code>$alefmadda=\\u0622;</code></dt>
252  *     <dd><strong>Variable definition.</strong> The name on the
253  *         left is assigned the text on the right. In this example,
254  *         after this statement, instances of the left hand name,
255  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
256  *         the Unicode character U+0622. Variable names must begin
257  *         with a letter and consist only of letters, digits, and
258  *         underscores. Case is significant. Duplicate names cause
259  *         an exception to be thrown, that is, variables cannot be
260  *         redefined. The right hand side may contain well-formed
261  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
262  *         The right hand side may contain embedded <code>UnicodeSet</code>
263  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
264  *     <dt><code>ai&gt;$alefmadda;</code></dt>
265  *     <dd><strong>Forward translation rule.</strong> This rule
266  *         states that the string on the left will be changed to the
267  *         string on the right when performing forward
268  *         transliteration.</dd>
269  *     <dt><code>ai&lt;$alefmadda;</code></dt>
270  *     <dd><strong>Reverse translation rule.</strong> This rule
271  *         states that the string on the right will be changed to
272  *         the string on the left when performing reverse
273  *         transliteration.</dd>
274  * </dl>
275  *
276  * <dl>
277  *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
278  *     <dd><strong>Bidirectional translation rule.</strong> This
279  *         rule states that the string on the right will be changed
280  *         to the string on the left when performing forward
281  *         transliteration, and vice versa when performing reverse
282  *         transliteration.</dd>
283  * </dl>
284  *
285  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
286  * string</em>. The match pattern consists of literal characters,
287  * optionally preceded by context, and optionally followed by
288  * context. Context characters, like literal pattern characters,
289  * must be matched in the text being transliterated. However, unlike
290  * literal pattern characters, they are not replaced by the output
291  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
292  * indicates the characters &quot;<code>def</code>&quot; must be
293  * preceded by &quot;<code>abc</code>&quot; for a successful match.
294  * If there is a successful match, &quot;<code>def</code>&quot; will
295  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
296  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
297  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
298  * (or &quot;<code>123}456</code>&quot;) in which the literal
299  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
300  *
301  * <p>The output string of a forward or reverse rule consists of
302  * characters to replace the literal pattern characters. If the
303  * output string contains the character '<code>|</code>', this is
304  * taken to indicate the location of the <em>cursor</em> after
305  * replacement. The cursor is the point in the text at which the
306  * next replacement, if any, will be applied. The cursor is usually
307  * placed within the replacement text; however, it can actually be
308  * placed into the precending or following context by using the
309  * special character '@'. Examples:
310  *
311  * <pre>
312  *     a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor before a
313  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between y and z
314  * </pre>
315  *
316  * <p><b>UnicodeSet</b>
317  *
318  * <p><code>UnicodeSet</code> patterns may appear anywhere that
319  * makes sense. They may appear in variable definitions.
320  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
321  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
322  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
323  *
324  * <p><code>UnicodeSet</code> patterns may also be embedded directly
325  * into rule strings. Thus, the following two rules are equivalent:
326  *
327  * <pre>
328  *     $vowel=[aeiou]; $vowel&gt;'*'; # One way to do this
329  *     [aeiou]&gt;'*'; # Another way
330  * </pre>
331  *
332  * <p>See {@link UnicodeSet} for more documentation and examples.
333  *
334  * <p><b>Segments</b>
335  *
336  * <p>Segments of the input string can be matched and copied to the
337  * output string. This makes certain sets of rules simpler and more
338  * general, and makes reordering possible. For example:
339  *
340  * <pre>
341  *     ([a-z]) &gt; $1 $1; # double lowercase letters
342  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs
343  * </pre>
344  *
345  * <p>The segment of the input string to be copied is delimited by
346  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
347  * nine segments may be defined. Segments may not overlap. In the
348  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
349  * represent the input string segments, in left-to-right order of
350  * definition.
351  *
352  * <p><b>Anchors</b>
353  *
354  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
355  * special characters '<code>^</code>' and '<code>$</code>'. For example:
356  *
357  * <pre>
358  *   ^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text
359  *   &nbsp; a&nbsp;&nbsp; &gt; 'A'; # match other instances of 'a'
360  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text
361  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances of 'z'
362  * </pre>
363  *
364  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
365  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
366  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
367  * match either the beginning or the end of the text, depending on its placement. For
368  * example:
369  *
370  * <pre>
371  *   $x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor
372  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start
373  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end
374  * </pre>
375  *
376  * <p><b>Example</b>
377  *
378  * <p>The following example rules illustrate many of the features of
379  * the rule language.
380  *
381  * <table border="0" cellpadding="4">
382  *     <tr>
383  *         <td style="vertical-align: top;">Rule 1.</td>
384  *         <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}&gt;x|y</code></td>
385  *     </tr>
386  *     <tr>
387  *         <td style="vertical-align: top;">Rule 2.</td>
388  *         <td style="vertical-align: top; write-space: nowrap;"><code>xyz&gt;r</code></td>
389  *     </tr>
390  *     <tr>
391  *         <td style="vertical-align: top;">Rule 3.</td>
392  *         <td style="vertical-align: top; write-space: nowrap;"><code>yz&gt;q</code></td>
393  *     </tr>
394  * </table>
395  *
396  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
397  * yields the following results:
398  *
399  * <table border="0" cellpadding="4">
400  *     <tr>
401  *         <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td>
402  *         <td style="vertical-align: top;">Initial state, no rules match. Advance
403  *         cursor.</td>
404  *     </tr>
405  *     <tr>
406  *         <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td>
407  *         <td style="vertical-align: top;">Still no match. Rule 1 does not match
408  *         because the preceding context is not present.</td>
409  *     </tr>
410  *     <tr>
411  *         <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td>
412  *         <td style="vertical-align: top;">Still no match. Keep advancing until
413  *         there is a match...</td>
414  *     </tr>
415  *     <tr>
416  *         <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td>
417  *         <td style="vertical-align: top;">...</td>
418  *     </tr>
419  *     <tr>
420  *         <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td>
421  *         <td style="vertical-align: top;">...</td>
422  *     </tr>
423  *     <tr>
424  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td>
425  *         <td style="vertical-align: top;">...</td>
426  *     </tr>
427  *     <tr>
428  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td>
429  *         <td style="vertical-align: top;">...</td>
430  *     </tr>
431  *     <tr>
432  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td>
433  *         <td style="vertical-align: top;">Rule 1 matches; replace &quot;<code>def</code>&quot;
434  *         with &quot;<code>xy</code>&quot; and back up the cursor
435  *         to before the '<code>y</code>'.</td>
436  *     </tr>
437  *     <tr>
438  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td>
439  *         <td style="vertical-align: top;">Although &quot;<code>xyz</code>&quot; is
440  *         present, rule 2 does not match because the cursor is
441  *         before the '<code>y</code>', not before the '<code>x</code>'.
442  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
443  *         with &quot;<code>q</code>&quot;.</td>
444  *     </tr>
445  *     <tr>
446  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td>
447  *         <td style="vertical-align: top;">The cursor is at the end;
448  *         transliteration is complete.</td>
449  *     </tr>
450  * </table>
451  *
452  * <p>The order of rules is significant. If multiple rules may match
453  * at some point, the first matching rule is applied.
454  *
455  * <p>Forward and reverse rules may have an empty output string.
456  * Otherwise, an empty left or right hand side of any statement is a
457  * syntax error.
458  *
459  * <p>Single quotes are used to quote any character other than a
460  * digit or letter. To specify a single quote itself, inside or
461  * outside of quotes, use two single quotes in a row. For example,
462  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
463  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
464  *
465  * <p><b>Notes</b>
466  *
467  * <p>While a Transliterator is being built from rules, it checks that
468  * the rules are added in proper order. For example, if the rule
469  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
470  * then the second rule will throw an exception. The reason is that
471  * the second rule can never be triggered, since the first rule
472  * always matches anything it matches. In other words, the first
473  * rule <em>masks</em> the second rule.
474  *
475  * @author Alan Liu
476  */
477 public abstract class Transliterator implements StringTransform  {
478     /**
479      * Direction constant indicating the forward direction in a transliterator,
480      * e.g., the forward rules of a rule-based Transliterator.  An "A-B"
481      * transliterator transliterates A to B when operating in the forward
482      * direction, and B to A when operating in the reverse direction.
483      */
484     public static final int FORWARD = 0;
485 
486     /**
487      * Direction constant indicating the reverse direction in a transliterator,
488      * e.g., the reverse rules of a rule-based Transliterator.  An "A-B"
489      * transliterator transliterates A to B when operating in the forward
490      * direction, and B to A when operating in the reverse direction.
491      */
492     public static final int REVERSE = 1;
493 
494     /**
495      * Position structure for incremental transliteration.  This data
496      * structure defines two substrings of the text being
497      * transliterated.  The first region, [contextStart,
498      * contextLimit), defines what characters the transliterator will
499      * read as context.  The second region, [start, limit), defines
500      * what characters will actually be transliterated.  The second
501      * region should be a subset of the first.
502      *
503      * <p>After a transliteration operation, some of the indices in this
504      * structure will be modified.  See the field descriptions for
505      * details.
506      *
507      * <p>contextStart &lt;= start &lt;= limit &lt;= contextLimit
508      *
509      * <p>Note: All index values in this structure must be at code point
510      * boundaries.  That is, none of them may occur between two code units
511      * of a surrogate pair.  If any index does split a surrogate pair,
512      * results are unspecified.
513      */
514     public static class Position {
515 
516         /**
517          * Beginning index, inclusive, of the context to be considered for
518          * a transliteration operation.  The transliterator will ignore
519          * anything before this index.  INPUT/OUTPUT parameter: This parameter
520          * is updated by a transliteration operation to reflect the maximum
521          * amount of antecontext needed by a transliterator.
522          */
523         public int contextStart;
524 
525         /**
526          * Ending index, exclusive, of the context to be considered for a
527          * transliteration operation.  The transliterator will ignore
528          * anything at or after this index.  INPUT/OUTPUT parameter: This
529          * parameter is updated to reflect changes in the length of the
530          * text, but points to the same logical position in the text.
531          */
532         public int contextLimit;
533 
534         /**
535          * Beginning index, inclusive, of the text to be transliteratd.
536          * INPUT/OUTPUT parameter: This parameter is advanced past
537          * characters that have already been transliterated by a
538          * transliteration operation.
539          */
540         public int start;
541 
542         /**
543          * Ending index, exclusive, of the text to be transliteratd.
544          * INPUT/OUTPUT parameter: This parameter is updated to reflect
545          * changes in the length of the text, but points to the same
546          * logical position in the text.
547          */
548         public int limit;
549 
550         /**
551          * Constructs a Position object with start, limit,
552          * contextStart, and contextLimit all equal to zero.
553          */
Position()554         public Position() {
555             this(0, 0, 0, 0);
556         }
557 
558         /**
559          * Constructs a Position object with the given start,
560          * contextStart, and contextLimit.  The limit is set to the
561          * contextLimit.
562          */
Position(int contextStart, int contextLimit, int start)563         public Position(int contextStart, int contextLimit, int start) {
564             this(contextStart, contextLimit, start, contextLimit);
565         }
566 
567         /**
568          * Constructs a Position object with the given start, limit,
569          * contextStart, and contextLimit.
570          */
Position(int contextStart, int contextLimit, int start, int limit)571         public Position(int contextStart, int contextLimit,
572                         int start, int limit) {
573             this.contextStart = contextStart;
574             this.contextLimit = contextLimit;
575             this.start = start;
576             this.limit = limit;
577         }
578 
579         /**
580          * Constructs a Position object that is a copy of another.
581          */
Position(Position pos)582         public Position(Position pos) {
583             set(pos);
584         }
585 
586         /**
587          * Copies the indices of this position from another.
588          */
set(Position pos)589         public void set(Position pos) {
590             contextStart = pos.contextStart;
591             contextLimit = pos.contextLimit;
592             start = pos.start;
593             limit = pos.limit;
594         }
595 
596         /**
597          * Returns true if this Position is equal to the given object.
598          */
599         @Override
equals(Object obj)600         public boolean equals(Object obj) {
601             if (obj instanceof Position) {
602                 Position pos = (Position) obj;
603                 return contextStart == pos.contextStart &&
604                     contextLimit == pos.contextLimit &&
605                     start == pos.start &&
606                     limit == pos.limit;
607             }
608             return false;
609         }
610 
611         /**
612          * {@inheritDoc}
613          */
614         @Override
hashCode()615         public int hashCode() {
616             return Objects.hash(contextStart, contextLimit, start, limit);
617         }
618 
619         /**
620          * Returns a string representation of this Position.
621          * @return a string representation of the object.
622          */
623         @Override
toString()624         public String toString() {
625             return "[cs=" + contextStart
626                 + ", s=" + start
627                 + ", l=" + limit
628                 + ", cl=" + contextLimit
629                 + "]";
630         }
631 
632         /**
633          * Check all bounds.  If they are invalid, throw an exception.
634          * @param length the length of the string this object applies to
635          * @exception IllegalArgumentException if any indices are out
636          * of bounds
637          */
validate(int length)638         public final void validate(int length) {
639             if (contextStart < 0 ||
640                 start < contextStart ||
641                 limit < start ||
642                 contextLimit < limit ||
643                 length < contextLimit) {
644                 throw new IllegalArgumentException("Invalid Position {cs=" +
645                                                    contextStart + ", s=" +
646                                                    start + ", l=" +
647                                                    limit + ", cl=" +
648                                                    contextLimit + "}, len=" +
649                                                    length);
650             }
651         }
652     }
653 
654     /**
655      * Programmatic name, e.g., "Latin-Arabic".
656      */
657     private String ID;
658 
659     /**
660      * This transliterator's filter.  Any character for which
661      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
662      * altered by this transliterator.  If <tt>filter</tt> is
663      * <tt>null</tt> then no filtering is applied.
664      */
665     private UnicodeSet filter;
666 
667     private int maximumContextLength = 0;
668 
669     /**
670      * System transliterator registry.
671      */
672     private static TransliteratorRegistry registry;
673 
674     private static Map<CaseInsensitiveString, String> displayNameCache;
675 
676     /**
677      * Prefix for resource bundle key for the display name for a
678      * transliterator.  The ID is appended to this to form the key.
679      * The resource bundle value should be a String.
680      */
681     private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%";
682 
683     /**
684      * Prefix for resource bundle key for the display name for a
685      * transliterator SCRIPT.  The ID is appended to this to form the key.
686      * The resource bundle value should be a String.
687      */
688     private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%";
689 
690     /**
691      * Resource bundle key for display name pattern.
692      * The resource bundle value should be a String forming a
693      * MessageFormat pattern, e.g.:
694      * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
695      */
696     private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
697 
698     /**
699      * Delimiter between elements in a compound ID.
700      */
701     static final char ID_DELIM = ';';
702 
703     /**
704      * Delimiter before target in an ID.
705      */
706     static final char ID_SEP = '-';
707 
708     /**
709      * Delimiter before variant in an ID.
710      */
711     static final char VARIANT_SEP = '/';
712 
713     /**
714      * To enable debugging output in the Transliterator component, set
715      * DEBUG to true.
716      *
717      * N.B. Make sure to recompile all of the ohos.global.icu.text package
718      * after changing this.  Easiest way to do this is 'ant clean
719      * core' ('ant' will NOT pick up the dependency automatically).
720      *
721      * <<This generates a lot of output.>>
722      */
723     static final boolean DEBUG = false;
724 
725     /**
726      * Default constructor.
727      * @param ID the string identifier for this transliterator
728      * @param filter the filter.  Any character for which
729      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
730      * altered by this transliterator.  If <tt>filter</tt> is
731      * <tt>null</tt> then no filtering is applied.
732      * @hide unsupported on OHOS
733      */
Transliterator(String ID, UnicodeFilter filter)734     protected Transliterator(String ID, UnicodeFilter filter) {
735         if (ID == null) {
736             throw new NullPointerException();
737         }
738         this.ID = ID;
739         setFilter(filter);
740     }
741 
742     /**
743      * Transliterates a segment of a string, with optional filtering.
744      *
745      * @param text the string to be transliterated
746      * @param start the beginning index, inclusive; <code>0 &lt;= start
747      * &lt;= limit</code>.
748      * @param limit the ending index, exclusive; <code>start &lt;= limit
749      * &lt;= text.length()</code>.
750      * @return The new limit index.  The text previously occupying <code>[start,
751      * limit)</code> has been transliterated, possibly to a string of a different
752      * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
753      * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
754      * the returned value is -1 and the input string remains unchanged.
755      */
transliterate(Replaceable text, int start, int limit)756     public final int transliterate(Replaceable text, int start, int limit) {
757         if (start < 0 ||
758             limit < start ||
759             text.length() < limit) {
760             return -1;
761         }
762 
763         Position pos = new Position(start, limit, start);
764         filteredTransliterate(text, pos, false, true);
765         return pos.limit;
766     }
767 
768     /**
769      * Transliterates an entire string in place. Convenience method.
770      * @param text the string to be transliterated
771      */
transliterate(Replaceable text)772     public final void transliterate(Replaceable text) {
773         transliterate(text, 0, text.length());
774     }
775 
776     /**
777      * Transliterate an entire string and returns the result. Convenience method.
778      *
779      * @param text the string to be transliterated
780      * @return The transliterated text
781      */
transliterate(String text)782     public final String transliterate(String text) {
783         ReplaceableString result = new ReplaceableString(text);
784         transliterate(result);
785         return result.toString();
786     }
787 
788     /**
789      * Transliterates the portion of the text buffer that can be
790      * transliterated unambiguosly after new text has been inserted,
791      * typically as a result of a keyboard event.  The new text in
792      * <code>insertion</code> will be inserted into <code>text</code>
793      * at <code>index.contextLimit</code>, advancing
794      * <code>index.contextLimit</code> by <code>insertion.length()</code>.
795      * Then the transliterator will try to transliterate characters of
796      * <code>text</code> between <code>index.start</code> and
797      * <code>index.contextLimit</code>.  Characters before
798      * <code>index.start</code> will not be changed.
799      *
800      * <p>Upon return, values in <code>index</code> will be updated.
801      * <code>index.contextStart</code> will be advanced to the first
802      * character that future calls to this method will read.
803      * <code>index.start</code> and <code>index.contextLimit</code> will
804      * be adjusted to delimit the range of text that future calls to
805      * this method may change.
806      *
807      * <p>Typical usage of this method begins with an initial call
808      * with <code>index.contextStart</code> and <code>index.contextLimit</code>
809      * set to indicate the portion of <code>text</code> to be
810      * transliterated, and <code>index.start == index.contextStart</code>.
811      * Thereafter, <code>index</code> can be used without
812      * modification in future calls, provided that all changes to
813      * <code>text</code> are made via this method.
814      *
815      * <p>This method assumes that future calls may be made that will
816      * insert new text into the buffer.  As a result, it only performs
817      * unambiguous transliterations.  After the last call to this
818      * method, there may be untransliterated text that is waiting for
819      * more input to resolve an ambiguity.  In order to perform these
820      * pending transliterations, clients should call {@link
821      * #finishTransliteration} after the last call to this
822      * method has been made.
823      *
824      * @param text the buffer holding transliterated and untransliterated text
825      * @param index the start and limit of the text, the position
826      * of the cursor, and the start and limit of transliteration.
827      * @param insertion text to be inserted and possibly
828      * transliterated into the translation buffer at
829      * <code>index.contextLimit</code>.  If <code>null</code> then no text
830      * is inserted.
831      * @see #handleTransliterate
832      * @exception IllegalArgumentException if <code>index</code>
833      * is invalid
834      */
transliterate(Replaceable text, Position index, String insertion)835     public final void transliterate(Replaceable text, Position index,
836                                     String insertion) {
837         index.validate(text.length());
838 
839 //        int originalStart = index.contextStart;
840         if (insertion != null) {
841             text.replace(index.limit, index.limit, insertion);
842             index.limit += insertion.length();
843             index.contextLimit += insertion.length();
844         }
845 
846         if (index.limit > 0 &&
847             UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) {
848             // Oops, there is a dangling lead surrogate in the buffer.
849             // This will break most transliterators, since they will
850             // assume it is part of a pair.  Don't transliterate until
851             // more text comes in.
852             return;
853         }
854 
855         filteredTransliterate(text, index, true, true);
856 
857 // TODO
858 // This doesn't work once we add quantifier support.  Need to rewrite
859 // this code to support quantifiers and 'use maximum backup <n>;'.
860 //
861 //        index.contextStart = Math.max(index.start - getMaximumContextLength(),
862 //                                      originalStart);
863     }
864 
865     /**
866      * Transliterates the portion of the text buffer that can be
867      * transliterated unambiguosly after a new character has been
868      * inserted, typically as a result of a keyboard event.  This is a
869      * convenience method; see {@link #transliterate(Replaceable,
870      * Transliterator.Position, String)} for details.
871      * @param text the buffer holding transliterated and
872      * untransliterated text
873      * @param index the start and limit of the text, the position
874      * of the cursor, and the start and limit of transliteration.
875      * @param insertion text to be inserted and possibly
876      * transliterated into the translation buffer at
877      * <code>index.contextLimit</code>.
878      * @see #transliterate(Replaceable, Transliterator.Position, String)
879      */
transliterate(Replaceable text, Position index, int insertion)880     public final void transliterate(Replaceable text, Position index,
881                                     int insertion) {
882         transliterate(text, index, UTF16.valueOf(insertion));
883     }
884 
885     /**
886      * Transliterates the portion of the text buffer that can be
887      * transliterated unambiguosly.  This is a convenience method; see
888      * {@link #transliterate(Replaceable, Transliterator.Position,
889      * String)} for details.
890      * @param text the buffer holding transliterated and
891      * untransliterated text
892      * @param index the start and limit of the text, the position
893      * of the cursor, and the start and limit of transliteration.
894      * @see #transliterate(Replaceable, Transliterator.Position, String)
895      */
transliterate(Replaceable text, Position index)896     public final void transliterate(Replaceable text, Position index) {
897         transliterate(text, index, null);
898     }
899 
900     /**
901      * Finishes any pending transliterations that were waiting for
902      * more characters.  Clients should call this method as the last
903      * call after a sequence of one or more calls to
904      * <code>transliterate()</code>.
905      * @param text the buffer holding transliterated and
906      * untransliterated text.
907      * @param index the array of indices previously passed to {@link
908      * #transliterate}
909      */
finishTransliteration(Replaceable text, Position index)910     public final void finishTransliteration(Replaceable text,
911                                             Position index) {
912         index.validate(text.length());
913         filteredTransliterate(text, index, false, true);
914     }
915 
916     /**
917      * Abstract method that concrete subclasses define to implement
918      * their transliteration algorithm.  This method handles both
919      * incremental and non-incremental transliteration.  Let
920      * <code>originalStart</code> refer to the value of
921      * <code>pos.start</code> upon entry.
922      *
923      * <ul>
924      *  <li>If <code>incremental</code> is false, then this method
925      *  should transliterate all characters between
926      *  <code>pos.start</code> and <code>pos.limit</code>. Upon return
927      *  <code>pos.start</code> must == <code> pos.limit</code>.</li>
928      *
929      *  <li>If <code>incremental</code> is true, then this method
930      *  should transliterate all characters between
931      *  <code>pos.start</code> and <code>pos.limit</code> that can be
932      *  unambiguously transliterated, regardless of future insertions
933      *  of text at <code>pos.limit</code>.  Upon return,
934      *  <code>pos.start</code> should be in the range
935      *  [<code>originalStart</code>, <code>pos.limit</code>).
936      *  <code>pos.start</code> should be positioned such that
937      *  characters [<code>originalStart</code>, <code>
938      *  pos.start</code>) will not be changed in the future by this
939      *  transliterator and characters [<code>pos.start</code>,
940      *  <code>pos.limit</code>) are unchanged.</li>
941      * </ul>
942      *
943      * <p>Implementations of this method should also obey the
944      * following invariants:</p>
945      *
946      * <ul>
947      *  <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
948      *  should be updated to reflect changes in length of the text
949      *  between <code>pos.start</code> and <code>pos.limit</code>. The
950      *  difference <code> pos.contextLimit - pos.limit</code> should
951      *  not change.</li>
952      *
953      *  <li><code>pos.contextStart</code> should not change.</li>
954      *
955      *  <li>Upon return, neither <code>pos.start</code> nor
956      *  <code>pos.limit</code> should be less than
957      *  <code>originalStart</code>.</li>
958      *
959      *  <li>Text before <code>originalStart</code> and text after
960      *  <code>pos.limit</code> should not change.</li>
961      *
962      *  <li>Text before <code>pos.contextStart</code> and text after
963      *  <code> pos.contextLimit</code> should be ignored.</li>
964      * </ul>
965      *
966      * <p>Subclasses may safely assume that all characters in
967      * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
968      * In other words, the filter has already been applied by the time
969      * this method is called.  See
970      * <code>filteredTransliterate()</code>.
971      *
972      * <p>This method is <b>not</b> for public consumption.  Calling
973      * this method directly will transliterate
974      * [<code>pos.start</code>, <code>pos.limit</code>) without
975      * applying the filter. End user code should call <code>
976      * transliterate()</code> instead of this method. Subclass code
977      * should call <code>filteredTransliterate()</code> instead of
978      * this method.<p>
979      *
980      * @param text the buffer holding transliterated and
981      * untransliterated text
982      *
983      * @param pos the indices indicating the start, limit, context
984      * start, and context limit of the text.
985      *
986      * @param incremental if true, assume more text may be inserted at
987      * <code>pos.limit</code> and act accordingly.  Otherwise,
988      * transliterate all text between <code>pos.start</code> and
989      * <code>pos.limit</code> and move <code>pos.start</code> up to
990      * <code>pos.limit</code>.
991      *
992      * @see #transliterate
993      * @hide unsupported on OHOS
994      */
handleTransliterate(Replaceable text, Position pos, boolean incremental)995     protected abstract void handleTransliterate(Replaceable text,
996                                                 Position pos, boolean incremental);
997 
998     /**
999      * Top-level transliteration method, handling filtering, incremental and
1000      * non-incremental transliteration, and rollback.  All transliteration
1001      * public API methods eventually call this method with a rollback argument
1002      * of TRUE.  Other entities may call this method but rollback should be
1003      * FALSE.
1004      *
1005      * <p>If this transliterator has a filter, break up the input text into runs
1006      * of unfiltered characters.  Pass each run to
1007      * <subclass>.handleTransliterate().
1008      *
1009      * <p>In incremental mode, if rollback is TRUE, perform a special
1010      * incremental procedure in which several passes are made over the input
1011      * text, adding one character at a time, and committing successful
1012      * transliterations as they occur.  Unsuccessful transliterations are rolled
1013      * back and retried with additional characters to give correct results.
1014      *
1015      * @param text the text to be transliterated
1016      * @param index the position indices
1017      * @param incremental if TRUE, then assume more characters may be inserted
1018      * at index.limit, and postpone processing to accomodate future incoming
1019      * characters
1020      * @param rollback if TRUE and if incremental is TRUE, then perform special
1021      * incremental processing, as described above, and undo partial
1022      * transliterations where necessary.  If incremental is FALSE then this
1023      * parameter is ignored.
1024      */
filteredTransliterate(Replaceable text, Position index, boolean incremental, boolean rollback)1025     private void filteredTransliterate(Replaceable text,
1026                                        Position index,
1027                                        boolean incremental,
1028                                        boolean rollback) {
1029         // Short circuit path for transliterators with no filter in
1030         // non-incremental mode.
1031         if (filter == null && !rollback) {
1032             handleTransliterate(text, index, incremental);
1033             return;
1034         }
1035 
1036         //----------------------------------------------------------------------
1037         // This method processes text in two groupings:
1038         //
1039         // RUNS -- A run is a contiguous group of characters which are contained
1040         // in the filter for this transliterator (filter.contains(ch) == true).
1041         // Text outside of runs may appear as context but it is not modified.
1042         // The start and limit Position values are narrowed to each run.
1043         //
1044         // PASSES (incremental only) -- To make incremental mode work correctly,
1045         // each run is broken up into n passes, where n is the length (in code
1046         // points) of the run.  Each pass contains the first n characters.  If a
1047         // pass is completely transliterated, it is committed, and further passes
1048         // include characters after the committed text.  If a pass is blocked,
1049         // and does not transliterate completely, then this method rolls back
1050         // the changes made during the pass, extends the pass by one code point,
1051         // and tries again.
1052         //----------------------------------------------------------------------
1053 
1054         // globalLimit is the limit value for the entire operation.  We
1055         // set index.limit to the end of each unfiltered run before
1056         // calling handleTransliterate(), so we need to maintain the real
1057         // value of index.limit here.  After each transliteration, we
1058         // update globalLimit for insertions or deletions that have
1059         // happened.
1060         int globalLimit = index.limit;
1061 
1062         // If there is a non-null filter, then break the input text up.  Say the
1063         // input text has the form:
1064         //   xxxabcxxdefxx
1065         // where 'x' represents a filtered character (filter.contains('x') ==
1066         // false).  Then we break this up into:
1067         //   xxxabc xxdef xx
1068         // Each pass through the loop consumes a run of filtered
1069         // characters (which are ignored) and a subsequent run of
1070         // unfiltered characters (which are transliterated).
1071 
1072         StringBuffer log = null;
1073         if (DEBUG) {
1074             log = new StringBuffer();
1075         }
1076 
1077         for (;;) {
1078 
1079             if (filter != null) {
1080                 // Narrow the range to be transliterated to the first run
1081                 // of unfiltered characters at or after index.start.
1082 
1083                 // Advance past filtered chars
1084                 int c;
1085                 while (index.start < globalLimit &&
1086                        !filter.contains(c=text.char32At(index.start))) {
1087                     index.start += UTF16.getCharCount(c);
1088                 }
1089 
1090                 // Find the end of this run of unfiltered chars
1091                 index.limit = index.start;
1092                 while (index.limit < globalLimit &&
1093                        filter.contains(c=text.char32At(index.limit))) {
1094                     index.limit += UTF16.getCharCount(c);
1095                 }
1096             }
1097 
1098             // Check to see if the unfiltered run is empty.  This only
1099             // happens at the end of the string when all the remaining
1100             // characters are filtered.
1101             if (index.start == index.limit) {
1102                 break;
1103             }
1104 
1105             // Is this run incremental?  If there is additional
1106             // filtered text (if limit < globalLimit) then we pass in
1107             // an incremental value of FALSE to force the subclass to
1108             // complete the transliteration for this run.
1109             boolean isIncrementalRun =
1110                 (index.limit < globalLimit ? false : incremental);
1111 
1112             int delta;
1113 
1114             // Implement rollback.  To understand the need for rollback,
1115             // consider the following transliterator:
1116             //
1117             //  "t" is "a > A;"
1118             //  "u" is "A > b;"
1119             //  "v" is a compound of "t; NFD; u" with a filter [:Ll:]
1120             //
1121             // Now apply "v" to the input text "a".  The result is "b".  But if
1122             // the transliteration is done incrementally, then the NFD holds
1123             // things up after "t" has already transformed "a" to "A".  When
1124             // finishTransliterate() is called, "A" is _not_ processed because
1125             // it gets excluded by the [:Ll:] filter, and the end result is "A"
1126             // -- incorrect.  The problem is that the filter is applied to a
1127             // partially-transliterated result, when we only want it to apply to
1128             // input text.  Although this example describes a compound
1129             // transliterator containing NFD and a specific filter, it can
1130             // happen with any transliterator which does a partial
1131             // transformation in incremental mode into characters outside its
1132             // filter.
1133             //
1134             // To handle this, when in incremental mode we supply characters to
1135             // handleTransliterate() in several passes.  Each pass adds one more
1136             // input character to the input text.  That is, for input "ABCD", we
1137             // first try "A", then "AB", then "ABC", and finally "ABCD".  If at
1138             // any point we block (upon return, start < limit) then we roll
1139             // back.  If at any point we complete the run (upon return start ==
1140             // limit) then we commit that run.
1141 
1142             if (rollback && isIncrementalRun) {
1143 
1144                 if (DEBUG) {
1145                     log.setLength(0);
1146                     System.out.println("filteredTransliterate{"+getID()+"}i: IN=" +
1147                                        UtilityExtensions.formatInput(text, index));
1148                 }
1149 
1150                 int runStart = index.start;
1151                 int runLimit = index.limit;
1152                 int runLength =  runLimit - runStart;
1153 
1154                 // Make a rollback copy at the end of the string
1155                 int rollbackOrigin = text.length();
1156                 text.copy(runStart, runLimit, rollbackOrigin);
1157 
1158                 // Variables reflecting the commitment of completely
1159                 // transliterated text.  passStart is the runStart, advanced
1160                 // past committed text.  rollbackStart is the rollbackOrigin,
1161                 // advanced past rollback text that corresponds to committed
1162                 // text.
1163                 int passStart = runStart;
1164                 int rollbackStart = rollbackOrigin;
1165 
1166                 // The limit for each pass; we advance by one code point with
1167                 // each iteration.
1168                 int passLimit = index.start;
1169 
1170                 // Total length, in 16-bit code units, of uncommitted text.
1171                 // This is the length to be rolled back.
1172                 int uncommittedLength = 0;
1173 
1174                 // Total delta (change in length) for all passes
1175                 int totalDelta = 0;
1176 
1177                 // PASS MAIN LOOP -- Start with a single character, and extend
1178                 // the text by one character at a time.  Roll back partial
1179                 // transliterations and commit complete transliterations.
1180                 for (;;) {
1181                     // Length of additional code point, either one or two
1182                     int charLength =
1183                         UTF16.getCharCount(text.char32At(passLimit));
1184                     passLimit += charLength;
1185                     if (passLimit > runLimit) {
1186                         break;
1187                     }
1188                     uncommittedLength += charLength;
1189 
1190                     index.limit = passLimit;
1191 
1192                     if (DEBUG) {
1193                         log.setLength(0);
1194                         log.append("filteredTransliterate{"+getID()+"}i: ");
1195                         UtilityExtensions.formatInput(log, text, index);
1196                     }
1197 
1198                     // Delegate to subclass for actual transliteration.  Upon
1199                     // return, start will be updated to point after the
1200                     // transliterated text, and limit and contextLimit will be
1201                     // adjusted for length changes.
1202                     handleTransliterate(text, index, true);
1203 
1204                     if (DEBUG) {
1205                         log.append(" => ");
1206                         UtilityExtensions.formatInput(log, text, index);
1207                     }
1208 
1209                     delta = index.limit - passLimit; // change in length
1210 
1211                     // We failed to completely transliterate this pass.
1212                     // Roll back the text.  Indices remain unchanged; reset
1213                     // them where necessary.
1214                     if (index.start != index.limit) {
1215                         // Find the rollbackStart, adjusted for length changes
1216                         // and the deletion of partially transliterated text.
1217                         int rs = rollbackStart + delta - (index.limit - passStart);
1218 
1219                         // Delete the partially transliterated text
1220                         text.replace(passStart, index.limit, "");
1221 
1222                         // Copy the rollback text back
1223                         text.copy(rs, rs + uncommittedLength, passStart);
1224 
1225                         // Restore indices to their original values
1226                         index.start = passStart;
1227                         index.limit = passLimit;
1228                         index.contextLimit -= delta;
1229 
1230                         if (DEBUG) {
1231                             log.append(" (ROLLBACK)");
1232                         }
1233                     }
1234 
1235                     // We did completely transliterate this pass.  Update the
1236                     // commit indices to record how far we got.  Adjust indices
1237                     // for length change.
1238                     else {
1239                         // Move the pass indices past the committed text.
1240                         passStart = passLimit = index.start;
1241 
1242                         // Adjust the rollbackStart for length changes and move
1243                         // it past the committed text.  All characters we've
1244                         // processed to this point are committed now, so zero
1245                         // out the uncommittedLength.
1246                         rollbackStart += delta + uncommittedLength;
1247                         uncommittedLength = 0;
1248 
1249                         // Adjust indices for length changes.
1250                         runLimit += delta;
1251                         totalDelta += delta;
1252                     }
1253 
1254                     if (DEBUG) {
1255                         System.out.println(Utility.escape(log.toString()));
1256                     }
1257                 }
1258 
1259                 // Adjust overall limit and rollbackOrigin for insertions and
1260                 // deletions.  Don't need to worry about contextLimit because
1261                 // handleTransliterate() maintains that.
1262                 rollbackOrigin += totalDelta;
1263                 globalLimit += totalDelta;
1264 
1265                 // Delete the rollback copy
1266                 text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
1267 
1268                 // Move start past committed text
1269                 index.start = passStart;
1270             }
1271 
1272             else {
1273                 // Delegate to subclass for actual transliteration.
1274                 if (DEBUG) {
1275                     log.setLength(0);
1276                     log.append("filteredTransliterate{"+getID()+"}: ");
1277                     UtilityExtensions.formatInput(log, text, index);
1278                 }
1279 
1280                 int limit = index.limit;
1281                 handleTransliterate(text, index, isIncrementalRun);
1282                 delta = index.limit - limit; // change in length
1283 
1284                 if (DEBUG) {
1285                     log.append(" => ");
1286                     UtilityExtensions.formatInput(log, text, index);
1287                 }
1288 
1289                 // In a properly written transliterator, start == limit after
1290                 // handleTransliterate() returns when incremental is false.
1291                 // Catch cases where the subclass doesn't do this, and throw
1292                 // an exception.  (Just pinning start to limit is a bad idea,
1293                 // because what's probably happening is that the subclass
1294                 // isn't transliterating all the way to the end, and it should
1295                 // in non-incremental mode.)
1296                 if (!isIncrementalRun && index.start != index.limit) {
1297                     throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID());
1298                 }
1299 
1300                 // Adjust overall limit for insertions/deletions.  Don't need
1301                 // to worry about contextLimit because handleTransliterate()
1302                 // maintains that.
1303                 globalLimit += delta;
1304 
1305                 if (DEBUG) {
1306                     System.out.println(Utility.escape(log.toString()));
1307                 }
1308             }
1309 
1310             if (filter == null || isIncrementalRun) {
1311                 break;
1312             }
1313 
1314             // If we did completely transliterate this
1315             // run, then repeat with the next unfiltered run.
1316         }
1317 
1318         // Start is valid where it is.  Limit needs to be put back where
1319         // it was, modulo adjustments for deletions/insertions.
1320         index.limit = globalLimit;
1321 
1322         if (DEBUG) {
1323             System.out.println("filteredTransliterate{"+getID()+"}: OUT=" +
1324                                UtilityExtensions.formatInput(text, index));
1325         }
1326     }
1327 
1328     /**
1329      * Transliterate a substring of text, as specified by index, taking filters
1330      * into account.  This method is for subclasses that need to delegate to
1331      * another transliterator.
1332      * @param text the text to be transliterated
1333      * @param index the position indices
1334      * @param incremental if TRUE, then assume more characters may be inserted
1335      * at index.limit, and postpone processing to accomodate future incoming
1336      * characters
1337      */
filteredTransliterate(Replaceable text, Position index, boolean incremental)1338     public void filteredTransliterate(Replaceable text,
1339                                          Position index,
1340                                          boolean incremental) {
1341         filteredTransliterate(text, index, incremental, false);
1342     }
1343 
1344     /**
1345      * Returns the length of the longest context required by this transliterator.
1346      * This is <em>preceding</em> context.  The default value is zero, but
1347      * subclasses can change this by calling <code>setMaximumContextLength()</code>.
1348      * For example, if a transliterator translates "ddd" (where
1349      * d is any digit) to "555" when preceded by "(ddd)", then the preceding
1350      * context length is 5, the length of "(ddd)".
1351      *
1352      * @return The maximum number of preceding context characters this
1353      * transliterator needs to examine
1354      */
getMaximumContextLength()1355     public final int getMaximumContextLength() {
1356         return maximumContextLength;
1357     }
1358 
1359     /**
1360      * Method for subclasses to use to set the maximum context length.
1361      * @see #getMaximumContextLength
1362      * @hide unsupported on OHOS
1363      */
setMaximumContextLength(int a)1364     protected void setMaximumContextLength(int a) {
1365         if (a < 0) {
1366             throw new IllegalArgumentException("Invalid context length " + a);
1367         }
1368         maximumContextLength = a;
1369     }
1370 
1371     /**
1372      * Returns a programmatic identifier for this transliterator.
1373      * If this identifier is passed to <code>getInstance()</code>, it
1374      * will return this object, if it has been registered.
1375      * @see #registerClass
1376      * @see #getAvailableIDs
1377      */
getID()1378     public final String getID() {
1379         return ID;
1380     }
1381 
1382     /**
1383      * Set the programmatic identifier for this transliterator.  Only
1384      * for use by subclasses.
1385      * @hide unsupported on OHOS
1386      */
setID(String id)1387     protected final void setID(String id) {
1388         ID = id;
1389     }
1390 
1391     /**
1392      * Returns a name for this transliterator that is appropriate for
1393      * display to the user in the default <code>DISPLAY</code> locale.  See {@link
1394      * #getDisplayName(String,Locale)} for details.
1395      * @see ohos.global.icu.util.ULocale.Category#DISPLAY
1396      */
getDisplayName(String ID)1397     public final static String getDisplayName(String ID) {
1398         return getDisplayName(ID, ULocale.getDefault(Category.DISPLAY));
1399     }
1400 
1401     /**
1402      * Returns a name for this transliterator that is appropriate for
1403      * display to the user in the given locale.  This name is taken
1404      * from the locale resource data in the standard manner of the
1405      * <code>java.text</code> package.
1406      *
1407      * <p>If no localized names exist in the system resource bundles,
1408      * a name is synthesized using a localized
1409      * <code>MessageFormat</code> pattern from the resource data.  The
1410      * arguments to this pattern are an integer followed by one or two
1411      * strings.  The integer is the number of strings, either 1 or 2.
1412      * The strings are formed by splitting the ID for this
1413      * transliterator at the first '-'.  If there is no '-', then the
1414      * entire ID forms the only string.
1415      * @param inLocale the Locale in which the display name should be
1416      * localized.
1417      * @see java.text.MessageFormat
1418      */
getDisplayName(String id, Locale inLocale)1419     public static String getDisplayName(String id, Locale inLocale) {
1420         return getDisplayName(id, ULocale.forLocale(inLocale));
1421     }
1422 
1423     /**
1424      * Returns a name for this transliterator that is appropriate for
1425      * display to the user in the given locale.  This name is taken
1426      * from the locale resource data in the standard manner of the
1427      * <code>java.text</code> package.
1428      *
1429      * <p>If no localized names exist in the system resource bundles,
1430      * a name is synthesized using a localized
1431      * <code>MessageFormat</code> pattern from the resource data.  The
1432      * arguments to this pattern are an integer followed by one or two
1433      * strings.  The integer is the number of strings, either 1 or 2.
1434      * The strings are formed by splitting the ID for this
1435      * transliterator at the first '-'.  If there is no '-', then the
1436      * entire ID forms the only string.
1437      * @param inLocale the ULocale in which the display name should be
1438      * localized.
1439      * @see java.text.MessageFormat
1440      */
getDisplayName(String id, ULocale inLocale)1441     public static String getDisplayName(String id, ULocale inLocale) {
1442 
1443         // Resource bundle containing display name keys and the
1444         // RB_RULE_BASED_IDS array.
1445         //
1446         //If we ever integrate this with the Sun JDK, the resource bundle
1447         // root will change to sun.text.resources.LocaleElements
1448 
1449         ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle.
1450             getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, inLocale);
1451 
1452         // Normalize the ID
1453         String stv[] = TransliteratorIDParser.IDtoSTV(id);
1454         if (stv == null) {
1455             // No target; malformed id
1456             return "";
1457         }
1458         String ID = stv[0] + '-' + stv[1];
1459         if (stv[2] != null && stv[2].length() > 0) {
1460             ID = ID + '/' + stv[2];
1461         }
1462 
1463         // Use the registered display name, if any
1464         String n = displayNameCache.get(new CaseInsensitiveString(ID));
1465         if (n != null) {
1466             return n;
1467         }
1468 
1469         // Use display name for the entire transliterator, if it
1470         // exists.
1471         try {
1472             return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
1473         } catch (MissingResourceException e) {}
1474 
1475         try {
1476             // Construct the formatter first; if getString() fails
1477             // we'll exit the try block
1478             MessageFormat format = new MessageFormat(
1479                     bundle.getString(RB_DISPLAY_NAME_PATTERN));
1480             // Construct the argument array
1481             Object[] args = new Object[] { Integer.valueOf(2), stv[0], stv[1] };
1482 
1483             // Use display names for the scripts, if they exist
1484             for (int j=1; j<=2; ++j) {
1485                 try {
1486                     args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX +
1487                                                (String) args[j]);
1488                 } catch (MissingResourceException e) {}
1489             }
1490 
1491             // Format it using the pattern in the resource
1492             return (stv[2].length() > 0) ?
1493                 (format.format(args) + '/' + stv[2]) :
1494                 format.format(args);
1495         } catch (MissingResourceException e2) {}
1496 
1497         // We should not reach this point unless there is something
1498         // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
1499         // been deleted from the root RB_LOCALE_ELEMENTS resource.
1500         throw new RuntimeException();
1501     }
1502 
1503     /**
1504      * Returns the filter used by this transliterator, or <tt>null</tt>
1505      * if this transliterator uses no filter.
1506      */
getFilter()1507     public final UnicodeFilter getFilter() {
1508         return filter;
1509     }
1510 
1511     /**
1512      * Changes the filter used by this transliterator.  If the filter
1513      * is set to <tt>null</tt> then no filtering will occur.
1514      *
1515      * <p>Callers must take care if a transliterator is in use by
1516      * multiple threads.  The filter should not be changed by one
1517      * thread while another thread may be transliterating.
1518      */
setFilter(UnicodeFilter filter)1519     public void setFilter(UnicodeFilter filter) {
1520         if (filter == null) {
1521             this.filter = null;
1522         } else {
1523             try {
1524                 // fast high-runner case
1525                 this.filter = new UnicodeSet((UnicodeSet)filter).freeze();
1526             } catch (Exception e) {
1527                 this.filter = new UnicodeSet();
1528                 filter.addMatchSetTo(this.filter);
1529                 this.filter.freeze();
1530             }
1531         }
1532     }
1533 
1534     /**
1535      * Returns a <code>Transliterator</code> object given its ID.
1536      * The ID must be either a system transliterator ID or a ID registered
1537      * using <code>registerClass()</code>.
1538      *
1539      * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1540      * @return A <code>Transliterator</code> object with the given ID
1541      * @exception IllegalArgumentException if the given ID is invalid.
1542      */
getInstance(String ID)1543     public static final Transliterator getInstance(String ID) {
1544         return getInstance(ID, FORWARD);
1545     }
1546 
1547     /**
1548      * Returns a <code>Transliterator</code> object given its ID.
1549      * The ID must be either a system transliterator ID or a ID registered
1550      * using <code>registerClass()</code>.
1551      *
1552      * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1553      * @param dir either FORWARD or REVERSE.  If REVERSE then the
1554      * inverse of the given ID is instantiated.
1555      * @return A <code>Transliterator</code> object with the given ID
1556      * @exception IllegalArgumentException if the given ID is invalid.
1557      * @see #registerClass
1558      * @see #getAvailableIDs
1559      * @see #getID
1560      */
getInstance(String ID, int dir)1561     public static Transliterator getInstance(String ID,
1562                                              int dir) {
1563         StringBuffer canonID = new StringBuffer();
1564         List<SingleID> list = new ArrayList<>();
1565         UnicodeSet[] globalFilter = new UnicodeSet[1];
1566         if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) {
1567             throw new IllegalArgumentException("Invalid ID " + ID);
1568         }
1569 
1570         List<Transliterator> translits = TransliteratorIDParser.instantiateList(list);
1571 
1572         // assert(list.size() > 0);
1573         Transliterator t = null;
1574         if (list.size() > 1 || canonID.indexOf(";") >= 0) {
1575             // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
1576             // has one child transliterator.  This is so that toRules() will return the right thing
1577             // (without any inactive ID), but our main ID still comes out correct.  That is, if we
1578             // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
1579             // even though the ID is "(Lower);Latin-Greek;".
1580             t = new CompoundTransliterator(translits);
1581         }
1582         else {
1583             t = translits.get(0);
1584         }
1585 
1586         t.setID(canonID.toString());
1587         if (globalFilter[0] != null) {
1588             t.setFilter(globalFilter[0]);
1589         }
1590         return t;
1591     }
1592 
1593     /**
1594      * Create a transliterator from a basic ID.  This is an ID
1595      * containing only the forward direction source, target, and
1596      * variant.
1597      * @param id a basic ID of the form S-T or S-T/V.
1598      * @param canonID canonical ID to apply to the result, or
1599      * null to leave the ID unchanged
1600      * @return a newly created Transliterator or null if the ID is
1601      * invalid.
1602      */
getBasicInstance(String id, String canonID)1603     static Transliterator getBasicInstance(String id, String canonID) {
1604         StringBuffer s = new StringBuffer();
1605         Transliterator t = registry.get(id, s);
1606         if (s.length() != 0) {
1607             // assert(t==0);
1608             // Instantiate an alias
1609             t = getInstance(s.toString(), FORWARD);
1610         }
1611         if (t != null && canonID != null) {
1612             t.setID(canonID);
1613         }
1614         return t;
1615     }
1616 
1617     /**
1618      * Returns a <code>Transliterator</code> object constructed from
1619      * the given rule string.  This will be a rule-based Transliterator,
1620      * if the rule string contains only rules, or a
1621      * compound Transliterator, if it contains ID blocks, or a
1622      * null Transliterator, if it contains ID blocks which parse as
1623      * empty for the given direction.
1624      *
1625      * @param ID the id for the transliterator.
1626      * @param rules rules, separated by ';'
1627      * @param dir either FORWARD or REVERSE.
1628      * @return a newly created Transliterator
1629      * @throws IllegalArgumentException if there is a problem with the ID or the rules
1630      */
createFromRules(String ID, String rules, int dir)1631     public static final Transliterator createFromRules(String ID, String rules, int dir) {
1632         Transliterator t = null;
1633 
1634         TransliteratorParser parser = new TransliteratorParser();
1635         parser.parse(rules, dir);
1636 
1637         // NOTE: The logic here matches that in TransliteratorRegistry.
1638         if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
1639             t = new NullTransliterator();
1640         }
1641         else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
1642             t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter);
1643         }
1644         else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
1645             // idBlock, no data -- this is an alias.  The ID has
1646             // been munged from reverse into forward mode, if
1647             // necessary, so instantiate the ID in the forward
1648             // direction.
1649             if (parser.compoundFilter != null) {
1650                 t = getInstance(parser.compoundFilter.toPattern(false) + ";"
1651                         + parser.idBlockVector.get(0));
1652             } else {
1653                 t = getInstance(parser.idBlockVector.get(0));
1654             }
1655 
1656             if (t != null) {
1657                 t.setID(ID);
1658             }
1659         }
1660         else {
1661             List<Transliterator> transliterators = new ArrayList<>();
1662             int passNumber = 1;
1663 
1664             int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size());
1665             for (int i = 0; i < limit; i++) {
1666                 if (i < parser.idBlockVector.size()) {
1667                     String idBlock = parser.idBlockVector.get(i);
1668                     if (idBlock.length() > 0) {
1669                         Transliterator temp = getInstance(idBlock);
1670                         if (!(temp instanceof NullTransliterator))
1671                             transliterators.add(getInstance(idBlock));
1672                     }
1673                 }
1674                 if (i < parser.dataVector.size()) {
1675                     Data data = parser.dataVector.get(i);
1676                     transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null));
1677                 }
1678             }
1679 
1680             t = new CompoundTransliterator(transliterators, passNumber - 1);
1681             t.setID(ID);
1682             if (parser.compoundFilter != null) {
1683                 t.setFilter(parser.compoundFilter);
1684             }
1685         }
1686 
1687         return t;
1688     }
1689 
1690     /**
1691      * Returns a rule string for this transliterator.
1692      * @param escapeUnprintable if true, then unprintable characters
1693      * will be converted to escape form backslash-'u' or
1694      * backslash-'U'.
1695      */
toRules(boolean escapeUnprintable)1696     public String toRules(boolean escapeUnprintable) {
1697         return baseToRules(escapeUnprintable);
1698     }
1699 
1700     /**
1701      * Returns a rule string for this transliterator.  This is
1702      * a non-overrideable base class implementation that subclasses
1703      * may call.  It simply munges the ID into the correct format,
1704      * that is, "foo" =&gt; "::foo".
1705      * @param escapeUnprintable if true, then unprintable characters
1706      * will be converted to escape form backslash-'u' or
1707      * backslash-'U'.
1708      * @hide unsupported on OHOS
1709      */
baseToRules(boolean escapeUnprintable)1710     protected final String baseToRules(boolean escapeUnprintable) {
1711         // The base class implementation of toRules munges the ID into
1712         // the correct format.  That is: foo => ::foo
1713         // KEEP in sync with rbt_pars
1714         if (escapeUnprintable) {
1715             StringBuffer rulesSource = new StringBuffer();
1716             String id = getID();
1717             for (int i=0; i<id.length();) {
1718                 int c = UTF16.charAt(id, i);
1719                 if (!Utility.escapeUnprintable(rulesSource, c)) {
1720                     UTF16.append(rulesSource, c);
1721                 }
1722                 i += UTF16.getCharCount(c);
1723             }
1724             rulesSource.insert(0, "::");
1725             rulesSource.append(ID_DELIM);
1726             return rulesSource.toString();
1727         }
1728         return "::" + getID() + ID_DELIM;
1729     }
1730 
1731     /**
1732      * Return the elements that make up this transliterator.  For
1733      * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1734      * were created, the return value of this method would be an array
1735      * of the three transliterator objects that make up that
1736      * transliterator: [NFD, Jamo-Latin, Latin-Greek].
1737      *
1738      * <p>If this transliterator is not composed of other
1739      * transliterators, then this method will return an array of
1740      * length one containing a reference to this transliterator.
1741      * @return an array of one or more transliterators that make up
1742      * this transliterator
1743      */
getElements()1744     public Transliterator[] getElements() {
1745         Transliterator result[];
1746         if (this instanceof CompoundTransliterator) {
1747             CompoundTransliterator cpd = (CompoundTransliterator) this;
1748             result = new Transliterator[cpd.getCount()];
1749             for (int i=0; i<result.length; ++i) {
1750                 result[i] = cpd.getTransliterator(i);
1751             }
1752         } else {
1753             result = new Transliterator[] { this };
1754         }
1755         return result;
1756     }
1757 
1758     /**
1759      * Returns the set of all characters that may be modified in the
1760      * input text by this Transliterator.  This incorporates this
1761      * object's current filter; if the filter is changed, the return
1762      * value of this function will change.  The default implementation
1763      * returns an empty set.  Some subclasses may override {@link
1764      * #handleGetSourceSet} to return a more precise result.  The
1765      * return result is approximate in any case and is intended for
1766      * use by tests, tools, or utilities.
1767      * @see #getTargetSet
1768      * @see #handleGetSourceSet
1769      */
getSourceSet()1770     public final UnicodeSet getSourceSet() {
1771         UnicodeSet result = new UnicodeSet();
1772         addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), result, new UnicodeSet());
1773         return result;
1774     }
1775 
1776     /**
1777      * Framework method that returns the set of all characters that
1778      * may be modified in the input text by this Transliterator,
1779      * ignoring the effect of this object's filter.  The base class
1780      * implementation returns the empty set.  Subclasses that wish to
1781      * implement this should override this method.
1782      * @return the set of characters that this transliterator may
1783      * modify.  The set may be modified, so subclasses should return a
1784      * newly-created object.
1785      * @see #getSourceSet
1786      * @see #getTargetSet
1787      * @hide unsupported on OHOS
1788      */
handleGetSourceSet()1789     protected UnicodeSet handleGetSourceSet() {
1790         return new UnicodeSet();
1791     }
1792 
1793     /**
1794      * Returns the set of all characters that may be generated as
1795      * replacement text by this transliterator.  The default
1796      * implementation returns the empty set.  Some subclasses may
1797      * override this method to return a more precise result.  The
1798      * return result is approximate in any case and is intended for
1799      * use by tests, tools, or utilities requiring such
1800      * meta-information.
1801      * <p>Warning. You might expect an empty filter to always produce an empty target.
1802      * However, consider the following:
1803      * <pre>
1804      * [Pp]{}[\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB] &gt; \';
1805      * </pre>
1806      * With a filter of [], you still get some elements in the target set, because this rule will still match. It could
1807      * be recast to the following if it were important.
1808      * <pre>
1809      * [Pp]{([\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB])} &gt; \' | $1;
1810      * </pre>
1811      * @see #getTargetSet
1812      */
getTargetSet()1813     public UnicodeSet getTargetSet() {
1814         UnicodeSet result = new UnicodeSet();
1815         addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), new UnicodeSet(), result);
1816         return result;
1817     }
1818 
1819     /**
1820      * Returns the set of all characters that may be generated as
1821      * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter().
1822      * <p>SHOULD BE OVERRIDEN BY SUBCLASSES.
1823      * It is probably an error for any transliterator to NOT override this, but we can't force them to
1824      * for backwards compatibility.
1825      * <p>Other methods vector through this.
1826      * <p>When gathering the information on source and target, the compound transliterator makes things complicated.
1827      * For example, suppose we have:
1828      * <pre>
1829      * Global FILTER = [ax]
1830      * a &gt; b;
1831      * :: NULL;
1832      * b &gt; c;
1833      * x &gt; d;
1834      * </pre>
1835      * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets
1836      * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to
1837      * the global filter, intersect that transliterator's filter. Based on that we get the target.
1838      * The next transliterator gets as a global filter (global + last target). And so on.
1839      * <p>There is another complication:
1840      * <pre>
1841      * Global FILTER = [ax]
1842      * a &gt;|b;
1843      * b &gt;c;
1844      * </pre>
1845      * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will
1846      * change the global filter as we go.
1847      * @param targetSet TODO
1848      * @see #getTargetSet
1849      * @deprecated  This API is ICU internal only.
1850      * @hide deprecated on icu4j-org
1851      * @hide draft / provisional / internal are hidden on OHOS
1852      */
1853     @Deprecated
addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)1854     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
1855         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
1856         UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter);
1857         // use old method, if we don't have anything better
1858         sourceSet.addAll(temp);
1859         // clumsy guess with target
1860         for (String s : temp) {
1861             String t = transliterate(s);
1862             if (!s.equals(t)) {
1863                 targetSet.addAll(t);
1864             }
1865         }
1866     }
1867 
1868     /**
1869      * Returns the intersectionof this instance's filter intersected with an external filter.
1870      * The externalFilter must be frozen (it is frozen if not).
1871      * The result may be frozen, so don't attempt to modify.
1872      * @deprecated  This API is ICU internal only.
1873      * @hide deprecated on icu4j-org
1874      * @hide draft / provisional / internal are hidden on OHOS
1875      */
1876     @Deprecated
1877    // TODO change to getMergedFilter
getFilterAsUnicodeSet(UnicodeSet externalFilter)1878     public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) {
1879         if (filter == null) {
1880             return externalFilter;
1881         }
1882         UnicodeSet filterSet = new UnicodeSet(externalFilter);
1883         // Most, but not all filters will be UnicodeSets.  Optimize for
1884         // the high-runner case.
1885         UnicodeSet temp;
1886         try {
1887             temp = filter;
1888         } catch (ClassCastException e) {
1889             filter.addMatchSetTo(temp = new UnicodeSet());
1890         }
1891         return filterSet.retainAll(temp).freeze();
1892     }
1893 
1894     /**
1895      * Returns this transliterator's inverse.  See the class
1896      * documentation for details.  This implementation simply inverts
1897      * the two entities in the ID and attempts to retrieve the
1898      * resulting transliterator.  That is, if <code>getID()</code>
1899      * returns "A-B", then this method will return the result of
1900      * <code>getInstance("B-A")</code>, or <code>null</code> if that
1901      * call fails.
1902      *
1903      * <p>Subclasses with knowledge of their inverse may wish to
1904      * override this method.
1905      *
1906      * @return a transliterator that is an inverse, not necessarily
1907      * exact, of this transliterator, or <code>null</code> if no such
1908      * transliterator is registered.
1909      * @see #registerClass
1910      */
getInverse()1911     public final Transliterator getInverse() {
1912         return getInstance(ID, REVERSE);
1913     }
1914 
1915     /**
1916      * Registers a subclass of <code>Transliterator</code> with the
1917      * system.  This subclass must have a public constructor taking no
1918      * arguments.  When that constructor is called, the resulting
1919      * object must return the <code>ID</code> passed to this method if
1920      * its <code>getID()</code> method is called.
1921      *
1922      * @param ID the result of <code>getID()</code> for this
1923      * transliterator
1924      * @param transClass a subclass of <code>Transliterator</code>
1925      * @see #unregister
1926      * @hide unsupported on OHOS
1927      */
registerClass(String ID, Class<? extends Transliterator> transClass, String displayName)1928     public static void registerClass(String ID, Class<? extends Transliterator> transClass, String displayName) {
1929         registry.put(ID, transClass, true);
1930         if (displayName != null) {
1931             displayNameCache.put(new CaseInsensitiveString(ID), displayName);
1932         }
1933     }
1934 
1935     /**
1936      * Register a factory object with the given ID.  The factory
1937      * method should return a new instance of the given transliterator.
1938      *
1939      * <p>Because ICU may choose to cache Transliterator objects internally, this must
1940      * be called at application startup, prior to any calls to
1941      * Transliterator.getInstance to avoid undefined behavior.
1942      *
1943      * @param ID the ID of this transliterator
1944      * @param factory the factory object
1945      * @hide unsupported on OHOS
1946      */
registerFactory(String ID, Factory factory)1947     public static void registerFactory(String ID, Factory factory) {
1948         registry.put(ID, factory, true);
1949     }
1950 
1951     /**
1952      * Register a Transliterator object with the given ID.
1953      *
1954      * <p>Because ICU may choose to cache Transliterator objects internally, this must
1955      * be called at application startup, prior to any calls to
1956      * Transliterator.getInstance to avoid undefined behavior.
1957      *
1958      * @param trans the Transliterator object
1959      * @hide unsupported on OHOS
1960      */
registerInstance(Transliterator trans)1961     public static void registerInstance(Transliterator trans) {
1962         registry.put(trans.getID(), trans, true);
1963     }
1964 
1965     /**
1966      * Register a Transliterator object.
1967      *
1968      * <p>Because ICU may choose to cache Transliterator objects internally, this must
1969      * be called at application startup, prior to any calls to
1970      * Transliterator.getInstance to avoid undefined behavior.
1971      *
1972      * @param trans the Transliterator object
1973      */
registerInstance(Transliterator trans, boolean visible)1974     static void registerInstance(Transliterator trans, boolean visible) {
1975         registry.put(trans.getID(), trans, visible);
1976     }
1977 
1978     /**
1979      * Register an ID as an alias of another ID.  Instantiating
1980      * alias ID produces the same result as instantiating the original ID.
1981      * This is generally used to create short aliases of compound IDs.
1982      *
1983      * <p>Because ICU may choose to cache Transliterator objects internally, this must
1984      * be called at application startup, prior to any calls to
1985      * Transliterator.getInstance to avoid undefined behavior.
1986      *
1987      * @param aliasID The new ID being registered.
1988      * @param realID The existing ID that the new ID should be an alias of.
1989      * @hide unsupported on OHOS
1990      */
registerAlias(String aliasID, String realID)1991     public static void registerAlias(String aliasID, String realID) {
1992         registry.put(aliasID, realID, true);
1993     }
1994 
1995     /**
1996      * Register two targets as being inverses of one another.  For
1997      * example, calling registerSpecialInverse("NFC", "NFD", true) causes
1998      * Transliterator to form the following inverse relationships:
1999      *
2000      * <pre>NFC =&gt; NFD
2001      * Any-NFC =&gt; Any-NFD
2002      * NFD =&gt; NFC
2003      * Any-NFD =&gt; Any-NFC</pre>
2004      *
2005      * (Without the special inverse registration, the inverse of NFC
2006      * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
2007      * that the presence or absence of "Any-" is preserved.
2008      *
2009      * <p>The relationship is symmetrical; registering (a, b) is
2010      * equivalent to registering (b, a).
2011      *
2012      * <p>The relevant IDs must still be registered separately as
2013      * factories or classes.
2014      *
2015      * <p>Only the targets are specified.  Special inverses always
2016      * have the form Any-Target1 &lt;=&gt; Any-Target2.  The target should
2017      * have canonical casing (the casing desired to be produced when
2018      * an inverse is formed) and should contain no whitespace or other
2019      * extraneous characters.
2020      *
2021      * @param target the target against which to register the inverse
2022      * @param inverseTarget the inverse of target, that is
2023      * Any-target.getInverse() =&gt; Any-inverseTarget
2024      * @param bidirectional if true, register the reverse relation
2025      * as well, that is, Any-inverseTarget.getInverse() =&gt; Any-target
2026      */
registerSpecialInverse(String target, String inverseTarget, boolean bidirectional)2027     static void registerSpecialInverse(String target,
2028                                        String inverseTarget,
2029                                        boolean bidirectional) {
2030         TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional);
2031     }
2032 
2033     /**
2034      * Unregisters a transliterator or class.  This may be either
2035      * a system transliterator or a user transliterator or class.
2036      *
2037      * @param ID the ID of the transliterator or class
2038      * @see #registerClass
2039      * @hide unsupported on OHOS
2040      */
unregister(String ID)2041     public static void unregister(String ID) {
2042         displayNameCache.remove(new CaseInsensitiveString(ID));
2043         registry.remove(ID);
2044     }
2045 
2046     /**
2047      * Returns an enumeration over the programmatic names of registered
2048      * <code>Transliterator</code> objects.  This includes both system
2049      * transliterators and user transliterators registered using
2050      * <code>registerClass()</code>.  The enumerated names may be
2051      * passed to <code>getInstance()</code>.
2052      *
2053      * @return An <code>Enumeration</code> over <code>String</code> objects
2054      * @see #getInstance
2055      * @see #registerClass
2056      */
getAvailableIDs()2057     public static final Enumeration<String> getAvailableIDs() {
2058         return registry.getAvailableIDs();
2059     }
2060 
2061     /**
2062      * Returns an enumeration over the source names of registered
2063      * transliterators.  Source names may be passed to
2064      * getAvailableTargets() to obtain available targets for each
2065      * source.
2066      */
getAvailableSources()2067     public static final Enumeration<String> getAvailableSources() {
2068         return registry.getAvailableSources();
2069     }
2070 
2071     /**
2072      * Returns an enumeration over the target names of registered
2073      * transliterators having a given source name.  Target names may
2074      * be passed to getAvailableVariants() to obtain available
2075      * variants for each source and target pair.
2076      */
getAvailableTargets(String source)2077     public static final Enumeration<String> getAvailableTargets(String source) {
2078         return registry.getAvailableTargets(source);
2079     }
2080 
2081     /**
2082      * Returns an enumeration over the variant names of registered
2083      * transliterators having a given source name and target name.
2084      */
getAvailableVariants(String source, String target)2085     public static final Enumeration<String> getAvailableVariants(String source,
2086                                                          String target) {
2087         return registry.getAvailableVariants(source, target);
2088     }
2089     private static final String ROOT = "root",
2090                                 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs";
2091     static {
2092         registry = new TransliteratorRegistry();
2093 
2094         // The display name cache starts out empty
2095         displayNameCache = Collections.synchronizedMap(new HashMap<CaseInsensitiveString, String>());
2096         /* The following code parses the index table located in
2097          * icu/data/translit/root.txt.  The index is an n x 4 table
2098          * that follows this format:
2099          *  <id>{
2100          *      file{
2101          *          resource{"<resource>"}
2102          *          direction{"<direction>"}
2103          *      }
2104          *  }
2105          *  <id>{
2106          *      internal{
2107          *          resource{"<resource>"}
2108          *          direction{"<direction"}
2109          *       }
2110          *  }
2111          *  <id>{
2112          *      alias{"<getInstanceArg"}
2113          *  }
2114          * <id> is the ID of the system transliterator being defined.  These
2115          * are public IDs enumerated by Transliterator.getAvailableIDs(),
2116          * unless the second field is "internal".
2117          *
2118          * <resource> is a ResourceReader resource name.  Currently these refer
2119          * to file names under com/ibm/text/resources.  This string is passed
2120          * directly to ResourceReader, together with <encoding>.
2121          *
2122          * <direction> is either "FORWARD" or "REVERSE".
2123          *
2124          * <getInstanceArg> is a string to be passed directly to
2125          * Transliterator.getInstance().  The returned Transliterator object
2126          * then has its ID changed to <id> and is returned.
2127          *
2128          * The extra blank field on "alias" lines is to make the array square.
2129          */
2130         UResourceBundle bundle, transIDs, colBund;
2131         bundle = UResourceBundle.getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, ROOT);
2132         transIDs = bundle.get(RB_RULE_BASED_IDS);
2133 
2134         int row, maxRows;
2135         maxRows = transIDs.getSize();
2136         for (row = 0; row < maxRows; row++) {
2137             colBund = transIDs.get(row);
2138             String ID = colBund.getKey();
2139             if (ID.indexOf("-t-") >= 0) {
2140                 continue;
2141             }
2142             UResourceBundle res = colBund.get(0);
2143             String type = res.getKey();
2144             if (type.equals("file") || type.equals("internal")) {
2145                 // Rest of line is <resource>:<encoding>:<direction>
2146                 //                pos       colon      c2
2147                 String resString = res.getString("resource");
2148                 int dir;
2149                 String direction = res.getString("direction");
2150                 switch (direction.charAt(0)) {
2151                 case 'F':
2152                     dir = FORWARD;
2153                     break;
2154                 case 'R':
2155                     dir = REVERSE;
2156                     break;
2157                 default:
2158                     throw new RuntimeException("Can't parse direction: " + direction);
2159                 }
registry.put(ID, resString, dir, !type.equals("internal"))2160                 registry.put(ID,
2161                              resString, // resource
2162                              dir,
2163                              !type.equals("internal"));
2164             } else if (type.equals("alias")) {
2165                 //'alias'; row[2]=createInstance argument
2166                 String resString = res.getString();
registry.put(ID, resString, true)2167                 registry.put(ID, resString, true);
2168             } else {
2169                 // Unknown type
2170                 throw new RuntimeException("Unknow type: " + type);
2171             }
2172         }
2173 
registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false)2174         registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false);
2175 
2176         // Register non-rule-based transliterators
registerClass(NullTransliterator._ID, NullTransliterator.class, null)2177         registerClass(NullTransliterator._ID,
2178                       NullTransliterator.class, null);
RemoveTransliterator.register()2179         RemoveTransliterator.register();
EscapeTransliterator.register()2180         EscapeTransliterator.register();
UnescapeTransliterator.register()2181         UnescapeTransliterator.register();
LowercaseTransliterator.register()2182         LowercaseTransliterator.register();
UppercaseTransliterator.register()2183         UppercaseTransliterator.register();
TitlecaseTransliterator.register()2184         TitlecaseTransliterator.register();
CaseFoldTransliterator.register()2185         CaseFoldTransliterator.register();
UnicodeNameTransliterator.register()2186         UnicodeNameTransliterator.register();
NameUnicodeTransliterator.register()2187         NameUnicodeTransliterator.register();
NormalizationTransliterator.register()2188         NormalizationTransliterator.register();
BreakTransliterator.register()2189         BreakTransliterator.register();
AnyTransliterator.register()2190         AnyTransliterator.register(); // do this last!
2191     }
2192 
2193     /**
2194      * Register the script-based "Any" transliterators: Any-Latin, Any-Greek
2195      * @deprecated This API is ICU internal only.
2196      * @hide deprecated on icu4j-org
2197      * @hide draft / provisional / internal are hidden on OHOS
2198      */
2199     @Deprecated
registerAny()2200     public static void registerAny() {
2201         AnyTransliterator.register();
2202     }
2203 
2204     /**
2205      * The factory interface for transliterators.  Transliterator
2206      * subclasses can register factory objects for IDs using the
2207      * registerFactory() method of Transliterator.  When invoked, the
2208      * factory object will be passed the ID being instantiated.  This
2209      * makes it possible to register one factory method to more than
2210      * one ID, or for a factory method to parameterize its result
2211      * based on the variant.
2212      * @hide exposed on OHOS
2213      */
2214     public static interface Factory {
2215         /**
2216          * Return a transliterator for the given ID.
2217          */
getInstance(String ID)2218         Transliterator getInstance(String ID);
2219     }
2220 
2221     /**
2222      * Implements StringTransform via this method.
2223      * @param source text to be transformed (eg lowercased)
2224      * @return result
2225      * @hide unsupported on OHOS
2226      */
2227     @Override
transform(String source)2228     public String transform(String source) {
2229         return transliterate(source);
2230     }
2231 }
2232