• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2013-2014, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * TailoredSet.java, ported from collationsets.h/.cpp
10 *
11 * C++ version created on: 2013feb09
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import java.util.Iterator;
18 
19 import ohos.global.icu.impl.Normalizer2Impl.Hangul;
20 import ohos.global.icu.impl.Trie2;
21 import ohos.global.icu.impl.Utility;
22 import ohos.global.icu.text.UnicodeSet;
23 import ohos.global.icu.util.CharsTrie;
24 import ohos.global.icu.util.CharsTrie.Entry;
25 
26 /**
27  * Finds the set of characters and strings that sort differently in the tailoring
28  * from the base data.
29  *
30  * Every mapping in the tailoring needs to be compared to the base,
31  * because some mappings are copied for optimization, and
32  * all contractions for a character are copied if any contractions for that character
33  * are added, modified or removed.
34  *
35  * It might be simpler to re-parse the rule string, but:
36  * - That would require duplicating some of the from-rules builder code.
37  * - That would make the runtime code depend on the builder.
38  * - That would only work if we have the rule string, and we allow users to
39  *   omit the rule string from data files.
40  * @hide exposed on OHOS
41  */
42 public final class TailoredSet {
43 
44     private CollationData data;
45     private CollationData baseData;
46     private UnicodeSet tailored;
47     private StringBuilder unreversedPrefix = new StringBuilder();
48     private String suffix;
49 
TailoredSet(UnicodeSet t)50     public TailoredSet(UnicodeSet t) {
51         tailored = t;
52     }
53 
forData(CollationData d)54     public void forData(CollationData d) {
55         data = d;
56         baseData = d.base;
57         assert (baseData != null);
58         // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
59         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
60         Trie2.Range range;
61         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
62             enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
63         }
64     }
65 
enumTailoredRange(int start, int end, int ce32, TailoredSet ts)66     private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
67         if (ce32 == Collation.FALLBACK_CE32) {
68             return; // fallback to base, not tailored
69         }
70         ts.handleCE32(start, end, ce32);
71     }
72 
73     // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
74     //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
75     //  function (enumTailoredRange()). This is not necessary for Java implementation.
handleCE32(int start, int end, int ce32)76     private void handleCE32(int start, int end, int ce32) {
77         assert (ce32 != Collation.FALLBACK_CE32);
78         if (Collation.isSpecialCE32(ce32)) {
79             ce32 = data.getIndirectCE32(ce32);
80             if (ce32 == Collation.FALLBACK_CE32) {
81                 return;
82             }
83         }
84         do {
85             int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
86             // Do not just continue if ce32 == baseCE32 because
87             // contractions and expansions in different data objects
88             // normally differ even if they have the same data offsets.
89             if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
90                 // fastpath
91                 if (ce32 != baseCE32) {
92                     tailored.add(start);
93                 }
94             } else {
95                 compare(start, ce32, baseCE32);
96             }
97         } while (++start <= end);
98     }
99 
compare(int c, int ce32, int baseCE32)100     private void compare(int c, int ce32, int baseCE32) {
101         if (Collation.isPrefixCE32(ce32)) {
102             int dataIndex = Collation.indexFromCE32(ce32);
103             ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
104             if (Collation.isPrefixCE32(baseCE32)) {
105                 int baseIndex = Collation.indexFromCE32(baseCE32);
106                 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
107                 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
108             } else {
109                 addPrefixes(data, c, data.contexts, dataIndex + 2);
110             }
111         } else if (Collation.isPrefixCE32(baseCE32)) {
112             int baseIndex = Collation.indexFromCE32(baseCE32);
113             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
114             addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
115         }
116 
117         if (Collation.isContractionCE32(ce32)) {
118             int dataIndex = Collation.indexFromCE32(ce32);
119             if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
120                 ce32 = Collation.NO_CE32;
121             } else {
122                 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
123             }
124             if (Collation.isContractionCE32(baseCE32)) {
125                 int baseIndex = Collation.indexFromCE32(baseCE32);
126                 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
127                     baseCE32 = Collation.NO_CE32;
128                 } else {
129                     baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
130                 }
131                 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
132             } else {
133                 addContractions(c, data.contexts, dataIndex + 2);
134             }
135         } else if (Collation.isContractionCE32(baseCE32)) {
136             int baseIndex = Collation.indexFromCE32(baseCE32);
137             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
138             addContractions(c, baseData.contexts, baseIndex + 2);
139         }
140 
141         int tag;
142         if (Collation.isSpecialCE32(ce32)) {
143             tag = Collation.tagFromCE32(ce32);
144             assert (tag != Collation.PREFIX_TAG);
145             assert (tag != Collation.CONTRACTION_TAG);
146             // Currently, the tailoring data builder does not write offset tags.
147             // They might be useful for saving space,
148             // but they would complicate the builder,
149             // and in tailorings we assume that performance of tailored characters is more important.
150             assert (tag != Collation.OFFSET_TAG);
151         } else {
152             tag = -1;
153         }
154         int baseTag;
155         if (Collation.isSpecialCE32(baseCE32)) {
156             baseTag = Collation.tagFromCE32(baseCE32);
157             assert (baseTag != Collation.PREFIX_TAG);
158             assert (baseTag != Collation.CONTRACTION_TAG);
159         } else {
160             baseTag = -1;
161         }
162 
163         // Non-contextual mappings, expansions, etc.
164         if (baseTag == Collation.OFFSET_TAG) {
165             // We might be comparing a tailoring CE which is a copy of
166             // a base offset-tag CE, via the [optimize [set]] syntax
167             // or when a single-character mapping was copied for tailored contractions.
168             // Offset tags always result in long-primary CEs,
169             // with common secondary/tertiary weights.
170             if (!Collation.isLongPrimaryCE32(ce32)) {
171                 add(c);
172                 return;
173             }
174             long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
175             long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
176             if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
177                 add(c);
178                 return;
179             }
180         }
181 
182         if (tag != baseTag) {
183             add(c);
184             return;
185         }
186 
187         if (tag == Collation.EXPANSION32_TAG) {
188             int length = Collation.lengthFromCE32(ce32);
189             int baseLength = Collation.lengthFromCE32(baseCE32);
190 
191             if (length != baseLength) {
192                 add(c);
193                 return;
194             }
195 
196             int idx0 = Collation.indexFromCE32(ce32);
197             int idx1 = Collation.indexFromCE32(baseCE32);
198 
199             for (int i = 0; i < length; ++i) {
200                 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
201                     add(c);
202                     break;
203                 }
204             }
205         } else if (tag == Collation.EXPANSION_TAG) {
206             int length = Collation.lengthFromCE32(ce32);
207             int baseLength = Collation.lengthFromCE32(baseCE32);
208 
209             if (length != baseLength) {
210                 add(c);
211                 return;
212             }
213 
214             int idx0 = Collation.indexFromCE32(ce32);
215             int idx1 = Collation.indexFromCE32(baseCE32);
216 
217             for (int i = 0; i < length; ++i) {
218                 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
219                     add(c);
220                     break;
221                 }
222             }
223         } else if (tag == Collation.HANGUL_TAG) {
224             StringBuilder jamos = new StringBuilder();
225             int length = Hangul.decompose(c, jamos);
226             if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
227                     || (length == 3 && tailored.contains(jamos.charAt(2)))) {
228                 add(c);
229             }
230         } else if (ce32 != baseCE32) {
231             add(c);
232         }
233     }
234 
comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx)235     private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
236         // Parallel iteration over prefixes of both tables.
237         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
238         CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
239         String tp = null; // Tailoring prefix.
240         String bp = null; // Base prefix.
241         // Use a string with a U+FFFF as the limit sentinel.
242         // U+FFFF is untailorable and will not occur in prefixes.
243         String none = "\uffff";
244         Entry te = null, be = null;
245         for (;;) {
246             if (tp == null) {
247                 if (prefixes.hasNext()) {
248                     te = prefixes.next();
249                     tp = te.chars.toString();
250                 } else {
251                     te = null;
252                     tp = none;
253                 }
254             }
255             if (bp == null) {
256                 if (basePrefixes.hasNext()) {
257                     be = basePrefixes.next();
258                     bp = be.chars.toString();
259                 } else {
260                     be = null;
261                     bp = none;
262                 }
263             }
264             if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) {
265                 break;
266             }
267             int cmp = tp.compareTo(bp);
268             if (cmp < 0) {
269                 // tp occurs in the tailoring but not in the base.
270                 assert (te != null);
271                 addPrefix(data, tp, c, te.value);
272                 te = null;
273                 tp = null;
274             } else if (cmp > 0) {
275                 // bp occurs in the base but not in the tailoring.
276                 assert (be != null);
277                 addPrefix(baseData, bp, c, be.value);
278                 be = null;
279                 bp = null;
280             } else {
281                 setPrefix(tp);
282                 assert (te != null && be != null);
283                 compare(c, te.value, be.value);
284                 resetPrefix();
285                 te = be = null;
286                 tp = bp = null;
287             }
288         }
289     }
290 
compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx)291     private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
292         // Parallel iteration over suffixes of both tables.
293         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
294         CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
295         String ts = null; // Tailoring suffix.
296         String bs = null; // Base suffix.
297         // Use a string with two U+FFFF as the limit sentinel.
298         // U+FFFF is untailorable and will not occur in contractions except maybe
299         // as a single suffix character for a root-collator boundary contraction.
300         String none = "\uffff\uffff";
301         Entry te = null, be = null;
302         for (;;) {
303             if (ts == null) {
304                 if (suffixes.hasNext()) {
305                     te = suffixes.next();
306                     ts = te.chars.toString();
307                 } else {
308                     te = null;
309                     ts = none;
310                 }
311             }
312             if (bs == null) {
313                 if (baseSuffixes.hasNext()) {
314                     be = baseSuffixes.next();
315                     bs = be.chars.toString();
316                 } else {
317                     be = null;
318                     bs = none;
319                 }
320             }
321             if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) {
322                 break;
323             }
324             int cmp = ts.compareTo(bs);
325             if (cmp < 0) {
326                 // ts occurs in the tailoring but not in the base.
327                 addSuffix(c, ts);
328                 te = null;
329                 ts = null;
330             } else if (cmp > 0) {
331                 // bs occurs in the base but not in the tailoring.
332                 addSuffix(c, bs);
333                 be = null;
334                 bs = null;
335             } else {
336                 suffix = ts;
337                 compare(c, te.value, be.value);
338                 suffix = null;
339                 te = be = null;
340                 ts = bs = null;
341             }
342         }
343     }
344 
addPrefixes(CollationData d, int c, CharSequence p, int pidx)345     private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
346         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
347         while (prefixes.hasNext()) {
348             Entry e = prefixes.next();
349             addPrefix(d, e.chars, c, e.value);
350         }
351     }
352 
addPrefix(CollationData d, CharSequence pfx, int c, int ce32)353     private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
354         setPrefix(pfx);
355         ce32 = d.getFinalCE32(ce32);
356         if (Collation.isContractionCE32(ce32)) {
357             int idx = Collation.indexFromCE32(ce32);
358             addContractions(c, d.contexts, idx + 2);
359         }
360         tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
361         resetPrefix();
362     }
363 
addContractions(int c, CharSequence p, int pidx)364     private void addContractions(int c, CharSequence p, int pidx) {
365         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
366         while (suffixes.hasNext()) {
367             Entry e = suffixes.next();
368             addSuffix(c, e.chars);
369         }
370     }
371 
addSuffix(int c, CharSequence sfx)372     private void addSuffix(int c, CharSequence sfx) {
373         tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
374     }
375 
add(int c)376     private void add(int c) {
377         if (unreversedPrefix.length() == 0 && suffix == null) {
378             tailored.add(c);
379         } else {
380             StringBuilder s = new StringBuilder(unreversedPrefix);
381             s.appendCodePoint(c);
382             if (suffix != null) {
383                 s.append(suffix);
384             }
385             tailored.add(s);
386         }
387     }
388 
389     // Prefixes are reversed in the data structure.
setPrefix(CharSequence pfx)390     private void setPrefix(CharSequence pfx) {
391         unreversedPrefix.setLength(0);
392         unreversedPrefix.append(pfx).reverse();
393     }
394 
resetPrefix()395     private void resetPrefix() {
396         unreversedPrefix.setLength(0);
397     }
398 }
399 
400