• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import java.util.ArrayList;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Set;
17 
18 import ohos.global.icu.impl.Norm2AllModes;
19 import ohos.global.icu.impl.Normalizer2Impl;
20 import ohos.global.icu.impl.Utility;
21 import ohos.global.icu.lang.UCharacter;
22 
23 /**
24  * This class allows one to iterate through all the strings that are canonically equivalent to a given
25  * string. For example, here are some sample results:
26  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
27  * <pre>
28  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
29  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
30  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
31  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
32  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
33  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
34  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
35  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
36  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
37 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
38 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
39 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
40  *</pre>
41  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
42  * since it has not been optimized for that situation.
43  * @author M. Davis
44  * @hide exposed on OHOS
45  */
46 
47 public final class CanonicalIterator {
48     /**
49      * Construct a CanonicalIterator object
50      * @param source string to get results for
51      */
CanonicalIterator(String source)52     public CanonicalIterator(String source) {
53         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
54         nfd = allModes.decomp;
55         nfcImpl = allModes.impl.ensureCanonIterData();
56         setSource(source);
57     }
58 
59     /**
60      * Gets the NFD form of the current source we are iterating over.
61      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
62      */
getSource()63     public String getSource() {
64       return source;
65     }
66 
67     /**
68      * Resets the iterator so that one can start again from the beginning.
69      */
reset()70     public void reset() {
71         done = false;
72         for (int i = 0; i < current.length; ++i) {
73             current[i] = 0;
74         }
75     }
76 
77     /**
78      * Get the next canonically equivalent string.
79      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
80      * @return the next string that is canonically equivalent. The value null is returned when
81      * the iteration is done.
82      */
next()83     public String next() {
84         if (done) return null;
85 
86         // construct return value
87 
88         buffer.setLength(0); // delete old contents
89         for (int i = 0; i < pieces.length; ++i) {
90             buffer.append(pieces[i][current[i]]);
91         }
92         String result = buffer.toString();
93 
94         // find next value for next time
95 
96         for (int i = current.length - 1; ; --i) {
97             if (i < 0) {
98                 done = true;
99                 break;
100             }
101             current[i]++;
102             if (current[i] < pieces[i].length) break; // got sequence
103             current[i] = 0;
104         }
105         return result;
106     }
107 
108     /**
109      * Set a new source for this iterator. Allows object reuse.
110      * @param newSource the source string to iterate against. This allows the same iterator to be used
111      * while changing the source string, saving object creation.
112      */
setSource(String newSource)113     public void setSource(String newSource) {
114         source = nfd.normalize(newSource);
115         done = false;
116 
117         // catch degenerate case
118         if (newSource.length() == 0) {
119             pieces = new String[1][];
120             current = new int[1];
121             pieces[0] = new String[]{""};
122             return;
123         }
124 
125         // find the segments
126         List<String> segmentList = new ArrayList<String>();
127         int cp;
128         int start = 0;
129 
130         // i should be the end of the first code point
131         // break up the string into segements
132 
133         int i = UTF16.findOffsetFromCodePoint(source, 1);
134 
135         for (; i < source.length(); i += Character.charCount(cp)) {
136             cp = source.codePointAt(i);
137             if (nfcImpl.isCanonSegmentStarter(cp)) {
138                 segmentList.add(source.substring(start, i)); // add up to i
139                 start = i;
140             }
141         }
142         segmentList.add(source.substring(start, i)); // add last one
143 
144         // allocate the arrays, and find the strings that are CE to each segment
145         pieces = new String[segmentList.size()][];
146         current = new int[segmentList.size()];
147         for (i = 0; i < pieces.length; ++i) {
148             if (PROGRESS) System.out.println("SEGMENT");
149             pieces[i] = getEquivalents(segmentList.get(i));
150         }
151     }
152 
153     /**
154      * Simple implementation of permutation.
155      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
156      * @param source the string to find permutations for
157      * @param skipZeros set to true to skip characters with canonical combining class zero
158      * @param output the set to add the results to
159      * @deprecated This API is ICU internal only.
160      * @hide draft / provisional / internal are hidden on OHOS
161      */
162     @Deprecated
permute(String source, boolean skipZeros, Set<String> output)163     public static void permute(String source, boolean skipZeros, Set<String> output) {
164         // TODO: optimize
165         //if (PROGRESS) System.out.println("Permute: " + source);
166 
167         // optimization:
168         // if zero or one character, just return a set with it
169         // we check for length < 2 to keep from counting code points all the time
170         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
171             output.add(source);
172             return;
173         }
174 
175         // otherwise iterate through the string, and recursively permute all the other characters
176         Set<String> subpermute = new HashSet<String>();
177         int cp;
178         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
179             cp = UTF16.charAt(source, i);
180 
181             // optimization:
182             // if the character is canonical combining class zero,
183             // don't permute it
184             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
185                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
186                 continue;
187             }
188 
189             // see what the permutations of the characters before and after this one are
190             subpermute.clear();
191             permute(source.substring(0,i)
192                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
193 
194             // prefix this character to all of them
195             String chStr = UTF16.valueOf(source, i);
196             for (String s : subpermute) {
197                 String piece = chStr + s;
198                 //if (PROGRESS) System.out.println("  Piece: " + piece);
199                 output.add(piece);
200             }
201         }
202     }
203 
204     // FOR TESTING
205 
206     /*
207      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
208      *
209     public static UnicodeSet getSafeStart() {
210         return (UnicodeSet) SAFE_START.clone();
211     }
212     */
213     /*
214      *@return the set of characters whose decompositions start with the given character
215      *
216     public static UnicodeSet getStarts(int cp) {
217         UnicodeSet result = AT_START.get(cp);
218         if (result == null) result = EMPTY;
219         return (UnicodeSet) result.clone();
220     }
221     */
222 
223     // ===================== PRIVATES ==============================
224 
225     // debug
226     private static boolean PROGRESS = false; // debug progress
227     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
228     private static boolean SKIP_ZEROS = true;
229 
230     // fields
231     private final Normalizer2 nfd;
232     private final Normalizer2Impl nfcImpl;
233     private String source;
234     private boolean done;
235     private String[][] pieces;
236     private int[] current;
237     // Note: C will need two more fields, since arrays there don't have lengths
238     // int pieces_length;
239     // int[] pieces_lengths;
240 
241     // transient fields
242     private transient StringBuilder buffer = new StringBuilder();
243 
244 
245     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
getEquivalents(String segment)246     private String[] getEquivalents(String segment) {
247         Set<String> result = new HashSet<String>();
248         Set<String> basic = getEquivalents2(segment);
249         Set<String> permutations = new HashSet<String>();
250 
251         // now get all the permutations
252         // add only the ones that are canonically equivalent
253         // TODO: optimize by not permuting any class zero.
254         Iterator<String> it = basic.iterator();
255         while (it.hasNext()) {
256             String item = it.next();
257             permutations.clear();
258             permute(item, SKIP_ZEROS, permutations);
259             Iterator<String> it2 = permutations.iterator();
260             while (it2.hasNext()) {
261                 String possible = it2.next();
262 
263 /*
264                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
265                 if (attempt.equals(segment)) {
266 */
267                 if (Normalizer.compare(possible, segment,0)==0) {
268 
269                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
270                     result.add(possible);
271 
272                 } else {
273                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
274                 }
275             }
276         }
277 
278         // convert into a String[] to clean up storage
279         String[] finalResult = new String[result.size()];
280         result.toArray(finalResult);
281         return finalResult;
282     }
283 
284 
getEquivalents2(String segment)285     private Set<String> getEquivalents2(String segment) {
286 
287         Set<String> result = new HashSet<String>();
288 
289         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
290 
291         result.add(segment);
292         StringBuffer workingBuffer = new StringBuffer();
293         UnicodeSet starts = new UnicodeSet();
294 
295         // cycle through all the characters
296         int cp;
297         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
298 
299             // see if any character is at the start of some decomposition
300             cp = segment.codePointAt(i);
301             if (!nfcImpl.getCanonStartSet(cp, starts)) {
302               continue;
303             }
304             // if so, see which decompositions match
305             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
306                 int cp2 = iter.codepoint;
307                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
308                 if (remainder == null) {
309                     continue;
310                 }
311 
312                 // there were some matches, so add all the possibilities to the set.
313                 String prefix= segment.substring(0,i);
314                 prefix += UTF16.valueOf(cp2);
315                 for (String item : remainder) {
316                     result.add(prefix + item);
317                 }
318             }
319         }
320         return result;
321         /*
322         Set result = new HashSet();
323         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
324         result.add(segment);
325         StringBuffer workingBuffer = new StringBuffer();
326 
327         // cycle through all the characters
328         int cp;
329 
330         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
331             // see if any character is at the start of some decomposition
332             cp = UTF16.charAt(segment, i);
333             NormalizerImpl.getCanonStartSet(c,fillSet)
334             UnicodeSet starts = AT_START.get(cp);
335             if (starts == null) continue;
336             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
337             // if so, see which decompositions match
338             while (usi.next()) {
339                 int cp2 = usi.codepoint;
340                 // we know that there are no strings in it
341                 // so we don't have to check CharacterIterator.IS_STRING
342                 Set remainder = extract(cp2, segment, i, workingBuffer);
343                 if (remainder == null) continue;
344 
345                 // there were some matches, so add all the possibilities to the set.
346                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
347                 Iterator it = remainder.iterator();
348                 while (it.hasNext()) {
349                     String item = (String) it.next();
350                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
351                     result.add(prefix + item);
352                 }
353             }
354         }
355         return result;
356         */
357     }
358 
359     /**
360      * See if the decomposition of cp2 is at segment starting at segmentPos
361      * (with canonical rearrangment!)
362      * If so, take the remainder, and return the equivalents
363      */
extract(int comp, String segment, int segmentPos, StringBuffer buf)364     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
365         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
366             + ", " + Utility.hex(segment.substring(segmentPos)));
367 
368         String decomp = nfcImpl.getDecomposition(comp);
369         if (decomp == null) {
370             decomp = UTF16.valueOf(comp);
371         }
372 
373         // See if it matches the start of segment (at segmentPos)
374         boolean ok = false;
375         int cp;
376         int decompPos = 0;
377         int decompCp = UTF16.charAt(decomp,0);
378         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
379         //int decompClass = getClass(decompCp);
380         buf.setLength(0); // initialize working buffer, shared among callees
381 
382         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
383             cp = UTF16.charAt(segment, i);
384             if (cp == decompCp) { // if equal, eat another cp from decomp
385                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
386                 if (decompPos == decomp.length()) { // done, have all decomp characters!
387                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
388                     ok = true;
389                     break;
390                 }
391                 decompCp = UTF16.charAt(decomp, decompPos);
392                 decompPos += UTF16.getCharCount(decompCp);
393                 //decompClass = getClass(decompCp);
394             } else {
395                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
396                 // brute force approach
397                 UTF16.append(buf, cp);
398                 /* TODO: optimize
399                 // since we know that the classes are monotonically increasing, after zero
400                 // e.g. 0 5 7 9 0 3
401                 // we can do an optimization
402                 // there are only a few cases that work: zero, less, same, greater
403                 // if both classes are the same, we fail
404                 // if the decomp class < the segment class, we fail
405 
406                 segClass = getClass(cp);
407                 if (decompClass <= segClass) return null;
408                 */
409             }
410         }
411         if (!ok) return null; // we failed, characters left over
412         if (PROGRESS) System.out.println("Matches");
413         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
414         String remainder = buf.toString();
415 
416         // brute force approach
417         // to check to make sure result is canonically equivalent
418         /*
419         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
420         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
421         */
422 
423         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
424 
425         // get the remaining combinations
426         return getEquivalents2(remainder);
427     }
428 
429     /*
430     // TODO: fix once we have a codepoint interface to get the canonical combining class
431     // TODO: Need public access to canonical combining class in UCharacter!
432     private static int getClass(int cp) {
433         return Normalizer.getClass((char)cp);
434     }
435     */
436 
437    // ================= BUILDER =========================
438     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
439 
440     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
441     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
442     static {
443         SET_WITH_NULL_STRING.add("");
444     }
445 
446   //  private static UnicodeSet SAFE_START = new UnicodeSet();
447   //  private static CharMap AT_START = new CharMap();
448 
449         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
450         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
451   //  private static int LAST_UNICODE = 0x10FFFF;
452     /*
453     static {
454         buildData();
455     }
456     */
457     /*
458     private static void buildData() {
459 
460         if (PROGRESS) System.out.println("Getting Safe Start");
461         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
462             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
463             int cc = UCharacter.getCombiningClass(cp);
464             if (cc == 0) SAFE_START.add(cp);
465             // will fix to be really safe below
466         }
467         if (PROGRESS) System.out.println();
468 
469         if (PROGRESS) System.out.println("Getting Containment");
470         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
471             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
472 
473             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
474 
475             //String istr = UTF16.valueOf(cp);
476             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
477             //if (decomp.equals(istr)) continue;
478 
479             // add each character in the decomposition to canBeIn
480 
481             int component;
482             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
483                 component = UTF16.charAt(decomp, i);
484                 if (i == 0) {
485                     AT_START.add(component, cp);
486                 } else if (UCharacter.getCombiningClass(component) == 0) {
487                     SAFE_START.remove(component);
488                 }
489             }
490         }
491         if (PROGRESS) System.out.println();
492     }
493         // the following is just for a map from characters to a set of characters
494 
495     private static class CharMap {
496         Map storage = new HashMap();
497         MutableInt probe = new MutableInt();
498         boolean converted = false;
499 
500         public void add(int cp, int whatItIsIn) {
501             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
502             if (result == null) {
503                 result = new UnicodeSet();
504                 storage.put(probe, result);
505             }
506             result.add(whatItIsIn);
507         }
508 
509         public UnicodeSet get(int cp) {
510             return (UnicodeSet) storage.get(probe.set(cp));
511         }
512     }
513 
514     private static class MutableInt {
515         public int contents;
516         public int hashCode() { return contents; }
517         public boolean equals(Object other) {
518             return ((MutableInt)other).contents == contents;
519         }
520         // allows chaining
521         public MutableInt set(int contents) {
522             this.contents = contents;
523             return this;
524         }
525     }
526     */
527 
528 }
529