• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 
9 import java.util.ArrayList;
10 import java.util.HashSet;
11 import java.util.Iterator;
12 import java.util.List;
13 import java.util.Set;
14 
15 import com.ibm.icu.impl.Norm2AllModes;
16 import com.ibm.icu.impl.Normalizer2Impl;
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.lang.UCharacter;
19 
20 /**
21  * This class allows one to iterate through all the strings that are canonically equivalent to a given
22  * string. For example, here are some sample results:
23  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
24  * <pre>
25  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
26  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
27  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
28  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
29  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
30  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
31  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
32  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
33  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
34 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
35 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
36 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
37  *</pre>
38  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
39  * since it has not been optimized for that situation.
40  * @author M. Davis
41  * @stable ICU 2.4
42  */
43 
44 public final class CanonicalIterator {
45     /**
46      * Construct a CanonicalIterator object
47      * @param source string to get results for
48      * @stable ICU 2.4
49      */
CanonicalIterator(String source)50     public CanonicalIterator(String source) {
51         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
52         nfd = allModes.decomp;
53         nfcImpl = allModes.impl.ensureCanonIterData();
54         setSource(source);
55     }
56 
57     /**
58      * Gets the NFD form of the current source we are iterating over.
59      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
60      * @stable ICU 2.4
61      */
getSource()62     public String getSource() {
63       return source;
64     }
65 
66     /**
67      * Resets the iterator so that one can start again from the beginning.
68      * @stable ICU 2.4
69      */
reset()70     public void reset() {
71         done = false;
72         for (int i = 0; i < current.length; ++i) {
73             current[i] = 0;
74         }
75     }
76 
77     /**
78      * Get the next canonically equivalent string.
79      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
80      * @return the next string that is canonically equivalent. The value null is returned when
81      * the iteration is done.
82      * @stable ICU 2.4
83      */
next()84     public String next() {
85         if (done) return null;
86 
87         // construct return value
88 
89         buffer.setLength(0); // delete old contents
90         for (int i = 0; i < pieces.length; ++i) {
91             buffer.append(pieces[i][current[i]]);
92         }
93         String result = buffer.toString();
94 
95         // find next value for next time
96 
97         for (int i = current.length - 1; ; --i) {
98             if (i < 0) {
99                 done = true;
100                 break;
101             }
102             current[i]++;
103             if (current[i] < pieces[i].length) break; // got sequence
104             current[i] = 0;
105         }
106         return result;
107     }
108 
109     /**
110      * Set a new source for this iterator. Allows object reuse.
111      * @param newSource the source string to iterate against. This allows the same iterator to be used
112      * while changing the source string, saving object creation.
113      * @stable ICU 2.4
114      */
setSource(String newSource)115     public void setSource(String newSource) {
116         source = nfd.normalize(newSource);
117         done = false;
118 
119         // catch degenerate case
120         if (newSource.length() == 0) {
121             pieces = new String[1][];
122             current = new int[1];
123             pieces[0] = new String[]{""};
124             return;
125         }
126 
127         // find the segments
128         List<String> segmentList = new ArrayList<String>();
129         int cp;
130         int start = 0;
131 
132         // i should be the end of the first code point
133         // break up the string into segements
134 
135         int i = UTF16.findOffsetFromCodePoint(source, 1);
136 
137         for (; i < source.length(); i += Character.charCount(cp)) {
138             cp = source.codePointAt(i);
139             if (nfcImpl.isCanonSegmentStarter(cp)) {
140                 segmentList.add(source.substring(start, i)); // add up to i
141                 start = i;
142             }
143         }
144         segmentList.add(source.substring(start, i)); // add last one
145 
146         // allocate the arrays, and find the strings that are CE to each segment
147         pieces = new String[segmentList.size()][];
148         current = new int[segmentList.size()];
149         for (i = 0; i < pieces.length; ++i) {
150             if (PROGRESS) System.out.println("SEGMENT");
151             pieces[i] = getEquivalents(segmentList.get(i));
152         }
153     }
154 
155     /**
156      * Simple implementation of permutation.
157      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
158      * @param source the string to find permutations for
159      * @param skipZeros set to true to skip characters with canonical combining class zero
160      * @param output the set to add the results to
161      * @internal
162      * @deprecated This API is ICU internal only.
163      */
164     @Deprecated
permute(String source, boolean skipZeros, Set<String> output)165     public static void permute(String source, boolean skipZeros, Set<String> output) {
166         // TODO: optimize
167         //if (PROGRESS) System.out.println("Permute: " + source);
168 
169         // optimization:
170         // if zero or one character, just return a set with it
171         // we check for length < 2 to keep from counting code points all the time
172         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
173             output.add(source);
174             return;
175         }
176 
177         // otherwise iterate through the string, and recursively permute all the other characters
178         Set<String> subpermute = new HashSet<String>();
179         int cp;
180         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
181             cp = UTF16.charAt(source, i);
182 
183             // optimization:
184             // if the character is canonical combining class zero,
185             // don't permute it
186             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
187                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
188                 continue;
189             }
190 
191             // see what the permutations of the characters before and after this one are
192             subpermute.clear();
193             permute(source.substring(0,i)
194                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
195 
196             // prefix this character to all of them
197             String chStr = UTF16.valueOf(source, i);
198             for (String s : subpermute) {
199                 String piece = chStr + s;
200                 //if (PROGRESS) System.out.println("  Piece: " + piece);
201                 output.add(piece);
202             }
203         }
204     }
205 
206     // FOR TESTING
207 
208     /*
209      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
210      *
211     public static UnicodeSet getSafeStart() {
212         return (UnicodeSet) SAFE_START.clone();
213     }
214     */
215     /*
216      *@return the set of characters whose decompositions start with the given character
217      *
218     public static UnicodeSet getStarts(int cp) {
219         UnicodeSet result = AT_START.get(cp);
220         if (result == null) result = EMPTY;
221         return (UnicodeSet) result.clone();
222     }
223     */
224 
225     // ===================== PRIVATES ==============================
226 
227     // debug
228     private static boolean PROGRESS = false; // debug progress
229     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
230     private static boolean SKIP_ZEROS = true;
231 
232     // fields
233     private final Normalizer2 nfd;
234     private final Normalizer2Impl nfcImpl;
235     private String source;
236     private boolean done;
237     private String[][] pieces;
238     private int[] current;
239     // Note: C will need two more fields, since arrays there don't have lengths
240     // int pieces_length;
241     // int[] pieces_lengths;
242 
243     // transient fields
244     private transient StringBuilder buffer = new StringBuilder();
245 
246 
247     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
getEquivalents(String segment)248     private String[] getEquivalents(String segment) {
249         Set<String> result = new HashSet<String>();
250         Set<String> basic = getEquivalents2(segment);
251         Set<String> permutations = new HashSet<String>();
252 
253         // now get all the permutations
254         // add only the ones that are canonically equivalent
255         // TODO: optimize by not permuting any class zero.
256         Iterator<String> it = basic.iterator();
257         while (it.hasNext()) {
258             String item = it.next();
259             permutations.clear();
260             permute(item, SKIP_ZEROS, permutations);
261             Iterator<String> it2 = permutations.iterator();
262             while (it2.hasNext()) {
263                 String possible = it2.next();
264 
265 /*
266                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
267                 if (attempt.equals(segment)) {
268 */
269                 if (Normalizer.compare(possible, segment,0)==0) {
270 
271                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
272                     result.add(possible);
273 
274                 } else {
275                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
276                 }
277             }
278         }
279 
280         // convert into a String[] to clean up storage
281         String[] finalResult = new String[result.size()];
282         result.toArray(finalResult);
283         return finalResult;
284     }
285 
286 
getEquivalents2(String segment)287     private Set<String> getEquivalents2(String segment) {
288 
289         Set<String> result = new HashSet<String>();
290 
291         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
292 
293         result.add(segment);
294         StringBuffer workingBuffer = new StringBuffer();
295         UnicodeSet starts = new UnicodeSet();
296 
297         // cycle through all the characters
298         int cp;
299         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
300 
301             // see if any character is at the start of some decomposition
302             cp = segment.codePointAt(i);
303             if (!nfcImpl.getCanonStartSet(cp, starts)) {
304               continue;
305             }
306             // if so, see which decompositions match
307             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
308                 int cp2 = iter.codepoint;
309                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
310                 if (remainder == null) {
311                     continue;
312                 }
313 
314                 // there were some matches, so add all the possibilities to the set.
315                 String prefix= segment.substring(0,i);
316                 prefix += UTF16.valueOf(cp2);
317                 for (String item : remainder) {
318                     result.add(prefix + item);
319                 }
320             }
321         }
322         return result;
323         /*
324         Set result = new HashSet();
325         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
326         result.add(segment);
327         StringBuffer workingBuffer = new StringBuffer();
328 
329         // cycle through all the characters
330         int cp;
331 
332         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
333             // see if any character is at the start of some decomposition
334             cp = UTF16.charAt(segment, i);
335             NormalizerImpl.getCanonStartSet(c,fillSet)
336             UnicodeSet starts = AT_START.get(cp);
337             if (starts == null) continue;
338             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
339             // if so, see which decompositions match
340             while (usi.next()) {
341                 int cp2 = usi.codepoint;
342                 // we know that there are no strings in it
343                 // so we don't have to check CharacterIterator.IS_STRING
344                 Set remainder = extract(cp2, segment, i, workingBuffer);
345                 if (remainder == null) continue;
346 
347                 // there were some matches, so add all the possibilities to the set.
348                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
349                 Iterator it = remainder.iterator();
350                 while (it.hasNext()) {
351                     String item = (String) it.next();
352                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
353                     result.add(prefix + item);
354                 }
355             }
356         }
357         return result;
358         */
359     }
360 
361     /**
362      * See if the decomposition of cp2 is at segment starting at segmentPos
363      * (with canonical rearrangment!)
364      * If so, take the remainder, and return the equivalents
365      */
extract(int comp, String segment, int segmentPos, StringBuffer buf)366     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
367         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
368             + ", " + Utility.hex(segment.substring(segmentPos)));
369 
370         String decomp = nfcImpl.getDecomposition(comp);
371         if (decomp == null) {
372             decomp = UTF16.valueOf(comp);
373         }
374 
375         // See if it matches the start of segment (at segmentPos)
376         boolean ok = false;
377         int cp;
378         int decompPos = 0;
379         int decompCp = UTF16.charAt(decomp,0);
380         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
381         //int decompClass = getClass(decompCp);
382         buf.setLength(0); // initialize working buffer, shared among callees
383 
384         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
385             cp = UTF16.charAt(segment, i);
386             if (cp == decompCp) { // if equal, eat another cp from decomp
387                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
388                 if (decompPos == decomp.length()) { // done, have all decomp characters!
389                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
390                     ok = true;
391                     break;
392                 }
393                 decompCp = UTF16.charAt(decomp, decompPos);
394                 decompPos += UTF16.getCharCount(decompCp);
395                 //decompClass = getClass(decompCp);
396             } else {
397                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
398                 // brute force approach
399                 UTF16.append(buf, cp);
400                 /* TODO: optimize
401                 // since we know that the classes are monotonically increasing, after zero
402                 // e.g. 0 5 7 9 0 3
403                 // we can do an optimization
404                 // there are only a few cases that work: zero, less, same, greater
405                 // if both classes are the same, we fail
406                 // if the decomp class < the segment class, we fail
407 
408                 segClass = getClass(cp);
409                 if (decompClass <= segClass) return null;
410                 */
411             }
412         }
413         if (!ok) return null; // we failed, characters left over
414         if (PROGRESS) System.out.println("Matches");
415         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
416         String remainder = buf.toString();
417 
418         // brute force approach
419         // to check to make sure result is canonically equivalent
420         /*
421         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
422         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
423         */
424 
425         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
426 
427         // get the remaining combinations
428         return getEquivalents2(remainder);
429     }
430 
431     /*
432     // TODO: fix once we have a codepoint interface to get the canonical combining class
433     // TODO: Need public access to canonical combining class in UCharacter!
434     private static int getClass(int cp) {
435         return Normalizer.getClass((char)cp);
436     }
437     */
438 
439    // ================= BUILDER =========================
440     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
441 
442     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
443     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
444     static {
445         SET_WITH_NULL_STRING.add("");
446     }
447 
448   //  private static UnicodeSet SAFE_START = new UnicodeSet();
449   //  private static CharMap AT_START = new CharMap();
450 
451         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
452         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
453   //  private static int LAST_UNICODE = 0x10FFFF;
454     /*
455     static {
456         buildData();
457     }
458     */
459     /*
460     private static void buildData() {
461 
462         if (PROGRESS) System.out.println("Getting Safe Start");
463         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
464             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
465             int cc = UCharacter.getCombiningClass(cp);
466             if (cc == 0) SAFE_START.add(cp);
467             // will fix to be really safe below
468         }
469         if (PROGRESS) System.out.println();
470 
471         if (PROGRESS) System.out.println("Getting Containment");
472         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
473             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
474 
475             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
476 
477             //String istr = UTF16.valueOf(cp);
478             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
479             //if (decomp.equals(istr)) continue;
480 
481             // add each character in the decomposition to canBeIn
482 
483             int component;
484             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
485                 component = UTF16.charAt(decomp, i);
486                 if (i == 0) {
487                     AT_START.add(component, cp);
488                 } else if (UCharacter.getCombiningClass(component) == 0) {
489                     SAFE_START.remove(component);
490                 }
491             }
492         }
493         if (PROGRESS) System.out.println();
494     }
495         // the following is just for a map from characters to a set of characters
496 
497     private static class CharMap {
498         Map storage = new HashMap();
499         MutableInt probe = new MutableInt();
500         boolean converted = false;
501 
502         public void add(int cp, int whatItIsIn) {
503             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
504             if (result == null) {
505                 result = new UnicodeSet();
506                 storage.put(probe, result);
507             }
508             result.add(whatItIsIn);
509         }
510 
511         public UnicodeSet get(int cp) {
512             return (UnicodeSet) storage.get(probe.set(cp));
513         }
514     }
515 
516     private static class MutableInt {
517         public int contents;
518         public int hashCode() { return contents; }
519         public boolean equals(Object other) {
520             return ((MutableInt)other).contents == contents;
521         }
522         // allows chaining
523         public MutableInt set(int contents) {
524             this.contents = contents;
525             return this;
526         }
527     }
528     */
529 
530 }
531