1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2002-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.dev.tool.layout; 11 12 import java.util.Vector; 13 14 import com.ibm.icu.impl.Utility; 15 import com.ibm.icu.lang.UCharacter; 16 import com.ibm.icu.lang.UScript; 17 import com.ibm.icu.text.CanonicalIterator; 18 import com.ibm.icu.text.UTF16; 19 import com.ibm.icu.text.UnicodeSet; 20 21 public class CanonicalCharacterData 22 { 23 private static int THRESHOLD = 4; 24 25 public class Record 26 { 27 // TODO: might want to save arrays of Char32's rather than UTF16 strings... Record(int character, int script)28 Record(int character, int script) 29 { 30 String char32 = UCharacter.toString(character); 31 CanonicalIterator iterator = new CanonicalIterator(char32); 32 Vector equivs = new Vector(); 33 34 composed = character; 35 36 for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) { 37 // Skip all equivalents of length 1; it's either the original 38 // characeter or something like Angstrom for A-Ring, which we don't care about 39 if (UTF16.countCodePoint(equiv) > 1) { 40 equivs.add(equiv); 41 } 42 } 43 44 int nEquivalents = equivs.size(); 45 46 if (nEquivalents > maxEquivalents[script]) { 47 maxEquivalents[script] = nEquivalents; 48 } 49 50 if (nEquivalents > 0) { 51 equivalents = new String[nEquivalents]; 52 53 if (nEquivalents > THRESHOLD) { 54 dumpEquivalents(character, equivs); 55 } 56 57 sortEquivalents(equivalents, equivs); 58 } 59 } 60 getComposedCharacter()61 public int getComposedCharacter() 62 { 63 return composed; 64 } 65 countEquivalents()66 public int countEquivalents() 67 { 68 if (equivalents == null) { 69 return 0; 70 } 71 72 return equivalents.length; 73 } 74 getEquivalents()75 public String[] getEquivalents() 76 { 77 return equivalents; 78 } 79 getEquivalent(int index)80 public String getEquivalent(int index) 81 { 82 if (equivalents == null || index < 0 || index >= equivalents.length) { 83 return null; 84 } 85 86 return equivalents[index]; 87 } 88 dumpEquivalents(int character, Vector equivs)89 private void dumpEquivalents(int character, Vector equivs) 90 { 91 int count = equivs.size(); 92 93 System.out.println(Utility.hex(character, 6) + " - " + count + ":"); 94 95 for (int i = 0; i < count; i += 1) { 96 String equiv = (String) equivs.elementAt(i); 97 int codePoints = UTF16.countCodePoint(equiv); 98 99 for (int c = 0; c < codePoints; c += 1) { 100 if (c > 0) { 101 System.out.print(" "); 102 } 103 104 System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6)); 105 } 106 107 System.out.println(); 108 } 109 110 System.out.println(); 111 } 112 113 private int composed; 114 private String[] equivalents = null; 115 } 116 CanonicalCharacterData()117 public CanonicalCharacterData() 118 { 119 // nothing to do... 120 } 121 add(int character)122 public void add(int character) 123 { 124 int script = UScript.getScript(character); 125 Vector recordVector = recordVectors[script]; 126 127 if (recordVector == null) { 128 recordVector = recordVectors[script] = new Vector(); 129 } 130 131 recordVector.add(new Record(character, script)); 132 } 133 getMaxEquivalents(int script)134 public int getMaxEquivalents(int script) 135 { 136 if (script < 0 || script >= UScript.CODE_LIMIT) { 137 return 0; 138 } 139 140 return maxEquivalents[script]; 141 } 142 getRecord(int script, int index)143 public Record getRecord(int script, int index) 144 { 145 if (script < 0 || script >= UScript.CODE_LIMIT) { 146 return null; 147 } 148 149 Vector recordVector = recordVectors[script]; 150 151 if (recordVector == null || index < 0 || index >= recordVector.size()) { 152 return null; 153 } 154 155 return (Record) recordVector.elementAt(index); 156 } 157 countRecords(int script)158 public int countRecords(int script) 159 { 160 if (script < 0 || script >= UScript.CODE_LIMIT || 161 recordVectors[script] == null) { 162 return 0; 163 } 164 165 return recordVectors[script].size(); 166 } 167 factory(UnicodeSet characterSet)168 public static CanonicalCharacterData factory(UnicodeSet characterSet) 169 { 170 int charCount = characterSet.size(); 171 CanonicalCharacterData data = new CanonicalCharacterData(); 172 173 System.out.println("There are " + charCount + " characters with a canonical decomposition."); 174 175 for (int i = 0; i < charCount; i += 1) { 176 data.add(characterSet.charAt(i)); 177 } 178 179 return data; 180 } 181 compareEquivalents(String a, String b)182 private static int compareEquivalents(String a, String b) 183 { 184 int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b); 185 186 if (result == 0) { 187 return a.compareTo(b); 188 } 189 190 return result; 191 } 192 193 // 194 // Straight insertion sort from Knuth vol. III, pg. 81 195 // sortEquivalents(String[] equivalents, Vector unsorted)196 private static void sortEquivalents(String[] equivalents, Vector unsorted) 197 { 198 int nEquivalents = equivalents.length; 199 200 for (int e = 0; e < nEquivalents; e += 1) { 201 String v = (String) unsorted.elementAt(e); 202 int i; 203 204 for (i = e - 1; i >= 0; i -= 1) { 205 if (compareEquivalents(v, equivalents[i]) >= 0) { 206 break; 207 } 208 209 equivalents[i + 1] = equivalents[i]; 210 } 211 212 equivalents[i + 1] = v; 213 } 214 } 215 216 private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT]; 217 private int maxEquivalents[] = new int[UScript.CODE_LIMIT]; 218 219 } 220