1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2001-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.demo.translit; 10 import java.io.BufferedWriter; 11 import java.io.File; 12 import java.io.FileOutputStream; 13 import java.io.IOException; 14 import java.io.OutputStreamWriter; 15 import java.io.PrintWriter; 16 import java.util.Comparator; 17 import java.util.HashMap; 18 import java.util.Iterator; 19 import java.util.Set; 20 import java.util.TreeSet; 21 22 import com.ibm.icu.impl.Utility; 23 import com.ibm.icu.lang.UCharacter; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.Normalizer; 26 import com.ibm.icu.text.Transliterator; 27 import com.ibm.icu.text.UTF16; 28 import com.ibm.icu.text.UnicodeSet; 29 import com.ibm.icu.text.UnicodeSetIterator; 30 31 public class TransliterationChart { main(String[] args)32 public static void main(String[] args) throws IOException { 33 System.out.println("Start"); 34 UnicodeSet lengthMarks = new UnicodeSet("[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]"); 35 int[] indicScripts = { 36 UScript.LATIN, 37 UScript.DEVANAGARI, 38 UScript.BENGALI, 39 UScript.GURMUKHI, 40 UScript.GUJARATI, 41 UScript.ORIYA, 42 UScript.TAMIL, 43 UScript.TELUGU, 44 UScript.KANNADA, 45 UScript.MALAYALAM, 46 }; 47 String[] names = new String[indicScripts.length]; 48 UnicodeSet[] sets = new UnicodeSet[indicScripts.length]; 49 Transliterator[] fallbacks = new Transliterator[indicScripts.length]; 50 for (int i = 0; i < indicScripts.length; ++i) { 51 names[i] = UScript.getName(indicScripts[i]); 52 sets[i] = new UnicodeSet("[[:" + names[i] + ":]&[[:L:][:M:]]&[:age=3.1:]]"); 53 fallbacks[i] = Transliterator.getInstance("any-" + names[i]); 54 } 55 EquivClass eq = new EquivClass(new ReverseComparator()); 56 PrintWriter pw = openPrintWriter("transChart.html"); 57 pw.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"); 58 pw.println("<title>Indic Transliteration Chart</title><style>"); 59 pw.println("td { text-align: Center; font-size: 200% }"); 60 pw.println("tt { font-size: 50% }"); 61 pw.println("td.miss { background-color: #CCCCFF }"); 62 pw.println("</style></head><body bgcolor='#FFFFFF'>"); 63 64 Transliterator anyToLatin = Transliterator.getInstance("any-latin"); 65 66 String testString = "\u0946\u093E"; 67 68 UnicodeSet failNorm = new UnicodeSet(); 69 Set latinFail = new TreeSet(); 70 71 for (int i = 0; i < indicScripts.length; ++i) { 72 if (indicScripts[i] == UScript.LATIN) continue; 73 String source = names[i]; 74 System.out.println(source); 75 UnicodeSet sourceChars = sets[i]; 76 77 for (int j = 0; j < indicScripts.length; ++j) { 78 if (i == j) continue; 79 String target = names[j]; 80 Transliterator forward = Transliterator.getInstance(source + '-' + target); 81 Transliterator backward = forward.getInverse(); 82 UnicodeSetIterator it = new UnicodeSetIterator(sourceChars); 83 while (it.next()) { 84 if (lengthMarks.contains(it.codepoint)) continue; 85 String s = Normalizer.normalize(it.codepoint,Normalizer.NFC,0); 86 //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue; 87 if (!s.equals(Normalizer.normalize(s,Normalizer.NFD,0))) { 88 failNorm.add(it.codepoint); 89 } 90 String t = fix(forward.transliterate(s)); 91 if (t.equals(testString)) { 92 System.out.println("debug"); 93 } 94 95 String r = fix(backward.transliterate(t)); 96 if (Normalizer.compare(s,r,0) == 0) { 97 if (indicScripts[j] != UScript.LATIN) eq.add(s,t); 98 } else { 99 if (indicScripts[j] == UScript.LATIN) { 100 latinFail.add(s + " - " + t + " - " + r); 101 } 102 } 103 } 104 } 105 } 106 // collect equivalents 107 pw.println("<table border='1' cellspacing='0'><tr>"); 108 for (int i = 0; i < indicScripts.length; ++i) { 109 pw.print("<th width='10%'>" + names[i].substring(0,3) + "</th>"); 110 } 111 pw.println("</tr>"); 112 113 Iterator rit = eq.getSetIterator(new MyComparator()); 114 while(rit.hasNext()) { 115 Set equivs = (Set)rit.next(); 116 pw.print("<tr>"); 117 Iterator sit = equivs.iterator(); 118 String source = (String)sit.next(); 119 String item = anyToLatin.transliterate(source); 120 if (item.equals("") || source.equals(item)) item = " "; 121 pw.print("<td>" + item + "</td>"); 122 for (int i = 1; i < indicScripts.length; ++i) { 123 sit = equivs.iterator(); 124 item = ""; 125 while (sit.hasNext()) { 126 String trial = (String)sit.next(); 127 if (!sets[i].containsAll(trial)) continue; 128 item = trial; 129 break; 130 } 131 String classString = ""; 132 if (item.equals("")) { 133 classString = " class='miss'"; 134 String temp = fallbacks[i].transliterate(source); 135 if (!temp.equals("") && !temp.equals(source)) item = temp; 136 } 137 String backup = item.equals("") ? " " : item; 138 pw.print("<td" + classString + " title='" + getName(item, "; ") + "'>" 139 + backup + "<br><tt>" + Utility.hex(item) + "</tt></td>"); 140 } 141 /* 142 Iterator sit = equivs.iterator(); 143 while (sit.hasNext()) { 144 String item = (String)sit.next(); 145 pw.print("<td>" + item + "</td>"); 146 } 147 */ 148 pw.println("</tr>"); 149 } 150 pw.println("</table>"); 151 if (true) { 152 pw.println("<h2>Failed Normalization</h2>"); 153 154 UnicodeSetIterator it = new UnicodeSetIterator(failNorm); 155 UnicodeSet pieces = new UnicodeSet(); 156 while (it.next()) { 157 String s = UTF16.valueOf(it.codepoint); 158 String d = Normalizer.normalize(s,Normalizer.NFD,0); 159 pw.println("Norm:" + s + ", " + Utility.hex(s) + " " + UCharacter.getName(it.codepoint) 160 + "; " + d + ", " + Utility.hex(d) + ", "); 161 pw.println(UCharacter.getName(d.charAt(1)) + "<br>"); 162 if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0) pieces.add(d.charAt(1)); 163 } 164 pw.println(pieces); 165 166 pw.println("<h2>Failed Round-Trip</h2>"); 167 Iterator cit = latinFail.iterator(); 168 while (cit.hasNext()) { 169 pw.println(cit.next() + "<br>"); 170 } 171 } 172 173 pw.println("</table></body></html>"); 174 pw.close(); 175 System.out.println("Done"); 176 } 177 fix(String s)178 public static String fix(String s) { 179 if (s.equals("\u0946\u093E")) return "\u094A"; 180 if (s.equals("\u0C46\u0C3E")) return "\u0C4A"; 181 if (s.equals("\u0CC6\u0CBE")) return "\u0CCA"; 182 183 if (s.equals("\u0947\u093E")) return "\u094B"; 184 if (s.equals("\u0A47\u0A3E")) return "\u0A4B"; 185 if (s.equals("\u0AC7\u0ABE")) return "\u0ACB"; 186 if (s.equals("\u0C47\u0C3E")) return "\u0C4B"; 187 if (s.equals("\u0CC7\u0CBE")) return "\u0CCB"; 188 189 //return Normalizer.normalize(s,Normalizer.NFD,0); 190 return s; 191 } 192 openPrintWriter(String fileName)193 public static PrintWriter openPrintWriter(String fileName) throws IOException { 194 File lf = new File(fileName); 195 System.out.println("Creating file: " + lf.getAbsoluteFile()); 196 197 return new PrintWriter( 198 new BufferedWriter( 199 new OutputStreamWriter( 200 new FileOutputStream(fileName), "UTF8"), 4*1024)); 201 } 202 203 getName(String s, String separator)204 public static String getName(String s, String separator) { 205 int cp; 206 StringBuffer sb = new StringBuffer(); 207 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 208 cp = UTF16.charAt(s,i); 209 if (i != 0) sb.append(separator); 210 sb.append(UCharacter.getName(cp)); 211 } 212 return sb.toString(); 213 } 214 215 static class MyComparator implements Comparator { compare(Object o1, Object o2)216 public int compare(Object o1, Object o2) { 217 Iterator i1 = ((TreeSet) o1).iterator(); 218 Iterator i2 = ((TreeSet) o2).iterator(); 219 while (i1.hasNext() && i2.hasNext()) { 220 String a = (String)i1.next(); 221 String b = (String)i2.next(); 222 int result = a.compareTo(b); 223 if (result != 0) return result; 224 } 225 if (i1.hasNext()) return 1; 226 if (i2.hasNext()) return -1; 227 return 0; 228 } 229 230 } 231 static class ReverseComparator implements Comparator { compare(Object o1, Object o2)232 public int compare(Object o1, Object o2) { 233 String a = o1.toString(); 234 char a1 = a.charAt(0); 235 String b = o2.toString(); 236 char b1 = b.charAt(0); 237 if (a1 < 0x900 && b1 > 0x900) return -1; 238 if (a1 > 0x900 && b1 < 0x900) return +1; 239 return a.compareTo(b); 240 } 241 } 242 243 static class EquivClass { EquivClass(Comparator c)244 EquivClass(Comparator c) { 245 comparator = c; 246 } 247 private HashMap itemToSet = new HashMap(); 248 private Comparator comparator; 249 add(Object a, Object b)250 void add(Object a, Object b) { 251 Set sa = (Set)itemToSet.get(a); 252 Set sb = (Set)itemToSet.get(b); 253 if (sa == null && sb == null) { // new set! 254 Set s = new TreeSet(comparator); 255 s.add(a); 256 s.add(b); 257 itemToSet.put(a, s); 258 itemToSet.put(b, s); 259 } else if (sa == null) { 260 sb.add(a); 261 } else if (sb == null) { 262 sa.add(b); 263 } else { // merge sets, dumping sb 264 sa.addAll(sb); 265 Iterator it = sb.iterator(); 266 while (it.hasNext()) { 267 itemToSet.put(it.next(), sa); 268 } 269 } 270 } 271 272 private class MyIterator implements Iterator { 273 private Iterator it; MyIterator(Comparator comp)274 MyIterator (Comparator comp) { 275 TreeSet values = new TreeSet(comp); 276 values.addAll(itemToSet.values()); 277 it = values.iterator(); 278 } 279 hasNext()280 public boolean hasNext() { 281 return it.hasNext(); 282 } next()283 public Object next() { 284 return it.next(); 285 } remove()286 public void remove() { 287 throw new IllegalArgumentException("can't remove"); 288 } 289 } 290 getSetIterator(Comparator comp)291 public Iterator getSetIterator (Comparator comp) { 292 return new MyIterator(comp); 293 } 294 295 } 296 }