1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2002-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 11 package com.ibm.icu.dev.tool.layout; 12 13 import com.ibm.icu.lang.UCharacter; 14 import com.ibm.icu.lang.UScript; 15 import com.ibm.icu.text.UTF16; 16 import com.ibm.icu.text.UnicodeSet; 17 18 /** 19 * @author Eric Mader 20 * 21 * Notes: 22 * 23 * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical 24 * decomposition. 25 * 26 * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]" 27 * will match all Latin, Greek and Cyrillic characters with a canonical decomposition. 28 * 29 * Are these three scripts enough? Do we want to collect them all at once and distribute by script, 30 * or process them one script at a time. It's probably a good idea to build a single table for 31 * however many scripts there are. 32 * 33 * It might be better to collect all the characters that have a canonical decomposition and just 34 * sort them into however many scripts there are... unless we'll get characters in COMMON??? 35 */ 36 public class CanonGSUBBuilder 37 { convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)38 static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable) 39 { 40 int leftType = ArabicShaping.VALUE_NONE; 41 int rightType = ArabicShaping.VALUE_NONE; 42 43 switch (type) { 44 case UCharacter.DecompositionType.ISOLATED: 45 break; 46 47 case UCharacter.DecompositionType.FINAL: 48 rightType = ArabicShaping.VALUE_LEFT; 49 break; 50 51 case UCharacter.DecompositionType.INITIAL: 52 leftType = ArabicShaping.VALUE_RIGHT; 53 break; 54 55 case UCharacter.DecompositionType.MEDIAL: 56 rightType = ArabicShaping.VALUE_LEFT; 57 leftType = ArabicShaping.VALUE_RIGHT; 58 break; 59 60 default: 61 return decomp + UCharacter.toString(ligature); 62 } 63 64 char[] chars = decomp.toCharArray(); 65 66 ArabicShaping.shape(chars, leftType, rightType, isolClassTable); 67 68 return new String(chars) + UCharacter.toString(ligature); 69 } 70 buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, ClassTable finaClassTable, ClassTable isolClassTable)71 static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, 72 ClassTable finaClassTable, ClassTable isolClassTable) 73 { 74 System.out.print("Finding Arabic contextual forms... "); 75 76 for (int i = 0; i < data.countRecords(); i += 1) { 77 ArabicCharacterData.Record record = data.getRecord(i); 78 String decomposition = record.getDecomposition(); 79 80 if (decomposition != null && decomposition.length() == 1) { 81 int contextual = record.getCodePoint(); 82 int isolated = UTF16.charAt(record.getDecomposition(), 0); 83 84 switch (record.getDecompositionType()) { 85 case UCharacter.DecompositionType.INITIAL: 86 initClassTable.addMapping(isolated, contextual); 87 break; 88 89 case UCharacter.DecompositionType.MEDIAL: 90 mediClassTable.addMapping(isolated, contextual); 91 break; 92 93 case UCharacter.DecompositionType.FINAL: 94 finaClassTable.addMapping(isolated, contextual); 95 break; 96 97 case UCharacter.DecompositionType.ISOLATED: 98 isolClassTable.addMapping(isolated, contextual); 99 break; 100 101 default: 102 // issue some error message? 103 break; 104 } 105 } 106 } 107 108 System.out.println("Done."); 109 } 110 buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)111 static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable) 112 { 113 LigatureTree contextualTree = new LigatureTree(); 114 int ligatureCount = 0; 115 116 System.out.print("Building Arabic ligature tree... "); 117 118 for (int i = 0; i < data.countRecords(); i += 1) { 119 ArabicCharacterData.Record record = data.getRecord(i); 120 String decomposition = record.getDecomposition(); 121 122 if (decomposition != null && decomposition.length() > 1) { 123 int ligature = record.getCodePoint(); 124 int decompType = record.getDecompositionType(); 125 126 switch (decompType) { 127 case UCharacter.DecompositionType.FINAL: 128 case UCharacter.DecompositionType.INITIAL: 129 case UCharacter.DecompositionType.MEDIAL: 130 case UCharacter.DecompositionType.ISOLATED: 131 contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable)); 132 ligatureCount += 1; 133 break; 134 135 case UCharacter.DecompositionType.CANONICAL: 136 //cannonicalTree.insert(decomposition + UCharacter.toString(ligature)); 137 break; 138 } 139 } 140 } 141 142 System.out.println(ligatureCount + " ligatures."); 143 144 return contextualTree; 145 } 146 147 static final int SIMPLE_GLYPH = 1; 148 static final int LIGATURE_GLYPH = 2; 149 static final int MARK_GLYPH = 3; 150 static final int COMPONENT_GLYPH = 4; 151 152 static final int categoryClassMap[] = { 153 0, // UNASSIGNED 154 SIMPLE_GLYPH, // UPPERCASE_LETTER 155 SIMPLE_GLYPH, // LOWERCASE_LETTER 156 SIMPLE_GLYPH, // TITLECASE_LETTER 157 SIMPLE_GLYPH, // MODIFIER_LETTER 158 SIMPLE_GLYPH, // OTHER_LETTER 159 MARK_GLYPH, // NON_SPACING_MARK 160 MARK_GLYPH, // ENCLOSING_MARK ?? 161 MARK_GLYPH, // COMBINING_SPACING_MARK ?? 162 SIMPLE_GLYPH, // DECIMAL_NUMBER 163 SIMPLE_GLYPH, // LETTER_NUMBER 164 SIMPLE_GLYPH, // OTHER_NUMBER; 165 0, // SPACE_SEPARATOR 166 0, // LINE_SEPARATOR 167 0, // PARAGRAPH_SEPARATOR 168 0, // CONTROL 169 0, // FORMAT 170 0, // PRIVATE_USE 171 0, // SURROGATE 172 SIMPLE_GLYPH, // DASH_PUNCTUATION 173 SIMPLE_GLYPH, // START_PUNCTUATION 174 SIMPLE_GLYPH, // END_PUNCTUATION 175 SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION 176 SIMPLE_GLYPH, // OTHER_PUNCTUATION 177 SIMPLE_GLYPH, // MATH_SYMBOL; 178 SIMPLE_GLYPH, // CURRENCY_SYMBOL 179 SIMPLE_GLYPH, // MODIFIER_SYMBOL 180 SIMPLE_GLYPH, // OTHER_SYMBOL 181 SIMPLE_GLYPH, // INITIAL_PUNCTUATION 182 SIMPLE_GLYPH // FINAL_PUNCTUATION 183 }; 184 getGlyphClass(ArabicCharacterData.Record record)185 static int getGlyphClass(ArabicCharacterData.Record record) 186 { 187 String decomp = record.getDecomposition(); 188 189 if (decomp != null && decomp.length() > 1) { 190 return LIGATURE_GLYPH; 191 } 192 193 return categoryClassMap[record.getGeneralCategory()]; 194 } 195 addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)196 static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable) 197 { 198 System.out.print("Adding Arabic glyph classes... "); 199 200 for (int i = 0; i < data.countRecords(); i += 1) { 201 ArabicCharacterData.Record record = data.getRecord(i); 202 classTable.addMapping(record.getCodePoint(), getGlyphClass(record)); 203 } 204 205 System.out.println("Done."); 206 } 207 buildArabicTables(ScriptList scriptList, FeatureList featureList, LookupList lookupList, ClassTable classTable)208 private static void buildArabicTables(ScriptList scriptList, FeatureList featureList, 209 LookupList lookupList, ClassTable classTable) { 210 // TODO: Might want to have the ligature table builder explicitly check for ligatures 211 // which start with space and tatweel rather than pulling them out here... 212 UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]"); 213 UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]"); 214 UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]"); 215 ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures)); 216 217 addArabicGlyphClasses(arabicData, classTable); 218 219 ClassTable initClassTable = new ClassTable(); 220 ClassTable mediClassTable = new ClassTable(); 221 ClassTable finaClassTable = new ClassTable(); 222 ClassTable isolClassTable = new ClassTable(); 223 224 buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable); 225 isolClassTable.snapshot(); 226 LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable); 227 228 LigatureTreeWalker ligaWalker = new LigatureTreeWalker(); 229 230 ligaTree.walk(ligaWalker); 231 232 Lookup initLookup, mediLookup, finaLookup, ligaLookup; 233 234 initLookup = new Lookup(Lookup.GSST_Single, 0); 235 initLookup.addSubtable(initClassTable); 236 237 mediLookup = new Lookup(Lookup.GSST_Single, 0); 238 mediLookup.addSubtable(mediClassTable); 239 240 finaLookup = new Lookup(Lookup.GSST_Single, 0); 241 finaLookup.addSubtable(finaClassTable); 242 243 ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks); 244 ligaLookup.addSubtable(ligaWalker); 245 246 Feature init = new Feature("init"); 247 Feature medi = new Feature("medi"); 248 Feature fina = new Feature("fina"); 249 Feature liga = new Feature("liga"); 250 251 init.addLookup(lookupList.addLookup(initLookup)); 252 medi.addLookup(lookupList.addLookup(mediLookup)); 253 fina.addLookup(lookupList.addLookup(finaLookup)); 254 liga.addLookup(lookupList.addLookup(ligaLookup)); 255 256 featureList.addFeature(init); 257 featureList.addFeature(medi); 258 featureList.addFeature(fina); 259 featureList.addFeature(liga); 260 261 scriptList.addFeature("arab", "(default)", init); 262 scriptList.addFeature("arab", "(default)", medi); 263 scriptList.addFeature("arab", "(default)", fina); 264 scriptList.addFeature("arab", "(default)", liga); 265 266 System.out.println(); 267 } 268 buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)269 public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree) 270 { 271 int ligatureCount = 0; 272 273 System.out.print("building composition ligature tree for " + UScript.getName(script) + "... "); 274 275 for (int i = 0; i < data.countRecords(script); i += 1) { 276 CanonicalCharacterData.Record record = data.getRecord(script, i); 277 String composed = UCharacter.toString(record.getComposedCharacter()); 278 279 for (int e = 0; e < record.countEquivalents(); e += 1) { 280 String equivalent = record.getEquivalent(e); 281 282 ligatureTree.insert(equivalent + composed); 283 ligatureCount += 1; 284 } 285 } 286 287 System.out.println(ligatureCount + " ligatures."); 288 } 289 buildDecompTables(CanonicalCharacterData data, int script)290 public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script) 291 { 292 int maxDecompCount = data.getMaxEquivalents(script); 293 DecompTable[] decompTables = new DecompTable[maxDecompCount]; 294 295 System.out.print("Building decompositon tables for " + UScript.getName(script) + 296 "... total decompositions: " + data.countRecords(script) + 297 ", max: " + maxDecompCount + "..."); 298 299 for (int i = 0; i < maxDecompCount; i += 1) { 300 DecompTable table = new DecompTable(); 301 302 for (int r = 0; r < data.countRecords(script); r += 1) { 303 CanonicalCharacterData.Record record = data.getRecord(script, r); 304 305 if (record.countEquivalents() > i) { 306 table.add(record.getComposedCharacter(), record.getEquivalent(i)); 307 } 308 } 309 310 decompTables[i] = table; 311 } 312 313 System.out.println(" Done."); 314 315 return decompTables; 316 } 317 buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)318 public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script) 319 { 320 int[] lookups = new int[2]; 321 322 DecompTable[] decompTables = buildDecompTables(data, script); 323 324 LigatureTree compTree = new LigatureTree(); 325 326 buildLigatureTree(data, script, compTree); 327 328 System.out.println(); 329 330 LigatureTreeWalker compWalker = new LigatureTreeWalker(); 331 332 compTree.walk(compWalker); 333 334 Lookup compLookup, dcmpLookup; 335 //int compLookupIndex, dcmpLookupIndex; 336 337 compLookup = new Lookup(Lookup.GSST_Ligature, 0); 338 compLookup.addSubtable(compWalker); 339 340 dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0); 341 for (int i = 0; i < decompTables.length; i += 1) { 342 dcmpLookup.addSubtable(decompTables[i]); 343 } 344 345 lookups[0] = lookupList.addLookup(compLookup); 346 lookups[1] = lookupList.addLookup(dcmpLookup); 347 348 return lookups; 349 } 350 addLookups(Feature feature, int[] lookups)351 public static void addLookups(Feature feature, int[] lookups) 352 { 353 for (int i = 0; i < lookups.length; i += 1) { 354 feature.addLookup(lookups[i]); 355 } 356 } 357 358 /* 359 * Hebrew mark order taken from the SBL Hebrew Font manual 360 * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks 361 */ buildCombiningClassTable()362 public static ClassTable buildCombiningClassTable() 363 { 364 UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]"); 365 ClassTable exceptions = new ClassTable(); 366 ClassTable combiningClasses = new ClassTable(); 367 int markCount = markSet.size(); 368 369 exceptions.addMapping(0x05C1, 10); // Point Shin Dot 370 exceptions.addMapping(0x05C2, 11); // Point Sin Dot 371 exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq 372 exceptions.addMapping(0x05BF, 23); // Point Rafe 373 exceptions.addMapping(0x05B9, 27); // Point Holam 374 exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum) 375 exceptions.addMapping(0x0591, 220); // Accent Etnahta 376 exceptions.addMapping(0x0596, 220); // Accent Tipeha 377 exceptions.addMapping(0x059B, 220); // Accent Tevir 378 exceptions.addMapping(0x05A3, 220); // Accent Munah 379 exceptions.addMapping(0x05A4, 220); // Accent Mahapakh 380 exceptions.addMapping(0x05A5, 220); // Accent Merkha 381 exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula 382 exceptions.addMapping(0x05A7, 220); // Accent Darga 383 exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo 384 exceptions.addMapping(0x05B0, 220); // Point Sheva 385 exceptions.addMapping(0x05B1, 220); // Point Hataf Segol 386 exceptions.addMapping(0x05B2, 220); // Point Hataf Patah 387 exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats 388 exceptions.addMapping(0x05B4, 220); // Point Hiriq 389 exceptions.addMapping(0x05B5, 220); // Point Tsere 390 exceptions.addMapping(0x05B6, 220); // Point Segol 391 exceptions.addMapping(0x05B7, 220); // Point Patah 392 exceptions.addMapping(0x05B8, 220); // Point Qamats 393 exceptions.addMapping(0x05BB, 220); // Point Qubuts 394 exceptions.addMapping(0x05BD, 220); // Point Meteg 395 exceptions.addMapping(0x059A, 222); // Accent Yetiv 396 exceptions.addMapping(0x05AD, 222); // Accent Dehi 397 exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum) 398 exceptions.addMapping(0x0593, 230); // Accent Shalshelet 399 exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan 400 exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol 401 exceptions.addMapping(0x0597, 230); // Accent Revia 402 exceptions.addMapping(0x0598, 230); // Accent Zarqa 403 exceptions.addMapping(0x059F, 230); // Accent Qarney Para 404 exceptions.addMapping(0x059E, 230); // Accent Gershayim 405 exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam 406 exceptions.addMapping(0x059C, 230); // Accent Geresh 407 exceptions.addMapping(0x0592, 230); // Accent Segolta 408 exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola 409 exceptions.addMapping(0x05AC, 230); // Accent Iluy 410 exceptions.addMapping(0x05A8, 230); // Accent Qadma 411 exceptions.addMapping(0x05AB, 230); // Accent Ole 412 exceptions.addMapping(0x05AF, 230); // Mark Masora Circle 413 exceptions.addMapping(0x05A1, 230); // Accent Pazer 414 //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot 415 exceptions.addMapping(0x05AE, 232); // Accent Zinor 416 exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana 417 exceptions.addMapping(0x0599, 232); // Accent Pashta 418 419 exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW 420 exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE 421 422 exceptions.addMapping(0x0651, 28); // ARABIC SHADDA 423 424 exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF 425 exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF 426 427 exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN 428 exceptions.addMapping(0x0650, 30); // ARABIC KASRA 429 430 exceptions.addMapping(0x0652, 31); // ARABIC SUKUN 431 exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH 432 433 exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN 434 exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN 435 exceptions.addMapping(0x064E, 31); // ARABIC FATHA 436 exceptions.addMapping(0x064F, 31); // ARABIC DAMMA 437 exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA 438 exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA 439 440 exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE 441 442 exceptions.snapshot(); 443 444 for (int i = 0; i < markCount; i += 1) { 445 int mark = markSet.charAt(i); 446 int markClass = exceptions.getGlyphClassID(mark); 447 448 if (markClass == 0) { 449 markClass = UCharacter.getCombiningClass(mark); 450 } 451 452 combiningClasses.addMapping(mark, markClass); 453 } 454 455 combiningClasses.snapshot(); 456 return combiningClasses; 457 } 458 buildDecompTables(String fileName)459 public static void buildDecompTables(String fileName) 460 { 461 // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored. 462 //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]"); 463 UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]"); 464 CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet); 465 ClassTable classTable = new ClassTable(); 466 467 LookupList lookupList = new LookupList(); 468 FeatureList featureList = new FeatureList(); 469 ScriptList scriptList = new ScriptList(); 470 471 // build common, inherited lookups... 472 // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON); 473 // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED); 474 475 for (int script = 0; script < UScript.CODE_LIMIT; script += 1) { 476 477 // This is a bit lame, but it's the only way I can think of 478 // to make this work w/o knowing the values of COMMON and INHERITED... 479 if (script == UScript.COMMON || script == UScript.INHERITED || 480 data.getMaxEquivalents(script) == 0) { 481 continue; 482 } 483 484 int[] lookups = buildLookups(data, lookupList, script); 485 486 Feature ccmp = new Feature("ccmp"); 487 488 addLookups(ccmp, lookups); 489 // addLookups(ccmp, commonLookups); 490 // addLookups(ccmp, inheritedLookups); 491 492 featureList.addFeature(ccmp); 493 494 String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script)); 495 496 scriptList.addFeature(scriptTag, "(default)", ccmp); 497 498 if (script == UScript.ARABIC) { 499 buildArabicTables(scriptList, featureList, lookupList, classTable); 500 } 501 } 502 503 featureList.finalizeFeatureList(); 504 505 ClassTable markClassTable = buildCombiningClassTable(); 506 507 GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList); 508 GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable); 509 String[] includeFiles = {"LETypes.h", "CanonShaping.h"}; 510 511 LigatureModuleWriter writer = new LigatureModuleWriter(); 512 513 writer.openFile(fileName); 514 writer.writeHeader(null, includeFiles); 515 writer.writeTable(gsubWriter); 516 writer.writeTable(gdefWriter); 517 writer.writeTrailer(); 518 writer.closeFile(); 519 } 520 main(String[] args)521 public static void main(String[] args) 522 { 523 buildDecompTables(args[0]); 524 } 525 } 526