1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2016, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import java.io.BufferedReader; 10 import java.io.IOException; 11 12 import org.unicode.cldr.draft.FileUtilities; 13 14 import com.ibm.icu.text.Transliterator; 15 import com.ibm.icu.util.ICUUncheckedIOException; 16 17 public class TransliteratorUtilities { 18 public static boolean DEBUG = false; 19 registerTransliteratorFromFile(String dir, String id)20 public static void registerTransliteratorFromFile(String dir, String id) { 21 try { 22 String filename = id.replace('-', '_') + ".txt"; 23 String rules = getFileContents(dir, filename); 24 Transliterator t; 25 int pos = id.indexOf('-'); 26 String rid; 27 if (pos < 0) { 28 rid = id + "-Any"; 29 id = "Any-" + id; 30 } else { 31 rid = id.substring(pos + 1) + "-" + id.substring(0, pos); 32 } 33 t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD); 34 Transliterator.unregister(id); 35 Transliterator.registerInstance(t); 36 37 /*String test = "\u049A\u0430\u0437\u0430\u049B"; 38 System.out.println(t.transliterate(test)); 39 t = Transliterator.getInstance(id); 40 System.out.println(t.transliterate(test)); 41 */ 42 43 t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE); 44 Transliterator.unregister(rid); 45 Transliterator.registerInstance(t); 46 if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid); 47 } catch (IOException e) { 48 //#if defined(FOUNDATION10) || defined(J2SE13) 49 //## throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id+" "+ e.getMessage()); 50 //#else 51 throw new ICUUncheckedIOException("Can't open " + dir + ", " + id, e); 52 //#endif 53 } 54 } 55 56 /** 57 * 58 */ getFileContents(String dir, String filename)59 public static String getFileContents(String dir, String filename) throws IOException { 60 //#if defined(FOUNDATION10) || defined(J2SE13) 61 //## BufferedReader br = TestUtil.openUTF8Reader(dir, filename); 62 //#else 63 BufferedReader br = FileUtilities.openUTF8Reader(dir, filename); 64 //#endif 65 StringBuffer buffer = new StringBuffer(); 66 while (true) { 67 String line = br.readLine(); 68 if (line == null) break; 69 if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); 70 buffer.append(line).append("\r\n"); 71 } 72 br.close(); 73 return buffer.toString(); 74 75 } 76 77 private static final String BASE_RULES = ":: (hex-any/xml);" + 78 ":: (hex-any/xml10);" + 79 "'<' > '<' ;" + 80 "'<' < '&'[lL][Tt]';' ;" + 81 "'&' > '&' ;" + 82 "'&' < '&'[aA][mM][pP]';' ;" + 83 "'>' < '&'[gG][tT]';' ;" + 84 "'\"' < '&'[qQ][uU][oO][tT]';' ; " + 85 "'' < '&'[aA][pP][oO][sS]';' ; "; 86 87 private static final String CONTENT_RULES = "'>' > '>' ;"; 88 89 private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + 90 "'\"' > '"' ; "; 91 92 private static final String HTML_RULES_CONTROLS = HTML_RULES + 93 ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; "; 94 95 private static final String HTML_RULES_ASCII = HTML_RULES + 96 ":: [[:C:][:^ASCII:]] any-hex/xml ; "; 97 98 private static final String XML_RULES = HTML_RULES + 99 "'' > ''' ; "; 100 101 /* 102 The ampersand character (&) and the left angle bracket (<) MUST NOT appear 103 104 in their literal form, except when used as markup delimiters, or within a 105 106 comment, a processing instruction, or a CDATA section. If they are needed 107 108 elsewhere, they MUST be escaped using either numeric character references or 109 110 the strings "&" and "<" respectively. The right angle bracket (>) MAY 111 112 be represented using the string ">", and MUST, for compatibility, be 113 114 escaped using either ">" or a character reference when it appears in the string 115 116 "]]>" in content, when that string is not marking the end of a CDATA section. 117 118 In the content of elements, character data is any string of characters which does 119 120 not contain the start-delimiter of any markup and does not include the 121 122 CDATA-section-close delimiter, "]]>". In a CDATA section, character data is 123 124 any string of characters not including the CDATA-section-close delimiter, 125 126 "]]>". 127 128 To allow attribute values to contain both single and double quotes, the 129 130 apostrophe or single-quote character (') MAY be represented as "'", and 131 132 the double-quote character (") as """. 133 134 135 */ 136 137 public static final Transliterator toXML = Transliterator.createFromRules( 138 "any-xml", XML_RULES, Transliterator.FORWARD); 139 public static final Transliterator fromXML = Transliterator.createFromRules( 140 "xml-any", XML_RULES, Transliterator.REVERSE); 141 public static final Transliterator toHTML = Transliterator.createFromRules( 142 "any-html", HTML_RULES, Transliterator.FORWARD); 143 public static final Transliterator toHTMLControl = Transliterator.createFromRules( 144 "any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); 145 public static final Transliterator toHTMLAscii = Transliterator.createFromRules( 146 "any-html", HTML_RULES_ASCII, Transliterator.FORWARD); 147 public static final Transliterator fromHTML = Transliterator.createFromRules( 148 "html-any", HTML_RULES, Transliterator.REVERSE); 149 } 150