1 /* 2 ****************************************************************************** 3 * Copyright (C) 2004-2005, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 * 7 * 8 */ 9 package org.unicode.cldr.util; 10 11 import java.util.Iterator; 12 import java.util.Map; 13 import java.util.TreeMap; 14 15 import com.ibm.icu.text.Normalizer; 16 import com.ibm.icu.text.UTF16; 17 import com.ibm.icu.text.UnicodeSet; 18 19 public class ExtractCollationRules { 20 Map<String, String> type_rules = new TreeMap<>(); 21 StringBuffer rules = new StringBuffer(); 22 set(CLDRFile file)23 public ExtractCollationRules set(CLDRFile file) { 24 type_rules.clear(); 25 String lastType = ""; 26 rules.setLength(0); 27 28 String context = null; 29 30 for (Iterator it = file.iterator("//ldml/collations", file.getComparator()); it.hasNext();) { 31 32 // System.out.print(rules.substring(lastLen, rules.length())); 33 // lastLen = rules.length(); 34 35 String path = (String) it.next(); 36 String value = file.getStringValue(path); 37 XPathParts parts = XPathParts.getFrozenInstance(path); 38 String type = parts.findAttributeValue("collation", "type"); 39 if (!type.equals(lastType)) { 40 lastType = type; 41 type_rules.put(lastType, rules.toString()); 42 rules.setLength(0); 43 } 44 String mainType = parts.getElement(3); 45 // base?, settings?, suppress_contractions?, optimize? 46 // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend? 47 if (mainType.equals("settings")) { 48 writeSettings(parts.getAttributes(3), rules); 49 continue; 50 } else if (mainType.equals("rules")) { 51 String ruleType = parts.getElement(4); 52 char c = ruleType.charAt(0); 53 if (c == 'x') { 54 ruleType = parts.getElement(5); 55 c = ruleType.charAt(0); 56 } 57 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c'; 58 String lastContext = context; 59 context = null; 60 switch (c) { 61 case 'r': 62 appendOrdering("&", null, value, false, true); 63 break; 64 case 'p': 65 appendOrdering("<", lastContext, value, isMultiple, true); 66 break; 67 case 's': 68 appendOrdering("<<", lastContext, value, isMultiple, true); 69 break; 70 case 't': 71 appendOrdering("<<<", lastContext, value, isMultiple, false); 72 break; 73 case 'i': 74 appendOrdering("=", lastContext, value, isMultiple, false); 75 break; 76 case 'c': 77 context = value; 78 break; 79 case 'e': 80 appendOrdering("/", null, value, false, false); 81 break; 82 default: 83 System.out.println("Couldn't handle: " + path + "\t" + value); 84 } 85 continue; 86 } else { 87 88 } 89 System.out.println("Couldn't handle: " + path + "\t" + value); 90 } 91 type_rules.put(lastType, rules.toString()); 92 return this; 93 } 94 appendOrdering(String relation, String context, String valueAfter, boolean isMultiple, boolean lineBreakBefore)95 private void appendOrdering(String relation, String context, String valueAfter, boolean isMultiple, 96 boolean lineBreakBefore) { 97 if (isMultiple) { 98 int cp; 99 for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) { 100 cp = UTF16.charAt(valueAfter, i); 101 if (lineBreakBefore) 102 rules.append(CldrUtility.LINE_SEPARATOR); 103 else 104 rules.append(' '); 105 rules.append(relation); 106 if (context != null) rules.append(' ').append(quote(context)); 107 rules.append(' ').append(quote(UTF16.valueOf(cp))); 108 } 109 } else { 110 if (lineBreakBefore) 111 rules.append(CldrUtility.LINE_SEPARATOR); 112 else 113 rules.append(' '); 114 rules.append(relation); 115 if (context != null) rules.append(' ').append(quote(context)); 116 rules.append(' ').append(quote(valueAfter)); 117 } 118 } 119 writeSettings(Map<String, String> attributes, StringBuffer results)120 private void writeSettings(Map<String, String> attributes, StringBuffer results) { 121 for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext();) { 122 String attribute = it.next(); 123 String value = attributes.get(attribute); 124 // TODO fix different cases 125 results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR); 126 // if (attribute.equals("normalization")) { 127 // 128 // } 129 } 130 } 131 iterator()132 public Iterator<String> iterator() { 133 return type_rules.keySet().iterator(); 134 } 135 getRules(Object key)136 public String getRules(Object key) { 137 return type_rules.get(key); 138 } 139 140 static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster 141 142 static UnicodeSet needsQuoting = null; 143 static UnicodeSet needsUnicodeForm = null; 144 quote(String s)145 static final String quote(String s) { 146 if (needsQuoting == null) { 147 /* 148 * c >= 'a' && c <= 'z' 149 * || c >= 'A' && c <= 'Z' 150 * || c >= '0' && c <= '9' 151 * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c)) 152 */ 153 needsQuoting = new UnicodeSet( 154 "[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); // 155 // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:] 156 // for (int i = 0; i <= 0x10FFFF; ++i) { 157 // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i); 158 // } 159 // needsQuoting.remove(); 160 needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]"); 161 } 162 s = Normalizer.compose(s, false); 163 quoteOperandBuffer.setLength(0); 164 boolean noQuotes = true; 165 boolean inQuote = false; 166 int cp; 167 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 168 cp = UTF16.charAt(s, i); 169 if (!needsQuoting.contains(cp)) { 170 if (inQuote) { 171 quoteOperandBuffer.append('\''); 172 inQuote = false; 173 } 174 quoteOperandBuffer.append(UTF16.valueOf(cp)); 175 } else { 176 noQuotes = false; 177 if (cp == '\'') { 178 quoteOperandBuffer.append("''"); 179 } else { 180 if (!inQuote) { 181 quoteOperandBuffer.append('\''); 182 inQuote = true; 183 } 184 if (!needsUnicodeForm.contains(cp)) 185 quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028 186 else if (cp > 0xFFFF) { 187 quoteOperandBuffer.append("\\U").append(hex(cp, 8)); 188 } else if (cp <= 0x20 || cp > 0x7E) { 189 quoteOperandBuffer.append("\\u").append(hex(cp, 4)); 190 } else { 191 quoteOperandBuffer.append(UTF16.valueOf(cp)); 192 } 193 } 194 } 195 /* 196 * switch (c) { 197 * case '<': case '>': case '#': case '=': case '&': case '/': 198 * quoteOperandBuffer.append('\'').append(c).append('\''); 199 * break; 200 * case '\'': 201 * quoteOperandBuffer.append("''"); 202 * break; 203 * default: 204 * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) { 205 * quoteOperandBuffer.append("\\u").append(Utility.hex(c)); 206 * break; 207 * } 208 * quoteOperandBuffer.append(c); 209 * break; 210 * } 211 */ 212 } 213 if (inQuote) { 214 quoteOperandBuffer.append('\''); 215 } 216 if (noQuotes) return s; // faster 217 return quoteOperandBuffer.toString(); 218 } 219 hex(long i, int places)220 static public String hex(long i, int places) { 221 if (i == Long.MIN_VALUE) return "-8000000000000000"; 222 boolean negative = i < 0; 223 if (negative) { 224 i = -i; 225 } 226 String result = Long.toString(i, 16).toUpperCase(); 227 if (result.length() < places) { 228 result = "0000000000000000".substring(result.length(), places) + result; 229 } 230 if (negative) { 231 return '-' + result; 232 } 233 return result; 234 } 235 236 } 237