• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  * Copyright (C) 2004-2005, International Business Machines Corporation and        *
4  * others. All Rights Reserved.                                               *
5  ******************************************************************************
6  *
7  * in shell:  (such as .cldrrc)
8  *   export CWDEBUG="-DCLDR_DTD_CACHE=/tmp/cldrdtd/"
9  *   export CWDEFS="-DCLDR_DTD_CACHE_DEBUG=y ${CWDEBUG}"
10  *
11  *
12  * in code:
13  *   docBuilder.setEntityResolver(new CachingEntityResolver());
14  *
15  */
16 package org.unicode.cldr.util;
17 
18 import java.util.Iterator;
19 import java.util.Map;
20 import java.util.TreeMap;
21 
22 import com.ibm.icu.text.Normalizer;
23 import com.ibm.icu.text.UTF16;
24 import com.ibm.icu.text.UnicodeSet;
25 
26 public class ExtractCollationRules {
27     Map<String, String> type_rules = new TreeMap<String, String>();
28     XPathParts parts = new XPathParts();
29     StringBuffer rules = new StringBuffer();
30 
set(CLDRFile file)31     public ExtractCollationRules set(CLDRFile file) {
32         type_rules.clear();
33         String lastType = "";
34         rules.setLength(0);
35 
36         String context = null;
37 
38         for (Iterator it = file.iterator("//ldml/collations", file.getComparator()); it.hasNext();) {
39 
40             // System.out.print(rules.substring(lastLen, rules.length()));
41             // lastLen = rules.length();
42 
43             String path = (String) it.next();
44             String value = file.getStringValue(path);
45             parts.set(path);
46             String type = parts.findAttributeValue("collation", "type");
47             if (!type.equals(lastType)) {
48                 lastType = type;
49                 type_rules.put(lastType, rules.toString());
50                 rules.setLength(0);
51             }
52             String mainType = parts.getElement(3);
53             // base?, settings?, suppress_contractions?, optimize?
54             // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend?
55             if (mainType.equals("settings")) {
56                 writeSettings(parts.getAttributes(3), rules);
57                 continue;
58             } else if (mainType.equals("rules")) {
59                 String ruleType = parts.getElement(4);
60                 char c = ruleType.charAt(0);
61                 if (c == 'x') {
62                     ruleType = parts.getElement(5);
63                     c = ruleType.charAt(0);
64                 }
65                 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c';
66                 String lastContext = context;
67                 context = null;
68                 switch (c) {
69                 case 'r':
70                     appendOrdering("&", null, value, false, true);
71                     break;
72                 case 'p':
73                     appendOrdering("<", lastContext, value, isMultiple, true);
74                     break;
75                 case 's':
76                     appendOrdering("<<", lastContext, value, isMultiple, true);
77                     break;
78                 case 't':
79                     appendOrdering("<<<", lastContext, value, isMultiple, false);
80                     break;
81                 case 'i':
82                     appendOrdering("=", lastContext, value, isMultiple, false);
83                     break;
84                 case 'c':
85                     context = value;
86                     break;
87                 case 'e':
88                     appendOrdering("/", null, value, false, false);
89                     break;
90                 default:
91                     System.out.println("Couldn't handle: " + path + "\t" + value);
92                 }
93                 continue;
94             } else {
95 
96             }
97             System.out.println("Couldn't handle: " + path + "\t" + value);
98         }
99         type_rules.put(lastType, rules.toString());
100         return this;
101     }
102 
appendOrdering(String relation, String context, String valueAfter, boolean isMultiple, boolean lineBreakBefore)103     private void appendOrdering(String relation, String context, String valueAfter, boolean isMultiple,
104         boolean lineBreakBefore) {
105         if (isMultiple) {
106             int cp;
107             for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) {
108                 cp = UTF16.charAt(valueAfter, i);
109                 if (lineBreakBefore)
110                     rules.append(CldrUtility.LINE_SEPARATOR);
111                 else
112                     rules.append(' ');
113                 rules.append(relation);
114                 if (context != null) rules.append(' ').append(quote(context));
115                 rules.append(' ').append(quote(UTF16.valueOf(cp)));
116             }
117         } else {
118             if (lineBreakBefore)
119                 rules.append(CldrUtility.LINE_SEPARATOR);
120             else
121                 rules.append(' ');
122             rules.append(relation);
123             if (context != null) rules.append(' ').append(quote(context));
124             rules.append(' ').append(quote(valueAfter));
125         }
126     }
127 
writeSettings(Map<String, String> attributes, StringBuffer results)128     private void writeSettings(Map<String, String> attributes, StringBuffer results) {
129         for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext();) {
130             String attribute = it.next();
131             String value = attributes.get(attribute);
132             // TODO fix different cases
133             results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR);
134             // if (attribute.equals("normalization")) {
135             //
136             // }
137         }
138     }
139 
iterator()140     public Iterator<String> iterator() {
141         return type_rules.keySet().iterator();
142     }
143 
getRules(Object key)144     public String getRules(Object key) {
145         return (String) type_rules.get(key);
146     }
147 
148     static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
149 
150     static UnicodeSet needsQuoting = null;
151     static UnicodeSet needsUnicodeForm = null;
152 
quote(String s)153     static final String quote(String s) {
154         if (needsQuoting == null) {
155             /*
156              * c >= 'a' && c <= 'z'
157              * || c >= 'A' && c <= 'Z'
158              * || c >= '0' && c <= '9'
159              * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
160              */
161             needsQuoting = new UnicodeSet(
162                 "[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); //
163             // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
164             // for (int i = 0; i <= 0x10FFFF; ++i) {
165             // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
166             // }
167             // needsQuoting.remove();
168             needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
169         }
170         s = Normalizer.compose(s, false);
171         quoteOperandBuffer.setLength(0);
172         boolean noQuotes = true;
173         boolean inQuote = false;
174         int cp;
175         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
176             cp = UTF16.charAt(s, i);
177             if (!needsQuoting.contains(cp)) {
178                 if (inQuote) {
179                     quoteOperandBuffer.append('\'');
180                     inQuote = false;
181                 }
182                 quoteOperandBuffer.append(UTF16.valueOf(cp));
183             } else {
184                 noQuotes = false;
185                 if (cp == '\'') {
186                     quoteOperandBuffer.append("''");
187                 } else {
188                     if (!inQuote) {
189                         quoteOperandBuffer.append('\'');
190                         inQuote = true;
191                     }
192                     if (!needsUnicodeForm.contains(cp))
193                         quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
194                     else if (cp > 0xFFFF) {
195                         quoteOperandBuffer.append("\\U").append(hex(cp, 8));
196                     } else if (cp <= 0x20 || cp > 0x7E) {
197                         quoteOperandBuffer.append("\\u").append(hex(cp, 4));
198                     } else {
199                         quoteOperandBuffer.append(UTF16.valueOf(cp));
200                     }
201                 }
202             }
203             /*
204              * switch (c) {
205              * case '<': case '>': case '#': case '=': case '&': case '/':
206              * quoteOperandBuffer.append('\'').append(c).append('\'');
207              * break;
208              * case '\'':
209              * quoteOperandBuffer.append("''");
210              * break;
211              * default:
212              * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) {
213              * quoteOperandBuffer.append("\\u").append(Utility.hex(c));
214              * break;
215              * }
216              * quoteOperandBuffer.append(c);
217              * break;
218              * }
219              */
220         }
221         if (inQuote) {
222             quoteOperandBuffer.append('\'');
223         }
224         if (noQuotes) return s; // faster
225         return quoteOperandBuffer.toString();
226     }
227 
hex(long i, int places)228     static public String hex(long i, int places) {
229         if (i == Long.MIN_VALUE) return "-8000000000000000";
230         boolean negative = i < 0;
231         if (negative) {
232             i = -i;
233         }
234         String result = Long.toString(i, 16).toUpperCase();
235         if (result.length() < places) {
236             result = "0000000000000000".substring(result.length(), places) + result;
237         }
238         if (negative) {
239             return '-' + result;
240         }
241         return result;
242     }
243 
244 }
245