• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.icu;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.io.StringWriter;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.List;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12 
13 import org.unicode.cldr.draft.FileUtilities;
14 import org.unicode.cldr.util.FileCopier;
15 import org.unicode.cldr.util.PatternCache;
16 
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.util.Calendar;
19 
20 /**
21  * Writes an IcuData object to a text file.
22  *
23  * @author jchye
24  */
25 public class IcuTextWriter {
26     /**
27      * The default tab indent (actually spaces)
28      */
29     private static final String TAB = "    ";
30     // List of characters to escape in UnicodeSets.
31     private static final Pattern UNICODESET_ESCAPE = PatternCache.get("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
32     // Only escape \ and " from other strings.
33     private static final Pattern STRING_ESCAPE = PatternCache.get("(?!')\\\\\\\\(?!')");
34     private static final Pattern QUOTE_ESCAPE = PatternCache.get("\\\\?\"");
35 
36     private static String headerText;
37 
38     /**
39      * ICU paths have a simple comparison, alphabetical within a level. We do
40      * have to catch the / so that it is lower than everything.
41      */
42     public static final Comparator<String> PATH_COMPARATOR = new Comparator<String>() {
43         @Override
44         public int compare(String arg0, String arg1) {
45             int min = Math.min(arg0.length(), arg1.length());
46             for (int i = 0; i < min; ++i) {
47                 int ch0 = arg0.charAt(i);
48                 int ch1 = arg1.charAt(i);
49                 int diff = ch0 - ch1;
50                 if (diff == 0) {
51                     continue;
52                 }
53                 if (ch0 == '/') {
54                     return -1;
55                 } else if (ch1 == '/') {
56                     return 1;
57                 }
58                 // make * greater than everything, because of languageMatch
59                 // while it is a pain to have it be unordered, this fix is sufficient to put all the *'s after anything else
60                 if (ch0 == '*') {
61                     return 1;
62                 } else if (ch1 == '*') {
63                     return -1;
64                 }
65                 return diff;
66             }
67             return arg0.length() - arg1.length();
68         }
69     };
70 
getHeader()71     private static String getHeader() {
72         if (headerText != null) {
73             return headerText;
74         }
75         try (StringWriter stringWriter = new StringWriter();) {
76             FileCopier.copy(NewLdml2IcuConverter.class, "ldml2icu_header.txt", stringWriter);
77             headerText = stringWriter.toString();
78             headerText = headerText.replace("%year%", String.valueOf(Calendar.getInstance().get(Calendar.YEAR)));
79             return headerText;
80         } catch (IOException ioe) {
81             throw new IllegalArgumentException(ioe);
82         }
83     }
84 
85     /**
86      * Write a file in ICU format. LDML2ICUConverter currently has some
87      * funny formatting in a few cases; don't try to match everything.
88      *
89      * @param icuData
90      *            the icu data structure to be written
91      * @param dirPath
92      *            the directory to write the file to
93      * @param hasSpecial
94      *            true if a special file was used to create the ICU data
95      */
writeToFile(IcuData icuData, String dirPath)96     public static void writeToFile(IcuData icuData, String dirPath) throws IOException {
97         String name = icuData.getName();
98         PrintWriter out = FileUtilities.openUTF8Writer(dirPath, name + ".txt");
99         out.write('\uFEFF');
100         // Append the header.
101         String header = getHeader().replace("%source%", icuData.getSourceFile());
102         out.print(header);
103         if (icuData.getFileComment() != null) {
104             out.println("/**");
105             out.append(" * ").append(icuData.getFileComment()).println();
106             out.println(" */");
107         }
108 
109         // Write the ICU data to file.
110         out.append(name);
111         if (!icuData.hasFallback()) out.append(":table(nofallback)");
112         List<String> sortedPaths = new ArrayList<String>(icuData.keySet());
113         Collections.sort(sortedPaths, PATH_COMPARATOR);
114         String[] lastLabels = new String[] {};
115         boolean wasSingular = false;
116         for (String path : sortedPaths) {
117             // Write values to file.
118             String[] labels = path.split("/", -1); // Don't discard trailing slashes.
119             int common = getCommon(lastLabels, labels);
120             for (int i = lastLabels.length - 1; i > common; --i) {
121                 if (wasSingular) {
122                     wasSingular = false;
123                 } else {
124                     out.append(Utility.repeat(TAB, i));
125                 }
126                 out.println("}");
127             }
128             for (int i = common + 1; i < labels.length; ++i) {
129                 final String pad = Utility.repeat(TAB, i);
130                 out.append(pad);
131                 String label = labels[i];
132                 if (!label.startsWith("<") && !label.endsWith(">")) {
133                     out.append(label);
134                 }
135                 out.append('{');
136                 if (i != labels.length - 1) {
137                     out.println();
138                 }
139             }
140             List<String[]> values = icuData.get(path);
141             try {
142                 wasSingular = appendValues(name, path, values, labels.length, out);
143             } catch (NullPointerException npe) {
144                 System.err.println("Null value encountered in " + path);
145             }
146             out.flush();
147             lastLabels = labels;
148         }
149         // Add last closing braces.
150         for (int i = lastLabels.length - 1; i > 0; --i) {
151             if (wasSingular) {
152                 wasSingular = false;
153             } else {
154                 out.append(Utility.repeat(TAB, i));
155             }
156             out.println("}");
157         }
158         out.println("}");
159         out.close();
160     }
161 
162     /**
163      * Inserts padding and values between braces.
164      * @param name
165      * @param rbPath
166      * @param values
167      * @param numTabs
168      * @param out
169      * @return
170      */
appendValues(String name, String rbPath, List<String[]> values, int numTabs, PrintWriter out)171     private static boolean appendValues(String name, String rbPath, List<String[]> values, int numTabs,
172         PrintWriter out) {
173         String[] firstArray;
174         boolean wasSingular = false;
175         boolean quote = !IcuData.isIntRbPath(rbPath);
176         boolean isSequence = rbPath.endsWith("/Sequence");
177         if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
178             if ((firstArray = values.get(0)).length == 1 && !mustBeArray(false, name, rbPath)) {
179                 String value = firstArray[0];
180                 if (quote) {
181                     value = quoteInside(value);
182                 }
183                 int maxWidth = 84 - Math.min(4, numTabs) * TAB.length();
184                 if (value.length() <= maxWidth) {
185                     // Single value for path: don't add newlines.
186                     appendValue(value, quote, out);
187                     wasSingular = true;
188                 } else {
189                     // Value too long to fit in one line, so wrap.
190                     final String pad = Utility.repeat(TAB, numTabs);
191                     out.println();
192                     int end;
193                     for (int i = 0; i < value.length(); i = end) {
194                         end = goodBreak(value, i + maxWidth);
195                         String part = value.substring(i, end);
196                         out.append(pad);
197                         appendValue(part, quote, out).println();
198                     }
199                 }
200             } else {
201                 // Only one array for the rbPath, so don't add an extra set of braces.
202                 final String pad = Utility.repeat(TAB, numTabs);
203                 out.println();
204                 appendArray(pad, firstArray, quote, isSequence, out);
205             }
206         } else {
207             final String pad = Utility.repeat(TAB, numTabs);
208             out.println();
209             for (String[] valueArray : values) {
210                 if (valueArray.length == 1) {
211                     // Single-value array: print normally.
212                     appendArray(pad, valueArray, quote, isSequence, out);
213                 } else {
214                     // Enclose this array in braces to separate it from other
215                     // values.
216                     out.append(pad).println("{");
217                     appendArray(pad + TAB, valueArray, quote, isSequence, out);
218                     out.append(pad).println("}");
219                 }
220             }
221         }
222         return wasSingular;
223     }
224 
225     /**
226      * Wrapper for a hack to determine if the given rb path should always
227      * present its values as an array. This hack is required for an ICU data test to pass.
228      *
229      * @param topValues
230      * @param name
231      * @param rbPath
232      * @return
233      */
mustBeArray(boolean topValues, String name, String rbPath)234     private static boolean mustBeArray(boolean topValues, String name, String rbPath) {
235         // TODO(jchye): Add this as an option to the locale file instead of hardcoding.
236         // System.out.println(name + "\t" + rbPath);
237         if (topValues) {
238             return (rbPath.startsWith("/rules/set")
239                 && name.equals("pluralRanges"));
240         }
241         return rbPath.equals("/LocaleScript")
242             || (rbPath.contains("/eras/") && !rbPath.endsWith(":alias") && !rbPath.endsWith("/named"))
243             || rbPath.startsWith("/calendarPreferenceData")
244             || rbPath.startsWith("/metazoneInfo");
245     }
246 
appendArray(String padding, String[] valueArray, boolean quote, boolean isSequence, PrintWriter out)247     private static PrintWriter appendArray(String padding, String[] valueArray,
248         boolean quote, boolean isSequence, PrintWriter out) {
249         for (String value : valueArray) {
250             out.append(padding);
251             appendValue(quoteInside(value), quote, out);
252             if (!isSequence) {
253                 out.print(",");
254             }
255             out.println();
256         }
257         return out;
258     }
259 
appendValue(String value, boolean quote, PrintWriter out)260     private static PrintWriter appendValue(String value, boolean quote, PrintWriter out) {
261         if (quote) {
262             return out.append('"').append(value).append('"');
263         } else {
264             return out.append(value);
265         }
266     }
267 
268     /**
269      * Can a string be broken here? If not, backup until we can.
270      *
271      * @param quoted
272      * @param end
273      * @return
274      */
goodBreak(String quoted, int end)275     private static int goodBreak(String quoted, int end) {
276         if (end > quoted.length()) {
277             return quoted.length();
278         }
279         // Don't break escaped Unicode characters.
280         // Need to handle both e.g. \u4E00 and \U00020000
281         for (int i = end - 1; i > end - 10;) {
282             char current = quoted.charAt(i--);
283             if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
284                 if ((current == 'u' || current == 'U') && i > end - 10 && quoted.charAt(i) == '\\') {
285                     return i;
286                 }
287                 break;
288             }
289         }
290         while (end > 0) {
291             char ch = quoted.charAt(end - 1);
292             if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
293                 break;
294             }
295             --end;
296         }
297         return end;
298     }
299 
300     /**
301      * Fix characters inside strings.
302      *
303      * @param item
304      * @return
305      */
quoteInside(String item)306     private static String quoteInside(String item) {
307         // Unicode-escape all quotes.
308         item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
309         // Double up on backslashes, ignoring Unicode-escaped characters.
310         Pattern pattern = item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
311         Matcher matcher = pattern.matcher(item);
312 
313         if (!matcher.find()) {
314             return item;
315         }
316         StringBuffer buffer = new StringBuffer();
317         int start = 0;
318         do {
319             buffer.append(item.substring(start, matcher.start()));
320             int punctuationChar = item.codePointAt(matcher.end() - 1);
321             buffer.append("\\");
322             if (punctuationChar == '\\') {
323                 buffer.append('\\');
324             }
325             buffer.append(matcher.group());
326             start = matcher.end();
327         } while (matcher.find());
328         buffer.append(item.substring(start));
329         return buffer.toString();
330     }
331 
332     /**
333      * find the initial labels (from a path) that are identical.
334      *
335      * @param item
336      * @return
337      */
getCommon(String[] lastLabels, String[] labels)338     private static int getCommon(String[] lastLabels, String[] labels) {
339         int min = Math.min(lastLabels.length, labels.length);
340         int i;
341         for (i = 0; i < min; ++i) {
342             if (!lastLabels[i].equals(labels[i])) {
343                 return i - 1;
344             }
345         }
346         return i - 1;
347     }
348 }
349