• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkNotNull;
7 import static java.nio.charset.StandardCharsets.UTF_8;
8 import static java.nio.file.StandardOpenOption.CREATE;
9 import static java.nio.file.StandardOpenOption.CREATE_NEW;
10 import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
11 import static java.util.stream.Collectors.joining;
12 
13 import java.io.IOException;
14 import java.io.PrintWriter;
15 import java.io.Writer;
16 import java.nio.file.Files;
17 import java.nio.file.OpenOption;
18 import java.nio.file.Path;
19 import java.util.List;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22 
23 import com.google.common.collect.Iterables;
24 
25 /**
26  * Writes an IcuData object to a text file. A lot of this class was copied directly from the
27  * original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
28  * behaviours. The behaviour of this class is currently tuned to produce perfect parity with
29  * the original conversion tools, but once migration of the tools is complete, it should
30  * probably be revisited and tidied up.
31  */
32 // TODO: Link to a definitive specification for the ICU data files and remove the hacks!
33 final class IcuTextWriter {
34     private static final String INDENT = "    ";
35     // List of characters to escape in UnicodeSets
36     // ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
37     private static final Pattern UNICODESET_ESCAPE =
38         Pattern.compile("\\\\[\\\\\\[\\]{}\\-&:^=]");
39     // Only escape \ and " from other strings.
40     private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
41     private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
42 
43     private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW };
44     private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING };
45 
46     /** Write a file in ICU data format with the specified header. */
writeToFile( IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite)47     static void writeToFile(
48         IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) {
49 
50         try {
51             Files.createDirectories(outDir);
52             Path file = outDir.resolve(icuData.getName() + ".txt");
53             OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES;
54             try (Writer w = Files.newBufferedWriter(file, UTF_8, fileOptions);
55                 PrintWriter out = new PrintWriter(w)) {
56                 new IcuTextWriter(icuData).writeTo(out, header);
57             }
58         } catch (IOException e) {
59             throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
60         }
61     }
62 
63     private final IcuData icuData;
64     private int depth = 0;
65     private boolean valueWasInline = false;
66 
IcuTextWriter(IcuData icuData)67     IcuTextWriter(IcuData icuData) {
68         this.icuData = checkNotNull(icuData);
69     }
70 
71     // TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
writeTo(PrintWriter out, List<String> header)72     private void writeTo(PrintWriter out, List<String> header) {
73         out.write('\uFEFF');
74         writeHeaderAndComments(out, header, icuData.getFileComment());
75 
76         // Write the ICU data to file. This takes the form:
77         // ----
78         // <name>{
79         //     foo{
80         //         bar{baz}
81         //     }
82         // }
83         // ----
84         // So it's like every RbPath has an implicit prefix of the IcuData name.
85         String root = icuData.getName();
86         if (!icuData.hasFallback()) {
87             root += ":table(nofallback)";
88         }
89         // TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
90         out.print(root);
91         out.print("{");
92         depth++;
93 
94         RbPath lastPath = RbPath.of();
95         for (RbPath path : icuData.getPaths()) {
96             // Close any blocks up to the common path length. Since paths are all distinct, the
97             // common length should always be shorter than either path. We add 1 since we must also
98             // account for the implicit root segment.
99             int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
100             // Before closing, the "cursor" is at the end of the last value written.
101             closeLastPath(commonDepth, out);
102             // After opening the value will be ready for the next value to be written.
103             openNextPath(path, out);
104             valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
105             lastPath = path;
106         }
107         closeLastPath(0, out);
108         out.println();
109         out.close();
110     }
111 
112     // Before: Cursor is at the end of the previous line.
113     // After: Cursor is positioned immediately after the last closed '}'
closeLastPath(int minDepth, PrintWriter out)114     private void closeLastPath(int minDepth, PrintWriter out) {
115         if (valueWasInline) {
116             depth--;
117             out.print('}');
118             valueWasInline = false;
119         }
120         while (depth > minDepth) {
121             close(out);
122         }
123     }
124 
125     // Before: Cursor is at the end of the previous line.
126     // After: Cursor is positioned immediately after the newly opened '{'
openNextPath(RbPath path, PrintWriter out)127     private void openNextPath(RbPath path, PrintWriter out) {
128         while (depth <= path.length()) {
129             // The -1 is to adjust for the implicit root element which means indentation (depth)
130             // no longer matches the index of the segment we are writing.
131             open(path.getSegment(depth - 1), out);
132         }
133     }
134 
open(String label, PrintWriter out)135     private void open(String label, PrintWriter out) {
136         newLineAndIndent(out, FormatOptions.PATH_FORMAT);
137         depth++;
138         // This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
139         // These take the form of "<any-string>" and are used to ensure that path order can be
140         // well defined even for anonymous lists of items.
141         if (!label.startsWith("<") && !label.endsWith(">")) {
142             out.print(label);
143         }
144         out.print('{');
145     }
146 
close(PrintWriter out)147     private void close(PrintWriter out) {
148         depth--;
149         newLineAndIndent(out, FormatOptions.PATH_FORMAT);
150         out.print('}');
151     }
152 
newLineAndIndent(PrintWriter out, FormatOptions format)153     private void newLineAndIndent(PrintWriter out, FormatOptions format) {
154         out.println();
155         if (format.shouldIndent) {
156             for (int i = 0; i < depth; i++) {
157                 out.print(INDENT);
158             }
159         }
160     }
161 
162     // Currently the "header" uses '//' line comments but the comments are in a block.
163     // TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
writeHeaderAndComments( PrintWriter out, List<String> header, List<String> comments)164     private static void writeHeaderAndComments(
165         PrintWriter out, List<String> header, List<String> comments) {
166 
167         header.forEach(s -> out.println("// " + s));
168         if (!comments.isEmpty()) {
169             // TODO: Don't use /* */ block quotes, just use inline // quotes.
170             out.println(
171                 comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
172         }
173     }
174 
175     private static final class FormatOptions {
176         // Only the indent flag is used
177         final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true);
178 
forPath(RbPath rbPath)179         static FormatOptions forPath(RbPath rbPath) {
180             return new FormatOptions(
181                     !rbPath.isIntPath() && !rbPath.isBinPath(),
182                     !rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(),
183                     !rbPath.isBinPath());
184         }
185 
186         final boolean shouldQuote;
187         final boolean shouldUseComma;
188         final boolean shouldIndent;
189 
FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent)190         private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) {
191             this.shouldQuote = shouldQuote;
192             this.shouldUseComma = shouldUseComma;
193             this.shouldIndent = shouldIndent;
194         }
195     }
196 
197     /** Inserts padding and values between braces. */
198     // TODO: Get rid of the need for icuDataName by adding type information to RbPath.
appendValues( String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out)199     private boolean appendValues(
200         String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out) {
201 
202         RbValue onlyValue;
203         boolean wasSingular = false;
204         FormatOptions format = FormatOptions.forPath(rbPath);
205         if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) {
206             onlyValue = values.get(0);
207             if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) {
208                 // Value has a single element and is not being forced to be an array.
209                 String onlyElement = Iterables.getOnlyElement(onlyValue.getElements());
210                 if (format.shouldQuote) {
211                     onlyElement = quoteInside(onlyElement);
212                 }
213                 // The numbers below are simply tuned to match the line wrapping in the original
214                 // CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
215                 // for a single character) and could definitely be improved.
216                 // TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
217                 int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
218                 if (onlyElement.length() <= maxWidth) {
219                     // Single element for path: don't add newlines.
220                     printValue(out, onlyElement, format);
221                     wasSingular = true;
222                 } else {
223                     // Element too long to fit in one line, so wrap.
224                     int end;
225                     for (int i = 0; i < onlyElement.length(); i = end) {
226                         end = goodBreak(onlyElement, i + maxWidth);
227                         String part = onlyElement.substring(i, end);
228                         newLineAndIndent(out, format);
229                         printValue(out, part, format);
230                     }
231                 }
232             } else {
233                 // Only one array for the rbPath, so don't add an extra set of braces.
234                 printElements(out, onlyValue, format);
235             }
236         } else {
237             for (RbValue value : values) {
238                 if (value.isSingleton()) {
239                     // Single-value array: print normally.
240                     printElements(out, value, format);
241                 } else {
242                     // Enclose this array in braces to separate it from other values.
243                     open("", out);
244                     printElements(out, value, format);
245                     close(out);
246                 }
247             }
248         }
249         return wasSingular;
250     }
251 
252     private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
253     private static final RbPath RB_RULES = RbPath.of("rules");
254     private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
255     private static final RbPath RB_ERAS = RbPath.of("eras");
256     private static final RbPath RB_NAMED = RbPath.of("named");
257     private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
258     private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
259 
260     /**
261      * Wrapper for a hack to determine if the given rb path should always present its values as an
262      * array.
263      */
264     // TODO: Verify this is still needed, and either make it less hacky, or delete it.
mustBeArray(boolean topValues, String name, RbPath rbPath)265     private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
266         if (topValues) {
267             // matches "rules/setNN" (hence the mucking about with raw segments).
268             return name.equals("pluralRanges")
269                 && rbPath.startsWith(RB_RULES)
270                 && rbPath.getSegment(1).startsWith("set");
271         }
272         return rbPath.equals(RB_LOCALE_SCRIPT)
273             || (rbPath.contains(RB_ERAS)
274                 && !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
275                 && !rbPath.endsWith(RB_NAMED))
276             || rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
277             || rbPath.startsWith(RB_METAZONE_INFO);
278     }
279 
printElements(PrintWriter out, RbValue rbValue, FormatOptions format)280     private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) {
281         // TODO: If "shouldUseComma" is made obsolete, just use the "else" block always.
282         if (rbValue.getElementsPerLine() == 1) {
283             for (String v : rbValue.getElements()) {
284                 newLineAndIndent(out, format);
285                 printValue(out, quoteInside(v), format);
286                 if (format.shouldUseComma) {
287                     out.print(",");
288                 }
289             }
290         } else {
291             checkArgument(format.shouldUseComma, "cannot group non-sequence values");
292             Iterable<List<String>> partitions =
293                     Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine());
294             for (List<String> tuple : partitions) {
295                 newLineAndIndent(out, format);
296                 for (String v : tuple) {
297                     printValue(out, quoteInside(v), format);
298                     out.print(",");
299                 }
300             }
301         }
302     }
303 
printValue(PrintWriter out, String value, FormatOptions format)304     private static void printValue(PrintWriter out, String value, FormatOptions format) {
305         if (format.shouldQuote) {
306             out.append('"').append(value).append('"');
307         } else {
308             out.append(value);
309         }
310     }
311 
312     // Can a string be broken here? If not, backup until we can.
313     // TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
goodBreak(String quoted, int end)314     private static int goodBreak(String quoted, int end) {
315         if (end > quoted.length()) {
316             return quoted.length();
317         }
318         // Don't break escaped Unicode characters.
319         // Need to handle both e.g. \u4E00 and \U00020000
320         for (int i = end - 1; i > end - 10;) {
321             char current = quoted.charAt(i--);
322             if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
323                 if ((current == 'u' || current == 'U') && i > end - 10
324                     && quoted.charAt(i) == '\\') {
325                     return i;
326                 }
327                 break;
328             }
329         }
330         while (end > 0) {
331             char ch = quoted.charAt(end - 1);
332             if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
333                 break;
334             }
335             --end;
336         }
337         return end;
338     }
339 
340     // Fix characters inside strings.
quoteInside(String item)341     private static String quoteInside(String item) {
342         // Unicode-escape all quotes.
343         item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
344         // Double up on backslashes, ignoring Unicode-escaped characters.
345         Pattern pattern =
346             item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
347         Matcher matcher = pattern.matcher(item);
348 
349         if (!matcher.find()) {
350             return item;
351         }
352         StringBuilder buffer = new StringBuilder();
353         int start = 0;
354         do {
355             buffer.append(item, start, matcher.start());
356             int punctuationChar = item.codePointAt(matcher.end() - 1);
357             buffer.append("\\");
358             if (punctuationChar == '\\') {
359                 buffer.append('\\');
360             }
361             buffer.append(matcher.group());
362             start = matcher.end();
363         } while (matcher.find());
364         buffer.append(item.substring(start));
365         return buffer.toString();
366     }
367 }
368