1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static java.nio.charset.StandardCharsets.UTF_8; 8 import static java.nio.file.StandardOpenOption.CREATE; 9 import static java.nio.file.StandardOpenOption.CREATE_NEW; 10 import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; 11 import static java.util.stream.Collectors.joining; 12 13 import java.io.IOException; 14 import java.io.PrintWriter; 15 import java.io.Writer; 16 import java.nio.file.Files; 17 import java.nio.file.OpenOption; 18 import java.nio.file.Path; 19 import java.util.List; 20 import java.util.regex.Matcher; 21 import java.util.regex.Pattern; 22 23 import com.google.common.collect.Iterables; 24 25 /** 26 * Writes an IcuData object to a text file. A lot of this class was copied directly from the 27 * original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic 28 * behaviours. The behaviour of this class is currently tuned to produce perfect parity with 29 * the original conversion tools, but once migration of the tools is complete, it should 30 * probably be revisited and tidied up. 31 */ 32 // TODO: Link to a definitive specification for the ICU data files and remove the hacks! 33 final class IcuTextWriter { 34 private static final String INDENT = " "; 35 // List of characters to escape in UnicodeSets 36 // ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '='). 37 private static final Pattern UNICODESET_ESCAPE = 38 Pattern.compile("\\\\[\\\\\\[\\]{}\\-&:^=]"); 39 // Only escape \ and " from other strings. 40 private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')"); 41 private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\""); 42 43 private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW }; 44 private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING }; 45 46 /** Write a file in ICU data format with the specified header. */ writeToFile( IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite)47 static void writeToFile( 48 IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) { 49 50 try { 51 Files.createDirectories(outDir); 52 Path file = outDir.resolve(icuData.getName() + ".txt"); 53 OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES; 54 try (Writer w = Files.newBufferedWriter(file, UTF_8, fileOptions); 55 PrintWriter out = new PrintWriter(w)) { 56 new IcuTextWriter(icuData).writeTo(out, header); 57 } 58 } catch (IOException e) { 59 throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e); 60 } 61 } 62 63 private final IcuData icuData; 64 private int depth = 0; 65 private boolean valueWasInline = false; 66 IcuTextWriter(IcuData icuData)67 IcuTextWriter(IcuData icuData) { 68 this.icuData = checkNotNull(icuData); 69 } 70 71 // TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197). writeTo(PrintWriter out, List<String> header)72 private void writeTo(PrintWriter out, List<String> header) { 73 out.write('\uFEFF'); 74 writeHeaderAndComments(out, header, icuData.getFileComment()); 75 76 // Write the ICU data to file. This takes the form: 77 // ---- 78 // <name>{ 79 // foo{ 80 // bar{baz} 81 // } 82 // } 83 // ---- 84 // So it's like every RbPath has an implicit prefix of the IcuData name. 85 String root = icuData.getName(); 86 if (!icuData.hasFallback()) { 87 root += ":table(nofallback)"; 88 } 89 // TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line). 90 out.print(root); 91 out.print("{"); 92 depth++; 93 94 RbPath lastPath = RbPath.of(); 95 for (RbPath path : icuData.getPaths()) { 96 // Close any blocks up to the common path length. Since paths are all distinct, the 97 // common length should always be shorter than either path. We add 1 since we must also 98 // account for the implicit root segment. 99 int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1; 100 // Before closing, the "cursor" is at the end of the last value written. 101 closeLastPath(commonDepth, out); 102 // After opening the value will be ready for the next value to be written. 103 openNextPath(path, out); 104 valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out); 105 lastPath = path; 106 } 107 closeLastPath(0, out); 108 out.println(); 109 out.close(); 110 } 111 112 // Before: Cursor is at the end of the previous line. 113 // After: Cursor is positioned immediately after the last closed '}' closeLastPath(int minDepth, PrintWriter out)114 private void closeLastPath(int minDepth, PrintWriter out) { 115 if (valueWasInline) { 116 depth--; 117 out.print('}'); 118 valueWasInline = false; 119 } 120 while (depth > minDepth) { 121 close(out); 122 } 123 } 124 125 // Before: Cursor is at the end of the previous line. 126 // After: Cursor is positioned immediately after the newly opened '{' openNextPath(RbPath path, PrintWriter out)127 private void openNextPath(RbPath path, PrintWriter out) { 128 while (depth <= path.length()) { 129 // The -1 is to adjust for the implicit root element which means indentation (depth) 130 // no longer matches the index of the segment we are writing. 131 open(path.getSegment(depth - 1), out); 132 } 133 } 134 open(String label, PrintWriter out)135 private void open(String label, PrintWriter out) { 136 newLineAndIndent(out, FormatOptions.PATH_FORMAT); 137 depth++; 138 // This handles the "magic" pseudo indexing paths that are added by RegexTransformer. 139 // These take the form of "<any-string>" and are used to ensure that path order can be 140 // well defined even for anonymous lists of items. 141 if (!label.startsWith("<") && !label.endsWith(">")) { 142 out.print(label); 143 } 144 out.print('{'); 145 } 146 close(PrintWriter out)147 private void close(PrintWriter out) { 148 depth--; 149 newLineAndIndent(out, FormatOptions.PATH_FORMAT); 150 out.print('}'); 151 } 152 newLineAndIndent(PrintWriter out, FormatOptions format)153 private void newLineAndIndent(PrintWriter out, FormatOptions format) { 154 out.println(); 155 if (format.shouldIndent) { 156 for (int i = 0; i < depth; i++) { 157 out.print(INDENT); 158 } 159 } 160 } 161 162 // Currently the "header" uses '//' line comments but the comments are in a block. 163 // TODO: Sort this out so there isn't a messy mix of comment styles in the data files. writeHeaderAndComments( PrintWriter out, List<String> header, List<String> comments)164 private static void writeHeaderAndComments( 165 PrintWriter out, List<String> header, List<String> comments) { 166 167 header.forEach(s -> out.println("// " + s)); 168 if (!comments.isEmpty()) { 169 // TODO: Don't use /* */ block quotes, just use inline // quotes. 170 out.println( 171 comments.stream().collect(joining("\n * ", "/**\n * ", "\n */"))); 172 } 173 } 174 175 private static final class FormatOptions { 176 // Only the indent flag is used 177 final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true); 178 forPath(RbPath rbPath)179 static FormatOptions forPath(RbPath rbPath) { 180 return new FormatOptions( 181 !rbPath.isIntPath() && !rbPath.isBinPath(), 182 !rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(), 183 !rbPath.isBinPath()); 184 } 185 186 final boolean shouldQuote; 187 final boolean shouldUseComma; 188 final boolean shouldIndent; 189 FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent)190 private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) { 191 this.shouldQuote = shouldQuote; 192 this.shouldUseComma = shouldUseComma; 193 this.shouldIndent = shouldIndent; 194 } 195 } 196 197 /** Inserts padding and values between braces. */ 198 // TODO: Get rid of the need for icuDataName by adding type information to RbPath. appendValues( String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out)199 private boolean appendValues( 200 String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out) { 201 202 RbValue onlyValue; 203 boolean wasSingular = false; 204 FormatOptions format = FormatOptions.forPath(rbPath); 205 if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) { 206 onlyValue = values.get(0); 207 if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) { 208 // Value has a single element and is not being forced to be an array. 209 String onlyElement = Iterables.getOnlyElement(onlyValue.getElements()); 210 if (format.shouldQuote) { 211 onlyElement = quoteInside(onlyElement); 212 } 213 // The numbers below are simply tuned to match the line wrapping in the original 214 // CLDR code. The behaviour it produces is sometimes strange (wrapping a line just 215 // for a single character) and could definitely be improved. 216 // TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping. 217 int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length()); 218 if (onlyElement.length() <= maxWidth) { 219 // Single element for path: don't add newlines. 220 printValue(out, onlyElement, format); 221 wasSingular = true; 222 } else { 223 // Element too long to fit in one line, so wrap. 224 int end; 225 for (int i = 0; i < onlyElement.length(); i = end) { 226 end = goodBreak(onlyElement, i + maxWidth); 227 String part = onlyElement.substring(i, end); 228 newLineAndIndent(out, format); 229 printValue(out, part, format); 230 } 231 } 232 } else { 233 // Only one array for the rbPath, so don't add an extra set of braces. 234 printElements(out, onlyValue, format); 235 } 236 } else { 237 for (RbValue value : values) { 238 if (value.isSingleton()) { 239 // Single-value array: print normally. 240 printElements(out, value, format); 241 } else { 242 // Enclose this array in braces to separate it from other values. 243 open("", out); 244 printElements(out, value, format); 245 close(out); 246 } 247 } 248 } 249 return wasSingular; 250 } 251 252 private static final RbPath RB_SEQUENCE = RbPath.of("Sequence"); 253 private static final RbPath RB_RULES = RbPath.of("rules"); 254 private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript"); 255 private static final RbPath RB_ERAS = RbPath.of("eras"); 256 private static final RbPath RB_NAMED = RbPath.of("named"); 257 private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData"); 258 private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo"); 259 260 /** 261 * Wrapper for a hack to determine if the given rb path should always present its values as an 262 * array. 263 */ 264 // TODO: Verify this is still needed, and either make it less hacky, or delete it. mustBeArray(boolean topValues, String name, RbPath rbPath)265 private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) { 266 if (topValues) { 267 // matches "rules/setNN" (hence the mucking about with raw segments). 268 return name.equals("pluralRanges") 269 && rbPath.startsWith(RB_RULES) 270 && rbPath.getSegment(1).startsWith("set"); 271 } 272 return rbPath.equals(RB_LOCALE_SCRIPT) 273 || (rbPath.contains(RB_ERAS) 274 && !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias") 275 && !rbPath.endsWith(RB_NAMED)) 276 || rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA) 277 || rbPath.startsWith(RB_METAZONE_INFO); 278 } 279 printElements(PrintWriter out, RbValue rbValue, FormatOptions format)280 private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) { 281 // TODO: If "shouldUseComma" is made obsolete, just use the "else" block always. 282 if (rbValue.getElementsPerLine() == 1) { 283 for (String v : rbValue.getElements()) { 284 newLineAndIndent(out, format); 285 printValue(out, quoteInside(v), format); 286 if (format.shouldUseComma) { 287 out.print(","); 288 } 289 } 290 } else { 291 checkArgument(format.shouldUseComma, "cannot group non-sequence values"); 292 Iterable<List<String>> partitions = 293 Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine()); 294 for (List<String> tuple : partitions) { 295 newLineAndIndent(out, format); 296 for (String v : tuple) { 297 printValue(out, quoteInside(v), format); 298 out.print(","); 299 } 300 } 301 } 302 } 303 printValue(PrintWriter out, String value, FormatOptions format)304 private static void printValue(PrintWriter out, String value, FormatOptions format) { 305 if (format.shouldQuote) { 306 out.append('"').append(value).append('"'); 307 } else { 308 out.append(value); 309 } 310 } 311 312 // Can a string be broken here? If not, backup until we can. 313 // TODO: Either don't bother line wrapping or look at making this use a line-break iterator. goodBreak(String quoted, int end)314 private static int goodBreak(String quoted, int end) { 315 if (end > quoted.length()) { 316 return quoted.length(); 317 } 318 // Don't break escaped Unicode characters. 319 // Need to handle both e.g. \u4E00 and \U00020000 320 for (int i = end - 1; i > end - 10;) { 321 char current = quoted.charAt(i--); 322 if (!Character.toString(current).matches("[0-9A-Fa-f]")) { 323 if ((current == 'u' || current == 'U') && i > end - 10 324 && quoted.charAt(i) == '\\') { 325 return i; 326 } 327 break; 328 } 329 } 330 while (end > 0) { 331 char ch = quoted.charAt(end - 1); 332 if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) { 333 break; 334 } 335 --end; 336 } 337 return end; 338 } 339 340 // Fix characters inside strings. quoteInside(String item)341 private static String quoteInside(String item) { 342 // Unicode-escape all quotes. 343 item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022"); 344 // Double up on backslashes, ignoring Unicode-escaped characters. 345 Pattern pattern = 346 item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE; 347 Matcher matcher = pattern.matcher(item); 348 349 if (!matcher.find()) { 350 return item; 351 } 352 StringBuilder buffer = new StringBuilder(); 353 int start = 0; 354 do { 355 buffer.append(item, start, matcher.start()); 356 int punctuationChar = item.codePointAt(matcher.end() - 1); 357 buffer.append("\\"); 358 if (punctuationChar == '\\') { 359 buffer.append('\\'); 360 } 361 buffer.append(matcher.group()); 362 start = matcher.end(); 363 } while (matcher.find()); 364 buffer.append(item.substring(start)); 365 return buffer.toString(); 366 } 367 } 368