1 package org.unicode.cldr.icu; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.io.StringWriter; 6 import java.util.ArrayList; 7 import java.util.Collections; 8 import java.util.Comparator; 9 import java.util.List; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 import org.unicode.cldr.draft.FileUtilities; 14 import org.unicode.cldr.util.FileCopier; 15 import org.unicode.cldr.util.PatternCache; 16 17 import com.ibm.icu.impl.Utility; 18 import com.ibm.icu.util.Calendar; 19 20 /** 21 * Writes an IcuData object to a text file. 22 * 23 * @author jchye 24 */ 25 public class IcuTextWriter { 26 /** 27 * The default tab indent (actually spaces) 28 */ 29 private static final String TAB = " "; 30 // List of characters to escape in UnicodeSets. 31 private static final Pattern UNICODESET_ESCAPE = PatternCache.get("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]"); 32 // Only escape \ and " from other strings. 33 private static final Pattern STRING_ESCAPE = PatternCache.get("(?!')\\\\\\\\(?!')"); 34 private static final Pattern QUOTE_ESCAPE = PatternCache.get("\\\\?\""); 35 36 private static String headerText; 37 38 /** 39 * ICU paths have a simple comparison, alphabetical within a level. We do 40 * have to catch the / so that it is lower than everything. 41 */ 42 public static final Comparator<String> PATH_COMPARATOR = new Comparator<String>() { 43 @Override 44 public int compare(String arg0, String arg1) { 45 int min = Math.min(arg0.length(), arg1.length()); 46 for (int i = 0; i < min; ++i) { 47 int ch0 = arg0.charAt(i); 48 int ch1 = arg1.charAt(i); 49 int diff = ch0 - ch1; 50 if (diff == 0) { 51 continue; 52 } 53 if (ch0 == '/') { 54 return -1; 55 } else if (ch1 == '/') { 56 return 1; 57 } 58 // make * greater than everything, because of languageMatch 59 // while it is a pain to have it be unordered, this fix is sufficient to put all the *'s after anything else 60 if (ch0 == '*') { 61 return 1; 62 } else if (ch1 == '*') { 63 return -1; 64 } 65 return diff; 66 } 67 return arg0.length() - arg1.length(); 68 } 69 }; 70 getHeader()71 private static String getHeader() { 72 if (headerText != null) { 73 return headerText; 74 } 75 try (StringWriter stringWriter = new StringWriter();) { 76 FileCopier.copy(NewLdml2IcuConverter.class, "ldml2icu_header.txt", stringWriter); 77 headerText = stringWriter.toString(); 78 headerText = headerText.replace("%year%", String.valueOf(Calendar.getInstance().get(Calendar.YEAR))); 79 return headerText; 80 } catch (IOException ioe) { 81 throw new IllegalArgumentException(ioe); 82 } 83 } 84 85 /** 86 * Write a file in ICU format. LDML2ICUConverter currently has some 87 * funny formatting in a few cases; don't try to match everything. 88 * 89 * @param icuData 90 * the icu data structure to be written 91 * @param dirPath 92 * the directory to write the file to 93 * @param hasSpecial 94 * true if a special file was used to create the ICU data 95 */ writeToFile(IcuData icuData, String dirPath)96 public static void writeToFile(IcuData icuData, String dirPath) throws IOException { 97 String name = icuData.getName(); 98 PrintWriter out = FileUtilities.openUTF8Writer(dirPath, name + ".txt"); 99 out.write('\uFEFF'); 100 // Append the header. 101 String header = getHeader().replace("%source%", icuData.getSourceFile()); 102 out.print(header); 103 if (icuData.getFileComment() != null) { 104 out.println("/**"); 105 out.append(" * ").append(icuData.getFileComment()).println(); 106 out.println(" */"); 107 } 108 109 // Write the ICU data to file. 110 out.append(name); 111 if (!icuData.hasFallback()) out.append(":table(nofallback)"); 112 List<String> sortedPaths = new ArrayList<String>(icuData.keySet()); 113 Collections.sort(sortedPaths, PATH_COMPARATOR); 114 String[] lastLabels = new String[] {}; 115 boolean wasSingular = false; 116 for (String path : sortedPaths) { 117 // Write values to file. 118 String[] labels = path.split("/", -1); // Don't discard trailing slashes. 119 int common = getCommon(lastLabels, labels); 120 for (int i = lastLabels.length - 1; i > common; --i) { 121 if (wasSingular) { 122 wasSingular = false; 123 } else { 124 out.append(Utility.repeat(TAB, i)); 125 } 126 out.println("}"); 127 } 128 for (int i = common + 1; i < labels.length; ++i) { 129 final String pad = Utility.repeat(TAB, i); 130 out.append(pad); 131 String label = labels[i]; 132 if (!label.startsWith("<") && !label.endsWith(">")) { 133 out.append(label); 134 } 135 out.append('{'); 136 if (i != labels.length - 1) { 137 out.println(); 138 } 139 } 140 List<String[]> values = icuData.get(path); 141 try { 142 wasSingular = appendValues(name, path, values, labels.length, out); 143 } catch (NullPointerException npe) { 144 System.err.println("Null value encountered in " + path); 145 } 146 out.flush(); 147 lastLabels = labels; 148 } 149 // Add last closing braces. 150 for (int i = lastLabels.length - 1; i > 0; --i) { 151 if (wasSingular) { 152 wasSingular = false; 153 } else { 154 out.append(Utility.repeat(TAB, i)); 155 } 156 out.println("}"); 157 } 158 out.println("}"); 159 out.close(); 160 } 161 162 /** 163 * Inserts padding and values between braces. 164 * @param name 165 * @param rbPath 166 * @param values 167 * @param numTabs 168 * @param out 169 * @return 170 */ appendValues(String name, String rbPath, List<String[]> values, int numTabs, PrintWriter out)171 private static boolean appendValues(String name, String rbPath, List<String[]> values, int numTabs, 172 PrintWriter out) { 173 String[] firstArray; 174 boolean wasSingular = false; 175 boolean quote = !IcuData.isIntRbPath(rbPath); 176 boolean isSequence = rbPath.endsWith("/Sequence"); 177 if (values.size() == 1 && !mustBeArray(true, name, rbPath)) { 178 if ((firstArray = values.get(0)).length == 1 && !mustBeArray(false, name, rbPath)) { 179 String value = firstArray[0]; 180 if (quote) { 181 value = quoteInside(value); 182 } 183 int maxWidth = 84 - Math.min(4, numTabs) * TAB.length(); 184 if (value.length() <= maxWidth) { 185 // Single value for path: don't add newlines. 186 appendValue(value, quote, out); 187 wasSingular = true; 188 } else { 189 // Value too long to fit in one line, so wrap. 190 final String pad = Utility.repeat(TAB, numTabs); 191 out.println(); 192 int end; 193 for (int i = 0; i < value.length(); i = end) { 194 end = goodBreak(value, i + maxWidth); 195 String part = value.substring(i, end); 196 out.append(pad); 197 appendValue(part, quote, out).println(); 198 } 199 } 200 } else { 201 // Only one array for the rbPath, so don't add an extra set of braces. 202 final String pad = Utility.repeat(TAB, numTabs); 203 out.println(); 204 appendArray(pad, firstArray, quote, isSequence, out); 205 } 206 } else { 207 final String pad = Utility.repeat(TAB, numTabs); 208 out.println(); 209 for (String[] valueArray : values) { 210 if (valueArray.length == 1) { 211 // Single-value array: print normally. 212 appendArray(pad, valueArray, quote, isSequence, out); 213 } else { 214 // Enclose this array in braces to separate it from other 215 // values. 216 out.append(pad).println("{"); 217 appendArray(pad + TAB, valueArray, quote, isSequence, out); 218 out.append(pad).println("}"); 219 } 220 } 221 } 222 return wasSingular; 223 } 224 225 /** 226 * Wrapper for a hack to determine if the given rb path should always 227 * present its values as an array. This hack is required for an ICU data test to pass. 228 * 229 * @param topValues 230 * @param name 231 * @param rbPath 232 * @return 233 */ mustBeArray(boolean topValues, String name, String rbPath)234 private static boolean mustBeArray(boolean topValues, String name, String rbPath) { 235 // TODO(jchye): Add this as an option to the locale file instead of hardcoding. 236 // System.out.println(name + "\t" + rbPath); 237 if (topValues) { 238 return (rbPath.startsWith("/rules/set") 239 && name.equals("pluralRanges")); 240 } 241 return rbPath.equals("/LocaleScript") 242 || (rbPath.contains("/eras/") && !rbPath.endsWith(":alias") && !rbPath.endsWith("/named")) 243 || rbPath.startsWith("/calendarPreferenceData") 244 || rbPath.startsWith("/metazoneInfo"); 245 } 246 appendArray(String padding, String[] valueArray, boolean quote, boolean isSequence, PrintWriter out)247 private static PrintWriter appendArray(String padding, String[] valueArray, 248 boolean quote, boolean isSequence, PrintWriter out) { 249 for (String value : valueArray) { 250 out.append(padding); 251 appendValue(quoteInside(value), quote, out); 252 if (!isSequence) { 253 out.print(","); 254 } 255 out.println(); 256 } 257 return out; 258 } 259 appendValue(String value, boolean quote, PrintWriter out)260 private static PrintWriter appendValue(String value, boolean quote, PrintWriter out) { 261 if (quote) { 262 return out.append('"').append(value).append('"'); 263 } else { 264 return out.append(value); 265 } 266 } 267 268 /** 269 * Can a string be broken here? If not, backup until we can. 270 * 271 * @param quoted 272 * @param end 273 * @return 274 */ goodBreak(String quoted, int end)275 private static int goodBreak(String quoted, int end) { 276 if (end > quoted.length()) { 277 return quoted.length(); 278 } 279 // Don't break escaped Unicode characters. 280 // Need to handle both e.g. \u4E00 and \U00020000 281 for (int i = end - 1; i > end - 10;) { 282 char current = quoted.charAt(i--); 283 if (!Character.toString(current).matches("[0-9A-Fa-f]")) { 284 if ((current == 'u' || current == 'U') && i > end - 10 && quoted.charAt(i) == '\\') { 285 return i; 286 } 287 break; 288 } 289 } 290 while (end > 0) { 291 char ch = quoted.charAt(end - 1); 292 if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) { 293 break; 294 } 295 --end; 296 } 297 return end; 298 } 299 300 /** 301 * Fix characters inside strings. 302 * 303 * @param item 304 * @return 305 */ quoteInside(String item)306 private static String quoteInside(String item) { 307 // Unicode-escape all quotes. 308 item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022"); 309 // Double up on backslashes, ignoring Unicode-escaped characters. 310 Pattern pattern = item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE; 311 Matcher matcher = pattern.matcher(item); 312 313 if (!matcher.find()) { 314 return item; 315 } 316 StringBuffer buffer = new StringBuffer(); 317 int start = 0; 318 do { 319 buffer.append(item.substring(start, matcher.start())); 320 int punctuationChar = item.codePointAt(matcher.end() - 1); 321 buffer.append("\\"); 322 if (punctuationChar == '\\') { 323 buffer.append('\\'); 324 } 325 buffer.append(matcher.group()); 326 start = matcher.end(); 327 } while (matcher.find()); 328 buffer.append(item.substring(start)); 329 return buffer.toString(); 330 } 331 332 /** 333 * find the initial labels (from a path) that are identical. 334 * 335 * @param item 336 * @return 337 */ getCommon(String[] lastLabels, String[] labels)338 private static int getCommon(String[] lastLabels, String[] labels) { 339 int min = Math.min(lastLabels.length, labels.length); 340 int i; 341 for (i = 0; i < min; ++i) { 342 if (!lastLabels[i].equals(labels[i])) { 343 return i - 1; 344 } 345 } 346 return i - 1; 347 } 348 } 349