package org.unicode.cldr.icu; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.FileCopier; import org.unicode.cldr.util.PatternCache; import com.ibm.icu.impl.Utility; import com.ibm.icu.util.Calendar; /** * Writes an IcuData object to a text file. * * @author jchye */ public class IcuTextWriter { /** * The default tab indent (actually spaces) */ private static final String TAB = " "; // List of characters to escape in UnicodeSets. private static final Pattern UNICODESET_ESCAPE = PatternCache.get("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]"); // Only escape \ and " from other strings. private static final Pattern STRING_ESCAPE = PatternCache.get("(?!')\\\\\\\\(?!')"); private static final Pattern QUOTE_ESCAPE = PatternCache.get("\\\\?\""); private static String headerText; /** * ICU paths have a simple comparison, alphabetical within a level. We do * have to catch the / so that it is lower than everything. */ public static final Comparator PATH_COMPARATOR = new Comparator() { @Override public int compare(String arg0, String arg1) { int min = Math.min(arg0.length(), arg1.length()); for (int i = 0; i < min; ++i) { int ch0 = arg0.charAt(i); int ch1 = arg1.charAt(i); int diff = ch0 - ch1; if (diff == 0) { continue; } if (ch0 == '/') { return -1; } else if (ch1 == '/') { return 1; } // make * greater than everything, because of languageMatch // while it is a pain to have it be unordered, this fix is sufficient to put all the *'s after anything else if (ch0 == '*') { return 1; } else if (ch1 == '*') { return -1; } return diff; } return arg0.length() - arg1.length(); } }; private static String getHeader() { if (headerText != null) { return headerText; } try (StringWriter stringWriter = new StringWriter();) { FileCopier.copy(NewLdml2IcuConverter.class, "ldml2icu_header.txt", stringWriter); headerText = stringWriter.toString(); headerText = headerText.replace("%year%", String.valueOf(Calendar.getInstance().get(Calendar.YEAR))); return headerText; } catch (IOException ioe) { throw new IllegalArgumentException(ioe); } } /** * Write a file in ICU format. LDML2ICUConverter currently has some * funny formatting in a few cases; don't try to match everything. * * @param icuData * the icu data structure to be written * @param dirPath * the directory to write the file to * @param hasSpecial * true if a special file was used to create the ICU data */ public static void writeToFile(IcuData icuData, String dirPath) throws IOException { String name = icuData.getName(); PrintWriter out = FileUtilities.openUTF8Writer(dirPath, name + ".txt"); out.write('\uFEFF'); // Append the header. String header = getHeader().replace("%source%", icuData.getSourceFile()); out.print(header); if (icuData.getFileComment() != null) { out.println("/**"); out.append(" * ").append(icuData.getFileComment()).println(); out.println(" */"); } // Write the ICU data to file. out.append(name); if (!icuData.hasFallback()) out.append(":table(nofallback)"); List sortedPaths = new ArrayList(icuData.keySet()); Collections.sort(sortedPaths, PATH_COMPARATOR); String[] lastLabels = new String[] {}; boolean wasSingular = false; for (String path : sortedPaths) { // Write values to file. String[] labels = path.split("/", -1); // Don't discard trailing slashes. int common = getCommon(lastLabels, labels); for (int i = lastLabels.length - 1; i > common; --i) { if (wasSingular) { wasSingular = false; } else { out.append(Utility.repeat(TAB, i)); } out.println("}"); } for (int i = common + 1; i < labels.length; ++i) { final String pad = Utility.repeat(TAB, i); out.append(pad); String label = labels[i]; if (!label.startsWith("<") && !label.endsWith(">")) { out.append(label); } out.append('{'); if (i != labels.length - 1) { out.println(); } } List values = icuData.get(path); try { wasSingular = appendValues(name, path, values, labels.length, out); } catch (NullPointerException npe) { System.err.println("Null value encountered in " + path); } out.flush(); lastLabels = labels; } // Add last closing braces. for (int i = lastLabels.length - 1; i > 0; --i) { if (wasSingular) { wasSingular = false; } else { out.append(Utility.repeat(TAB, i)); } out.println("}"); } out.println("}"); out.close(); } /** * Inserts padding and values between braces. * @param name * @param rbPath * @param values * @param numTabs * @param out * @return */ private static boolean appendValues(String name, String rbPath, List values, int numTabs, PrintWriter out) { String[] firstArray; boolean wasSingular = false; boolean quote = !IcuData.isIntRbPath(rbPath); boolean isSequence = rbPath.endsWith("/Sequence"); if (values.size() == 1 && !mustBeArray(true, name, rbPath)) { if ((firstArray = values.get(0)).length == 1 && !mustBeArray(false, name, rbPath)) { String value = firstArray[0]; if (quote) { value = quoteInside(value); } int maxWidth = 84 - Math.min(4, numTabs) * TAB.length(); if (value.length() <= maxWidth) { // Single value for path: don't add newlines. appendValue(value, quote, out); wasSingular = true; } else { // Value too long to fit in one line, so wrap. final String pad = Utility.repeat(TAB, numTabs); out.println(); int end; for (int i = 0; i < value.length(); i = end) { end = goodBreak(value, i + maxWidth); String part = value.substring(i, end); out.append(pad); appendValue(part, quote, out).println(); } } } else { // Only one array for the rbPath, so don't add an extra set of braces. final String pad = Utility.repeat(TAB, numTabs); out.println(); appendArray(pad, firstArray, quote, isSequence, out); } } else { final String pad = Utility.repeat(TAB, numTabs); out.println(); for (String[] valueArray : values) { if (valueArray.length == 1) { // Single-value array: print normally. appendArray(pad, valueArray, quote, isSequence, out); } else { // Enclose this array in braces to separate it from other // values. out.append(pad).println("{"); appendArray(pad + TAB, valueArray, quote, isSequence, out); out.append(pad).println("}"); } } } return wasSingular; } /** * Wrapper for a hack to determine if the given rb path should always * present its values as an array. This hack is required for an ICU data test to pass. * * @param topValues * @param name * @param rbPath * @return */ private static boolean mustBeArray(boolean topValues, String name, String rbPath) { // TODO(jchye): Add this as an option to the locale file instead of hardcoding. // System.out.println(name + "\t" + rbPath); if (topValues) { return (rbPath.startsWith("/rules/set") && name.equals("pluralRanges")); } return rbPath.equals("/LocaleScript") || (rbPath.contains("/eras/") && !rbPath.endsWith(":alias") && !rbPath.endsWith("/named")) || rbPath.startsWith("/calendarPreferenceData") || rbPath.startsWith("/metazoneInfo"); } private static PrintWriter appendArray(String padding, String[] valueArray, boolean quote, boolean isSequence, PrintWriter out) { for (String value : valueArray) { out.append(padding); appendValue(quoteInside(value), quote, out); if (!isSequence) { out.print(","); } out.println(); } return out; } private static PrintWriter appendValue(String value, boolean quote, PrintWriter out) { if (quote) { return out.append('"').append(value).append('"'); } else { return out.append(value); } } /** * Can a string be broken here? If not, backup until we can. * * @param quoted * @param end * @return */ private static int goodBreak(String quoted, int end) { if (end > quoted.length()) { return quoted.length(); } // Don't break escaped Unicode characters. // Need to handle both e.g. \u4E00 and \U00020000 for (int i = end - 1; i > end - 10;) { char current = quoted.charAt(i--); if (!Character.toString(current).matches("[0-9A-Fa-f]")) { if ((current == 'u' || current == 'U') && i > end - 10 && quoted.charAt(i) == '\\') { return i; } break; } } while (end > 0) { char ch = quoted.charAt(end - 1); if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) { break; } --end; } return end; } /** * Fix characters inside strings. * * @param item * @return */ private static String quoteInside(String item) { // Unicode-escape all quotes. item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022"); // Double up on backslashes, ignoring Unicode-escaped characters. Pattern pattern = item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE; Matcher matcher = pattern.matcher(item); if (!matcher.find()) { return item; } StringBuffer buffer = new StringBuffer(); int start = 0; do { buffer.append(item.substring(start, matcher.start())); int punctuationChar = item.codePointAt(matcher.end() - 1); buffer.append("\\"); if (punctuationChar == '\\') { buffer.append('\\'); } buffer.append(matcher.group()); start = matcher.end(); } while (matcher.find()); buffer.append(item.substring(start)); return buffer.toString(); } /** * find the initial labels (from a path) that are identical. * * @param item * @return */ private static int getCommon(String[] lastLabels, String[] labels) { int min = Math.min(lastLabels.length, labels.length); int i; for (i = 0; i < min; ++i) { if (!lastLabels[i].equals(labels[i])) { return i - 1; } } return i - 1; } }