OpenHarmony-v4.1-Release/s

// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.CREATE_NEW;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.util.stream.Collectors.joining;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.collect.Iterables;

/**
 * Writes an IcuData object to a text file. A lot of this class was copied directly from the
 * original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
 * behaviours. The behaviour of this class is currently tuned to produce perfect parity with
 * the original conversion tools, but once migration of the tools is complete, it should
 * probably be revisited and tidied up.
 */
// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
final class IcuTextWriter {
    private static final String INDENT = "    ";
    // List of characters to escape in UnicodeSets
    // ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
    private static final Pattern UNICODESET_ESCAPE =
        Pattern.compile("\\\\[\\\\\\[\\]{}\\-&:^=]");
    // Only escape \ and " from other strings.
    private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
    private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");

    private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW };
    private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING };

    /** Write a file in ICU data format with the specified header. */
    static void writeToFile(
        IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) {

        try {
            Files.createDirectories(outDir);
            Path file = outDir.resolve(icuData.getName() + ".txt");
            OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES;
            try (Writer w = Files.newBufferedWriter(file, UTF_8, fileOptions);
                PrintWriter out = new PrintWriter(w)) {
                new IcuTextWriter(icuData).writeTo(out, header);
            }
        } catch (IOException e) {
            throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
        }
    }

    private final IcuData icuData;
    private int depth = 0;
    private boolean valueWasInline = false;

    IcuTextWriter(IcuData icuData) {
        this.icuData = checkNotNull(icuData);
    }

    // TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
    private void writeTo(PrintWriter out, List<String> header) {
        out.write('\uFEFF');
        writeHeaderAndComments(out, header, icuData.getFileComment());

        // Write the ICU data to file. This takes the form:
        // ----
        // <name>{
        //     foo{
        //         bar{baz}
        //     }
        // }
        // ----
        // So it's like every RbPath has an implicit prefix of the IcuData name.
        String root = icuData.getName();
        if (!icuData.hasFallback()) {
            root += ":table(nofallback)";
        }
        // TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
        out.print(root);
        out.print("{");
        depth++;

        RbPath lastPath = RbPath.of();
        for (RbPath path : icuData.getPaths()) {
            // Close any blocks up to the common path length. Since paths are all distinct, the
            // common length should always be shorter than either path. We add 1 since we must also
            // account for the implicit root segment.
            int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
            // Before closing, the "cursor" is at the end of the last value written.
            closeLastPath(commonDepth, out);
            // After opening the value will be ready for the next value to be written.
            openNextPath(path, out);
            valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
            lastPath = path;
        }
        closeLastPath(0, out);
        out.println();
        out.close();
    }

    // Before: Cursor is at the end of the previous line.
    // After: Cursor is positioned immediately after the last closed '}'
    private void closeLastPath(int minDepth, PrintWriter out) {
        if (valueWasInline) {
            depth--;
            out.print('}');
            valueWasInline = false;
        }
        while (depth > minDepth) {
            close(out);
        }
    }

    // Before: Cursor is at the end of the previous line.
    // After: Cursor is positioned immediately after the newly opened '{'
    private void openNextPath(RbPath path, PrintWriter out) {
        while (depth <= path.length()) {
            // The -1 is to adjust for the implicit root element which means indentation (depth)
            // no longer matches the index of the segment we are writing.
            open(path.getSegment(depth - 1), out);
        }
    }

    private void open(String label, PrintWriter out) {
        newLineAndIndent(out, FormatOptions.PATH_FORMAT);
        depth++;
        // This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
        // These take the form of "<any-string>" and are used to ensure that path order can be
        // well defined even for anonymous lists of items.
        if (!label.startsWith("<") && !label.endsWith(">")) {
            out.print(label);
        }
        out.print('{');
    }

    private void close(PrintWriter out) {
        depth--;
        newLineAndIndent(out, FormatOptions.PATH_FORMAT);
        out.print('}');
    }

    private void newLineAndIndent(PrintWriter out, FormatOptions format) {
        out.println();
        if (format.shouldIndent) {
            for (int i = 0; i < depth; i++) {
                out.print(INDENT);
            }
        }
    }

    // Currently the "header" uses '//' line comments but the comments are in a block.
    // TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
    private static void writeHeaderAndComments(
        PrintWriter out, List<String> header, List<String> comments) {

        header.forEach(s -> out.println("// " + s));
        if (!comments.isEmpty()) {
            // TODO: Don't use /* */ block quotes, just use inline // quotes.
            out.println(
                comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
        }
    }

    private static final class FormatOptions {
        // Only the indent flag is used
        final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true);

        static FormatOptions forPath(RbPath rbPath) {
            return new FormatOptions(
                    !rbPath.isIntPath() && !rbPath.isBinPath(),
                    !rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(),
                    !rbPath.isBinPath());
        }

        final boolean shouldQuote;
        final boolean shouldUseComma;
        final boolean shouldIndent;

        private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) {
            this.shouldQuote = shouldQuote;
            this.shouldUseComma = shouldUseComma;
            this.shouldIndent = shouldIndent;
        }
    }

    /** Inserts padding and values between braces. */
    // TODO: Get rid of the need for icuDataName by adding type information to RbPath.
    private boolean appendValues(
        String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out) {

        RbValue onlyValue;
        boolean wasSingular = false;
        FormatOptions format = FormatOptions.forPath(rbPath);
        if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) {
            onlyValue = values.get(0);
            if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) {
                // Value has a single element and is not being forced to be an array.
                String onlyElement = Iterables.getOnlyElement(onlyValue.getElements());
                if (format.shouldQuote) {
                    onlyElement = quoteInside(onlyElement);
                }
                // The numbers below are simply tuned to match the line wrapping in the original
                // CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
                // for a single character) and could definitely be improved.
                // TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
                int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
                if (onlyElement.length() <= maxWidth) {
                    // Single element for path: don't add newlines.
                    printValue(out, onlyElement, format);
                    wasSingular = true;
                } else {
                    // Element too long to fit in one line, so wrap.
                    int end;
                    for (int i = 0; i < onlyElement.length(); i = end) {
                        end = goodBreak(onlyElement, i + maxWidth);
                        String part = onlyElement.substring(i, end);
                        newLineAndIndent(out, format);
                        printValue(out, part, format);
                    }
                }
            } else {
                // Only one array for the rbPath, so don't add an extra set of braces.
                printElements(out, onlyValue, format);
            }
        } else {
            for (RbValue value : values) {
                if (value.isSingleton()) {
                    // Single-value array: print normally.
                    printElements(out, value, format);
                } else {
                    // Enclose this array in braces to separate it from other values.
                    open("", out);
                    printElements(out, value, format);
                    close(out);
                }
            }
        }
        return wasSingular;
    }

    private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
    private static final RbPath RB_RULES = RbPath.of("rules");
    private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
    private static final RbPath RB_ERAS = RbPath.of("eras");
    private static final RbPath RB_NAMED = RbPath.of("named");
    private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
    private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");

    /**
     * Wrapper for a hack to determine if the given rb path should always present its values as an
     * array.
     */
    // TODO: Verify this is still needed, and either make it less hacky, or delete it.
    private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
        if (topValues) {
            // matches "rules/setNN" (hence the mucking about with raw segments).
            return name.equals("pluralRanges")
                && rbPath.startsWith(RB_RULES)
                && rbPath.getSegment(1).startsWith("set");
        }
        return rbPath.equals(RB_LOCALE_SCRIPT)
            || (rbPath.contains(RB_ERAS)
                && !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
                && !rbPath.endsWith(RB_NAMED))
            || rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
            || rbPath.startsWith(RB_METAZONE_INFO);
    }

    private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) {
        // TODO: If "shouldUseComma" is made obsolete, just use the "else" block always.
        if (rbValue.getElementsPerLine() == 1) {
            for (String v : rbValue.getElements()) {
                newLineAndIndent(out, format);
                printValue(out, quoteInside(v), format);
                if (format.shouldUseComma) {
                    out.print(",");
                }
            }
        } else {
            checkArgument(format.shouldUseComma, "cannot group non-sequence values");
            Iterable<List<String>> partitions =
                    Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine());
            for (List<String> tuple : partitions) {
                newLineAndIndent(out, format);
                for (String v : tuple) {
                    printValue(out, quoteInside(v), format);
                    out.print(",");
                }
            }
        }
    }

    private static void printValue(PrintWriter out, String value, FormatOptions format) {
        if (format.shouldQuote) {
            out.append('"').append(value).append('"');
        } else {
            out.append(value);
        }
    }

    // Can a string be broken here? If not, backup until we can.
    // TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
    private static int goodBreak(String quoted, int end) {
        if (end > quoted.length()) {
            return quoted.length();
        }
        // Don't break escaped Unicode characters.
        // Need to handle both e.g. \u4E00 and \U00020000
        for (int i = end - 1; i > end - 10;) {
            char current = quoted.charAt(i--);
            if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
                if ((current == 'u' || current == 'U') && i > end - 10
                    && quoted.charAt(i) == '\\') {
                    return i;
                }
                break;
            }
        }
        while (end > 0) {
            char ch = quoted.charAt(end - 1);
            if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
                break;
            }
            --end;
        }
        return end;
    }

    // Fix characters inside strings.
    private static String quoteInside(String item) {
        // Unicode-escape all quotes.
        item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
        // Double up on backslashes, ignoring Unicode-escaped characters.
        Pattern pattern =
            item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
        Matcher matcher = pattern.matcher(item);

        if (!matcher.find()) {
            return item;
        }
        StringBuilder buffer = new StringBuilder();
        int start = 0;
        do {
            buffer.append(item, start, matcher.start());
            int punctuationChar = item.codePointAt(matcher.end() - 1);
            buffer.append("\\");
            if (punctuationChar == '\\') {
                buffer.append('\\');
            }
            buffer.append(matcher.group());
            start = matcher.end();
        } while (matcher.find());
        buffer.append(item.substring(start));
        return buffer.toString();
    }
}