android-14.0.0_r21/s

// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;

import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkElementIndex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import com.google.common.base.Joiner;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;

/**
 * Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
 * ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
 * extended to be a more fully featured "diff" tool or a proper ICU data file parser.
 *
 * <p>This is a temporary debugging tool and should not be relied upon during any part of the data
 * generation process.
 */
final class IcuDataDumper {
    private static final Joiner LIST_JOINER = Joiner.on(',');
    private static final RbPath VERSION = RbPath.of("Version");

    public static void main(String... args) throws IOException {
        Path fileOrDir;
        Optional<Pattern> name = Optional.empty();
        switch (args.length) {
        case 2:
            name = Optional.of(Pattern.compile(args[1]));
        case 1:
            fileOrDir = Paths.get(args[0]);
            break;
        default:
            throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
        }

        if (Files.isDirectory(fileOrDir)) {
            walkDirectory(fileOrDir, name);
        } else {
            checkArgument(!name.isPresent(),
                "cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
            IcuDataParser parser = new IcuDataParser(fileOrDir);
            parser.parse();
            dump(parser.icuData);
        }
    }

    private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
        Predicate<Path> matchesName =
            f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
        List<IcuDataParser> icuParsers;
        try (Stream<Path> files = Files.walk(fileOrDir)) {
            icuParsers = files
                .filter(Files::isRegularFile)
                .filter(matchesName)
                .map(IcuDataParser::new)
                .collect(toImmutableList());
        }
        ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
        for (IcuDataParser p : icuParsers) {
            p.parse();
            for (RbPath k : p.icuData.keySet()) {
                List<RbValue> values = p.icuData.get(k);
                if (!allPaths.containsKey(k)) {
                    allPaths.putAll(k, values);
                } else if (!VERSION.equals(k)) {
                    checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
                }
            }
        }
        dump(allPaths);
    }

    private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
        allPaths.keySet().stream()
            .sorted()
            .forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
    }

    private static final class IcuDataParser {
        // Path of file being parsed.
        private final Path path;

        // Comments in header (before data starts), without comment characters.
        private final List<String> headerComment = new ArrayList<>();
        // ICU data name (the name of the root element).
        private String name = null;
        // ICU data values.
        private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();

        // Current line number (1-indexed).
        private int lineNumber = 0;
        // The type of the previous line that was processed.
        private LineType lastType = LineType.COMMENT;
        // True when inside /* .. */ comments in the header.
        private boolean inBlockComment = false;
        // True when in the final top-level group at the end of parsing.
        private boolean inFinalGroup = false;
        // True when a partial (line wrapped) value has been read.
        private boolean isLineContinuation = false;
        // Current path while parsing (NOT including the root element).
        private Deque<String> pathStack = new ArrayDeque<>();
        // Current sequence of values for the path (as defined in the current path stack).
        private List<String> currentValue = new ArrayList<>();
        // Current partially read value of a multi-line value.
        private String wrappedValue = "";
        // Map of indices used to auto-generate names for anonymous path segments.
        // TODO: Check if this is even needed and remove if not.
        private Multiset<Integer> indices = HashMultiset.create();

        IcuDataParser(Path path) {
            this.path = checkNotNull(path);
        }

        public boolean parse() throws IOException {
            List<String> lines = Files.readAllLines(path);
            // Best approximation to a magic number be have (BOM plus inline comment). This stops
            // use trying to parse the transliteration files, which are a different type.
            if (!lines.get(0).startsWith("\uFEFF//")) {
                return false;
            }
            lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);

            // Sanity check for expected final state. Just checking the "lastType" should be enough
            // to catch everything else (due to transition rules and how the code tidies up) but it
            // seems prudent to sanity check everything just in case.
            checkState(lastType == LineType.GROUP_END);
            checkState(!inBlockComment);
            checkState(name != null);
            checkState(pathStack.isEmpty() && inFinalGroup);
            checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
            return true;
        }

        void processLineWithCheck(String line) {
            lineNumber++;
            if (lineNumber == 1 && line.startsWith("\uFEFF")) {
                line = line.substring(1);
            }
            try {
                processLine(line);
            } catch (RuntimeException e) {
                throw new RuntimeException(
                    String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
                    e);
            }
        }

        void processLine(String line) {
            line = maybeTrimEndOfLineComment(line);
            if (line.isEmpty()) {
                return;
            }
            LineMatch match = LineType.match(line, inBlockComment);
            checkState(match.getType().isValidTransitionFrom(lastType),
                "invalid state transition: %s --//-> %s", lastType, match.getType());
            boolean isEndOfWrappedValue = false;
            switch (match.getType()) {
            case COMMENT:
                if (name != null) {
                    // Comments in data are ignored since they cannot be properly associated with
                    // paths or values in an IcuData instance (only legacy tooling emits these).
                    break;
                }
                if (line.startsWith("/*")) {
                    inBlockComment = true;
                }
                headerComment.add(match.get(0));
                if (inBlockComment && line.contains("*/")) {
                    checkState(line.indexOf("*/") == line.length() - 2,
                        "unexpected end of comment block");
                    inBlockComment = false;
                }
                break;

            case INLINE_VALUE:
                icuData.put(
                    getPathFromStack().extendBy(getSegment(match.get(0))),
                    RbValue.of(unquote(match.get(1))));
                break;

            case GROUP_START:
                checkState(currentValue.isEmpty());
                if (name == null) {
                    name = match.get(0);
                    checkState(name != null, "cannot have anonymous top-level group");
                } else {
                    pathStack.push(getSegment(match.get(0)));
                }
                wrappedValue = "";
                isLineContinuation = false;
                break;

            case QUOTED_VALUE:
                wrappedValue += unquote(match.get(0));
                isLineContinuation = !line.endsWith(",");
                if (!isLineContinuation) {
                    currentValue.add(wrappedValue);
                    wrappedValue = "";
                }
                break;

            case VALUE:
                checkState(!isLineContinuation, "unexpected unquoted value");
                currentValue.add(match.get(0));
                break;

            case GROUP_END:
                // Account for quoted values without trailing ',' just before group end.
                if (isLineContinuation) {
                    currentValue.add(wrappedValue);
                    isLineContinuation = false;
                }
                // Emit the collection sequence of values for the current path as an RbValue.
                if (!currentValue.isEmpty()) {
                    icuData.put(getPathFromStack(), RbValue.of(currentValue));
                    currentValue.clear();
                }
                // Annoyingly the name is outside the stack so the stack will empty before the last
                // end group.
                if (!pathStack.isEmpty()) {
                    pathStack.pop();
                    indices.setCount(pathStack.size(), 0);
                } else {
                    checkState(!inFinalGroup, "unexpected group end");
                    inFinalGroup = true;
                }
                break;

            case UNKNOWN:
                throw new IllegalStateException("cannot parse line: " + match.get(0));
            }
            lastType = match.getType();
        }

        private RbPath getPathFromStack() {
            if (pathStack.isEmpty()) {
                return RbPath.of();
            }
            List<String> segments = new ArrayList<>();
            Iterables.addAll(segments, pathStack);
            if (segments.get(0).matches("<[0-9]{4}>")) {
                segments.remove(0);
            }
            return RbPath.of(Lists.reverse(segments));
        }

        private String getSegment(String segmentOrNull) {
            if (segmentOrNull != null) {
                return segmentOrNull;
            }
            int depth = pathStack.size();
            int index = indices.count(depth);
            indices.add(depth, 1);
            return String.format("<%04d>", index);
        }

        private String maybeTrimEndOfLineComment(String line) {
            // Once the name is set, we are past the header and into the data.
            if (name != null) {
                // Index to search for '//' from - must skip quoted values.
                int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
                int commentIdx = line.indexOf("//", startIdx);
                if (commentIdx != -1) {
                    line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
                }
            }
            return line;
        }

        private static String unquote(String s) {
            if (s.startsWith("\"") && s.endsWith("\"")) {
                return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
            }
            checkState(!s.contains("\""), "invalid unquoted value: %s", s);
            return s;
        }

        private static final class LineMatch {
            private final LineType type;
            private final Function<Integer, String> args;

            LineMatch(LineType type, Function<Integer, String> args) {
                this.type = checkNotNull(type);
                this.args = checkNotNull(args);
            }

            String get(int n) {
                return args.apply(n);
            }

            LineType getType() {
                return type;
            }
        }

        private enum LineType {
            // Comment _start_ with any comment value captured.
            COMMENT("(?://|/\\*)\\s*(.*)"),
            // A combination of GROUP_START, VALUE and GROUP_END with whitespace.
            INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
            // Allows for empty segment names (anonymous arrays) which match 'null'.
            GROUP_START("(?:(.*\\S)\\s*)?\\{"),
            GROUP_END("\\}"),
            QUOTED_VALUE("(\".*\"),?"),
            VALUE("([^\"{}]+),?"),
            UNKNOWN(".*");

            // Table of allowed transitions expected during parsing.
            // key=current state, values=set of permitted previous states
            private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
                ImmutableSetMultimap.<LineType, LineType>builder()
                    .putAll(COMMENT, COMMENT)
                    .putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
                    .putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
                    .putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
                    .putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
                    .putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
                    .build();

            private final Pattern pattern;

            LineType(String regex) {
                this.pattern = Pattern.compile(regex);
            }

            boolean isValidTransitionFrom(LineType lastType) {
                return TRANSITIONS.get(this).contains(lastType);
            }

            static LineMatch match(String line, boolean inBlockComment) {
                // Block comments kinda suck and it'd be great if the ICU data only used '//' style
                // comments (if would definitely simplify any parsers out there). Once the
                // transition to the new transformation tools is complete, they can be changed to
                // only emit '//' style comments.
                if (inBlockComment) {
                    if (line.startsWith("*")) {
                        line = whitespace().trimLeadingFrom(line.substring(1));
                    }
                    return new LineMatch(COMMENT, ImmutableList.of(line)::get);
                }
                for (LineType type : TRANSITIONS.keySet()) {
                    // Regex groups start at 1, but we want the getter function to be zero-indexed.
                    Matcher m = type.pattern.matcher(line);
                    if (m.matches()) {
                        return new LineMatch(type, n -> {
                            checkElementIndex(n, m.groupCount());
                            return m.group(n + 1);
                        });
                    }
                }
                return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
            }
        }
    }
}