android-12.0.0_r34/s

// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT;
import static java.util.function.Function.identity;
import static java.util.regex.Pattern.CASE_INSENSITIVE;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;

import java.util.Arrays;
import java.util.Set;
import java.util.function.Function;
import java.util.function.IntUnaryOperator;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.IntStream;

import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataSupplier.CldrResolution;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.cldr.api.FilteredData;
import org.unicode.cldr.api.PathMatcher;

import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;

/**
 * A factory for wrapping data suppliers to add synthetic locales for debugging. The currently
 * supported synthetic locales are:
 * <ul>
 *     <li>{@code en_XA}: A pseudo locale which generates expanded text with many non-Latin accents.
 *     <li>{@code ar_XB}: A pseudo locale which generates BiDi text for debugging.
 * </ul>
 *
 * <p>Both pseudo locales are based on {@code "en"} data, and generate values which are readable
 * by English speaking developers. For example, the CLDR value "Hello World" will be turned into
 * something like:
 * <ul>
 *     <li>{@code en_XA}: [Ĥéļļö Ŵöŕļð one two]
 *     <li>{@code ar_XB}: dlroW elloH
 * </ul>
 *
 * <p>In the case of BiDi pseudo localization, bi-directional markers are also inserted into the
 * text so that, if the system using the data is configured correctly, the results will look
 * "normal" (i.e. Latin text will appear displayed left-to-right because of the BiDi markers).
 */
// TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled.
public final class PseudoLocales {
    // Right-to-left override character.
    private static final String RLO = "\u202e";
    // Arabic letter mark character.
    private static final String ALM = "\u061C";
    // Pop direction formatting character.
    private static final String PDF = "\u202c";
    // Prefix to add before each LTR word.
    private static final String BIDI_PREFIX = ALM + RLO;
    // Postfix to add after each LTR word.
    private static final String BIDI_POSTFIX = PDF + ALM;

    // See getExemplarValue() method for why we don't extract the exemplar list from "en".
    private enum PseudoType {
        BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF),
        EXPAND("en_XA", PseudoLocales::expanding,
            "a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm"
                + "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175"
                + "x\u1e8by\u00fdz\u017e");

        private static final ImmutableMap<String, PseudoType> ID_MAP =
            Arrays.stream(values()).collect(toImmutableMap(PseudoType::getLocaleId, identity()));

        private static PseudoType fromId(String localeId) {
            return checkNotNull(ID_MAP.get(localeId), "unknown pseduo locale: %s", localeId);
        }

        private static ImmutableSet<String> getLocaleIds() {
            return ID_MAP.keySet();
        }

        private final String localeId;
        private final Function<Boolean, PseudoText> textSupplier;
        // A string whose code points form the exemplar set for the pseudo locale.
        private final String exemplars;

        PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars) {
            this.localeId = localeId;
            this.textSupplier = textSupplier;
            this.exemplars = exemplars;
        }

        String getLocaleId() {
            return localeId;
        }

        PseudoText getText(boolean isPattern) {
            return textSupplier.apply(isPattern);
        }

        String getExemplars() {
            return exemplars;
        }
    }

    /**
     * Returns a wrapped data supplier which will inject {@link CldrData} for the pseudo locales
     * {@code en_XA} and {@code ar_XB}. These locales should behave in all respects like normal
     * locales and can be processed accordingly.
     */
    public static CldrDataSupplier addPseudoLocalesTo(CldrDataSupplier src) {
        return new PseudoSupplier(src);
    }

    private static final class PseudoSupplier extends CldrDataSupplier {
        private final CldrDataSupplier src;
        private final Set<String> srcIds;
        private final CldrData enData;
        private final ImmutableSet<CldrPath> pathsToProcess;

        PseudoSupplier(CldrDataSupplier src) {
            this.src = checkNotNull(src);
            this.srcIds = src.getAvailableLocaleIds();
            // Start with resolved data so we can merge values from "en" and "en_001" for coverage
            // and supply the unfiltered values if someone wants the resolved version of the pseudo
            // locale data.
            this.enData = src.getDataForLocale("en", RESOLVED);
            // But since we don't want to filter paths which come from the "root" locale (such as
            // aliases) then we need to find the union of "English" paths we expect to filter.
            this.pathsToProcess = getUnresolvedPaths(src, "en", "en_001");
            // Just check that we aren't wrapping an already wrapped supplier.
            PseudoType.getLocaleIds()
                .forEach(id -> checkArgument(!srcIds.contains(id),
                    "pseudo locale %s already supported by given data supplier", id));
        }

        private static ImmutableSet<CldrPath> getUnresolvedPaths(
            CldrDataSupplier src, String... ids) {

            ImmutableSet.Builder<CldrPath> paths = ImmutableSet.builder();
            for (String id : ids) {
                src.getDataForLocale(id, UNRESOLVED).accept(ARBITRARY, v -> paths.add(v.getPath()));
            }
            return paths.build();
        }

        @Override public CldrDataSupplier withDraftStatusAtLeast(CldrDraftStatus draftStatus) {
            return new PseudoSupplier(src.withDraftStatusAtLeast(draftStatus));
        }

        @Override public CldrData getDataForLocale(String localeId, CldrResolution resolution) {
            if (PseudoType.getLocaleIds().contains(localeId)) {
                return new PseudoLocaleData(
                    enData, pathsToProcess, resolution, PseudoType.fromId(localeId));
            } else {
                return src.getDataForLocale(localeId, resolution);
            }
        }

        @Override public Set<String> getAvailableLocaleIds() {
            return Sets.union(src.getAvailableLocaleIds(), PseudoType.getLocaleIds());
        }

        @Override public CldrData getDataForType(CldrDataType type) {
            return src.getDataForType(type);
        }
    }

    private interface PseudoText {
        void addFragment(String text, boolean isLocalizable);
    }

    private static final class PseudoLocaleData extends FilteredData {
        private static final PathMatcher LDML = PathMatcher.of("//ldml");

        private static final PathMatcher AUX_EXEMPLARS =
            ldml("characters/exemplarCharacters[@type=\"auxiliary\"]");

        private static final PathMatcher NUMBERING_SYSTEM =
            ldml("numbers/defaultNumberingSystem");

        private static final PathMatcher GREGORIAN_SHORT_STANDARD_PATTERN =
            ldml("dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]");

        // These paths were mostly derived from looking at the previous implementation's behaviour
        // and can be modified as needed.
        private static final Predicate<CldrPath> IS_PSEUDO_PATH =
            matchAnyLdmlPrefix(
                "localeDisplayNames",
                "delimiters",
                "dates/calendars/calendar",
                "dates/fields",
                "dates/timeZoneNames",
                "listPatterns",
                "posix/messages",
                "characterLabels",
                "typographicNames",
                "units")
                .and(matchAnyLdmlPrefix(
                    "localeDisplayNames/localeDisplayPattern",
                    "dates/timeZoneNames/fallbackFormat")
                    .negate());

        // The expectation is that all non-alias paths with values under these roots are "date/time
        // pattern like" (such as "E h:mm:ss B") in which care must be taken to not pseudo localize
        // the patterns in such as way as to break them. This list must be accurate.
        private static final Predicate<CldrPath> IS_PATTERN_PATH = matchAnyLdmlPrefix(
            "dates/calendars/calendar/timeFormats",
            "dates/calendars/calendar/dateFormats",
            "dates/calendars/calendar/dateTimeFormats",
            "dates/timeZoneNames/hourFormat");

        private static PathMatcher ldml(String paths) {
            return LDML.withSuffix(paths);
        }

        private static Predicate<CldrPath> matchAnyLdmlPrefix(String... paths) {
            ImmutableList<Predicate<CldrPath>> collect =
                Arrays.stream(paths)
                    .map(s -> (Predicate<CldrPath>) ldml(s)::matchesPrefixOf)
                    .collect(toImmutableList());
            return p -> collect.stream().anyMatch(e -> e.test(p));
        }

        // Look for any attribute in the path with "narrow" in its value. Since "narrow" values
        // have strong expectations of width, we should not expand these (but might alter them
        // otherwise).
        private static final Predicate<String> IS_NARROW =
            Pattern.compile("\\[@[a-z]+=\"[^\"]*narrow[^\"]*\"]", CASE_INSENSITIVE).asPredicate();

        private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}");
        private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'");

        private final PseudoType type;
        private final boolean isResolved;
        private final ImmutableSet<CldrPath> pathsToProcess;

        private PseudoLocaleData(
            CldrData srcData,
            ImmutableSet<CldrPath> pathsToProcess,
            CldrResolution resolution,
            PseudoType type) {

            super(srcData);
            this.isResolved = checkNotNull(resolution) == RESOLVED;
            this.type = checkNotNull(type);
            this.pathsToProcess = pathsToProcess;
        }

        @Override
        protected CldrValue filter(CldrValue value) {
            CldrPath path = value.getPath();

            // Special case(s) first...
            // We add the exemplar character list according to the pseudo type.
            if (AUX_EXEMPLARS.matches(path)) {
                return getExemplarValue(path);
            }
            // Force "latn" for the "ar_XB" pseudo locale (since otherwise it inherits from "ar".
            // The path we get here was from "en" so should already be "latn", but we just have
            // to return it in order for it to take effect.
            if (type == PseudoType.BIDI && NUMBERING_SYSTEM.matches(path)) {
                checkArgument(value.getValue().equals("latn"));
                return value;
            }

            CldrValue defaultReturnValue = isResolved ? value : null;
            // This makes it look like we have explicit values only for the included paths.
            if (!pathsToProcess.contains(path) || !IS_PSEUDO_PATH.test(path)) {
                return defaultReturnValue;
            }
            String fullPath = value.getFullPath();
            // For now don't do anything with "narrow" data (this matches the previous behaviour).
            // We can always add something here later if necessary.
            if (IS_NARROW.test(fullPath)) {
                return defaultReturnValue;
            }
            // Explicitly return 24 hrs format pattern for the Gregorian short standard pattern
            // entry to be consistent with the time cycle specified in supplemental.xml for
            // region 001. 001 is the region the pseudolocales en_XA/ar_XB default to.
            // This prevents ICU unit test failure.
            if (GREGORIAN_SHORT_STANDARD_PATTERN.matches(path)) {
                return CldrValue.parseValue(fullPath, "[H:mm]");
            }
            String text = createMessage(value.getValue(), IS_PATTERN_PATH.test(path));

            return CldrValue.parseValue(fullPath, text);
        }

        // It's tempting to think that the existing exemplar list in "en" could be parsed to
        // generate list automatically (rather than having a hard coded list in the type) but
        // https://unicode.org/reports/tr35/tr35-general.html#ExemplarSyntax
        // makes it quite clear that this is infeasible, since there are many equivalent
        // representations of the examplar characters that could appear in the value
        // (e.g. "[a b ... z]", "[a-z]", "[{a} {b} ... {z}]")
        private CldrValue getExemplarValue(CldrPath path) {
            StringBuilder exemplarList = new StringBuilder("[");
            type.getExemplars().codePoints()
                .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' '));
            exemplarList.setCharAt(exemplarList.length() - 1, ']');
            return CldrValue.parseValue(path.toString(), exemplarList.toString());
        }

        // Append a (possibly escaped) representation of the exemaplar character.
        private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) {
            // This could be fixed if needed, but for now it's safer to check.
            checkArgument(
                Character.isBmpCodePoint(cp),
                "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp));
            if (Character.isAlphabetic(cp)) {
                out.appendCodePoint(cp);
            } else {
                out.append(String.format("\\u%04X", cp));
            }
            return out;
        }

        private String createMessage(String text, boolean isPattern) {
            // Pattern text is split by the quoted sections (which are localizable) whereas
            // non-pattern text is split by placeholder (e.g. {0}) which are not localizable.
            // This is why "isPattern" is used to signal "isLocalizable" in addFragment().
            Matcher match = (isPattern ? QUOTED_TEXT : NUMERIC_PLACEHOLDER).matcher(text);
            // Alternate between unmatched and matched sections in the text, always localizing one
            // but not the other (depending the type). Append the trailing section at the end.
            PseudoText out = type.getText(isPattern);
            int start = 0;
            for (; match.find(); start = match.end()) {
                out.addFragment(text.substring(start, match.start()), !isPattern);
                out.addFragment(match.group(), isPattern);
            }
            out.addFragment(text.substring(start), !isPattern);
            return out.toString();
        }
    }

    // ---- Expanding Pseudo-localizer (e.g. "November" --> "[Ñöṽéɱƀéŕ one two]") ----

    // A map from a string of alternating key/value code-points; e.g. '1' -> '①'.
    // Note that a subset of this is also used to form the "exemplar" set (see PseudoType).
    private static final IntUnaryOperator CONVERT_CODEPOINT = toCodePointFunction(
        " \u2003!\u00a1\"\u2033#\u266f$\u20ac%\u2030&\u214b*\u204e+\u207a,\u060c-\u2010.\u00b7"
            + "/\u20440\u24ea1\u24602\u24613\u24624\u24635\u24646\u24657\u24668\u24679\u2468"
            + ":\u2236;\u204f<\u2264=\u2242>\u2265?\u00bf@\u055eA\u00c5B\u0181C\u00c7D\u00d0"
            + "E\u00c9F\u0191G\u011cH\u0124I\u00ceJ\u0134K\u0136L\u013bM\u1e40N\u00d1O\u00d6"
            + "P\u00deQ\u01eaR\u0154S\u0160T\u0162U\u00dbV\u1e7cW\u0174X\u1e8aY\u00ddZ\u017d"
            + "[\u2045\\\u2216]\u2046^\u02c4_\u203f`\u2035a\u00e5b\u0180c\u00e7d\u00f0e\u00e9"
            + "f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm\u0271n\u00f1o\u00f6p\u00fe"
            + "q\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175x\u1e8by\u00fdz\u017e|\u00a6"
            + "~\u02de");

    // Converts a source/target alternating code-points into a map.
    private static IntUnaryOperator toCodePointFunction(String s) {
        // Not pretty, but there's no nice way to "pair up" successive stream elements without
        // extra library dependencies, so we collect them and then iterate via index.
        int[] codePoints = s.codePoints().toArray();
        checkArgument((codePoints.length & 1) == 0,
            "must have an even number of code points (was %s)", codePoints.length);
        ImmutableMap<Integer, Integer> map =
            IntStream.range(0, codePoints.length / 2)
                .boxed()
                .collect(toImmutableMap(n -> codePoints[2 * n], n -> codePoints[(2 * n) + 1]));
        return cp -> map.getOrDefault(cp, cp);
    }

    // A list of words to be added to text when it is expanded. A whole number of words are
    // always added (and the fact they are numeric words is irrelevant, could be Lorem Ipsum).
    // So far nothing goes above "ten" in en_XA, but this can always be trivially extended.
    private static final String PADDING = "one two three four five six seven eight nine ten";

    private static PseudoText expanding(boolean isPattern) {
        return new PseudoText() {
            IntStream.Builder codePoints = IntStream.builder();

            @Override
            public void addFragment(String text, boolean isLocalizable) {
                text.codePoints()
                    .map(isLocalizable ? CONVERT_CODEPOINT : cp -> cp)
                    .forEach(codePoints::add);
            }

            @Override
            public String toString() {
                int[] cp = codePoints.build().toArray();
                // Copy the original code and round up the 50% calculation (it's not important).
                int endIndex = CharMatcher.whitespace().indexIn(PADDING, (cp.length + 1) / 2);
                String suffix = PADDING.substring(0, Math.min(endIndex, PADDING.length()));
                // For pattern strings, any literal text must be quoted (the fragment text
                // already was). Note that this is why we don't transform single-quotes.
                if (isPattern) {
                    suffix = "'" + suffix.replace(" ", "' '") + "'";
                }
                // Final output is something like "November" --> "[Ñöṽéɱƀéŕ one two]"
                // Where the additional padding adds at least 50% to the length of the text.
                return "[" + new String(cp, 0, cp.length) + " " + suffix + "]";
            }
        };
    }

    // ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)----

    // Bidi localization doesn't care if the fragment is a pattern or not.
    @SuppressWarnings("unused")
    private static PseudoText bidi(boolean isPattern) {
        return new PseudoText() {
            private final StringBuilder out = new StringBuilder();

            // This was largely copied from the original CLDRFilePseudolocalizer class and
            // while it appears to work fine, I don't know enough to comment it clearly.
            // TODO: Find someone who can add a decent comment here!
            @Override
            public void addFragment(String text, boolean isLocalizable) {
                if (isLocalizable) {
                    boolean wrapping = false;
                    for (int index = 0; index < text.length(); ) {
                        int codePoint = text.codePointAt(index);
                        index += Character.charCount(codePoint);
                        byte directionality = Character.getDirectionality(codePoint);
                        boolean needsWrap = (directionality == DIRECTIONALITY_LEFT_TO_RIGHT);
                        if (needsWrap != wrapping) {
                            wrapping = needsWrap;
                            out.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX);
                        }
                        out.appendCodePoint(codePoint);
                    }
                    if (wrapping) {
                        out.append(BIDI_POSTFIX);
                    }
                } else {
                    out.append(text);
                }
            }

            @Override
            public String toString() {
                return out.toString();
            }
        };
    }

    private PseudoLocales() {
    }
}