• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkNotNull;
7 import static com.google.common.collect.ImmutableList.toImmutableList;
8 import static com.google.common.collect.ImmutableMap.toImmutableMap;
9 import static java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT;
10 import static java.util.function.Function.identity;
11 import static java.util.regex.Pattern.CASE_INSENSITIVE;
12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
13 import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
14 import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
15 
16 import java.util.Arrays;
17 import java.util.Set;
18 import java.util.function.Function;
19 import java.util.function.IntUnaryOperator;
20 import java.util.function.Predicate;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
23 import java.util.stream.IntStream;
24 
25 import org.unicode.cldr.api.CldrData;
26 import org.unicode.cldr.api.CldrDataSupplier;
27 import org.unicode.cldr.api.CldrDataSupplier.CldrResolution;
28 import org.unicode.cldr.api.CldrDataType;
29 import org.unicode.cldr.api.CldrDraftStatus;
30 import org.unicode.cldr.api.CldrPath;
31 import org.unicode.cldr.api.CldrValue;
32 import org.unicode.cldr.api.FilteredData;
33 import org.unicode.cldr.api.PathMatcher;
34 
35 import com.google.common.base.CharMatcher;
36 import com.google.common.collect.ImmutableList;
37 import com.google.common.collect.ImmutableMap;
38 import com.google.common.collect.ImmutableSet;
39 import com.google.common.collect.Sets;
40 
41 /**
42  * A factory for wrapping data suppliers to add synthetic locales for debugging. The currently
43  * supported synthetic locales are:
44  * <ul>
45  *     <li>{@code en_XA}: A pseudo locale which generates expanded text with many non-Latin accents.
46  *     <li>{@code ar_XB}: A pseudo locale which generates BiDi text for debugging.
47  * </ul>
48  *
49  * <p>Both pseudo locales are based on {@code "en"} data, and generate values which are readable
50  * by English speaking developers. For example, the CLDR value "Hello World" will be turned into
51  * something like:
52  * <ul>
53  *     <li>{@code en_XA}: [Ĥéļļö Ŵöŕļð one two]
54  *     <li>{@code ar_XB}: dlroW elloH
55  * </ul>
56  *
57  * <p>In the case of BiDi pseudo localization, bi-directional markers are also inserted into the
58  * text so that, if the system using the data is configured correctly, the results will look
59  * "normal" (i.e. Latin text will appear displayed left-to-right because of the BiDi markers).
60  */
61 // TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled.
62 public final class PseudoLocales {
63     // Right-to-left override character.
64     private static final String RLO = "\u202e";
65     // Arabic letter mark character.
66     private static final String ALM = "\u061C";
67     // Pop direction formatting character.
68     private static final String PDF = "\u202c";
69     // Prefix to add before each LTR word.
70     private static final String BIDI_PREFIX = ALM + RLO;
71     // Postfix to add after each LTR word.
72     private static final String BIDI_POSTFIX = PDF + ALM;
73 
74     // See getExemplarValue() method for why we don't extract the exemplar list from "en".
75     private enum PseudoType {
76         BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF),
77         EXPAND("en_XA", PseudoLocales::expanding,
78             "a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm"
79                 + "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175"
80                 + "x\u1e8by\u00fdz\u017e");
81 
82         private static final ImmutableMap<String, PseudoType> ID_MAP =
83             Arrays.stream(values()).collect(toImmutableMap(PseudoType::getLocaleId, identity()));
84 
fromId(String localeId)85         private static PseudoType fromId(String localeId) {
86             return checkNotNull(ID_MAP.get(localeId), "unknown pseduo locale: %s", localeId);
87         }
88 
getLocaleIds()89         private static ImmutableSet<String> getLocaleIds() {
90             return ID_MAP.keySet();
91         }
92 
93         private final String localeId;
94         private final Function<Boolean, PseudoText> textSupplier;
95         // A string whose code points form the exemplar set for the pseudo locale.
96         private final String exemplars;
97 
PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars)98         PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars) {
99             this.localeId = localeId;
100             this.textSupplier = textSupplier;
101             this.exemplars = exemplars;
102         }
103 
getLocaleId()104         String getLocaleId() {
105             return localeId;
106         }
107 
getText(boolean isPattern)108         PseudoText getText(boolean isPattern) {
109             return textSupplier.apply(isPattern);
110         }
111 
getExemplars()112         String getExemplars() {
113             return exemplars;
114         }
115     }
116 
117     /**
118      * Returns a wrapped data supplier which will inject {@link CldrData} for the pseudo locales
119      * {@code en_XA} and {@code ar_XB}. These locales should behave in all respects like normal
120      * locales and can be processed accordingly.
121      */
addPseudoLocalesTo(CldrDataSupplier src)122     public static CldrDataSupplier addPseudoLocalesTo(CldrDataSupplier src) {
123         return new PseudoSupplier(src);
124     }
125 
126     private static final class PseudoSupplier extends CldrDataSupplier {
127         private final CldrDataSupplier src;
128         private final Set<String> srcIds;
129         private final CldrData enData;
130         private final ImmutableSet<CldrPath> pathsToProcess;
131 
PseudoSupplier(CldrDataSupplier src)132         PseudoSupplier(CldrDataSupplier src) {
133             this.src = checkNotNull(src);
134             this.srcIds = src.getAvailableLocaleIds();
135             // Start with resolved data so we can merge values from "en" and "en_001" for coverage
136             // and supply the unfiltered values if someone wants the resolved version of the pseudo
137             // locale data.
138             this.enData = src.getDataForLocale("en", RESOLVED);
139             // But since we don't want to filter paths which come from the "root" locale (such as
140             // aliases) then we need to find the union of "English" paths we expect to filter.
141             this.pathsToProcess = getUnresolvedPaths(src, "en", "en_001");
142             // Just check that we aren't wrapping an already wrapped supplier.
143             PseudoType.getLocaleIds()
144                 .forEach(id -> checkArgument(!srcIds.contains(id),
145                     "pseudo locale %s already supported by given data supplier", id));
146         }
147 
getUnresolvedPaths( CldrDataSupplier src, String... ids)148         private static ImmutableSet<CldrPath> getUnresolvedPaths(
149             CldrDataSupplier src, String... ids) {
150 
151             ImmutableSet.Builder<CldrPath> paths = ImmutableSet.builder();
152             for (String id : ids) {
153                 src.getDataForLocale(id, UNRESOLVED).accept(ARBITRARY, v -> paths.add(v.getPath()));
154             }
155             return paths.build();
156         }
157 
withDraftStatusAtLeast(CldrDraftStatus draftStatus)158         @Override public CldrDataSupplier withDraftStatusAtLeast(CldrDraftStatus draftStatus) {
159             return new PseudoSupplier(src.withDraftStatusAtLeast(draftStatus));
160         }
161 
getDataForLocale(String localeId, CldrResolution resolution)162         @Override public CldrData getDataForLocale(String localeId, CldrResolution resolution) {
163             if (PseudoType.getLocaleIds().contains(localeId)) {
164                 return new PseudoLocaleData(
165                     enData, pathsToProcess, resolution, PseudoType.fromId(localeId));
166             } else {
167                 return src.getDataForLocale(localeId, resolution);
168             }
169         }
170 
getAvailableLocaleIds()171         @Override public Set<String> getAvailableLocaleIds() {
172             return Sets.union(src.getAvailableLocaleIds(), PseudoType.getLocaleIds());
173         }
174 
getDataForType(CldrDataType type)175         @Override public CldrData getDataForType(CldrDataType type) {
176             return src.getDataForType(type);
177         }
178     }
179 
180     private interface PseudoText {
addFragment(String text, boolean isLocalizable)181         void addFragment(String text, boolean isLocalizable);
182     }
183 
184     private static final class PseudoLocaleData extends FilteredData {
185         private static final PathMatcher LDML = PathMatcher.of("//ldml");
186 
187         private static final PathMatcher AUX_EXEMPLARS =
188             ldml("characters/exemplarCharacters[@type=\"auxiliary\"]");
189 
190         private static final PathMatcher NUMBERING_SYSTEM =
191             ldml("numbers/defaultNumberingSystem");
192 
193         private static final PathMatcher GREGORIAN_SHORT_STANDARD_PATTERN =
194             ldml("dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]");
195 
196         // These paths were mostly derived from looking at the previous implementation's behaviour
197         // and can be modified as needed.
198         private static final Predicate<CldrPath> IS_PSEUDO_PATH =
199             matchAnyLdmlPrefix(
200                 "localeDisplayNames",
201                 "delimiters",
202                 "dates/calendars/calendar",
203                 "dates/fields",
204                 "dates/timeZoneNames",
205                 "listPatterns",
206                 "posix/messages",
207                 "characterLabels",
208                 "typographicNames",
209                 "units")
210                 .and(matchAnyLdmlPrefix(
211                     "localeDisplayNames/localeDisplayPattern",
212                     "dates/timeZoneNames/fallbackFormat")
213                     .negate());
214 
215         // The expectation is that all non-alias paths with values under these roots are "date/time
216         // pattern like" (such as "E h:mm:ss B") in which care must be taken to not pseudo localize
217         // the patterns in such as way as to break them. This list must be accurate.
218         private static final Predicate<CldrPath> IS_PATTERN_PATH = matchAnyLdmlPrefix(
219             "dates/calendars/calendar/timeFormats",
220             "dates/calendars/calendar/dateFormats",
221             "dates/calendars/calendar/dateTimeFormats",
222             "dates/timeZoneNames/hourFormat");
223 
ldml(String paths)224         private static PathMatcher ldml(String paths) {
225             return LDML.withSuffix(paths);
226         }
227 
matchAnyLdmlPrefix(String... paths)228         private static Predicate<CldrPath> matchAnyLdmlPrefix(String... paths) {
229             ImmutableList<Predicate<CldrPath>> collect =
230                 Arrays.stream(paths)
231                     .map(s -> (Predicate<CldrPath>) ldml(s)::matchesPrefixOf)
232                     .collect(toImmutableList());
233             return p -> collect.stream().anyMatch(e -> e.test(p));
234         }
235 
236         // Look for any attribute in the path with "narrow" in its value. Since "narrow" values
237         // have strong expectations of width, we should not expand these (but might alter them
238         // otherwise).
239         private static final Predicate<String> IS_NARROW =
240             Pattern.compile("\\[@[a-z]+=\"[^\"]*narrow[^\"]*\"]", CASE_INSENSITIVE).asPredicate();
241 
242         private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}");
243         private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'");
244 
245         private final PseudoType type;
246         private final boolean isResolved;
247         private final ImmutableSet<CldrPath> pathsToProcess;
248 
PseudoLocaleData( CldrData srcData, ImmutableSet<CldrPath> pathsToProcess, CldrResolution resolution, PseudoType type)249         private PseudoLocaleData(
250             CldrData srcData,
251             ImmutableSet<CldrPath> pathsToProcess,
252             CldrResolution resolution,
253             PseudoType type) {
254 
255             super(srcData);
256             this.isResolved = checkNotNull(resolution) == RESOLVED;
257             this.type = checkNotNull(type);
258             this.pathsToProcess = pathsToProcess;
259         }
260 
261         @Override
filter(CldrValue value)262         protected CldrValue filter(CldrValue value) {
263             CldrPath path = value.getPath();
264 
265             // Special case(s) first...
266             // We add the exemplar character list according to the pseudo type.
267             if (AUX_EXEMPLARS.matches(path)) {
268                 return getExemplarValue(path);
269             }
270             // Force "latn" for the "ar_XB" pseudo locale (since otherwise it inherits from "ar".
271             // The path we get here was from "en" so should already be "latn", but we just have
272             // to return it in order for it to take effect.
273             if (type == PseudoType.BIDI && NUMBERING_SYSTEM.matches(path)) {
274                 checkArgument(value.getValue().equals("latn"));
275                 return value;
276             }
277 
278             CldrValue defaultReturnValue = isResolved ? value : null;
279             // This makes it look like we have explicit values only for the included paths.
280             if (!pathsToProcess.contains(path) || !IS_PSEUDO_PATH.test(path)) {
281                 return defaultReturnValue;
282             }
283             String fullPath = value.getFullPath();
284             // For now don't do anything with "narrow" data (this matches the previous behaviour).
285             // We can always add something here later if necessary.
286             if (IS_NARROW.test(fullPath)) {
287                 return defaultReturnValue;
288             }
289             // Explicitly return 24 hrs format pattern for the Gregorian short standard pattern
290             // entry to be consistent with the time cycle specified in supplemental.xml for
291             // region 001. 001 is the region the pseudolocales en_XA/ar_XB default to.
292             // This prevents ICU unit test failure.
293             if (GREGORIAN_SHORT_STANDARD_PATTERN.matches(path)) {
294                 return CldrValue.parseValue(fullPath, "[H:mm]");
295             }
296             String text = createMessage(value.getValue(), IS_PATTERN_PATH.test(path));
297 
298             return CldrValue.parseValue(fullPath, text);
299         }
300 
301         // It's tempting to think that the existing exemplar list in "en" could be parsed to
302         // generate list automatically (rather than having a hard coded list in the type) but
303         // https://unicode.org/reports/tr35/tr35-general.html#ExemplarSyntax
304         // makes it quite clear that this is infeasible, since there are many equivalent
305         // representations of the examplar characters that could appear in the value
306         // (e.g. "[a b ... z]", "[a-z]", "[{a} {b} ... {z}]")
getExemplarValue(CldrPath path)307         private CldrValue getExemplarValue(CldrPath path) {
308             StringBuilder exemplarList = new StringBuilder("[");
309             type.getExemplars().codePoints()
310                 .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' '));
311             exemplarList.setCharAt(exemplarList.length() - 1, ']');
312             return CldrValue.parseValue(path.toString(), exemplarList.toString());
313         }
314 
315         // Append a (possibly escaped) representation of the exemaplar character.
appendExemplarCodePoint(StringBuilder out, int cp)316         private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) {
317             // This could be fixed if needed, but for now it's safer to check.
318             checkArgument(
319                 Character.isBmpCodePoint(cp),
320                 "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp));
321             if (Character.isAlphabetic(cp)) {
322                 out.appendCodePoint(cp);
323             } else {
324                 out.append(String.format("\\u%04X", cp));
325             }
326             return out;
327         }
328 
createMessage(String text, boolean isPattern)329         private String createMessage(String text, boolean isPattern) {
330             // Pattern text is split by the quoted sections (which are localizable) whereas
331             // non-pattern text is split by placeholder (e.g. {0}) which are not localizable.
332             // This is why "isPattern" is used to signal "isLocalizable" in addFragment().
333             Matcher match = (isPattern ? QUOTED_TEXT : NUMERIC_PLACEHOLDER).matcher(text);
334             // Alternate between unmatched and matched sections in the text, always localizing one
335             // but not the other (depending the type). Append the trailing section at the end.
336             PseudoText out = type.getText(isPattern);
337             int start = 0;
338             for (; match.find(); start = match.end()) {
339                 out.addFragment(text.substring(start, match.start()), !isPattern);
340                 out.addFragment(match.group(), isPattern);
341             }
342             out.addFragment(text.substring(start), !isPattern);
343             return out.toString();
344         }
345     }
346 
347     // ---- Expanding Pseudo-localizer (e.g. "November" --> "[Ñöṽéɱƀéŕ one two]") ----
348 
349     // A map from a string of alternating key/value code-points; e.g. '1' -> '①'.
350     // Note that a subset of this is also used to form the "exemplar" set (see PseudoType).
351     private static final IntUnaryOperator CONVERT_CODEPOINT = toCodePointFunction(
352         " \u2003!\u00a1\"\u2033#\u266f$\u20ac%\u2030&\u214b*\u204e+\u207a,\u060c-\u2010.\u00b7"
353             + "/\u20440\u24ea1\u24602\u24613\u24624\u24635\u24646\u24657\u24668\u24679\u2468"
354             + ":\u2236;\u204f<\u2264=\u2242>\u2265?\u00bf@\u055eA\u00c5B\u0181C\u00c7D\u00d0"
355             + "E\u00c9F\u0191G\u011cH\u0124I\u00ceJ\u0134K\u0136L\u013bM\u1e40N\u00d1O\u00d6"
356             + "P\u00deQ\u01eaR\u0154S\u0160T\u0162U\u00dbV\u1e7cW\u0174X\u1e8aY\u00ddZ\u017d"
357             + "[\u2045\\\u2216]\u2046^\u02c4_\u203f`\u2035a\u00e5b\u0180c\u00e7d\u00f0e\u00e9"
358             + "f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm\u0271n\u00f1o\u00f6p\u00fe"
359             + "q\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175x\u1e8by\u00fdz\u017e|\u00a6"
360             + "~\u02de");
361 
362     // Converts a source/target alternating code-points into a map.
toCodePointFunction(String s)363     private static IntUnaryOperator toCodePointFunction(String s) {
364         // Not pretty, but there's no nice way to "pair up" successive stream elements without
365         // extra library dependencies, so we collect them and then iterate via index.
366         int[] codePoints = s.codePoints().toArray();
367         checkArgument((codePoints.length & 1) == 0,
368             "must have an even number of code points (was %s)", codePoints.length);
369         ImmutableMap<Integer, Integer> map =
370             IntStream.range(0, codePoints.length / 2)
371                 .boxed()
372                 .collect(toImmutableMap(n -> codePoints[2 * n], n -> codePoints[(2 * n) + 1]));
373         return cp -> map.getOrDefault(cp, cp);
374     }
375 
376     // A list of words to be added to text when it is expanded. A whole number of words are
377     // always added (and the fact they are numeric words is irrelevant, could be Lorem Ipsum).
378     // So far nothing goes above "ten" in en_XA, but this can always be trivially extended.
379     private static final String PADDING = "one two three four five six seven eight nine ten";
380 
expanding(boolean isPattern)381     private static PseudoText expanding(boolean isPattern) {
382         return new PseudoText() {
383             IntStream.Builder codePoints = IntStream.builder();
384 
385             @Override
386             public void addFragment(String text, boolean isLocalizable) {
387                 text.codePoints()
388                     .map(isLocalizable ? CONVERT_CODEPOINT : cp -> cp)
389                     .forEach(codePoints::add);
390             }
391 
392             @Override
393             public String toString() {
394                 int[] cp = codePoints.build().toArray();
395                 // Copy the original code and round up the 50% calculation (it's not important).
396                 int endIndex = CharMatcher.whitespace().indexIn(PADDING, (cp.length + 1) / 2);
397                 String suffix = PADDING.substring(0, Math.min(endIndex, PADDING.length()));
398                 // For pattern strings, any literal text must be quoted (the fragment text
399                 // already was). Note that this is why we don't transform single-quotes.
400                 if (isPattern) {
401                     suffix = "'" + suffix.replace(" ", "' '") + "'";
402                 }
403                 // Final output is something like "November" --> "[Ñöṽéɱƀéŕ one two]"
404                 // Where the additional padding adds at least 50% to the length of the text.
405                 return "[" + new String(cp, 0, cp.length) + " " + suffix + "]";
406             }
407         };
408     }
409 
410     // ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)----
411 
412     // Bidi localization doesn't care if the fragment is a pattern or not.
413     @SuppressWarnings("unused")
bidi(boolean isPattern)414     private static PseudoText bidi(boolean isPattern) {
415         return new PseudoText() {
416             private final StringBuilder out = new StringBuilder();
417 
418             // This was largely copied from the original CLDRFilePseudolocalizer class and
419             // while it appears to work fine, I don't know enough to comment it clearly.
420             // TODO: Find someone who can add a decent comment here!
421             @Override
422             public void addFragment(String text, boolean isLocalizable) {
423                 if (isLocalizable) {
424                     boolean wrapping = false;
425                     for (int index = 0; index < text.length(); ) {
426                         int codePoint = text.codePointAt(index);
427                         index += Character.charCount(codePoint);
428                         byte directionality = Character.getDirectionality(codePoint);
429                         boolean needsWrap = (directionality == DIRECTIONALITY_LEFT_TO_RIGHT);
430                         if (needsWrap != wrapping) {
431                             wrapping = needsWrap;
432                             out.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX);
433                         }
434                         out.appendCodePoint(codePoint);
435                     }
436                     if (wrapping) {
437                         out.append(BIDI_POSTFIX);
438                     }
439                 } else {
440                     out.append(text);
441                 }
442             }
443 
444             @Override
445             public String toString() {
446                 return out.toString();
447             }
448         };
449     }
450 
451     private PseudoLocales() {
452     }
453 }
454