1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static com.google.common.collect.ImmutableList.toImmutableList; 8 import static com.google.common.collect.ImmutableMap.toImmutableMap; 9 import static java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT; 10 import static java.util.function.Function.identity; 11 import static java.util.regex.Pattern.CASE_INSENSITIVE; 12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY; 13 import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED; 14 import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED; 15 16 import java.util.Arrays; 17 import java.util.Set; 18 import java.util.function.Function; 19 import java.util.function.IntUnaryOperator; 20 import java.util.function.Predicate; 21 import java.util.regex.Matcher; 22 import java.util.regex.Pattern; 23 import java.util.stream.IntStream; 24 25 import org.unicode.cldr.api.CldrData; 26 import org.unicode.cldr.api.CldrDataSupplier; 27 import org.unicode.cldr.api.CldrDataSupplier.CldrResolution; 28 import org.unicode.cldr.api.CldrDataType; 29 import org.unicode.cldr.api.CldrDraftStatus; 30 import org.unicode.cldr.api.CldrPath; 31 import org.unicode.cldr.api.CldrValue; 32 import org.unicode.cldr.api.FilteredData; 33 import org.unicode.cldr.api.PathMatcher; 34 35 import com.google.common.base.CharMatcher; 36 import com.google.common.collect.ImmutableList; 37 import com.google.common.collect.ImmutableMap; 38 import com.google.common.collect.ImmutableSet; 39 import com.google.common.collect.Sets; 40 41 /** 42 * A factory for wrapping data suppliers to add synthetic locales for debugging. The currently 43 * supported synthetic locales are: 44 * <ul> 45 * <li>{@code en_XA}: A pseudo locale which generates expanded text with many non-Latin accents. 46 * <li>{@code ar_XB}: A pseudo locale which generates BiDi text for debugging. 47 * </ul> 48 * 49 * <p>Both pseudo locales are based on {@code "en"} data, and generate values which are readable 50 * by English speaking developers. For example, the CLDR value "Hello World" will be turned into 51 * something like: 52 * <ul> 53 * <li>{@code en_XA}: [Ĥéļļö Ŵöŕļð one two] 54 * <li>{@code ar_XB}: dlroW elloH 55 * </ul> 56 * 57 * <p>In the case of BiDi pseudo localization, bi-directional markers are also inserted into the 58 * text so that, if the system using the data is configured correctly, the results will look 59 * "normal" (i.e. Latin text will appear displayed left-to-right because of the BiDi markers). 60 */ 61 // TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled. 62 public final class PseudoLocales { 63 // Right-to-left override character. 64 private static final String RLO = "\u202e"; 65 // Arabic letter mark character. 66 private static final String ALM = "\u061C"; 67 // Pop direction formatting character. 68 private static final String PDF = "\u202c"; 69 // Prefix to add before each LTR word. 70 private static final String BIDI_PREFIX = ALM + RLO; 71 // Postfix to add after each LTR word. 72 private static final String BIDI_POSTFIX = PDF + ALM; 73 74 // See getExemplarValue() method for why we don't extract the exemplar list from "en". 75 private enum PseudoType { 76 BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF), 77 EXPAND("en_XA", PseudoLocales::expanding, 78 "a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm" 79 + "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175" 80 + "x\u1e8by\u00fdz\u017e"); 81 82 private static final ImmutableMap<String, PseudoType> ID_MAP = 83 Arrays.stream(values()).collect(toImmutableMap(PseudoType::getLocaleId, identity())); 84 fromId(String localeId)85 private static PseudoType fromId(String localeId) { 86 return checkNotNull(ID_MAP.get(localeId), "unknown pseduo locale: %s", localeId); 87 } 88 getLocaleIds()89 private static ImmutableSet<String> getLocaleIds() { 90 return ID_MAP.keySet(); 91 } 92 93 private final String localeId; 94 private final Function<Boolean, PseudoText> textSupplier; 95 // A string whose code points form the exemplar set for the pseudo locale. 96 private final String exemplars; 97 PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars)98 PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars) { 99 this.localeId = localeId; 100 this.textSupplier = textSupplier; 101 this.exemplars = exemplars; 102 } 103 getLocaleId()104 String getLocaleId() { 105 return localeId; 106 } 107 getText(boolean isPattern)108 PseudoText getText(boolean isPattern) { 109 return textSupplier.apply(isPattern); 110 } 111 getExemplars()112 String getExemplars() { 113 return exemplars; 114 } 115 } 116 117 /** 118 * Returns a wrapped data supplier which will inject {@link CldrData} for the pseudo locales 119 * {@code en_XA} and {@code ar_XB}. These locales should behave in all respects like normal 120 * locales and can be processed accordingly. 121 */ addPseudoLocalesTo(CldrDataSupplier src)122 public static CldrDataSupplier addPseudoLocalesTo(CldrDataSupplier src) { 123 return new PseudoSupplier(src); 124 } 125 126 private static final class PseudoSupplier extends CldrDataSupplier { 127 private final CldrDataSupplier src; 128 private final Set<String> srcIds; 129 private final CldrData enData; 130 private final ImmutableSet<CldrPath> pathsToProcess; 131 PseudoSupplier(CldrDataSupplier src)132 PseudoSupplier(CldrDataSupplier src) { 133 this.src = checkNotNull(src); 134 this.srcIds = src.getAvailableLocaleIds(); 135 // Start with resolved data so we can merge values from "en" and "en_001" for coverage 136 // and supply the unfiltered values if someone wants the resolved version of the pseudo 137 // locale data. 138 this.enData = src.getDataForLocale("en", RESOLVED); 139 // But since we don't want to filter paths which come from the "root" locale (such as 140 // aliases) then we need to find the union of "English" paths we expect to filter. 141 this.pathsToProcess = getUnresolvedPaths(src, "en", "en_001"); 142 // Just check that we aren't wrapping an already wrapped supplier. 143 PseudoType.getLocaleIds() 144 .forEach(id -> checkArgument(!srcIds.contains(id), 145 "pseudo locale %s already supported by given data supplier", id)); 146 } 147 getUnresolvedPaths( CldrDataSupplier src, String... ids)148 private static ImmutableSet<CldrPath> getUnresolvedPaths( 149 CldrDataSupplier src, String... ids) { 150 151 ImmutableSet.Builder<CldrPath> paths = ImmutableSet.builder(); 152 for (String id : ids) { 153 src.getDataForLocale(id, UNRESOLVED).accept(ARBITRARY, v -> paths.add(v.getPath())); 154 } 155 return paths.build(); 156 } 157 withDraftStatusAtLeast(CldrDraftStatus draftStatus)158 @Override public CldrDataSupplier withDraftStatusAtLeast(CldrDraftStatus draftStatus) { 159 return new PseudoSupplier(src.withDraftStatusAtLeast(draftStatus)); 160 } 161 getDataForLocale(String localeId, CldrResolution resolution)162 @Override public CldrData getDataForLocale(String localeId, CldrResolution resolution) { 163 if (PseudoType.getLocaleIds().contains(localeId)) { 164 return new PseudoLocaleData( 165 enData, pathsToProcess, resolution, PseudoType.fromId(localeId)); 166 } else { 167 return src.getDataForLocale(localeId, resolution); 168 } 169 } 170 getAvailableLocaleIds()171 @Override public Set<String> getAvailableLocaleIds() { 172 return Sets.union(src.getAvailableLocaleIds(), PseudoType.getLocaleIds()); 173 } 174 getDataForType(CldrDataType type)175 @Override public CldrData getDataForType(CldrDataType type) { 176 return src.getDataForType(type); 177 } 178 } 179 180 private interface PseudoText { addFragment(String text, boolean isLocalizable)181 void addFragment(String text, boolean isLocalizable); 182 } 183 184 private static final class PseudoLocaleData extends FilteredData { 185 private static final PathMatcher LDML = PathMatcher.of("//ldml"); 186 187 private static final PathMatcher AUX_EXEMPLARS = 188 ldml("characters/exemplarCharacters[@type=\"auxiliary\"]"); 189 190 private static final PathMatcher NUMBERING_SYSTEM = 191 ldml("numbers/defaultNumberingSystem"); 192 193 private static final PathMatcher GREGORIAN_SHORT_STANDARD_PATTERN = 194 ldml("dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]"); 195 196 // These paths were mostly derived from looking at the previous implementation's behaviour 197 // and can be modified as needed. 198 private static final Predicate<CldrPath> IS_PSEUDO_PATH = 199 matchAnyLdmlPrefix( 200 "localeDisplayNames", 201 "delimiters", 202 "dates/calendars/calendar", 203 "dates/fields", 204 "dates/timeZoneNames", 205 "listPatterns", 206 "posix/messages", 207 "characterLabels", 208 "typographicNames", 209 "units") 210 .and(matchAnyLdmlPrefix( 211 "localeDisplayNames/localeDisplayPattern", 212 "dates/timeZoneNames/fallbackFormat") 213 .negate()); 214 215 // The expectation is that all non-alias paths with values under these roots are "date/time 216 // pattern like" (such as "E h:mm:ss B") in which care must be taken to not pseudo localize 217 // the patterns in such as way as to break them. This list must be accurate. 218 private static final Predicate<CldrPath> IS_PATTERN_PATH = matchAnyLdmlPrefix( 219 "dates/calendars/calendar/timeFormats", 220 "dates/calendars/calendar/dateFormats", 221 "dates/calendars/calendar/dateTimeFormats", 222 "dates/timeZoneNames/hourFormat"); 223 ldml(String paths)224 private static PathMatcher ldml(String paths) { 225 return LDML.withSuffix(paths); 226 } 227 matchAnyLdmlPrefix(String... paths)228 private static Predicate<CldrPath> matchAnyLdmlPrefix(String... paths) { 229 ImmutableList<Predicate<CldrPath>> collect = 230 Arrays.stream(paths) 231 .map(s -> (Predicate<CldrPath>) ldml(s)::matchesPrefixOf) 232 .collect(toImmutableList()); 233 return p -> collect.stream().anyMatch(e -> e.test(p)); 234 } 235 236 // Look for any attribute in the path with "narrow" in its value. Since "narrow" values 237 // have strong expectations of width, we should not expand these (but might alter them 238 // otherwise). 239 private static final Predicate<String> IS_NARROW = 240 Pattern.compile("\\[@[a-z]+=\"[^\"]*narrow[^\"]*\"]", CASE_INSENSITIVE).asPredicate(); 241 242 private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}"); 243 private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'"); 244 245 private final PseudoType type; 246 private final boolean isResolved; 247 private final ImmutableSet<CldrPath> pathsToProcess; 248 PseudoLocaleData( CldrData srcData, ImmutableSet<CldrPath> pathsToProcess, CldrResolution resolution, PseudoType type)249 private PseudoLocaleData( 250 CldrData srcData, 251 ImmutableSet<CldrPath> pathsToProcess, 252 CldrResolution resolution, 253 PseudoType type) { 254 255 super(srcData); 256 this.isResolved = checkNotNull(resolution) == RESOLVED; 257 this.type = checkNotNull(type); 258 this.pathsToProcess = pathsToProcess; 259 } 260 261 @Override filter(CldrValue value)262 protected CldrValue filter(CldrValue value) { 263 CldrPath path = value.getPath(); 264 265 // Special case(s) first... 266 // We add the exemplar character list according to the pseudo type. 267 if (AUX_EXEMPLARS.matches(path)) { 268 return getExemplarValue(path); 269 } 270 // Force "latn" for the "ar_XB" pseudo locale (since otherwise it inherits from "ar". 271 // The path we get here was from "en" so should already be "latn", but we just have 272 // to return it in order for it to take effect. 273 if (type == PseudoType.BIDI && NUMBERING_SYSTEM.matches(path)) { 274 checkArgument(value.getValue().equals("latn")); 275 return value; 276 } 277 278 CldrValue defaultReturnValue = isResolved ? value : null; 279 // This makes it look like we have explicit values only for the included paths. 280 if (!pathsToProcess.contains(path) || !IS_PSEUDO_PATH.test(path)) { 281 return defaultReturnValue; 282 } 283 String fullPath = value.getFullPath(); 284 // For now don't do anything with "narrow" data (this matches the previous behaviour). 285 // We can always add something here later if necessary. 286 if (IS_NARROW.test(fullPath)) { 287 return defaultReturnValue; 288 } 289 // Explicitly return 24 hrs format pattern for the Gregorian short standard pattern 290 // entry to be consistent with the time cycle specified in supplemental.xml for 291 // region 001. 001 is the region the pseudolocales en_XA/ar_XB default to. 292 // This prevents ICU unit test failure. 293 if (GREGORIAN_SHORT_STANDARD_PATTERN.matches(path)) { 294 return CldrValue.parseValue(fullPath, "[H:mm]"); 295 } 296 String text = createMessage(value.getValue(), IS_PATTERN_PATH.test(path)); 297 298 return CldrValue.parseValue(fullPath, text); 299 } 300 301 // It's tempting to think that the existing exemplar list in "en" could be parsed to 302 // generate list automatically (rather than having a hard coded list in the type) but 303 // https://unicode.org/reports/tr35/tr35-general.html#ExemplarSyntax 304 // makes it quite clear that this is infeasible, since there are many equivalent 305 // representations of the examplar characters that could appear in the value 306 // (e.g. "[a b ... z]", "[a-z]", "[{a} {b} ... {z}]") getExemplarValue(CldrPath path)307 private CldrValue getExemplarValue(CldrPath path) { 308 StringBuilder exemplarList = new StringBuilder("["); 309 type.getExemplars().codePoints() 310 .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' ')); 311 exemplarList.setCharAt(exemplarList.length() - 1, ']'); 312 return CldrValue.parseValue(path.toString(), exemplarList.toString()); 313 } 314 315 // Append a (possibly escaped) representation of the exemaplar character. appendExemplarCodePoint(StringBuilder out, int cp)316 private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) { 317 // This could be fixed if needed, but for now it's safer to check. 318 checkArgument( 319 Character.isBmpCodePoint(cp), 320 "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp)); 321 if (Character.isAlphabetic(cp)) { 322 out.appendCodePoint(cp); 323 } else { 324 out.append(String.format("\\u%04X", cp)); 325 } 326 return out; 327 } 328 createMessage(String text, boolean isPattern)329 private String createMessage(String text, boolean isPattern) { 330 // Pattern text is split by the quoted sections (which are localizable) whereas 331 // non-pattern text is split by placeholder (e.g. {0}) which are not localizable. 332 // This is why "isPattern" is used to signal "isLocalizable" in addFragment(). 333 Matcher match = (isPattern ? QUOTED_TEXT : NUMERIC_PLACEHOLDER).matcher(text); 334 // Alternate between unmatched and matched sections in the text, always localizing one 335 // but not the other (depending the type). Append the trailing section at the end. 336 PseudoText out = type.getText(isPattern); 337 int start = 0; 338 for (; match.find(); start = match.end()) { 339 out.addFragment(text.substring(start, match.start()), !isPattern); 340 out.addFragment(match.group(), isPattern); 341 } 342 out.addFragment(text.substring(start), !isPattern); 343 return out.toString(); 344 } 345 } 346 347 // ---- Expanding Pseudo-localizer (e.g. "November" --> "[Ñöṽéɱƀéŕ one two]") ---- 348 349 // A map from a string of alternating key/value code-points; e.g. '1' -> '①'. 350 // Note that a subset of this is also used to form the "exemplar" set (see PseudoType). 351 private static final IntUnaryOperator CONVERT_CODEPOINT = toCodePointFunction( 352 " \u2003!\u00a1\"\u2033#\u266f$\u20ac%\u2030&\u214b*\u204e+\u207a,\u060c-\u2010.\u00b7" 353 + "/\u20440\u24ea1\u24602\u24613\u24624\u24635\u24646\u24657\u24668\u24679\u2468" 354 + ":\u2236;\u204f<\u2264=\u2242>\u2265?\u00bf@\u055eA\u00c5B\u0181C\u00c7D\u00d0" 355 + "E\u00c9F\u0191G\u011cH\u0124I\u00ceJ\u0134K\u0136L\u013bM\u1e40N\u00d1O\u00d6" 356 + "P\u00deQ\u01eaR\u0154S\u0160T\u0162U\u00dbV\u1e7cW\u0174X\u1e8aY\u00ddZ\u017d" 357 + "[\u2045\\\u2216]\u2046^\u02c4_\u203f`\u2035a\u00e5b\u0180c\u00e7d\u00f0e\u00e9" 358 + "f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm\u0271n\u00f1o\u00f6p\u00fe" 359 + "q\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175x\u1e8by\u00fdz\u017e|\u00a6" 360 + "~\u02de"); 361 362 // Converts a source/target alternating code-points into a map. toCodePointFunction(String s)363 private static IntUnaryOperator toCodePointFunction(String s) { 364 // Not pretty, but there's no nice way to "pair up" successive stream elements without 365 // extra library dependencies, so we collect them and then iterate via index. 366 int[] codePoints = s.codePoints().toArray(); 367 checkArgument((codePoints.length & 1) == 0, 368 "must have an even number of code points (was %s)", codePoints.length); 369 ImmutableMap<Integer, Integer> map = 370 IntStream.range(0, codePoints.length / 2) 371 .boxed() 372 .collect(toImmutableMap(n -> codePoints[2 * n], n -> codePoints[(2 * n) + 1])); 373 return cp -> map.getOrDefault(cp, cp); 374 } 375 376 // A list of words to be added to text when it is expanded. A whole number of words are 377 // always added (and the fact they are numeric words is irrelevant, could be Lorem Ipsum). 378 // So far nothing goes above "ten" in en_XA, but this can always be trivially extended. 379 private static final String PADDING = "one two three four five six seven eight nine ten"; 380 expanding(boolean isPattern)381 private static PseudoText expanding(boolean isPattern) { 382 return new PseudoText() { 383 IntStream.Builder codePoints = IntStream.builder(); 384 385 @Override 386 public void addFragment(String text, boolean isLocalizable) { 387 text.codePoints() 388 .map(isLocalizable ? CONVERT_CODEPOINT : cp -> cp) 389 .forEach(codePoints::add); 390 } 391 392 @Override 393 public String toString() { 394 int[] cp = codePoints.build().toArray(); 395 // Copy the original code and round up the 50% calculation (it's not important). 396 int endIndex = CharMatcher.whitespace().indexIn(PADDING, (cp.length + 1) / 2); 397 String suffix = PADDING.substring(0, Math.min(endIndex, PADDING.length())); 398 // For pattern strings, any literal text must be quoted (the fragment text 399 // already was). Note that this is why we don't transform single-quotes. 400 if (isPattern) { 401 suffix = "'" + suffix.replace(" ", "' '") + "'"; 402 } 403 // Final output is something like "November" --> "[Ñöṽéɱƀéŕ one two]" 404 // Where the additional padding adds at least 50% to the length of the text. 405 return "[" + new String(cp, 0, cp.length) + " " + suffix + "]"; 406 } 407 }; 408 } 409 410 // ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)---- 411 412 // Bidi localization doesn't care if the fragment is a pattern or not. 413 @SuppressWarnings("unused") bidi(boolean isPattern)414 private static PseudoText bidi(boolean isPattern) { 415 return new PseudoText() { 416 private final StringBuilder out = new StringBuilder(); 417 418 // This was largely copied from the original CLDRFilePseudolocalizer class and 419 // while it appears to work fine, I don't know enough to comment it clearly. 420 // TODO: Find someone who can add a decent comment here! 421 @Override 422 public void addFragment(String text, boolean isLocalizable) { 423 if (isLocalizable) { 424 boolean wrapping = false; 425 for (int index = 0; index < text.length(); ) { 426 int codePoint = text.codePointAt(index); 427 index += Character.charCount(codePoint); 428 byte directionality = Character.getDirectionality(codePoint); 429 boolean needsWrap = (directionality == DIRECTIONALITY_LEFT_TO_RIGHT); 430 if (needsWrap != wrapping) { 431 wrapping = needsWrap; 432 out.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX); 433 } 434 out.appendCodePoint(codePoint); 435 } 436 if (wrapping) { 437 out.append(BIDI_POSTFIX); 438 } 439 } else { 440 out.append(text); 441 } 442 } 443 444 @Override 445 public String toString() { 446 return out.toString(); 447 } 448 }; 449 } 450 451 private PseudoLocales() { 452 } 453 } 454