• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.mapper;
4 
5 import static com.google.common.base.Preconditions.checkNotNull;
6 import static org.unicode.cldr.api.AttributeKey.keyOf;
7 
8 import java.util.Optional;
9 
10 import org.unicode.cldr.api.AttributeKey;
11 import org.unicode.cldr.api.CldrData;
12 import org.unicode.cldr.api.CldrDataType;
13 import org.unicode.cldr.api.CldrValue;
14 import org.unicode.icu.tool.cldrtoicu.IcuData;
15 import org.unicode.icu.tool.cldrtoicu.RbPath;
16 import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor;
17 import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor.SubProcessor;
18 
19 import com.google.common.escape.UnicodeEscaper;
20 
21 /**
22  * A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
23  * paths matching:
24  * <pre>{@code
25  *   //ldml/segmentations/segmentation/suppressions/suppression
26  *   //ldml/special/icu:breakIteratorData/...
27  * }</pre>
28  */
29 // TODO: This class can almost certainly be replace with a small RegexTransformer config.
30 public final class BreakIteratorMapper {
31 
32     private static final CldrDataProcessor<BreakIteratorMapper> CLDR_PROCESSOR;
33     static {
34         CldrDataProcessor.Builder<BreakIteratorMapper> processor = CldrDataProcessor.builder();
35         // The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And
36         // its default (and only) value is "standard".
37         // TODO: Understand and document why this is the case.
38         processor.addValueAction(
39             "//ldml/segmentations/segmentation[@type=*]/suppressions/suppression",
40             BreakIteratorMapper::addSuppression);
41         SubProcessor<BreakIteratorMapper> specials =
42             processor.addSubprocessor("//ldml/special/icu:breakIteratorData");
43         specials.addValueAction("icu:boundaries/*", BreakIteratorMapper::addBoundary);
44         specials.addValueAction(
45             "icu:dictionaries/icu:dictionary", BreakIteratorMapper::addDictionary);
46         specials.addValueAction(
47             "icu:extensions/icu:extension", BreakIteratorMapper::addExtension);
48         specials.addValueAction(
49             "icu:lstm/icu:lstmdata", BreakIteratorMapper::addLstmdata);
50         CLDR_PROCESSOR = processor.build();
51     }
52 
53     private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
54     private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
55     private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
56     private static final AttributeKey LSTMDATA_DEP = keyOf("icu:lstmdata", "icu:dependency");
57     private static final AttributeKey LSTMDATA_TYPE = keyOf("icu:lstmdata", "type");
58 
59     /**
60      * Processes data from the given supplier to generate break-iterator data for a set of locale
61      * IDs.
62      *
63      * @param icuData the ICU data to be filled.
64      * @param cldrData the unresolved CLDR data to process.
65      * @param icuSpecialData additional ICU data (in the "icu:" namespace)
66      * @return IcuData containing break-iterator data for the given locale ID.
67      */
process( IcuData icuData, CldrData cldrData, Optional<CldrData> icuSpecialData)68     public static IcuData process(
69         IcuData icuData, CldrData cldrData, Optional<CldrData> icuSpecialData) {
70 
71         BreakIteratorMapper mapper = new BreakIteratorMapper(icuData);
72         icuSpecialData.ifPresent(d -> CLDR_PROCESSOR.process(d, mapper));
73         CLDR_PROCESSOR.process(cldrData, mapper);
74         return mapper.icuData;
75     }
76 
77     // The per-locale ICU data being collected by this visitor.
78     private final IcuData icuData;
79 
BreakIteratorMapper(IcuData icuData)80     private BreakIteratorMapper(IcuData icuData) {
81         this.icuData = checkNotNull(icuData);
82     }
83 
addSuppression(CldrValue v)84     private void addSuppression(CldrValue v) {
85         //System.out.println("addSuppression: " + v.toString()); // debug
86         String type = SEGMENTATION_TYPE.valueFrom(v);
87         // TODO: Understand and document why we escape values here, but not for collation data.
88         icuData.add(
89             RbPath.of("exceptions", type + ":array"), ESCAPE_NON_ASCII.escape(v.getValue()));
90     }
91 
addBoundary(CldrValue v)92     private void addBoundary(CldrValue v) {
93         //System.out.println("addBoundary: " + v.toString()); // debug
94         addDependency(getDependencyName(v), getBoundaryType(v), getBoundaryDependency(v));
95     }
96 
addDictionary(CldrValue v)97     private void addDictionary(CldrValue v) {
98         //System.out.println("addDictionary: " + v.toString()); // debug
99         addDependency(
100             getDependencyName(v),
101             DICTIONARY_TYPE.valueFrom(v),
102             DICTIONARY_DEP.optionalValueFrom(v));
103     }
104 
addExtension(CldrValue v)105     private void addExtension(CldrValue v) {
106         //System.out.println("addExtension: " + v.toString()); // debug
107         icuData.add(
108             RbPath.of("extensions"), v.getValue());
109     }
110 
addLstmdata(CldrValue v)111     private void addLstmdata(CldrValue v) {
112         //System.out.println("addLstmdata: " + v.toString()); // debug
113         addDependency(
114             getDependencyName(v),
115             LSTMDATA_TYPE.valueFrom(v),
116             LSTMDATA_DEP.optionalValueFrom(v));
117     }
118 
addDependency(String name, String type, Optional<String> dependency)119     private void addDependency(String name, String type, Optional<String> dependency) {
120         //System.out.println("addDependency: name " + name + ", type " + type + ", dependency " + dependency);
121         icuData.add(
122             RbPath.of(name, type + ":process(dependency)"),
123             dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
124     }
125 
126     // Must match the BOUNDARIES or DICTIONARY path.
getDependencyName(CldrValue value)127     private static String getDependencyName(CldrValue value) {
128         return stripXmlNamespace(value.getPath().getParent().getName());
129     }
130 
131     // Must match the BOUNDARIES path.
getBoundaryType(CldrValue value)132     private static String getBoundaryType(CldrValue value) {
133         String elementName = value.getPath().getName();
134         String type = stripXmlNamespace(elementName);
135         return keyOf(elementName, "alt")
136             .optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
137     }
138 
139     // Must match the BOUNDARIES path.
getBoundaryDependency(CldrValue value)140     private static Optional<String> getBoundaryDependency(CldrValue value) {
141         return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
142     }
143 
144     // Strips the first prefix of the form "xxx:" from a string.
stripXmlNamespace(String s)145     private static String stripXmlNamespace(String s) {
146         return s.substring(s.indexOf(':') + 1);
147     }
148 
149     /*
150      * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
151      * backslash to a double backslash. This class is super slow for non-ASCII escaping due to
152      * using "String.format()", however there's < 100 values that need any escaping, so it's fine.
153      */
154     private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
155         private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
156 
157         @Override
158         protected char[] escape(int cp) {
159             // Returning null means "do not escape".
160             if (0x0020 <= cp && cp <= 0x007F) {
161                 return cp == '\\' ? DOUBLE_BACKSLASH : null;
162             } else if (cp <= 0xFFFF) {
163                 return String.format("\\u%04X", cp).toCharArray();
164             }
165             return String.format("\\U%08X", cp).toCharArray();
166         }
167     };
168 }
169