1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.mapper; 4 5 import static com.google.common.base.Preconditions.checkNotNull; 6 import static org.unicode.cldr.api.AttributeKey.keyOf; 7 8 import java.util.Optional; 9 10 import org.unicode.cldr.api.AttributeKey; 11 import org.unicode.cldr.api.CldrData; 12 import org.unicode.cldr.api.CldrDataType; 13 import org.unicode.cldr.api.CldrValue; 14 import org.unicode.icu.tool.cldrtoicu.IcuData; 15 import org.unicode.icu.tool.cldrtoicu.RbPath; 16 import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; 17 import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor.SubProcessor; 18 19 import com.google.common.escape.UnicodeEscaper; 20 21 /** 22 * A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under 23 * paths matching: 24 * <pre>{@code 25 * //ldml/segmentations/segmentation/suppressions/suppression 26 * //ldml/special/icu:breakIteratorData/... 27 * }</pre> 28 */ 29 // TODO: This class can almost certainly be replace with a small RegexTransformer config. 30 public final class BreakIteratorMapper { 31 32 private static final CldrDataProcessor<BreakIteratorMapper> CLDR_PROCESSOR; 33 static { 34 CldrDataProcessor.Builder<BreakIteratorMapper> processor = CldrDataProcessor.builder(); 35 // The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And 36 // its default (and only) value is "standard". 37 // TODO: Understand and document why this is the case. 38 processor.addValueAction( 39 "//ldml/segmentations/segmentation[@type=*]/suppressions/suppression", 40 BreakIteratorMapper::addSuppression); 41 SubProcessor<BreakIteratorMapper> specials = 42 processor.addSubprocessor("//ldml/special/icu:breakIteratorData"); 43 specials.addValueAction("icu:boundaries/*", BreakIteratorMapper::addBoundary); 44 specials.addValueAction( 45 "icu:dictionaries/icu:dictionary", BreakIteratorMapper::addDictionary); 46 specials.addValueAction( 47 "icu:extensions/icu:extension", BreakIteratorMapper::addExtension); 48 specials.addValueAction( 49 "icu:lstm/icu:lstmdata", BreakIteratorMapper::addLstmdata); 50 CLDR_PROCESSOR = processor.build(); 51 } 52 53 private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type"); 54 private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency"); 55 private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type"); 56 private static final AttributeKey LSTMDATA_DEP = keyOf("icu:lstmdata", "icu:dependency"); 57 private static final AttributeKey LSTMDATA_TYPE = keyOf("icu:lstmdata", "type"); 58 59 /** 60 * Processes data from the given supplier to generate break-iterator data for a set of locale 61 * IDs. 62 * 63 * @param icuData the ICU data to be filled. 64 * @param cldrData the unresolved CLDR data to process. 65 * @param icuSpecialData additional ICU data (in the "icu:" namespace) 66 * @return IcuData containing break-iterator data for the given locale ID. 67 */ process( IcuData icuData, CldrData cldrData, Optional<CldrData> icuSpecialData)68 public static IcuData process( 69 IcuData icuData, CldrData cldrData, Optional<CldrData> icuSpecialData) { 70 71 BreakIteratorMapper mapper = new BreakIteratorMapper(icuData); 72 icuSpecialData.ifPresent(d -> CLDR_PROCESSOR.process(d, mapper)); 73 CLDR_PROCESSOR.process(cldrData, mapper); 74 return mapper.icuData; 75 } 76 77 // The per-locale ICU data being collected by this visitor. 78 private final IcuData icuData; 79 BreakIteratorMapper(IcuData icuData)80 private BreakIteratorMapper(IcuData icuData) { 81 this.icuData = checkNotNull(icuData); 82 } 83 addSuppression(CldrValue v)84 private void addSuppression(CldrValue v) { 85 //System.out.println("addSuppression: " + v.toString()); // debug 86 String type = SEGMENTATION_TYPE.valueFrom(v); 87 // TODO: Understand and document why we escape values here, but not for collation data. 88 icuData.add( 89 RbPath.of("exceptions", type + ":array"), ESCAPE_NON_ASCII.escape(v.getValue())); 90 } 91 addBoundary(CldrValue v)92 private void addBoundary(CldrValue v) { 93 //System.out.println("addBoundary: " + v.toString()); // debug 94 addDependency(getDependencyName(v), getBoundaryType(v), getBoundaryDependency(v)); 95 } 96 addDictionary(CldrValue v)97 private void addDictionary(CldrValue v) { 98 //System.out.println("addDictionary: " + v.toString()); // debug 99 addDependency( 100 getDependencyName(v), 101 DICTIONARY_TYPE.valueFrom(v), 102 DICTIONARY_DEP.optionalValueFrom(v)); 103 } 104 addExtension(CldrValue v)105 private void addExtension(CldrValue v) { 106 //System.out.println("addExtension: " + v.toString()); // debug 107 icuData.add( 108 RbPath.of("extensions"), v.getValue()); 109 } 110 addLstmdata(CldrValue v)111 private void addLstmdata(CldrValue v) { 112 //System.out.println("addLstmdata: " + v.toString()); // debug 113 addDependency( 114 getDependencyName(v), 115 LSTMDATA_TYPE.valueFrom(v), 116 LSTMDATA_DEP.optionalValueFrom(v)); 117 } 118 addDependency(String name, String type, Optional<String> dependency)119 private void addDependency(String name, String type, Optional<String> dependency) { 120 //System.out.println("addDependency: name " + name + ", type " + type + ", dependency " + dependency); 121 icuData.add( 122 RbPath.of(name, type + ":process(dependency)"), 123 dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency"))); 124 } 125 126 // Must match the BOUNDARIES or DICTIONARY path. getDependencyName(CldrValue value)127 private static String getDependencyName(CldrValue value) { 128 return stripXmlNamespace(value.getPath().getParent().getName()); 129 } 130 131 // Must match the BOUNDARIES path. getBoundaryType(CldrValue value)132 private static String getBoundaryType(CldrValue value) { 133 String elementName = value.getPath().getName(); 134 String type = stripXmlNamespace(elementName); 135 return keyOf(elementName, "alt") 136 .optionalValueFrom(value).map(a -> type + "_" + a).orElse(type); 137 } 138 139 // Must match the BOUNDARIES path. getBoundaryDependency(CldrValue value)140 private static Optional<String> getBoundaryDependency(CldrValue value) { 141 return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value); 142 } 143 144 // Strips the first prefix of the form "xxx:" from a string. stripXmlNamespace(String s)145 private static String stripXmlNamespace(String s) { 146 return s.substring(s.indexOf(':') + 1); 147 } 148 149 /* 150 * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert 151 * backslash to a double backslash. This class is super slow for non-ASCII escaping due to 152 * using "String.format()", however there's < 100 values that need any escaping, so it's fine. 153 */ 154 private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() { 155 private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray(); 156 157 @Override 158 protected char[] escape(int cp) { 159 // Returning null means "do not escape". 160 if (0x0020 <= cp && cp <= 0x007F) { 161 return cp == '\\' ? DOUBLE_BACKSLASH : null; 162 } else if (cp <= 0xFFFF) { 163 return String.format("\\u%04X", cp).toCharArray(); 164 } 165 return String.format("\\U%08X", cp).toCharArray(); 166 } 167 }; 168 } 169