1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.CharMatcher.whitespace; 6 import static com.google.common.base.Preconditions.checkArgument; 7 import static com.google.common.base.Preconditions.checkElementIndex; 8 import static com.google.common.base.Preconditions.checkNotNull; 9 import static com.google.common.base.Preconditions.checkState; 10 import static com.google.common.collect.ImmutableList.toImmutableList; 11 12 import java.io.IOException; 13 import java.nio.file.Files; 14 import java.nio.file.Path; 15 import java.nio.file.Paths; 16 import java.util.ArrayDeque; 17 import java.util.ArrayList; 18 import java.util.Deque; 19 import java.util.List; 20 import java.util.Optional; 21 import java.util.function.Function; 22 import java.util.function.Predicate; 23 import java.util.regex.Matcher; 24 import java.util.regex.Pattern; 25 import java.util.stream.Stream; 26 27 import com.google.common.base.Joiner; 28 import com.google.common.collect.ArrayListMultimap; 29 import com.google.common.collect.HashMultiset; 30 import com.google.common.collect.ImmutableList; 31 import com.google.common.collect.ImmutableSetMultimap; 32 import com.google.common.collect.Iterables; 33 import com.google.common.collect.ListMultimap; 34 import com.google.common.collect.Lists; 35 import com.google.common.collect.Multiset; 36 37 /** 38 * Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable 39 * ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be 40 * extended to be a more fully featured "diff" tool or a proper ICU data file parser. 41 * 42 * <p>This is a temporary debugging tool and should not be relied upon during any part of the data 43 * generation process. 44 */ 45 final class IcuDataDumper { 46 private static final Joiner LIST_JOINER = Joiner.on(','); 47 private static final RbPath VERSION = RbPath.of("Version"); 48 main(String... args)49 public static void main(String... args) throws IOException { 50 Path fileOrDir; 51 Optional<Pattern> name = Optional.empty(); 52 switch (args.length) { 53 case 2: 54 name = Optional.of(Pattern.compile(args[1])); 55 case 1: 56 fileOrDir = Paths.get(args[0]); 57 break; 58 default: 59 throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]"); 60 } 61 62 if (Files.isDirectory(fileOrDir)) { 63 walkDirectory(fileOrDir, name); 64 } else { 65 checkArgument(!name.isPresent(), 66 "cannot specificy a name pattern for a non-directory file: %s", fileOrDir); 67 IcuDataParser parser = new IcuDataParser(fileOrDir); 68 parser.parse(); 69 dump(parser.icuData); 70 } 71 } 72 walkDirectory(Path fileOrDir, Optional<Pattern> name)73 private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException { 74 Predicate<Path> matchesName = 75 f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true); 76 List<IcuDataParser> icuParsers; 77 try (Stream<Path> files = Files.walk(fileOrDir)) { 78 icuParsers = files 79 .filter(Files::isRegularFile) 80 .filter(matchesName) 81 .map(IcuDataParser::new) 82 .collect(toImmutableList()); 83 } 84 ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create(); 85 for (IcuDataParser p : icuParsers) { 86 p.parse(); 87 for (RbPath k : p.icuData.keySet()) { 88 List<RbValue> values = p.icuData.get(k); 89 if (!allPaths.containsKey(k)) { 90 allPaths.putAll(k, values); 91 } else if (!VERSION.equals(k)) { 92 checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k); 93 } 94 } 95 } 96 dump(allPaths); 97 } 98 dump(ListMultimap<RbPath, RbValue> allPaths)99 private static void dump(ListMultimap<RbPath, RbValue> allPaths) { 100 allPaths.keySet().stream() 101 .sorted() 102 .forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k)))); 103 } 104 105 private static final class IcuDataParser { 106 // Path of file being parsed. 107 private final Path path; 108 109 // Comments in header (before data starts), without comment characters. 110 private final List<String> headerComment = new ArrayList<>(); 111 // ICU data name (the name of the root element). 112 private String name = null; 113 // ICU data values. 114 private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create(); 115 116 // Current line number (1-indexed). 117 private int lineNumber = 0; 118 // The type of the previous line that was processed. 119 private LineType lastType = LineType.COMMENT; 120 // True when inside /* .. */ comments in the header. 121 private boolean inBlockComment = false; 122 // True when in the final top-level group at the end of parsing. 123 private boolean inFinalGroup = false; 124 // True when a partial (line wrapped) value has been read. 125 private boolean isLineContinuation = false; 126 // Current path while parsing (NOT including the root element). 127 private Deque<String> pathStack = new ArrayDeque<>(); 128 // Current sequence of values for the path (as defined in the current path stack). 129 private List<String> currentValue = new ArrayList<>(); 130 // Current partially read value of a multi-line value. 131 private String wrappedValue = ""; 132 // Map of indices used to auto-generate names for anonymous path segments. 133 // TODO: Check if this is even needed and remove if not. 134 private Multiset<Integer> indices = HashMultiset.create(); 135 IcuDataParser(Path path)136 IcuDataParser(Path path) { 137 this.path = checkNotNull(path); 138 } 139 parse()140 public boolean parse() throws IOException { 141 List<String> lines = Files.readAllLines(path); 142 // Best approximation to a magic number be have (BOM plus inline comment). This stops 143 // use trying to parse the transliteration files, which are a different type. 144 if (!lines.get(0).startsWith("\uFEFF//")) { 145 return false; 146 } 147 lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck); 148 149 // Sanity check for expected final state. Just checking the "lastType" should be enough 150 // to catch everything else (due to transition rules and how the code tidies up) but it 151 // seems prudent to sanity check everything just in case. 152 checkState(lastType == LineType.GROUP_END); 153 checkState(!inBlockComment); 154 checkState(name != null); 155 checkState(pathStack.isEmpty() && inFinalGroup); 156 checkState(wrappedValue.isEmpty() && currentValue.isEmpty()); 157 return true; 158 } 159 processLineWithCheck(String line)160 void processLineWithCheck(String line) { 161 lineNumber++; 162 if (lineNumber == 1 && line.startsWith("\uFEFF")) { 163 line = line.substring(1); 164 } 165 try { 166 processLine(line); 167 } catch (RuntimeException e) { 168 throw new RuntimeException( 169 String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line), 170 e); 171 } 172 } 173 processLine(String line)174 void processLine(String line) { 175 line = maybeTrimEndOfLineComment(line); 176 if (line.isEmpty()) { 177 return; 178 } 179 LineMatch match = LineType.match(line, inBlockComment); 180 checkState(match.getType().isValidTransitionFrom(lastType), 181 "invalid state transition: %s --//-> %s", lastType, match.getType()); 182 boolean isEndOfWrappedValue = false; 183 switch (match.getType()) { 184 case COMMENT: 185 if (name != null) { 186 // Comments in data are ignored since they cannot be properly associated with 187 // paths or values in an IcuData instance (only legacy tooling emits these). 188 break; 189 } 190 if (line.startsWith("/*")) { 191 inBlockComment = true; 192 } 193 headerComment.add(match.get(0)); 194 if (inBlockComment && line.contains("*/")) { 195 checkState(line.indexOf("*/") == line.length() - 2, 196 "unexpected end of comment block"); 197 inBlockComment = false; 198 } 199 break; 200 201 case INLINE_VALUE: 202 icuData.put( 203 getPathFromStack().extendBy(getSegment(match.get(0))), 204 RbValue.of(unquote(match.get(1)))); 205 break; 206 207 case GROUP_START: 208 checkState(currentValue.isEmpty()); 209 if (name == null) { 210 name = match.get(0); 211 checkState(name != null, "cannot have anonymous top-level group"); 212 } else { 213 pathStack.push(getSegment(match.get(0))); 214 } 215 wrappedValue = ""; 216 isLineContinuation = false; 217 break; 218 219 case QUOTED_VALUE: 220 wrappedValue += unquote(match.get(0)); 221 isLineContinuation = !line.endsWith(","); 222 if (!isLineContinuation) { 223 currentValue.add(wrappedValue); 224 wrappedValue = ""; 225 } 226 break; 227 228 case VALUE: 229 checkState(!isLineContinuation, "unexpected unquoted value"); 230 currentValue.add(match.get(0)); 231 break; 232 233 case GROUP_END: 234 // Account for quoted values without trailing ',' just before group end. 235 if (isLineContinuation) { 236 currentValue.add(wrappedValue); 237 isLineContinuation = false; 238 } 239 // Emit the collection sequence of values for the current path as an RbValue. 240 if (!currentValue.isEmpty()) { 241 icuData.put(getPathFromStack(), RbValue.of(currentValue)); 242 currentValue.clear(); 243 } 244 // Annoyingly the name is outside the stack so the stack will empty before the last 245 // end group. 246 if (!pathStack.isEmpty()) { 247 pathStack.pop(); 248 indices.setCount(pathStack.size(), 0); 249 } else { 250 checkState(!inFinalGroup, "unexpected group end"); 251 inFinalGroup = true; 252 } 253 break; 254 255 case UNKNOWN: 256 throw new IllegalStateException("cannot parse line: " + match.get(0)); 257 } 258 lastType = match.getType(); 259 } 260 getPathFromStack()261 private RbPath getPathFromStack() { 262 if (pathStack.isEmpty()) { 263 return RbPath.of(); 264 } 265 List<String> segments = new ArrayList<>(); 266 Iterables.addAll(segments, pathStack); 267 if (segments.get(0).matches("<[0-9]{4}>")) { 268 segments.remove(0); 269 } 270 return RbPath.of(Lists.reverse(segments)); 271 } 272 getSegment(String segmentOrNull)273 private String getSegment(String segmentOrNull) { 274 if (segmentOrNull != null) { 275 return segmentOrNull; 276 } 277 int depth = pathStack.size(); 278 int index = indices.count(depth); 279 indices.add(depth, 1); 280 return String.format("<%04d>", index); 281 } 282 maybeTrimEndOfLineComment(String line)283 private String maybeTrimEndOfLineComment(String line) { 284 // Once the name is set, we are past the header and into the data. 285 if (name != null) { 286 // Index to search for '//' from - must skip quoted values. 287 int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0; 288 int commentIdx = line.indexOf("//", startIdx); 289 if (commentIdx != -1) { 290 line = whitespace().trimTrailingFrom(line.substring(0, commentIdx)); 291 } 292 } 293 return line; 294 } 295 unquote(String s)296 private static String unquote(String s) { 297 if (s.startsWith("\"") && s.endsWith("\"")) { 298 return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1"); 299 } 300 checkState(!s.contains("\""), "invalid unquoted value: %s", s); 301 return s; 302 } 303 304 private static final class LineMatch { 305 private final LineType type; 306 private final Function<Integer, String> args; 307 LineMatch(LineType type, Function<Integer, String> args)308 LineMatch(LineType type, Function<Integer, String> args) { 309 this.type = checkNotNull(type); 310 this.args = checkNotNull(args); 311 } 312 get(int n)313 String get(int n) { 314 return args.apply(n); 315 } 316 getType()317 LineType getType() { 318 return type; 319 } 320 } 321 322 private enum LineType { 323 // Comment _start_ with any comment value captured. 324 COMMENT("(?://|/\\*)\\s*(.*)"), 325 // A combination of GROUP_START, VALUE and GROUP_END with whitespace. 326 INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"), 327 // Allows for empty segment names (anonymous arrays) which match 'null'. 328 GROUP_START("(?:(.*\\S)\\s*)?\\{"), 329 GROUP_END("\\}"), 330 QUOTED_VALUE("(\".*\"),?"), 331 VALUE("([^\"{}]+),?"), 332 UNKNOWN(".*"); 333 334 // Table of allowed transitions expected during parsing. 335 // key=current state, values=set of permitted previous states 336 private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS = 337 ImmutableSetMultimap.<LineType, LineType>builder() 338 .putAll(COMMENT, COMMENT) 339 .putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END) 340 .putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE) 341 .putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE) 342 .putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE) 343 .putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE) 344 .build(); 345 346 private final Pattern pattern; 347 LineType(String regex)348 LineType(String regex) { 349 this.pattern = Pattern.compile(regex); 350 } 351 isValidTransitionFrom(LineType lastType)352 boolean isValidTransitionFrom(LineType lastType) { 353 return TRANSITIONS.get(this).contains(lastType); 354 } 355 match(String line, boolean inBlockComment)356 static LineMatch match(String line, boolean inBlockComment) { 357 // Block comments kinda suck and it'd be great if the ICU data only used '//' style 358 // comments (if would definitely simplify any parsers out there). Once the 359 // transition to the new transformation tools is complete, they can be changed to 360 // only emit '//' style comments. 361 if (inBlockComment) { 362 if (line.startsWith("*")) { 363 line = whitespace().trimLeadingFrom(line.substring(1)); 364 } 365 return new LineMatch(COMMENT, ImmutableList.of(line)::get); 366 } 367 for (LineType type : TRANSITIONS.keySet()) { 368 // Regex groups start at 1, but we want the getter function to be zero-indexed. 369 Matcher m = type.pattern.matcher(line); 370 if (m.matches()) { 371 return new LineMatch(type, n -> { 372 checkElementIndex(n, m.groupCount()); 373 return m.group(n + 1); 374 }); 375 } 376 } 377 return new LineMatch(UNKNOWN, ImmutableList.of(line)::get); 378 } 379 } 380 } 381 } 382