• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu;
4 
5 import static com.google.common.base.CharMatcher.whitespace;
6 import static com.google.common.base.Preconditions.checkArgument;
7 import static com.google.common.base.Preconditions.checkElementIndex;
8 import static com.google.common.base.Preconditions.checkNotNull;
9 import static com.google.common.base.Preconditions.checkState;
10 import static com.google.common.collect.ImmutableList.toImmutableList;
11 
12 import java.io.IOException;
13 import java.nio.file.Files;
14 import java.nio.file.Path;
15 import java.nio.file.Paths;
16 import java.util.ArrayDeque;
17 import java.util.ArrayList;
18 import java.util.Deque;
19 import java.util.List;
20 import java.util.Optional;
21 import java.util.function.Function;
22 import java.util.function.Predicate;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25 import java.util.stream.Stream;
26 
27 import com.google.common.base.Joiner;
28 import com.google.common.collect.ArrayListMultimap;
29 import com.google.common.collect.HashMultiset;
30 import com.google.common.collect.ImmutableList;
31 import com.google.common.collect.ImmutableSetMultimap;
32 import com.google.common.collect.Iterables;
33 import com.google.common.collect.ListMultimap;
34 import com.google.common.collect.Lists;
35 import com.google.common.collect.Multiset;
36 
37 /**
38  * Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
39  * ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
40  * extended to be a more fully featured "diff" tool or a proper ICU data file parser.
41  *
42  * <p>This is a temporary debugging tool and should not be relied upon during any part of the data
43  * generation process.
44  */
45 final class IcuDataDumper {
46     private static final Joiner LIST_JOINER = Joiner.on(',');
47     private static final RbPath VERSION = RbPath.of("Version");
48 
main(String... args)49     public static void main(String... args) throws IOException {
50         Path fileOrDir;
51         Optional<Pattern> name = Optional.empty();
52         switch (args.length) {
53         case 2:
54             name = Optional.of(Pattern.compile(args[1]));
55         case 1:
56             fileOrDir = Paths.get(args[0]);
57             break;
58         default:
59             throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
60         }
61 
62         if (Files.isDirectory(fileOrDir)) {
63             walkDirectory(fileOrDir, name);
64         } else {
65             checkArgument(!name.isPresent(),
66                 "cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
67             IcuDataParser parser = new IcuDataParser(fileOrDir);
68             parser.parse();
69             dump(parser.icuData);
70         }
71     }
72 
walkDirectory(Path fileOrDir, Optional<Pattern> name)73     private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
74         Predicate<Path> matchesName =
75             f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
76         List<IcuDataParser> icuParsers;
77         try (Stream<Path> files = Files.walk(fileOrDir)) {
78             icuParsers = files
79                 .filter(Files::isRegularFile)
80                 .filter(matchesName)
81                 .map(IcuDataParser::new)
82                 .collect(toImmutableList());
83         }
84         ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
85         for (IcuDataParser p : icuParsers) {
86             p.parse();
87             for (RbPath k : p.icuData.keySet()) {
88                 List<RbValue> values = p.icuData.get(k);
89                 if (!allPaths.containsKey(k)) {
90                     allPaths.putAll(k, values);
91                 } else if (!VERSION.equals(k)) {
92                     checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
93                 }
94             }
95         }
96         dump(allPaths);
97     }
98 
dump(ListMultimap<RbPath, RbValue> allPaths)99     private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
100         allPaths.keySet().stream()
101             .sorted()
102             .forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
103     }
104 
105     private static final class IcuDataParser {
106         // Path of file being parsed.
107         private final Path path;
108 
109         // Comments in header (before data starts), without comment characters.
110         private final List<String> headerComment = new ArrayList<>();
111         // ICU data name (the name of the root element).
112         private String name = null;
113         // ICU data values.
114         private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
115 
116         // Current line number (1-indexed).
117         private int lineNumber = 0;
118         // The type of the previous line that was processed.
119         private LineType lastType = LineType.COMMENT;
120         // True when inside /* .. */ comments in the header.
121         private boolean inBlockComment = false;
122         // True when in the final top-level group at the end of parsing.
123         private boolean inFinalGroup = false;
124         // True when a partial (line wrapped) value has been read.
125         private boolean isLineContinuation = false;
126         // Current path while parsing (NOT including the root element).
127         private Deque<String> pathStack = new ArrayDeque<>();
128         // Current sequence of values for the path (as defined in the current path stack).
129         private List<String> currentValue = new ArrayList<>();
130         // Current partially read value of a multi-line value.
131         private String wrappedValue = "";
132         // Map of indices used to auto-generate names for anonymous path segments.
133         // TODO: Check if this is even needed and remove if not.
134         private Multiset<Integer> indices = HashMultiset.create();
135 
IcuDataParser(Path path)136         IcuDataParser(Path path) {
137             this.path = checkNotNull(path);
138         }
139 
parse()140         public boolean parse() throws IOException {
141             List<String> lines = Files.readAllLines(path);
142             // Best approximation to a magic number be have (BOM plus inline comment). This stops
143             // use trying to parse the transliteration files, which are a different type.
144             if (!lines.get(0).startsWith("\uFEFF//")) {
145                 return false;
146             }
147             lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
148 
149             // Sanity check for expected final state. Just checking the "lastType" should be enough
150             // to catch everything else (due to transition rules and how the code tidies up) but it
151             // seems prudent to sanity check everything just in case.
152             checkState(lastType == LineType.GROUP_END);
153             checkState(!inBlockComment);
154             checkState(name != null);
155             checkState(pathStack.isEmpty() && inFinalGroup);
156             checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
157             return true;
158         }
159 
processLineWithCheck(String line)160         void processLineWithCheck(String line) {
161             lineNumber++;
162             if (lineNumber == 1 && line.startsWith("\uFEFF")) {
163                 line = line.substring(1);
164             }
165             try {
166                 processLine(line);
167             } catch (RuntimeException e) {
168                 throw new RuntimeException(
169                     String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
170                     e);
171             }
172         }
173 
processLine(String line)174         void processLine(String line) {
175             line = maybeTrimEndOfLineComment(line);
176             if (line.isEmpty()) {
177                 return;
178             }
179             LineMatch match = LineType.match(line, inBlockComment);
180             checkState(match.getType().isValidTransitionFrom(lastType),
181                 "invalid state transition: %s --//-> %s", lastType, match.getType());
182             boolean isEndOfWrappedValue = false;
183             switch (match.getType()) {
184             case COMMENT:
185                 if (name != null) {
186                     // Comments in data are ignored since they cannot be properly associated with
187                     // paths or values in an IcuData instance (only legacy tooling emits these).
188                     break;
189                 }
190                 if (line.startsWith("/*")) {
191                     inBlockComment = true;
192                 }
193                 headerComment.add(match.get(0));
194                 if (inBlockComment && line.contains("*/")) {
195                     checkState(line.indexOf("*/") == line.length() - 2,
196                         "unexpected end of comment block");
197                     inBlockComment = false;
198                 }
199                 break;
200 
201             case INLINE_VALUE:
202                 icuData.put(
203                     getPathFromStack().extendBy(getSegment(match.get(0))),
204                     RbValue.of(unquote(match.get(1))));
205                 break;
206 
207             case GROUP_START:
208                 checkState(currentValue.isEmpty());
209                 if (name == null) {
210                     name = match.get(0);
211                     checkState(name != null, "cannot have anonymous top-level group");
212                 } else {
213                     pathStack.push(getSegment(match.get(0)));
214                 }
215                 wrappedValue = "";
216                 isLineContinuation = false;
217                 break;
218 
219             case QUOTED_VALUE:
220                 wrappedValue += unquote(match.get(0));
221                 isLineContinuation = !line.endsWith(",");
222                 if (!isLineContinuation) {
223                     currentValue.add(wrappedValue);
224                     wrappedValue = "";
225                 }
226                 break;
227 
228             case VALUE:
229                 checkState(!isLineContinuation, "unexpected unquoted value");
230                 currentValue.add(match.get(0));
231                 break;
232 
233             case GROUP_END:
234                 // Account for quoted values without trailing ',' just before group end.
235                 if (isLineContinuation) {
236                     currentValue.add(wrappedValue);
237                     isLineContinuation = false;
238                 }
239                 // Emit the collection sequence of values for the current path as an RbValue.
240                 if (!currentValue.isEmpty()) {
241                     icuData.put(getPathFromStack(), RbValue.of(currentValue));
242                     currentValue.clear();
243                 }
244                 // Annoyingly the name is outside the stack so the stack will empty before the last
245                 // end group.
246                 if (!pathStack.isEmpty()) {
247                     pathStack.pop();
248                     indices.setCount(pathStack.size(), 0);
249                 } else {
250                     checkState(!inFinalGroup, "unexpected group end");
251                     inFinalGroup = true;
252                 }
253                 break;
254 
255             case UNKNOWN:
256                 throw new IllegalStateException("cannot parse line: " + match.get(0));
257             }
258             lastType = match.getType();
259         }
260 
getPathFromStack()261         private RbPath getPathFromStack() {
262             if (pathStack.isEmpty()) {
263                 return RbPath.of();
264             }
265             List<String> segments = new ArrayList<>();
266             Iterables.addAll(segments, pathStack);
267             if (segments.get(0).matches("<[0-9]{4}>")) {
268                 segments.remove(0);
269             }
270             return RbPath.of(Lists.reverse(segments));
271         }
272 
getSegment(String segmentOrNull)273         private String getSegment(String segmentOrNull) {
274             if (segmentOrNull != null) {
275                 return segmentOrNull;
276             }
277             int depth = pathStack.size();
278             int index = indices.count(depth);
279             indices.add(depth, 1);
280             return String.format("<%04d>", index);
281         }
282 
maybeTrimEndOfLineComment(String line)283         private String maybeTrimEndOfLineComment(String line) {
284             // Once the name is set, we are past the header and into the data.
285             if (name != null) {
286                 // Index to search for '//' from - must skip quoted values.
287                 int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
288                 int commentIdx = line.indexOf("//", startIdx);
289                 if (commentIdx != -1) {
290                     line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
291                 }
292             }
293             return line;
294         }
295 
unquote(String s)296         private static String unquote(String s) {
297             if (s.startsWith("\"") && s.endsWith("\"")) {
298                 return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
299             }
300             checkState(!s.contains("\""), "invalid unquoted value: %s", s);
301             return s;
302         }
303 
304         private static final class LineMatch {
305             private final LineType type;
306             private final Function<Integer, String> args;
307 
LineMatch(LineType type, Function<Integer, String> args)308             LineMatch(LineType type, Function<Integer, String> args) {
309                 this.type = checkNotNull(type);
310                 this.args = checkNotNull(args);
311             }
312 
get(int n)313             String get(int n) {
314                 return args.apply(n);
315             }
316 
getType()317             LineType getType() {
318                 return type;
319             }
320         }
321 
322         private enum LineType {
323             // Comment _start_ with any comment value captured.
324             COMMENT("(?://|/\\*)\\s*(.*)"),
325             // A combination of GROUP_START, VALUE and GROUP_END with whitespace.
326             INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
327             // Allows for empty segment names (anonymous arrays) which match 'null'.
328             GROUP_START("(?:(.*\\S)\\s*)?\\{"),
329             GROUP_END("\\}"),
330             QUOTED_VALUE("(\".*\"),?"),
331             VALUE("([^\"{}]+),?"),
332             UNKNOWN(".*");
333 
334             // Table of allowed transitions expected during parsing.
335             // key=current state, values=set of permitted previous states
336             private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
337                 ImmutableSetMultimap.<LineType, LineType>builder()
338                     .putAll(COMMENT, COMMENT)
339                     .putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
340                     .putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
341                     .putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
342                     .putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
343                     .putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
344                     .build();
345 
346             private final Pattern pattern;
347 
LineType(String regex)348             LineType(String regex) {
349                 this.pattern = Pattern.compile(regex);
350             }
351 
isValidTransitionFrom(LineType lastType)352             boolean isValidTransitionFrom(LineType lastType) {
353                 return TRANSITIONS.get(this).contains(lastType);
354             }
355 
match(String line, boolean inBlockComment)356             static LineMatch match(String line, boolean inBlockComment) {
357                 // Block comments kinda suck and it'd be great if the ICU data only used '//' style
358                 // comments (if would definitely simplify any parsers out there). Once the
359                 // transition to the new transformation tools is complete, they can be changed to
360                 // only emit '//' style comments.
361                 if (inBlockComment) {
362                     if (line.startsWith("*")) {
363                         line = whitespace().trimLeadingFrom(line.substring(1));
364                     }
365                     return new LineMatch(COMMENT, ImmutableList.of(line)::get);
366                 }
367                 for (LineType type : TRANSITIONS.keySet()) {
368                     // Regex groups start at 1, but we want the getter function to be zero-indexed.
369                     Matcher m = type.pattern.matcher(line);
370                     if (m.matches()) {
371                         return new LineMatch(type, n -> {
372                             checkElementIndex(n, m.groupCount());
373                             return m.group(n + 1);
374                         });
375                     }
376                 }
377                 return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
378             }
379         }
380     }
381 }
382