1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.ant; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkState; 7 import static com.google.common.collect.ImmutableSet.toImmutableSet; 8 import static java.nio.charset.StandardCharsets.UTF_8; 9 import static java.nio.file.LinkOption.NOFOLLOW_LINKS; 10 import static java.util.stream.Collectors.joining; 11 import static java.util.stream.Collectors.partitioningBy; 12 13 import java.io.BufferedReader; 14 import java.io.IOException; 15 import java.io.InputStream; 16 import java.io.InputStreamReader; 17 import java.io.UncheckedIOException; 18 import java.nio.file.Files; 19 import java.nio.file.Path; 20 import java.nio.file.Paths; 21 import java.util.ArrayList; 22 import java.util.Arrays; 23 import java.util.HashSet; 24 import java.util.List; 25 import java.util.Map; 26 import java.util.Set; 27 import java.util.TreeSet; 28 import java.util.stream.Collectors; 29 import java.util.stream.Stream; 30 31 import org.apache.tools.ant.BuildException; 32 import org.apache.tools.ant.Task; 33 import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir; 34 35 import com.google.common.base.CharMatcher; 36 import com.google.common.collect.ImmutableList; 37 import com.google.common.collect.ImmutableSet; 38 import com.google.common.collect.Iterables; 39 import com.google.common.io.CharStreams; 40 41 // Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed. 42 public final class CleanOutputDirectoryTask extends Task { 43 private static final ImmutableSet<String> ALLOWED_DIRECTORIES = 44 Stream 45 .concat( 46 Stream.of("misc", "translit"), 47 Arrays.stream(IcuLocaleDir.values()).map(IcuLocaleDir::getOutputDir)) 48 .sorted() 49 .collect(toImmutableSet()); 50 51 private static final CharMatcher NOT_WHITESPACE = CharMatcher.whitespace().negate(); 52 53 private static final String HEADER_FILE = "ldml2icu_header.txt"; 54 55 // If present in the header of a file, this line is used to determine that the file was 56 // auto-generated. This allows us to change the rest of the header freely without issue. 57 // However if it's not present in the file, we fallback to comparing the rest of the 58 // header without it (since that's the old behaviour). 59 // Once there's been an ICU release with this line included in the headers of all data 60 // files, we can remove the fallback and just test for this line and nothing else. 61 private static final String WAS_GENERATED_LABEL = 62 "Generated using tools/cldr/cldr-to-icu/build-icu-data.xml"; 63 64 // The number of header lines to check before giving up if we don't find the generated 65 // label. 66 private static final int MAX_HEADER_CHECK_LINES = 20; 67 68 private Path root = null; 69 private boolean forceDelete = false; 70 private final List<Dir> outputDirs = new ArrayList<>(); 71 private final ImmutableList<String> headerLines; 72 CleanOutputDirectoryTask()73 public CleanOutputDirectoryTask() { 74 // TODO: Consider passing in header lines via Ant? 75 this.headerLines = readLinesFromResource("/" + HEADER_FILE); 76 // For now assume that the generated label is the last line of the header. 77 checkState(Iterables.getLast(headerLines).equals(WAS_GENERATED_LABEL), 78 "Expected last line of %s header file to be:\n\t%s", HEADER_FILE, WAS_GENERATED_LABEL); 79 // Make sure we check at least a few more lines than is in the current header. 80 checkState(MAX_HEADER_CHECK_LINES >= headerLines.size() + 5, 81 "Unexpectedly large header file; please increase MAX_HEADER_CHECK_LINES constant"); 82 } 83 84 public static final class Retain extends Task { 85 private Path path = null; 86 87 // Don't use "Path" for the argument type because that always makes an absolute path (e.g. 88 // relative to the working directory for the Ant task). We want relative paths. 89 @SuppressWarnings("unused") setPath(String path)90 public void setPath(String path) { 91 Path p = Paths.get(path).normalize(); 92 checkBuild(!p.isAbsolute() && !p.startsWith(".."), "invalid path: %s", path); 93 this.path = p; 94 } 95 96 @Override init()97 public void init() throws BuildException { 98 checkBuild(path != null, "missing 'path' attribute"); 99 } 100 } 101 102 public static final class Dir extends Task { 103 private String name; 104 private final Set<Path> retained = new HashSet<>(); 105 106 @SuppressWarnings("unused") setName(String name)107 public void setName(String name) { 108 checkBuild(ALLOWED_DIRECTORIES.contains(name), 109 "unknown directory name '%s'; allowed values: %s", name, ALLOWED_DIRECTORIES); 110 this.name = name; 111 } 112 113 @SuppressWarnings("unused") addConfiguredRetain(Retain retain)114 public void addConfiguredRetain(Retain retain) { 115 retained.add(retain.path); 116 } 117 118 @Override init()119 public void init() throws BuildException { 120 checkBuild(name != null, "missing 'name' attribute"); 121 } 122 } 123 124 @SuppressWarnings("unused") setRoot(String root)125 public void setRoot(String root) { 126 // Use String here since on some systems Ant doesn't support automatically converting Path instances. 127 this.root = Paths.get(root); 128 } 129 130 @SuppressWarnings("unused") setForceDelete(boolean forceDelete)131 public void setForceDelete(boolean forceDelete) { 132 this.forceDelete = forceDelete; 133 } 134 135 @SuppressWarnings("unused") addConfiguredDir(Dir dir)136 public void addConfiguredDir(Dir dir) { 137 outputDirs.add(dir); 138 } 139 140 @Override execute()141 public void execute() throws BuildException { 142 checkBuild(root != null, "missing 'root' attribute"); 143 checkBuild(!outputDirs.isEmpty(), "missing <dir> elements"); 144 145 if (!Files.exists(root)) { 146 log("Root directory '" + root + "' does not exist (nothing to clean)"); 147 return; 148 } 149 checkBuild(Files.isDirectory(root), "specified root '%s' is not a directory", root); 150 151 Set<Path> autogenFiles = new TreeSet<>(); 152 Set<Path> unknownFiles = new TreeSet<>(); 153 for (Dir dirInfo : outputDirs) { 154 Path dirPath = root.resolve(dirInfo.name); 155 if (!Files.exists(dirPath)) { 156 continue; 157 } 158 checkBuild(Files.isDirectory(dirPath), "'%s' is not a directory", dirPath); 159 160 // Note: For now we just walk the immediate contents of each output directory and don't 161 // attempt to recursively process things. Only a couple of output directories have 162 // sub-directories anyway, and we never write files into them anyway. 163 try (Stream<Path> files = Files.list(dirPath)) { 164 Map<Boolean, List<Path>> map = files 165 .filter(p -> couldDelete(p, dirPath, dirInfo)) 166 .parallel() 167 .collect(partitioningBy(this::wasAutoGenerated)); 168 unknownFiles.addAll(map.get(false)); 169 autogenFiles.addAll(map.get(true)); 170 } catch (IOException e) { 171 throw new BuildException("Error processing directory: " + dirPath, e); 172 } 173 } 174 175 if (!unknownFiles.isEmpty() && !forceDelete) { 176 // If there are NO safe files, then something weird is going on (perhaps a change in 177 // the header file). 178 if (autogenFiles.isEmpty()) { 179 log("Error determining 'safe' files for deletion (no auto-generated files found)."); 180 log(unknownFiles.size() + " files would be deleted for 'clean' task"); 181 logPartioned(unknownFiles); 182 log("Set '-DforceDelete=true' to delete all files not listed in" 183 + " <outputDirectories>."); 184 } else { 185 // A mix of safe and unsafe files is weird, but in this case it should be a 186 // relatively small number of files (e.g. adding a new manually maintained file or 187 // accidental editing of header lines). 188 log("Unknown files exist which cannot be determined to be auto-generated"); 189 log("Files:"); 190 logPartioned(unknownFiles); 191 log(String.format("%d unknown files or directories found", unknownFiles.size())); 192 log("Set '-DforceDelete=true' to delete these files, or add them to" 193 + " <outputDirectories>."); 194 } 195 throw new BuildException("Unsafe files cannot be deleted"); 196 } 197 if (!unknownFiles.isEmpty()) { 198 checkState(forceDelete, "unexpected flag state (forceDelete should be true here)"); 199 List<Path> filesToDelete = 200 unknownFiles.stream() 201 .filter(p -> !Files.isDirectory(p)) 202 .collect(Collectors.toList()); 203 log(String.format("Force deleting %,d files...\n", filesToDelete.size())); 204 deleteAllFiles(filesToDelete); 205 206 List<Path> unknownDirs = 207 unknownFiles.stream() 208 .filter(p -> Files.isDirectory(p)) 209 .collect(Collectors.toList()); 210 if (!unknownDirs.isEmpty()) { 211 log("Add the following directories to the <outputDirectories> task:"); 212 logPartioned(unknownDirs); 213 throw new BuildException("Unsafe directories cannot be deleted"); 214 } 215 } 216 if (!autogenFiles.isEmpty()) { 217 log(String.format("Deleting %,d auto-generated files...\n", autogenFiles.size())); 218 deleteAllFiles(autogenFiles); 219 } 220 } 221 logPartioned(Iterable<Path> files)222 private void logPartioned(Iterable<Path> files) { 223 Iterables.partition(files, 5) 224 .forEach(f -> log( 225 f.stream().map(p -> root.relativize(p).toString()).collect(joining(", ")))); 226 } 227 couldDelete(Path path, Path dir, Dir dirInfo)228 private boolean couldDelete(Path path, Path dir, Dir dirInfo) { 229 return !dirInfo.retained.contains(dir.relativize(path)); 230 } 231 wasAutoGenerated(Path path)232 private boolean wasAutoGenerated(Path path) { 233 if (!Files.isRegularFile(path, NOFOLLOW_LINKS)) { 234 // Directories, symbolic links, devices etc. 235 return false; 236 } 237 try (BufferedReader r = Files.newBufferedReader(path, UTF_8)) { 238 return wasFileAutoGenerated(r, headerLines); 239 } catch (IOException e) { 240 throw new UncheckedIOException(e); 241 } 242 } 243 244 // TODO: Once the WAS_GENERATED_LABEL is in all auto-generated ICU data files, simplify this. 245 // Static and non-private for testing. wasFileAutoGenerated(BufferedReader fileReader, ImmutableList<String> headerLines)246 static boolean wasFileAutoGenerated(BufferedReader fileReader, ImmutableList<String> headerLines) 247 throws IOException { 248 // A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just 249 // treat it as optional everywhere (it's not the important thing we check here). 250 fileReader.mark(1); 251 int maybeByteOrderMark = fileReader.read(); 252 if (maybeByteOrderMark != '\uFEFF') { 253 // Also reset if the file was empty, but that should be harmless. 254 fileReader.reset(); 255 } 256 boolean isLenientHeaderMatchSoFar = true; 257 for (int n = 0; n < MAX_HEADER_CHECK_LINES ; n++) { 258 String line = fileReader.readLine(); 259 // True if we have processed the header, not including the trailing generated label. 260 boolean headerIsProcessed = n >= headerLines.size() - 1; 261 boolean isCompleteLenientMatch = isLenientHeaderMatchSoFar && headerIsProcessed; 262 if (line == null) { 263 // We ran off the end of the file, so we're done. 264 return isCompleteLenientMatch; 265 } 266 int headerStart = skipComment(line); 267 if (headerStart < 0) { 268 // We ran off the end of the expected comment section, so we're done. 269 return isCompleteLenientMatch; 270 } 271 if (matchesToEndOfLine(line, headerStart, WAS_GENERATED_LABEL)) { 272 // Finding the generated label trumps any lenient matching. 273 return true; 274 } 275 if (!isLenientHeaderMatchSoFar) { 276 // We already failed at lenient matching, so keep going in the hope of finding 277 // the generated label (we don't need to check the header any more). 278 continue; 279 } 280 if (headerIsProcessed) { 281 // We finishing processing the header and it matched (not including the 282 // generated label) but for older data files, that's fine. 283 return true; 284 } 285 // Check the next header line (not including the trailing generated label). 286 isLenientHeaderMatchSoFar = matchesToEndOfLine(line, headerStart, headerLines.get(n)); 287 } 288 // This is actually an unusual case. It corresponds to a file which: 289 // * has a leading comment section at least as long as MAX_HEADER_CHECK_LINES 290 // * does not contain WAS_GENERATED_LABEL anywhere in the first MAX_HEADER_CHECK_LINES 291 // Most files checked are expected to match, and so will not get here, but instead 292 // return via one of the return statements above. 293 return false; 294 } 295 matchesToEndOfLine(String line, int start, String expected)296 private static boolean matchesToEndOfLine(String line, int start, String expected) { 297 return line.length() - start == expected.length() 298 && line.regionMatches(start, expected, 0, expected.length()); 299 } 300 skipComment(String line)301 private static int skipComment(String line) { 302 if (line.startsWith("#")) { 303 return toCommentStart(line, 1); 304 } else if (line.startsWith("//")) { 305 return toCommentStart(line, 2); 306 } 307 return -1; 308 } 309 310 // Not just "index-of" since a comment start followed by only whitespace is NOT a failure to 311 // find a comment (since the header might have an empty line in it, which should be okay). toCommentStart(String line, int offset)312 private static int toCommentStart(String line, int offset) { 313 int index = NOT_WHITESPACE.indexIn(line, offset); 314 return index >= 0 ? index : line.length(); 315 } 316 deleteAllFiles(Iterable<Path> files)317 private static void deleteAllFiles(Iterable<Path> files) { 318 for (Path p : files) { 319 try { 320 // This is a code error, since only files should be passed here. 321 checkArgument(!Files.isDirectory(p), "Cannot delete directories: %s", p); 322 Files.deleteIfExists(p); 323 } catch (IOException e) { 324 throw new BuildException("Error deleting file: " + p, e); 325 } 326 } 327 } 328 checkBuild(boolean condition, String message, Object... args)329 private static void checkBuild(boolean condition, String message, Object... args) { 330 if (!condition) { 331 throw new BuildException(String.format(message, args)); 332 } 333 } 334 readLinesFromResource(String name)335 private static ImmutableList<String> readLinesFromResource(String name) { 336 try (InputStream in = CleanOutputDirectoryTask.class.getResourceAsStream(name)) { 337 return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in, UTF_8))); 338 } catch (IOException e) { 339 throw new RuntimeException("cannot read resource: " + name, e); 340 } 341 } 342 } 343