1 /* 2 * Copyright 2017 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.zip; 18 19 import static java.nio.charset.StandardCharsets.UTF_8; 20 21 import com.google.common.primitives.UnsignedInts; 22 import java.io.ByteArrayInputStream; 23 import java.io.Closeable; 24 import java.io.IOError; 25 import java.io.IOException; 26 import java.nio.ByteBuffer; 27 import java.nio.ByteOrder; 28 import java.nio.MappedByteBuffer; 29 import java.nio.channels.FileChannel; 30 import java.nio.channels.FileChannel.MapMode; 31 import java.nio.charset.CharacterCodingException; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.file.Path; 34 import java.nio.file.StandardOpenOption; 35 import java.util.Iterator; 36 import java.util.zip.Inflater; 37 import java.util.zip.InflaterInputStream; 38 import java.util.zip.ZipException; 39 40 /** 41 * A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a 42 * href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet 43 * support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have 44 * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile} 45 * instead. 46 * 47 * <p>If you're reading this because you're fixing a bug, sorry. 48 * 49 * <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this 50 * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I 51 * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the 52 * entire file from the beginning to scan the local headers, whereas this implementation (and 53 * ZipFile) only read the central directory. Iterating over the entries (but not reading the data) 54 * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so 55 * most time is spent just scanning entry names. And rt.jar is an interesting test case because 56 * every compilation has to read it, and it dominates the size of the classpath for small 57 * compilations. 58 * 59 * <p>Implementation notes: 60 * 61 * <ul> 62 * <li>Leading garbage may be supported, since the archive is read backwards using the central 63 * directory. Archives modified with zip -A may not be supported. Trailing garbage is not 64 * supported. 65 * <li>UTF-8 is the only supported encoding. 66 * <li>STORED and DEFLATE are the only supported compression methods. 67 * <li>Zip files larger than Integer.MAX_VALUE bytes are not supported. 68 * <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end 69 * header is present only if ENDTOT in EOCD header is 0xFFFF. 70 * </ul> 71 */ 72 public final class Zip { 73 74 static final int ZIP64_ENDSIG = 0x06064b50; 75 static final int ZIP64_LOCSIG = 0x07064b50; 76 77 static final int LOCHDR = 30; // LOC header size 78 static final int CENHDR = 46; // CEN header size 79 static final int ENDHDR = 22; // END header size 80 static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size 81 static final int ZIP64_ENDHDR = 56; // ZIP64 end header size 82 83 static final int ENDTOT = 10; // total number of entries 84 static final int ENDSIZ = 12; // central directory size in bytes 85 static final int ENDCOM = 20; // zip file comment length 86 87 static final int CENHOW = 10; // compression method 88 static final int CENLEN = 24; // uncompressed size 89 static final int CENSIZ = 20; // compressed size 90 static final int CENNAM = 28; // filename length 91 static final int CENEXT = 30; // extra field length 92 static final int CENCOM = 32; // comment length 93 static final int CENOFF = 42; // LOC header offset 94 95 static final int LOCEXT = 28; // extra field length 96 97 static final int ZIP64_ENDSIZ = 40; // central directory size in bytes 98 99 static final int ZIP64_MAGICCOUNT = 0xFFFF; 100 101 /** Iterates over a zip archive. */ 102 static class ZipIterator implements Iterator<Entry> { 103 104 /** A reader for the backing storage. */ 105 private final FileChannel chan; 106 107 private final Path path; 108 private int cdindex = 0; 109 private final MappedByteBuffer cd; 110 private final CharsetDecoder decoder = UTF_8.newDecoder(); 111 ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd)112 ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) { 113 this.path = path; 114 this.chan = chan; 115 this.cd = cd; 116 } 117 118 @Override hasNext()119 public boolean hasNext() { 120 return cdindex < cd.limit(); 121 } 122 123 /* Returns a {@link Entry} for the current CEN entry. */ 124 @Override next()125 public Entry next() { 126 // TODO(cushon): technically we're supposed to throw NSEE 127 checkSignature(path, cd, cdindex, 1, 2, "CENSIG"); 128 int nameLength = cd.getChar(cdindex + CENNAM); 129 int extLength = cd.getChar(cdindex + CENEXT); 130 int commentLength = cd.getChar(cdindex + CENCOM); 131 Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex); 132 cdindex += CENHDR + nameLength + extLength + commentLength; 133 return entry; 134 } 135 string(ByteBuffer buf, int offset, int length)136 public String string(ByteBuffer buf, int offset, int length) { 137 buf = buf.duplicate(); 138 buf.position(offset); 139 buf.limit(offset + length); 140 decoder.reset(); 141 try { 142 return decoder.decode(buf).toString(); 143 } catch (CharacterCodingException e) { 144 throw new IOError(e); 145 } 146 } 147 } 148 149 /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */ 150 public static class ZipIterable implements Iterable<Entry>, Closeable { 151 152 private final Path path; 153 private final FileChannel chan; 154 private final MappedByteBuffer cd; 155 ZipIterable(Path path)156 public ZipIterable(Path path) throws IOException { 157 this.path = path; 158 this.chan = FileChannel.open(path, StandardOpenOption.READ); 159 // Locate the EOCD 160 long size = chan.size(); 161 if (size < ENDHDR) { 162 throw new ZipException("invalid zip archive"); 163 } 164 long eocdOffset = size - ENDHDR; 165 MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR); 166 eocd.order(ByteOrder.LITTLE_ENDIAN); 167 int index = 0; 168 int commentSize = 0; 169 if (!isSignature(eocd, 0, 5, 6)) { 170 // The archive may contain a zip file comment; keep looking for the EOCD. 171 long start = Math.max(0, size - ENDHDR - 0xFFFF); 172 eocd = chan.map(MapMode.READ_ONLY, start, (size - start)); 173 eocd.order(ByteOrder.LITTLE_ENDIAN); 174 index = (int) ((size - start) - ENDHDR); 175 while (index > 0) { 176 index--; 177 eocd.position(index); 178 if (isSignature(eocd, index, 5, 6)) { 179 commentSize = (int) ((size - start) - ENDHDR) - index; 180 eocdOffset = start + index; 181 break; 182 } 183 } 184 } 185 checkSignature(path, eocd, index, 5, 6, "ENDSIG"); 186 int totalEntries = eocd.getChar(index + ENDTOT); 187 long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ)); 188 int actualCommentSize = eocd.getChar(index + ENDCOM); 189 if (commentSize != actualCommentSize) { 190 throw new ZipException( 191 String.format( 192 "zip file comment length was %d, expected %d", commentSize, actualCommentSize)); 193 } 194 // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator. 195 if (totalEntries == ZIP64_MAGICCOUNT) { 196 // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors. 197 long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR; 198 // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF 199 // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations 200 // read the full EOCD records and compare them. 201 long zip64cdsize = zip64cdsize(chan, zip64eocdOffset); 202 if (zip64cdsize != -1) { 203 eocdOffset = zip64eocdOffset; 204 cdsize = zip64cdsize; 205 } else { 206 // If we couldn't find a zip64 EOCD at a fixed offset, either it doesn't exist 207 // or there was a zip64 extensible data sector, so try going through the 208 // locator. This approach doesn't work if data was prepended to the archive 209 // without updating the offset in the locator. 210 MappedByteBuffer zip64loc = 211 chan.map(MapMode.READ_ONLY, size - ENDHDR - ZIP64_LOCHDR, ZIP64_LOCHDR); 212 zip64loc.order(ByteOrder.LITTLE_ENDIAN); 213 if (zip64loc.getInt(0) == ZIP64_LOCSIG) { 214 zip64eocdOffset = zip64loc.getLong(8); 215 zip64cdsize = zip64cdsize(chan, zip64eocdOffset); 216 if (zip64cdsize != -1) { 217 eocdOffset = zip64eocdOffset; 218 cdsize = zip64cdsize; 219 } 220 } 221 } 222 } 223 this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize); 224 cd.order(ByteOrder.LITTLE_ENDIAN); 225 } 226 zip64cdsize(FileChannel chan, long eocdOffset)227 static long zip64cdsize(FileChannel chan, long eocdOffset) throws IOException { 228 MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ZIP64_ENDHDR); 229 zip64eocd.order(ByteOrder.LITTLE_ENDIAN); 230 if (zip64eocd.getInt(0) == ZIP64_ENDSIG) { 231 return zip64eocd.getLong(ZIP64_ENDSIZ); 232 } 233 return -1; 234 } 235 236 @Override iterator()237 public Iterator<Entry> iterator() { 238 return new ZipIterator(path, chan, cd); 239 } 240 241 @Override close()242 public void close() throws IOException { 243 chan.close(); 244 } 245 } 246 247 /** An entry in a zip archive. */ 248 public static class Entry { 249 250 private final Path path; 251 private final FileChannel chan; 252 private final String name; 253 private final ByteBuffer cd; 254 private final int cdindex; 255 Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex)256 public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) { 257 this.path = path; 258 this.chan = chan; 259 this.name = name; 260 this.cd = cd; 261 this.cdindex = cdindex; 262 } 263 264 /** The entry name. */ name()265 public String name() { 266 return name; 267 } 268 269 /** The entry data. */ data()270 public byte[] data() { 271 // Read the offset and variable lengths from the central directory and then try to map in the 272 // data section in one shot. 273 long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF)); 274 int nameLength = cd.getChar(cdindex + CENNAM); 275 int extLength = cd.getChar(cdindex + CENEXT); 276 int compression = cd.getChar(cdindex + CENHOW); 277 switch (compression) { 278 case 0x8: 279 return getBytes( 280 offset, 281 nameLength, 282 extLength, 283 UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)), 284 /*deflate=*/ true); 285 case 0x0: 286 return getBytes( 287 offset, 288 nameLength, 289 extLength, 290 UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)), 291 /*deflate=*/ false); 292 default: 293 throw new AssertionError( 294 String.format("unsupported compression mode: 0x%x", compression)); 295 } 296 } 297 298 /** 299 * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header 300 * reports more extra field data than the central directory. 301 */ 302 static final int EXTRA_FIELD_SLACK = 128; 303 getBytes( long offset, int nameLength, int cenExtLength, long size, boolean deflate)304 private byte[] getBytes( 305 long offset, int nameLength, int cenExtLength, long size, boolean deflate) { 306 if (size > Integer.MAX_VALUE) { 307 throw new IllegalArgumentException("unsupported zip entry size: " + size); 308 } 309 try { 310 MappedByteBuffer fc = 311 chan.map( 312 MapMode.READ_ONLY, 313 offset, 314 Math.min( 315 LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK, 316 chan.size() - offset)); 317 fc.order(ByteOrder.LITTLE_ENDIAN); 318 checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG"); 319 int locExtLength = fc.getChar(LOCEXT); 320 if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) { 321 // If the local header's extra fields don't match the central directory and we didn't 322 // leave enough slac, re-map the data section with the correct extra field length. 323 fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size); 324 fc.order(ByteOrder.LITTLE_ENDIAN); 325 } else { 326 // Otherwise seek past the local header, name, and extra fields to the data. 327 fc.position(LOCHDR + nameLength + locExtLength); 328 fc.limit((int) (LOCHDR + nameLength + locExtLength + size)); 329 } 330 byte[] bytes = new byte[(int) size]; 331 fc.get(bytes); 332 if (deflate) { 333 bytes = 334 new InflaterInputStream( 335 new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true)) 336 .readAllBytes(); 337 } 338 return bytes; 339 } catch (IOException e) { 340 throw new IOError(e); 341 } 342 } 343 } 344 checkSignature( Path path, MappedByteBuffer buf, int index, int i, int j, String name)345 static void checkSignature( 346 Path path, MappedByteBuffer buf, int index, int i, int j, String name) { 347 if (!isSignature(buf, index, i, j)) { 348 throw new AssertionError( 349 String.format( 350 "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)", 351 path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index))); 352 } 353 } 354 isSignature(MappedByteBuffer buf, int index, int i, int j)355 static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) { 356 return (buf.get(index) == 'P') 357 && (buf.get(index + 1) == 'K') 358 && (buf.get(index + 2) == i) 359 && (buf.get(index + 3) == j); 360 } 361 Zip()362 private Zip() {} 363 } 364