• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Google Inc. All Rights Reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.turbine.zip;
18 
19 import static java.nio.charset.StandardCharsets.UTF_8;
20 
21 import com.google.common.primitives.UnsignedInts;
22 import java.io.ByteArrayInputStream;
23 import java.io.Closeable;
24 import java.io.IOError;
25 import java.io.IOException;
26 import java.nio.ByteBuffer;
27 import java.nio.ByteOrder;
28 import java.nio.MappedByteBuffer;
29 import java.nio.channels.FileChannel;
30 import java.nio.channels.FileChannel.MapMode;
31 import java.nio.charset.CharacterCodingException;
32 import java.nio.charset.CharsetDecoder;
33 import java.nio.file.Path;
34 import java.nio.file.StandardOpenOption;
35 import java.util.Iterator;
36 import java.util.zip.Inflater;
37 import java.util.zip.InflaterInputStream;
38 import java.util.zip.ZipException;
39 
40 /**
41  * A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a
42  * href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet
43  * support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have
44  * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile}
45  * instead.
46  *
47  * <p>If you're reading this because you're fixing a bug, sorry.
48  *
49  * <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this
50  * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I
51  * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the
52  * entire file from the beginning to scan the local headers, whereas this implementation (and
53  * ZipFile) only read the central directory. Iterating over the entries (but not reading the data)
54  * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so
55  * most time is spent just scanning entry names. And rt.jar is an interesting test case because
56  * every compilation has to read it, and it dominates the size of the classpath for small
57  * compilations.
58  *
59  * <p>Implementation notes:
60  *
61  * <ul>
62  *   <li>Leading garbage may be supported, since the archive is read backwards using the central
63  *       directory. Archives modified with zip -A may not be supported. Trailing garbage is not
64  *       supported.
65  *   <li>UTF-8 is the only supported encoding.
66  *   <li>STORED and DEFLATE are the only supported compression methods.
67  *   <li>Zip files larger than Integer.MAX_VALUE bytes are not supported.
68  *   <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end
69  *       header is present only if ENDTOT in EOCD header is 0xFFFF.
70  * </ul>
71  */
72 public final class Zip {
73 
74   static final int ZIP64_ENDSIG = 0x06064b50;
75   static final int ZIP64_LOCSIG = 0x07064b50;
76 
77   static final int LOCHDR = 30; // LOC header size
78   static final int CENHDR = 46; // CEN header size
79   static final int ENDHDR = 22; // END header size
80   static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size
81   static final int ZIP64_ENDHDR = 56; // ZIP64 end header size
82 
83   static final int ENDTOT = 10; // total number of entries
84   static final int ENDSIZ = 12; // central directory size in bytes
85   static final int ENDCOM = 20; // zip file comment length
86 
87   static final int CENHOW = 10; // compression method
88   static final int CENLEN = 24; // uncompressed size
89   static final int CENSIZ = 20; // compressed size
90   static final int CENNAM = 28; // filename length
91   static final int CENEXT = 30; // extra field length
92   static final int CENCOM = 32; // comment length
93   static final int CENOFF = 42; // LOC header offset
94 
95   static final int LOCEXT = 28; // extra field length
96 
97   static final int ZIP64_ENDSIZ = 40; // central directory size in bytes
98 
99   static final int ZIP64_MAGICCOUNT = 0xFFFF;
100 
101   /** Iterates over a zip archive. */
102   static class ZipIterator implements Iterator<Entry> {
103 
104     /** A reader for the backing storage. */
105     private final FileChannel chan;
106 
107     private final Path path;
108     private int cdindex = 0;
109     private final MappedByteBuffer cd;
110     private final CharsetDecoder decoder = UTF_8.newDecoder();
111 
ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd)112     ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) {
113       this.path = path;
114       this.chan = chan;
115       this.cd = cd;
116     }
117 
118     @Override
hasNext()119     public boolean hasNext() {
120       return cdindex < cd.limit();
121     }
122 
123     /* Returns a {@link Entry} for the current CEN entry. */
124     @Override
next()125     public Entry next() {
126       // TODO(cushon): technically we're supposed to throw NSEE
127       checkSignature(path, cd, cdindex, 1, 2, "CENSIG");
128       int nameLength = cd.getChar(cdindex + CENNAM);
129       int extLength = cd.getChar(cdindex + CENEXT);
130       int commentLength = cd.getChar(cdindex + CENCOM);
131       Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex);
132       cdindex += CENHDR + nameLength + extLength + commentLength;
133       return entry;
134     }
135 
string(ByteBuffer buf, int offset, int length)136     public String string(ByteBuffer buf, int offset, int length) {
137       buf = buf.duplicate();
138       buf.position(offset);
139       buf.limit(offset + length);
140       decoder.reset();
141       try {
142         return decoder.decode(buf).toString();
143       } catch (CharacterCodingException e) {
144         throw new IOError(e);
145       }
146     }
147   }
148 
149   /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */
150   public static class ZipIterable implements Iterable<Entry>, Closeable {
151 
152     private final Path path;
153     private final FileChannel chan;
154     private final MappedByteBuffer cd;
155 
ZipIterable(Path path)156     public ZipIterable(Path path) throws IOException {
157       this.path = path;
158       this.chan = FileChannel.open(path, StandardOpenOption.READ);
159       // Locate the EOCD
160       long size = chan.size();
161       if (size < ENDHDR) {
162         throw new ZipException("invalid zip archive");
163       }
164       long eocdOffset = size - ENDHDR;
165       MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR);
166       eocd.order(ByteOrder.LITTLE_ENDIAN);
167       int index = 0;
168       int commentSize = 0;
169       if (!isSignature(eocd, 0, 5, 6)) {
170         // The archive may contain a zip file comment; keep looking for the EOCD.
171         long start = Math.max(0, size - ENDHDR - 0xFFFF);
172         eocd = chan.map(MapMode.READ_ONLY, start, (size - start));
173         eocd.order(ByteOrder.LITTLE_ENDIAN);
174         index = (int) ((size - start) - ENDHDR);
175         while (index > 0) {
176           index--;
177           eocd.position(index);
178           if (isSignature(eocd, index, 5, 6)) {
179             commentSize = (int) ((size - start) - ENDHDR) - index;
180             eocdOffset = start + index;
181             break;
182           }
183         }
184       }
185       checkSignature(path, eocd, index, 5, 6, "ENDSIG");
186       int totalEntries = eocd.getChar(index + ENDTOT);
187       long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ));
188       int actualCommentSize = eocd.getChar(index + ENDCOM);
189       if (commentSize != actualCommentSize) {
190         throw new ZipException(
191             String.format(
192                 "zip file comment length was %d, expected %d", commentSize, actualCommentSize));
193       }
194       // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator.
195       if (totalEntries == ZIP64_MAGICCOUNT) {
196         // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors.
197         long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR;
198         // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF
199         // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations
200         // read the full EOCD records and compare them.
201         long zip64cdsize = zip64cdsize(chan, zip64eocdOffset);
202         if (zip64cdsize != -1) {
203           eocdOffset = zip64eocdOffset;
204           cdsize = zip64cdsize;
205         } else {
206           // If we couldn't find a zip64 EOCD at a fixed offset, either it doesn't exist
207           // or there was a zip64 extensible data sector, so try going through the
208           // locator. This approach doesn't work if data was prepended to the archive
209           // without updating the offset in the locator.
210           MappedByteBuffer zip64loc =
211               chan.map(MapMode.READ_ONLY, size - ENDHDR - ZIP64_LOCHDR, ZIP64_LOCHDR);
212           zip64loc.order(ByteOrder.LITTLE_ENDIAN);
213           if (zip64loc.getInt(0) == ZIP64_LOCSIG) {
214             zip64eocdOffset = zip64loc.getLong(8);
215             zip64cdsize = zip64cdsize(chan, zip64eocdOffset);
216             if (zip64cdsize != -1) {
217               eocdOffset = zip64eocdOffset;
218               cdsize = zip64cdsize;
219             }
220           }
221         }
222       }
223       this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize);
224       cd.order(ByteOrder.LITTLE_ENDIAN);
225     }
226 
zip64cdsize(FileChannel chan, long eocdOffset)227     static long zip64cdsize(FileChannel chan, long eocdOffset) throws IOException {
228       MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ZIP64_ENDHDR);
229       zip64eocd.order(ByteOrder.LITTLE_ENDIAN);
230       if (zip64eocd.getInt(0) == ZIP64_ENDSIG) {
231         return zip64eocd.getLong(ZIP64_ENDSIZ);
232       }
233       return -1;
234     }
235 
236     @Override
iterator()237     public Iterator<Entry> iterator() {
238       return new ZipIterator(path, chan, cd);
239     }
240 
241     @Override
close()242     public void close() throws IOException {
243       chan.close();
244     }
245   }
246 
247   /** An entry in a zip archive. */
248   public static class Entry {
249 
250     private final Path path;
251     private final FileChannel chan;
252     private final String name;
253     private final ByteBuffer cd;
254     private final int cdindex;
255 
Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex)256     public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) {
257       this.path = path;
258       this.chan = chan;
259       this.name = name;
260       this.cd = cd;
261       this.cdindex = cdindex;
262     }
263 
264     /** The entry name. */
name()265     public String name() {
266       return name;
267     }
268 
269     /** The entry data. */
data()270     public byte[] data() {
271       // Read the offset and variable lengths from the central directory and then try to map in the
272       // data section in one shot.
273       long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF));
274       int nameLength = cd.getChar(cdindex + CENNAM);
275       int extLength = cd.getChar(cdindex + CENEXT);
276       int compression = cd.getChar(cdindex + CENHOW);
277       switch (compression) {
278         case 0x8:
279           return getBytes(
280               offset,
281               nameLength,
282               extLength,
283               UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)),
284               /*deflate=*/ true);
285         case 0x0:
286           return getBytes(
287               offset,
288               nameLength,
289               extLength,
290               UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)),
291               /*deflate=*/ false);
292         default:
293           throw new AssertionError(
294               String.format("unsupported compression mode: 0x%x", compression));
295       }
296     }
297 
298     /**
299      * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header
300      * reports more extra field data than the central directory.
301      */
302     static final int EXTRA_FIELD_SLACK = 128;
303 
getBytes( long offset, int nameLength, int cenExtLength, long size, boolean deflate)304     private byte[] getBytes(
305         long offset, int nameLength, int cenExtLength, long size, boolean deflate) {
306       if (size > Integer.MAX_VALUE) {
307         throw new IllegalArgumentException("unsupported zip entry size: " + size);
308       }
309       try {
310         MappedByteBuffer fc =
311             chan.map(
312                 MapMode.READ_ONLY,
313                 offset,
314                 Math.min(
315                     LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK,
316                     chan.size() - offset));
317         fc.order(ByteOrder.LITTLE_ENDIAN);
318         checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG");
319         int locExtLength = fc.getChar(LOCEXT);
320         if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) {
321           // If the local header's extra fields don't match the central directory and we didn't
322           // leave enough slac, re-map the data section with the correct extra field length.
323           fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size);
324           fc.order(ByteOrder.LITTLE_ENDIAN);
325         } else {
326           // Otherwise seek past the local header, name, and extra fields to the data.
327           fc.position(LOCHDR + nameLength + locExtLength);
328           fc.limit((int) (LOCHDR + nameLength + locExtLength + size));
329         }
330         byte[] bytes = new byte[(int) size];
331         fc.get(bytes);
332         if (deflate) {
333           bytes =
334               new InflaterInputStream(
335                       new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true))
336                   .readAllBytes();
337         }
338         return bytes;
339       } catch (IOException e) {
340         throw new IOError(e);
341       }
342     }
343   }
344 
checkSignature( Path path, MappedByteBuffer buf, int index, int i, int j, String name)345   static void checkSignature(
346       Path path, MappedByteBuffer buf, int index, int i, int j, String name) {
347     if (!isSignature(buf, index, i, j)) {
348       throw new AssertionError(
349           String.format(
350               "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)",
351               path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index)));
352     }
353   }
354 
isSignature(MappedByteBuffer buf, int index, int i, int j)355   static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) {
356     return (buf.get(index) == 'P')
357         && (buf.get(index + 1) == 'K')
358         && (buf.get(index + 2) == i)
359         && (buf.get(index + 3) == j);
360   }
361 
Zip()362   private Zip() {}
363 }
364