1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 /* 20 * This package is based on the work done by Timothy Gerard Endres 21 * (time@ice.com) to whom the Ant project is very grateful for his great code. 22 */ 23 24 package org.apache.commons.compress.archivers.tar; 25 26 import java.io.ByteArrayOutputStream; 27 import java.io.IOException; 28 import java.io.InputStream; 29 import java.util.HashMap; 30 import java.util.Map; 31 32 import org.apache.commons.compress.archivers.ArchiveEntry; 33 import org.apache.commons.compress.archivers.ArchiveInputStream; 34 import org.apache.commons.compress.archivers.zip.ZipEncoding; 35 import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 36 import org.apache.commons.compress.utils.ArchiveUtils; 37 import org.apache.commons.compress.utils.CharsetNames; 38 import org.apache.commons.compress.utils.IOUtils; 39 40 /** 41 * The TarInputStream reads a UNIX tar archive as an InputStream. 42 * methods are provided to position at each successive entry in 43 * the archive, and the read each entry as a normal input stream 44 * using read(). 45 * @NotThreadSafe 46 */ 47 public class TarArchiveInputStream extends ArchiveInputStream { 48 49 private static final int SMALL_BUFFER_SIZE = 256; 50 51 private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE]; 52 53 /** The size the TAR header */ 54 private final int recordSize; 55 56 /** The size of a block */ 57 private final int blockSize; 58 59 /** True if file has hit EOF */ 60 private boolean hasHitEOF; 61 62 /** Size of the current entry */ 63 private long entrySize; 64 65 /** How far into the entry the stream is at */ 66 private long entryOffset; 67 68 /** An input stream to read from */ 69 private final InputStream is; 70 71 /** The meta-data about the current entry */ 72 private TarArchiveEntry currEntry; 73 74 /** The encoding of the file */ 75 private final ZipEncoding zipEncoding; 76 77 // the provided encoding (for unit tests) 78 final String encoding; 79 80 // the global PAX header 81 private Map<String, String> globalPaxHeaders = new HashMap<>(); 82 83 /** 84 * Constructor for TarInputStream. 85 * @param is the input stream to use 86 */ TarArchiveInputStream(final InputStream is)87 public TarArchiveInputStream(final InputStream is) { 88 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 89 } 90 91 /** 92 * Constructor for TarInputStream. 93 * @param is the input stream to use 94 * @param encoding name of the encoding to use for file names 95 * @since 1.4 96 */ TarArchiveInputStream(final InputStream is, final String encoding)97 public TarArchiveInputStream(final InputStream is, final String encoding) { 98 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 99 encoding); 100 } 101 102 /** 103 * Constructor for TarInputStream. 104 * @param is the input stream to use 105 * @param blockSize the block size to use 106 */ TarArchiveInputStream(final InputStream is, final int blockSize)107 public TarArchiveInputStream(final InputStream is, final int blockSize) { 108 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 109 } 110 111 /** 112 * Constructor for TarInputStream. 113 * @param is the input stream to use 114 * @param blockSize the block size to use 115 * @param encoding name of the encoding to use for file names 116 * @since 1.4 117 */ TarArchiveInputStream(final InputStream is, final int blockSize, final String encoding)118 public TarArchiveInputStream(final InputStream is, final int blockSize, 119 final String encoding) { 120 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 121 } 122 123 /** 124 * Constructor for TarInputStream. 125 * @param is the input stream to use 126 * @param blockSize the block size to use 127 * @param recordSize the record size to use 128 */ TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize)129 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) { 130 this(is, blockSize, recordSize, null); 131 } 132 133 /** 134 * Constructor for TarInputStream. 135 * @param is the input stream to use 136 * @param blockSize the block size to use 137 * @param recordSize the record size to use 138 * @param encoding name of the encoding to use for file names 139 * @since 1.4 140 */ TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, final String encoding)141 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 142 final String encoding) { 143 this.is = is; 144 this.hasHitEOF = false; 145 this.encoding = encoding; 146 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 147 this.recordSize = recordSize; 148 this.blockSize = blockSize; 149 } 150 151 /** 152 * Closes this stream. Calls the TarBuffer's close() method. 153 * @throws IOException on error 154 */ 155 @Override close()156 public void close() throws IOException { 157 is.close(); 158 } 159 160 /** 161 * Get the record size being used by this stream's buffer. 162 * 163 * @return The TarBuffer record size. 164 */ getRecordSize()165 public int getRecordSize() { 166 return recordSize; 167 } 168 169 /** 170 * Get the available data that can be read from the current 171 * entry in the archive. This does not indicate how much data 172 * is left in the entire archive, only in the current entry. 173 * This value is determined from the entry's size header field 174 * and the amount of data already read from the current entry. 175 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 176 * bytes are left in the current entry in the archive. 177 * 178 * @return The number of available bytes for the current entry. 179 * @throws IOException for signature 180 */ 181 @Override available()182 public int available() throws IOException { 183 if (isDirectory()) { 184 return 0; 185 } 186 if (entrySize - entryOffset > Integer.MAX_VALUE) { 187 return Integer.MAX_VALUE; 188 } 189 return (int) (entrySize - entryOffset); 190 } 191 192 193 /** 194 * Skips over and discards <code>n</code> bytes of data from this input 195 * stream. The <code>skip</code> method may, for a variety of reasons, end 196 * up skipping over some smaller number of bytes, possibly <code>0</code>. 197 * This may result from any of a number of conditions; reaching end of file 198 * or end of entry before <code>n</code> bytes have been skipped; are only 199 * two possibilities. The actual number of bytes skipped is returned. If 200 * <code>n</code> is negative, no bytes are skipped. 201 * 202 * 203 * @param n 204 * the number of bytes to be skipped. 205 * @return the actual number of bytes skipped. 206 * @throws IOException 207 * if some other I/O error occurs. 208 */ 209 @Override skip(final long n)210 public long skip(final long n) throws IOException { 211 if (n <= 0 || isDirectory()) { 212 return 0; 213 } 214 215 final long available = entrySize - entryOffset; 216 final long skipped = IOUtils.skip(is, Math.min(n, available)); 217 count(skipped); 218 entryOffset += skipped; 219 return skipped; 220 } 221 222 /** 223 * Since we do not support marking just yet, we return false. 224 * 225 * @return False. 226 */ 227 @Override markSupported()228 public boolean markSupported() { 229 return false; 230 } 231 232 /** 233 * Since we do not support marking just yet, we do nothing. 234 * 235 * @param markLimit The limit to mark. 236 */ 237 @Override mark(final int markLimit)238 public void mark(final int markLimit) { 239 } 240 241 /** 242 * Since we do not support marking just yet, we do nothing. 243 */ 244 @Override reset()245 public synchronized void reset() { 246 } 247 248 /** 249 * Get the next entry in this tar archive. This will skip 250 * over any remaining data in the current entry, if there 251 * is one, and place the input stream at the header of the 252 * next entry, and read the header and instantiate a new 253 * TarEntry from the header bytes and return that entry. 254 * If there are no more entries in the archive, null will 255 * be returned to indicate that the end of the archive has 256 * been reached. 257 * 258 * @return The next TarEntry in the archive, or null. 259 * @throws IOException on error 260 */ getNextTarEntry()261 public TarArchiveEntry getNextTarEntry() throws IOException { 262 if (isAtEOF()) { 263 return null; 264 } 265 266 if (currEntry != null) { 267 /* Skip will only go to the end of the current entry */ 268 IOUtils.skip(this, Long.MAX_VALUE); 269 270 /* skip to the end of the last record */ 271 skipRecordPadding(); 272 } 273 274 final byte[] headerBuf = getRecord(); 275 276 if (headerBuf == null) { 277 /* hit EOF */ 278 currEntry = null; 279 return null; 280 } 281 282 try { 283 currEntry = new TarArchiveEntry(headerBuf, zipEncoding); 284 } catch (final IllegalArgumentException e) { 285 throw new IOException("Error detected parsing the header", e); 286 } 287 288 entryOffset = 0; 289 entrySize = currEntry.getSize(); 290 291 if (currEntry.isGNULongLinkEntry()) { 292 final byte[] longLinkData = getLongNameData(); 293 if (longLinkData == null) { 294 // Bugzilla: 40334 295 // Malformed tar file - long link entry name not followed by 296 // entry 297 return null; 298 } 299 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 300 } 301 302 if (currEntry.isGNULongNameEntry()) { 303 final byte[] longNameData = getLongNameData(); 304 if (longNameData == null) { 305 // Bugzilla: 40334 306 // Malformed tar file - long entry name not followed by 307 // entry 308 return null; 309 } 310 currEntry.setName(zipEncoding.decode(longNameData)); 311 } 312 313 if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers 314 readGlobalPaxHeaders(); 315 } 316 317 if (currEntry.isPaxHeader()){ // Process Pax headers 318 paxHeaders(); 319 } else if (!globalPaxHeaders.isEmpty()) { 320 applyPaxHeadersToCurrentEntry(globalPaxHeaders); 321 } 322 323 if (currEntry.isOldGNUSparse()){ // Process sparse files 324 readOldGNUSparse(); 325 } 326 327 // If the size of the next element in the archive has changed 328 // due to a new size being reported in the posix header 329 // information, we update entrySize here so that it contains 330 // the correct value. 331 entrySize = currEntry.getSize(); 332 333 return currEntry; 334 } 335 336 /** 337 * The last record block should be written at the full size, so skip any 338 * additional space used to fill a record after an entry 339 */ skipRecordPadding()340 private void skipRecordPadding() throws IOException { 341 if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 342 final long numRecords = (this.entrySize / this.recordSize) + 1; 343 final long padding = (numRecords * this.recordSize) - this.entrySize; 344 final long skipped = IOUtils.skip(is, padding); 345 count(skipped); 346 } 347 } 348 349 /** 350 * Get the next entry in this tar archive as longname data. 351 * 352 * @return The next entry in the archive as longname data, or null. 353 * @throws IOException on error 354 */ getLongNameData()355 protected byte[] getLongNameData() throws IOException { 356 // read in the name 357 final ByteArrayOutputStream longName = new ByteArrayOutputStream(); 358 int length = 0; 359 while ((length = read(smallBuf)) >= 0) { 360 longName.write(smallBuf, 0, length); 361 } 362 getNextEntry(); 363 if (currEntry == null) { 364 // Bugzilla: 40334 365 // Malformed tar file - long entry name not followed by entry 366 return null; 367 } 368 byte[] longNameData = longName.toByteArray(); 369 // remove trailing null terminator(s) 370 length = longNameData.length; 371 while (length > 0 && longNameData[length - 1] == 0) { 372 --length; 373 } 374 if (length != longNameData.length) { 375 final byte[] l = new byte[length]; 376 System.arraycopy(longNameData, 0, l, 0, length); 377 longNameData = l; 378 } 379 return longNameData; 380 } 381 382 /** 383 * Get the next record in this tar archive. This will skip 384 * over any remaining data in the current entry, if there 385 * is one, and place the input stream at the header of the 386 * next entry. 387 * 388 * <p>If there are no more entries in the archive, null will be 389 * returned to indicate that the end of the archive has been 390 * reached. At the same time the {@code hasHitEOF} marker will be 391 * set to true.</p> 392 * 393 * @return The next header in the archive, or null. 394 * @throws IOException on error 395 */ getRecord()396 private byte[] getRecord() throws IOException { 397 byte[] headerBuf = readRecord(); 398 setAtEOF(isEOFRecord(headerBuf)); 399 if (isAtEOF() && headerBuf != null) { 400 tryToConsumeSecondEOFRecord(); 401 consumeRemainderOfLastBlock(); 402 headerBuf = null; 403 } 404 return headerBuf; 405 } 406 407 /** 408 * Determine if an archive record indicate End of Archive. End of 409 * archive is indicated by a record that consists entirely of null bytes. 410 * 411 * @param record The record data to check. 412 * @return true if the record data is an End of Archive 413 */ isEOFRecord(final byte[] record)414 protected boolean isEOFRecord(final byte[] record) { 415 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 416 } 417 418 /** 419 * Read a record from the input stream and return the data. 420 * 421 * @return The record data or null if EOF has been hit. 422 * @throws IOException on error 423 */ readRecord()424 protected byte[] readRecord() throws IOException { 425 426 final byte[] record = new byte[recordSize]; 427 428 final int readNow = IOUtils.readFully(is, record); 429 count(readNow); 430 if (readNow != recordSize) { 431 return null; 432 } 433 434 return record; 435 } 436 readGlobalPaxHeaders()437 private void readGlobalPaxHeaders() throws IOException { 438 globalPaxHeaders = parsePaxHeaders(this); 439 getNextEntry(); // Get the actual file entry 440 } 441 paxHeaders()442 private void paxHeaders() throws IOException{ 443 final Map<String, String> headers = parsePaxHeaders(this); 444 getNextEntry(); // Get the actual file entry 445 applyPaxHeadersToCurrentEntry(headers); 446 } 447 448 // NOTE, using a Map here makes it impossible to ever support GNU 449 // sparse files using the PAX Format 0.0, see 450 // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188 parsePaxHeaders(final InputStream i)451 Map<String, String> parsePaxHeaders(final InputStream i) 452 throws IOException { 453 final Map<String, String> headers = new HashMap<>(globalPaxHeaders); 454 // Format is "length keyword=value\n"; 455 while(true){ // get length 456 int ch; 457 int len = 0; 458 int read = 0; 459 while((ch = i.read()) != -1) { 460 read++; 461 if (ch == '\n') { // blank line in header 462 break; 463 } else if (ch == ' '){ // End of length string 464 // Get keyword 465 final ByteArrayOutputStream coll = new ByteArrayOutputStream(); 466 while((ch = i.read()) != -1) { 467 read++; 468 if (ch == '='){ // end of keyword 469 final String keyword = coll.toString(CharsetNames.UTF_8); 470 // Get rest of entry 471 final int restLen = len - read; 472 if (restLen == 1) { // only NL 473 headers.remove(keyword); 474 } else { 475 final byte[] rest = new byte[restLen]; 476 final int got = IOUtils.readFully(i, rest); 477 if (got != restLen) { 478 throw new IOException("Failed to read " 479 + "Paxheader. Expected " 480 + restLen 481 + " bytes, read " 482 + got); 483 } 484 // Drop trailing NL 485 final String value = new String(rest, 0, 486 restLen - 1, CharsetNames.UTF_8); 487 headers.put(keyword, value); 488 } 489 break; 490 } 491 coll.write((byte) ch); 492 } 493 break; // Processed single header 494 } 495 len *= 10; 496 len += ch - '0'; 497 } 498 if (ch == -1){ // EOF 499 break; 500 } 501 } 502 return headers; 503 } 504 applyPaxHeadersToCurrentEntry(final Map<String, String> headers)505 private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) { 506 currEntry.updateEntryFromPaxHeaders(headers); 507 508 } 509 510 /** 511 * Adds the sparse chunks from the current entry to the sparse chunks, 512 * including any additional sparse entries following the current entry. 513 * 514 * @throws IOException on error 515 * 516 * @todo Sparse files get not yet really processed. 517 */ readOldGNUSparse()518 private void readOldGNUSparse() throws IOException { 519 /* we do not really process sparse files yet 520 sparses = new ArrayList(); 521 sparses.addAll(currEntry.getSparses()); 522 */ 523 if (currEntry.isExtended()) { 524 TarArchiveSparseEntry entry; 525 do { 526 final byte[] headerBuf = getRecord(); 527 if (headerBuf == null) { 528 currEntry = null; 529 break; 530 } 531 entry = new TarArchiveSparseEntry(headerBuf); 532 /* we do not really process sparse files yet 533 sparses.addAll(entry.getSparses()); 534 */ 535 } while (entry.isExtended()); 536 } 537 } 538 isDirectory()539 private boolean isDirectory() { 540 return currEntry != null && currEntry.isDirectory(); 541 } 542 543 /** 544 * Returns the next Archive Entry in this Stream. 545 * 546 * @return the next entry, 547 * or {@code null} if there are no more entries 548 * @throws IOException if the next entry could not be read 549 */ 550 @Override getNextEntry()551 public ArchiveEntry getNextEntry() throws IOException { 552 return getNextTarEntry(); 553 } 554 555 /** 556 * Tries to read the next record rewinding the stream if it is not a EOF record. 557 * 558 * <p>This is meant to protect against cases where a tar 559 * implementation has written only one EOF record when two are 560 * expected. Actually this won't help since a non-conforming 561 * implementation likely won't fill full blocks consisting of - by 562 * default - ten records either so we probably have already read 563 * beyond the archive anyway.</p> 564 */ tryToConsumeSecondEOFRecord()565 private void tryToConsumeSecondEOFRecord() throws IOException { 566 boolean shouldReset = true; 567 final boolean marked = is.markSupported(); 568 if (marked) { 569 is.mark(recordSize); 570 } 571 try { 572 shouldReset = !isEOFRecord(readRecord()); 573 } finally { 574 if (shouldReset && marked) { 575 pushedBackBytes(recordSize); 576 is.reset(); 577 } 578 } 579 } 580 581 /** 582 * Reads bytes from the current tar archive entry. 583 * 584 * This method is aware of the boundaries of the current 585 * entry in the archive and will deal with them as if they 586 * were this stream's start and EOF. 587 * 588 * @param buf The buffer into which to place bytes read. 589 * @param offset The offset at which to place bytes read. 590 * @param numToRead The number of bytes to read. 591 * @return The number of bytes read, or -1 at EOF. 592 * @throws IOException on error 593 */ 594 @Override read(final byte[] buf, final int offset, int numToRead)595 public int read(final byte[] buf, final int offset, int numToRead) throws IOException { 596 int totalRead = 0; 597 598 if (isAtEOF() || isDirectory() || entryOffset >= entrySize) { 599 return -1; 600 } 601 602 if (currEntry == null) { 603 throw new IllegalStateException("No current tar entry"); 604 } 605 606 numToRead = Math.min(numToRead, available()); 607 608 totalRead = is.read(buf, offset, numToRead); 609 610 if (totalRead == -1) { 611 if (numToRead > 0) { 612 throw new IOException("Truncated TAR archive"); 613 } 614 setAtEOF(true); 615 } else { 616 count(totalRead); 617 entryOffset += totalRead; 618 } 619 620 return totalRead; 621 } 622 623 /** 624 * Whether this class is able to read the given entry. 625 * 626 * <p>May return false if the current entry is a sparse file.</p> 627 */ 628 @Override canReadEntryData(final ArchiveEntry ae)629 public boolean canReadEntryData(final ArchiveEntry ae) { 630 if (ae instanceof TarArchiveEntry) { 631 final TarArchiveEntry te = (TarArchiveEntry) ae; 632 return !te.isSparse(); 633 } 634 return false; 635 } 636 637 /** 638 * Get the current TAR Archive Entry that this input stream is processing 639 * 640 * @return The current Archive Entry 641 */ getCurrentEntry()642 public TarArchiveEntry getCurrentEntry() { 643 return currEntry; 644 } 645 setCurrentEntry(final TarArchiveEntry e)646 protected final void setCurrentEntry(final TarArchiveEntry e) { 647 currEntry = e; 648 } 649 isAtEOF()650 protected final boolean isAtEOF() { 651 return hasHitEOF; 652 } 653 setAtEOF(final boolean b)654 protected final void setAtEOF(final boolean b) { 655 hasHitEOF = b; 656 } 657 658 /** 659 * This method is invoked once the end of the archive is hit, it 660 * tries to consume the remaining bytes under the assumption that 661 * the tool creating this archive has padded the last block. 662 */ consumeRemainderOfLastBlock()663 private void consumeRemainderOfLastBlock() throws IOException { 664 final long bytesReadOfLastBlock = getBytesRead() % blockSize; 665 if (bytesReadOfLastBlock > 0) { 666 final long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock); 667 count(skipped); 668 } 669 } 670 671 /** 672 * Checks if the signature matches what is expected for a tar file. 673 * 674 * @param signature 675 * the bytes to check 676 * @param length 677 * the number of bytes to check 678 * @return true, if this stream is a tar archive stream, false otherwise 679 */ matches(final byte[] signature, final int length)680 public static boolean matches(final byte[] signature, final int length) { 681 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 682 return false; 683 } 684 685 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 686 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 687 && 688 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 689 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 690 ){ 691 return true; 692 } 693 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 694 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 695 && 696 ( 697 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 698 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 699 || 700 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 701 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 702 ) 703 ){ 704 return true; 705 } 706 // COMPRESS-107 - recognise Ant tar files 707 return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 708 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 709 && 710 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 711 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN); 712 } 713 714 } 715