001    /*
002     *  Licensed to the Apache Software Foundation (ASF) under one or more
003     *  contributor license agreements.  See the NOTICE file distributed with
004     *  this work for additional information regarding copyright ownership.
005     *  The ASF licenses this file to You under the Apache License, Version 2.0
006     *  (the "License"); you may not use this file except in compliance with
007     *  the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     *
017     */
018    
019    /*
020     * This package is based on the work done by Timothy Gerard Endres
021     * (time@ice.com) to whom the Ant project is very grateful for his great code.
022     */
023    
024    package org.apache.commons.compress.archivers.tar;
025    
026    import java.io.ByteArrayOutputStream;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.util.HashMap;
030    import java.util.Map;
031    import java.util.Map.Entry;
032    
033    import org.apache.commons.compress.archivers.ArchiveEntry;
034    import org.apache.commons.compress.archivers.ArchiveInputStream;
035    import org.apache.commons.compress.archivers.zip.ZipEncoding;
036    import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
037    import org.apache.commons.compress.utils.ArchiveUtils;
038    import org.apache.commons.compress.utils.CharsetNames;
039    
040    /**
041     * The TarInputStream reads a UNIX tar archive as an InputStream.
042     * methods are provided to position at each successive entry in
043     * the archive, and the read each entry as a normal input stream
044     * using read().
045     * @NotThreadSafe
046     */
047    public class TarArchiveInputStream extends ArchiveInputStream {
048        private static final int SMALL_BUFFER_SIZE = 256;
049        private static final int BUFFER_SIZE = 8 * 1024;
050    
051        private final byte[] SKIP_BUF = new byte[BUFFER_SIZE];
052        private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE];
053    
054        private boolean hasHitEOF;
055        private long entrySize;
056        private long entryOffset;
057        private byte[] readBuf;
058        protected final TarBuffer buffer;
059        private TarArchiveEntry currEntry;
060        private final ZipEncoding encoding;
061    
062        /**
063         * Constructor for TarInputStream.
064         * @param is the input stream to use
065         */
066        public TarArchiveInputStream(InputStream is) {
067            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
068        }
069    
070        /**
071         * Constructor for TarInputStream.
072         * @param is the input stream to use
073         * @param encoding name of the encoding to use for file names
074         * @since Commons Compress 1.4
075         */
076        public TarArchiveInputStream(InputStream is, String encoding) {
077            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE, encoding);
078        }
079    
080        /**
081         * Constructor for TarInputStream.
082         * @param is the input stream to use
083         * @param blockSize the block size to use
084         */
085        public TarArchiveInputStream(InputStream is, int blockSize) {
086            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
087        }
088    
089        /**
090         * Constructor for TarInputStream.
091         * @param is the input stream to use
092         * @param blockSize the block size to use
093         * @param encoding name of the encoding to use for file names
094         * @since Commons Compress 1.4
095         */
096        public TarArchiveInputStream(InputStream is, int blockSize,
097                                     String encoding) {
098            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE, encoding);
099        }
100    
101        /**
102         * Constructor for TarInputStream.
103         * @param is the input stream to use
104         * @param blockSize the block size to use
105         * @param recordSize the record size to use
106         */
107        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
108            this(is, blockSize, recordSize, null);
109        }
110    
111        /**
112         * Constructor for TarInputStream.
113         * @param is the input stream to use
114         * @param blockSize the block size to use
115         * @param recordSize the record size to use
116         * @param encoding name of the encoding to use for file names
117         * @since Commons Compress 1.4
118         */
119        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
120                                     String encoding) {
121            this.buffer = new TarBuffer(is, blockSize, recordSize);
122            this.readBuf = null;
123            this.hasHitEOF = false;
124            this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
125        }
126    
127        /**
128         * Closes this stream. Calls the TarBuffer's close() method.
129         * @throws IOException on error
130         */
131        @Override
132        public void close() throws IOException {
133            buffer.close();
134        }
135    
136        /**
137         * Get the record size being used by this stream's TarBuffer.
138         *
139         * @return The TarBuffer record size.
140         */
141        public int getRecordSize() {
142            return buffer.getRecordSize();
143        }
144    
145        /**
146         * Get the available data that can be read from the current
147         * entry in the archive. This does not indicate how much data
148         * is left in the entire archive, only in the current entry.
149         * This value is determined from the entry's size header field
150         * and the amount of data already read from the current entry.
151         * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
152         * bytes are left in the current entry in the archive.
153         *
154         * @return The number of available bytes for the current entry.
155         * @throws IOException for signature
156         */
157        @Override
158        public int available() throws IOException {
159            if (entrySize - entryOffset > Integer.MAX_VALUE) {
160                return Integer.MAX_VALUE;
161            }
162            return (int) (entrySize - entryOffset);
163        }
164    
165        /**
166         * Skip bytes in the input buffer. This skips bytes in the
167         * current entry's data, not the entire archive, and will
168         * stop at the end of the current entry's data if the number
169         * to skip extends beyond that point.
170         *
171         * @param numToSkip The number of bytes to skip.
172         * @return the number actually skipped
173         * @throws IOException on error
174         */
175        @Override
176        public long skip(long numToSkip) throws IOException {
177            // REVIEW
178            // This is horribly inefficient, but it ensures that we
179            // properly skip over bytes via the TarBuffer...
180            //
181            long skip = numToSkip;
182            while (skip > 0) {
183                int realSkip = (int) (skip > SKIP_BUF.length
184                                      ? SKIP_BUF.length : skip);
185                int numRead = read(SKIP_BUF, 0, realSkip);
186                if (numRead == -1) {
187                    break;
188                }
189                skip -= numRead;
190            }
191            return (numToSkip - skip);
192        }
193    
194        /**
195         * Since we do not support marking just yet, we do nothing.
196         */
197        @Override
198        public synchronized void reset() {
199        }
200    
201        /**
202         * Get the next entry in this tar archive. This will skip
203         * over any remaining data in the current entry, if there
204         * is one, and place the input stream at the header of the
205         * next entry, and read the header and instantiate a new
206         * TarEntry from the header bytes and return that entry.
207         * If there are no more entries in the archive, null will
208         * be returned to indicate that the end of the archive has
209         * been reached.
210         *
211         * @return The next TarEntry in the archive, or null.
212         * @throws IOException on error
213         */
214        public TarArchiveEntry getNextTarEntry() throws IOException {
215            if (hasHitEOF) {
216                return null;
217            }
218    
219            if (currEntry != null) {
220                long numToSkip = entrySize - entryOffset;
221    
222                while (numToSkip > 0) {
223                    long skipped = skip(numToSkip);
224                    if (skipped <= 0) {
225                        throw new RuntimeException("failed to skip current tar"
226                                                   + " entry");
227                    }
228                    numToSkip -= skipped;
229                }
230    
231                readBuf = null;
232            }
233    
234            byte[] headerBuf = getRecord();
235    
236            if (hasHitEOF) {
237                currEntry = null;
238                return null;
239            }
240    
241            try {
242                currEntry = new TarArchiveEntry(headerBuf, encoding);
243            } catch (IllegalArgumentException e) {
244                IOException ioe = new IOException("Error detected parsing the header");
245                ioe.initCause(e);
246                throw ioe;
247            }
248            entryOffset = 0;
249            entrySize = currEntry.getSize();
250    
251            if (currEntry.isGNULongNameEntry()) {
252                // read in the name
253                ByteArrayOutputStream longName = new ByteArrayOutputStream();
254                int length = 0;
255                while ((length = read(SMALL_BUF)) >= 0) {
256                    longName.write(SMALL_BUF, 0, length);
257                }
258                getNextEntry();
259                if (currEntry == null) {
260                    // Bugzilla: 40334
261                    // Malformed tar file - long entry name not followed by entry
262                    return null;
263                }
264                byte[] longNameData = longName.toByteArray();
265                // remove trailing null terminator(s)
266                length = longNameData.length;
267                while (length > 0 && longNameData[length - 1] == 0) {
268                    --length;
269                }
270                if (length != longNameData.length) {
271                    byte[] l = new byte[length];
272                    System.arraycopy(longNameData, 0, l, 0, length);
273                    longNameData = l;
274                }
275    
276                currEntry.setName(encoding.decode(longNameData));
277            }
278    
279            if (currEntry.isPaxHeader()){ // Process Pax headers
280                paxHeaders();
281            }
282    
283            if (currEntry.isGNUSparse()){ // Process sparse files
284                readGNUSparse();
285            }
286    
287            // If the size of the next element in the archive has changed
288            // due to a new size being reported in the posix header
289            // information, we update entrySize here so that it contains
290            // the correct value.
291            entrySize = currEntry.getSize();
292            return currEntry;
293        }
294    
295        /**
296         * Get the next record in this tar archive. This will skip
297         * over any remaining data in the current entry, if there
298         * is one, and place the input stream at the header of the
299         * next entry.
300         * If there are no more entries in the archive, null will
301         * be returned to indicate that the end of the archive has
302         * been reached.
303         *
304         * @return The next header in the archive, or null.
305         * @throws IOException on error
306         */
307        private byte[] getRecord() throws IOException {
308            if (hasHitEOF) {
309                return null;
310            }
311    
312            byte[] headerBuf = buffer.readRecord();
313    
314            if (headerBuf == null) {
315                hasHitEOF = true;
316            } else if (buffer.isEOFRecord(headerBuf)) {
317                hasHitEOF = true;
318                buffer.tryToConsumeSecondEOFRecord();
319            }
320    
321            return hasHitEOF ? null : headerBuf;
322        }
323    
324        private void paxHeaders() throws IOException{
325            Map<String, String> headers = parsePaxHeaders(this);
326            getNextEntry(); // Get the actual file entry
327            applyPaxHeadersToCurrentEntry(headers);
328        }
329    
330        Map<String, String> parsePaxHeaders(InputStream i) throws IOException {
331            Map<String, String> headers = new HashMap<String, String>();
332            // Format is "length keyword=value\n";
333            while(true){ // get length
334                int ch;
335                int len = 0;
336                int read = 0;
337                while((ch = i.read()) != -1) {
338                    read++;
339                    if (ch == ' '){ // End of length string
340                        // Get keyword
341                        ByteArrayOutputStream coll = new ByteArrayOutputStream();
342                        while((ch = i.read()) != -1) {
343                            read++;
344                            if (ch == '='){ // end of keyword
345                                String keyword = coll.toString(CharsetNames.UTF_8);
346                                // Get rest of entry
347                                byte[] rest = new byte[len - read];
348                                int got = i.read(rest);
349                                if (got != len - read){
350                                    throw new IOException("Failed to read "
351                                                          + "Paxheader. Expected "
352                                                          + (len - read)
353                                                          + " bytes, read "
354                                                          + got);
355                                }
356                                // Drop trailing NL
357                                String value = new String(rest, 0,
358                                                          len - read - 1, CharsetNames.UTF_8);
359                                headers.put(keyword, value);
360                                break;
361                            }
362                            coll.write((byte) ch);
363                        }
364                        break; // Processed single header
365                    }
366                    len *= 10;
367                    len += ch - '0';
368                }
369                if (ch == -1){ // EOF
370                    break;
371                }
372            }
373            return headers;
374        }
375    
376        private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) {
377            /*
378             * The following headers are defined for Pax.
379             * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
380             * mtime
381             * comment
382             * gid, gname
383             * linkpath
384             * size
385             * uid,uname
386             * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those
387             */
388            for (Entry<String, String> ent : headers.entrySet()){
389                String key = ent.getKey();
390                String val = ent.getValue();
391                if ("path".equals(key)){
392                    currEntry.setName(val);
393                } else if ("linkpath".equals(key)){
394                    currEntry.setLinkName(val);
395                } else if ("gid".equals(key)){
396                    currEntry.setGroupId(Integer.parseInt(val));
397                } else if ("gname".equals(key)){
398                    currEntry.setGroupName(val);
399                } else if ("uid".equals(key)){
400                    currEntry.setUserId(Integer.parseInt(val));
401                } else if ("uname".equals(key)){
402                    currEntry.setUserName(val);
403                } else if ("size".equals(key)){
404                    currEntry.setSize(Long.parseLong(val));
405                } else if ("mtime".equals(key)){
406                    currEntry.setModTime((long) (Double.parseDouble(val) * 1000));
407                } else if ("SCHILY.devminor".equals(key)){
408                    currEntry.setDevMinor(Integer.parseInt(val));
409                } else if ("SCHILY.devmajor".equals(key)){
410                    currEntry.setDevMajor(Integer.parseInt(val));
411                }
412            }
413        }
414    
415        /**
416         * Adds the sparse chunks from the current entry to the sparse chunks,
417         * including any additional sparse entries following the current entry.
418         *
419         * @throws IOException on error
420         *
421         * @todo Sparse files get not yet really processed.
422         */
423        private void readGNUSparse() throws IOException {
424            /* we do not really process sparse files yet
425            sparses = new ArrayList();
426            sparses.addAll(currEntry.getSparses());
427            */
428            if (currEntry.isExtended()) {
429                TarArchiveSparseEntry entry;
430                do {
431                    byte[] headerBuf = getRecord();
432                    if (hasHitEOF) {
433                        currEntry = null;
434                        break;
435                    }
436                    entry = new TarArchiveSparseEntry(headerBuf);
437                    /* we do not really process sparse files yet
438                    sparses.addAll(entry.getSparses());
439                    */
440                } while (entry.isExtended());
441            }
442        }
443    
444        @Override
445        public ArchiveEntry getNextEntry() throws IOException {
446            return getNextTarEntry();
447        }
448    
449        /**
450         * Reads bytes from the current tar archive entry.
451         *
452         * This method is aware of the boundaries of the current
453         * entry in the archive and will deal with them as if they
454         * were this stream's start and EOF.
455         *
456         * @param buf The buffer into which to place bytes read.
457         * @param offset The offset at which to place bytes read.
458         * @param numToRead The number of bytes to read.
459         * @return The number of bytes read, or -1 at EOF.
460         * @throws IOException on error
461         */
462        @Override
463        public int read(byte[] buf, int offset, int numToRead) throws IOException {
464            int totalRead = 0;
465    
466            if (entryOffset >= entrySize) {
467                return -1;
468            }
469    
470            if ((numToRead + entryOffset) > entrySize) {
471                numToRead = (int) (entrySize - entryOffset);
472            }
473    
474            if (readBuf != null) {
475                int sz = (numToRead > readBuf.length) ? readBuf.length
476                    : numToRead;
477    
478                System.arraycopy(readBuf, 0, buf, offset, sz);
479    
480                if (sz >= readBuf.length) {
481                    readBuf = null;
482                } else {
483                    int newLen = readBuf.length - sz;
484                    byte[] newBuf = new byte[newLen];
485    
486                    System.arraycopy(readBuf, sz, newBuf, 0, newLen);
487    
488                    readBuf = newBuf;
489                }
490    
491                totalRead += sz;
492                numToRead -= sz;
493                offset += sz;
494            }
495    
496            while (numToRead > 0) {
497                byte[] rec = buffer.readRecord();
498    
499                if (rec == null) {
500                    // Unexpected EOF!
501                    throw new IOException("unexpected EOF with " + numToRead
502                                          + " bytes unread. Occured at byte: " + getBytesRead());
503                }
504                count(rec.length);
505                int sz = numToRead;
506                int recLen = rec.length;
507    
508                if (recLen > sz) {
509                    System.arraycopy(rec, 0, buf, offset, sz);
510    
511                    readBuf = new byte[recLen - sz];
512    
513                    System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
514                } else {
515                    sz = recLen;
516    
517                    System.arraycopy(rec, 0, buf, offset, recLen);
518                }
519    
520                totalRead += sz;
521                numToRead -= sz;
522                offset += sz;
523            }
524    
525            entryOffset += totalRead;
526    
527            return totalRead;
528        }
529    
530        /**
531         * Whether this class is able to read the given entry.
532         *
533         * <p>May return false if the current entry is a sparse file.</p>
534         */
535        @Override
536        public boolean canReadEntryData(ArchiveEntry ae) {
537            if (ae instanceof TarArchiveEntry) {
538                TarArchiveEntry te = (TarArchiveEntry) ae;
539                return !te.isGNUSparse();
540            }
541            return false;
542        }
543    
544        protected final TarArchiveEntry getCurrentEntry() {
545            return currEntry;
546        }
547    
548        protected final void setCurrentEntry(TarArchiveEntry e) {
549            currEntry = e;
550        }
551    
552        protected final boolean isAtEOF() {
553            return hasHitEOF;
554        }
555    
556        protected final void setAtEOF(boolean b) {
557            hasHitEOF = b;
558        }
559    
560        /**
561         * Checks if the signature matches what is expected for a tar file.
562         *
563         * @param signature
564         *            the bytes to check
565         * @param length
566         *            the number of bytes to check
567         * @return true, if this stream is a tar archive stream, false otherwise
568         */
569        public static boolean matches(byte[] signature, int length) {
570            if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
571                return false;
572            }
573    
574            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
575                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
576                &&
577                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
578                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
579                    ){
580                return true;
581            }
582            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
583                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
584                &&
585                (
586                 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
587                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
588                ||
589                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
590                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
591                )
592                    ){
593                return true;
594            }
595            // COMPRESS-107 - recognise Ant tar files
596            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
597                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
598                &&
599                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
600                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
601                    ){
602                return true;
603            }
604            return false;
605        }
606    
607    }