001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     * http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing,
013     * software distributed under the License is distributed on an
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015     * KIND, either express or implied.  See the License for the
016     * specific language governing permissions and limitations
017     * under the License.
018     */
019    package org.apache.commons.compress.compressors.gzip;
020    
021    import java.io.IOException;
022    import java.io.EOFException;
023    import java.io.InputStream;
024    import java.io.DataInputStream;
025    import java.io.BufferedInputStream;
026    import java.util.zip.DataFormatException;
027    import java.util.zip.Inflater;
028    import java.util.zip.CRC32;
029    
030    import org.apache.commons.compress.compressors.CompressorInputStream;
031    
032    /**
033     * Input stream that decompresses .gz files.
034     * This supports decompressing concatenated .gz files which is important
035     * when decompressing standalone .gz files.
036     * <p>
037     * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
038     * files: it stops after the first member and silently ignores the rest.
039     * It doesn't leave the read position to point to the beginning of the next
040     * member, which makes it difficult workaround the lack of concatenation
041     * support.
042     * <p>
043     * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
044     * container format decoder. The actual decompression is done with
045     * {@link java.util.zip.Inflater}.
046     */
047    public class GzipCompressorInputStream extends CompressorInputStream {
048        // Header flags
049        // private static final int FTEXT = 0x01; // Uninteresting for us
050        private static final int FHCRC = 0x02;
051        private static final int FEXTRA = 0x04;
052        private static final int FNAME = 0x08;
053        private static final int FCOMMENT = 0x10;
054        private static final int FRESERVED = 0xE0;
055    
056        // Compressed input stream, possibly wrapped in a BufferedInputStream
057        private final InputStream in;
058    
059        // True if decompressing multimember streams.
060        private final boolean decompressConcatenated;
061    
062        // Buffer to hold the input data
063        private final byte[] buf = new byte[8192];
064    
065        // Amount of data in buf.
066        private int bufUsed = 0;
067    
068        // Decompressor
069        private Inflater inf = new Inflater(true);
070    
071        // CRC32 from uncompressed data
072        private final CRC32 crc = new CRC32();
073    
074        private int memberSize;
075    
076        // True once everything has been decompressed
077        private boolean endReached = false;
078    
079        // used in no-arg read method
080        private final byte[] oneByte = new byte[1];
081    
082        /**
083         * Constructs a new input stream that decompresses gzip-compressed data
084         * from the specified input stream.
085         * <p>
086         * This is equivalent to
087         * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
088         * will not decompress concatenated .gz files.
089         *
090         * @param inputStream  the InputStream from which this object should
091         *                     be created of
092         *
093         * @throws IOException if the stream could not be created
094         */
095        public GzipCompressorInputStream(InputStream inputStream)
096                throws IOException {
097            this(inputStream, false);
098        }
099    
100        /**
101         * Constructs a new input stream that decompresses gzip-compressed data
102         * from the specified input stream.
103         * <p>
104         * If <code>decompressConcatenated</code> is {@code false}:
105         * This decompressor might read more input than it will actually use.
106         * If <code>inputStream</code> supports <code>mark</code> and
107         * <code>reset</code>, then the input position will be adjusted
108         * so that it is right after the last byte of the compressed stream.
109         * If <code>mark</code> isn't supported, the input position will be
110         * undefined.
111         *
112         * @param inputStream  the InputStream from which this object should
113         *                     be created of
114         * @param decompressConcatenated
115         *                     if true, decompress until the end of the input;
116         *                     if false, stop after the first .gz member
117         *
118         * @throws IOException if the stream could not be created
119         */
120        public GzipCompressorInputStream(InputStream inputStream,
121                                         boolean decompressConcatenated)
122                throws IOException {
123            // Mark support is strictly needed for concatenated files only,
124            // but it's simpler if it is always available.
125            if (inputStream.markSupported()) {
126                in = inputStream;
127            } else {
128                in = new BufferedInputStream(inputStream);
129            }
130    
131            this.decompressConcatenated = decompressConcatenated;
132            init(true);
133        }
134    
135        private boolean init(boolean isFirstMember) throws IOException {
136            assert isFirstMember || decompressConcatenated;
137    
138            // Check the magic bytes without a possibility of EOFException.
139            int magic0 = in.read();
140            int magic1 = in.read();
141    
142            // If end of input was reached after decompressing at least
143            // one .gz member, we have reached the end of the file successfully.
144            if (magic0 == -1 && !isFirstMember) {
145                return false;
146            }
147    
148            if (magic0 != 31 || magic1 != 139) {
149                throw new IOException(isFirstMember
150                                      ? "Input is not in the .gz format"
151                                      : "Garbage after a valid .gz stream");
152            }
153    
154            // Parsing the rest of the header may throw EOFException.
155            DataInputStream inData = new DataInputStream(in);
156            int method = inData.readUnsignedByte();
157            if (method != 8) {
158                throw new IOException("Unsupported compression method "
159                                      + method + " in the .gz header");
160            }
161    
162            int flg = inData.readUnsignedByte();
163            if ((flg & FRESERVED) != 0) {
164                throw new IOException(
165                        "Reserved flags are set in the .gz header");
166            }
167    
168            inData.readInt(); // mtime, ignored
169            inData.readUnsignedByte(); // extra flags, ignored
170            inData.readUnsignedByte(); // operating system, ignored
171    
172            // Extra field, ignored
173            if ((flg & FEXTRA) != 0) {
174                int xlen = inData.readUnsignedByte();
175                xlen |= inData.readUnsignedByte() << 8;
176    
177                // This isn't as efficient as calling in.skip would be,
178                // but it's lazier to handle unexpected end of input this way.
179                // Most files don't have an extra field anyway.
180                while (xlen-- > 0) {
181                    inData.readUnsignedByte();
182                }
183            }
184    
185            // Original file name, ignored
186            if ((flg & FNAME) != 0) {
187                readToNull(inData);
188            }
189    
190            // Comment, ignored
191            if ((flg & FCOMMENT) != 0) {
192                readToNull(inData);
193            }
194    
195            // Header "CRC16" which is actually a truncated CRC32 (which isn't
196            // as good as real CRC16). I don't know if any encoder implementation
197            // sets this, so it's not worth trying to verify it. GNU gzip 1.4
198            // doesn't support this field, but zlib seems to be able to at least
199            // skip over it.
200            if ((flg & FHCRC) != 0) {
201                inData.readShort();
202            }
203    
204            // Reset
205            inf.reset();
206            crc.reset();
207            memberSize = 0;
208    
209            return true;
210        }
211    
212        private void readToNull(DataInputStream inData) throws IOException {
213            while (inData.readUnsignedByte() != 0x00) {}
214        }
215    
216        /** {@inheritDoc} */
217        @Override
218        public int read() throws IOException {
219            return read(oneByte, 0, 1) == -1 ? -1 : (oneByte[0] & 0xFF);
220        }
221    
222        /**
223         * {@inheritDoc}
224         *
225         * @since 1.1
226         */
227        @Override
228        public int read(byte[] b, int off, int len) throws IOException {
229            if (endReached) {
230                return -1;
231            }
232    
233            int size = 0;
234    
235            while (len > 0) {
236                if (inf.needsInput()) {
237                    // Remember the current position because we may need to
238                    // rewind after reading too much input.
239                    in.mark(buf.length);
240    
241                    bufUsed = in.read(buf);
242                    if (bufUsed == -1) {
243                        throw new EOFException();
244                    }
245    
246                    inf.setInput(buf, 0, bufUsed);
247                }
248    
249                int ret;
250                try {
251                    ret = inf.inflate(b, off, len);
252                } catch (DataFormatException e) {
253                    throw new IOException("Gzip-compressed data is corrupt");
254                }
255    
256                crc.update(b, off, ret);
257                memberSize += ret;
258                off += ret;
259                len -= ret;
260                size += ret;
261                count(ret);
262    
263                if (inf.finished()) {
264                    // We may have read too many bytes. Rewind the read
265                    // position to match the actual amount used.
266                    //
267                    // NOTE: The "if" is there just in case. Since we used
268                    // in.mark earler, it should always skip enough.
269                    in.reset();
270    
271                    int skipAmount = bufUsed - inf.getRemaining();
272                    if (in.skip(skipAmount) != skipAmount) {
273                        throw new IOException();
274                    }
275    
276                    bufUsed = 0;
277    
278                    DataInputStream inData = new DataInputStream(in);
279    
280                    // CRC32
281                    long crcStored = 0;
282                    for (int i = 0; i < 4; ++i) {
283                        crcStored |= (long)inData.readUnsignedByte() << (i * 8);
284                    }
285    
286                    if (crcStored != crc.getValue()) {
287                        throw new IOException("Gzip-compressed data is corrupt "
288                                              + "(CRC32 error)");
289                    }
290    
291                    // Uncompressed size modulo 2^32 (ISIZE in the spec)
292                    int isize = 0;
293                    for (int i = 0; i < 4; ++i) {
294                        isize |= inData.readUnsignedByte() << (i * 8);
295                    }
296    
297                    if (isize != memberSize) {
298                        throw new IOException("Gzip-compressed data is corrupt"
299                                              + "(uncompressed size mismatch)");
300                    }
301    
302                    // See if this is the end of the file.
303                    if (!decompressConcatenated || !init(false)) {
304                        inf.end();
305                        inf = null;
306                        endReached = true;
307                        return size == 0 ? -1 : size;
308                    }
309                }
310            }
311    
312            return size;
313        }
314    
315        /**
316         * Checks if the signature matches what is expected for a .gz file.
317         *
318         * @param signature the bytes to check
319         * @param length    the number of bytes to check
320         * @return          true if this is a .gz stream, false otherwise
321         *
322         * @since 1.1
323         */
324        public static boolean matches(byte[] signature, int length) {
325    
326            if (length < 2) {
327                return false;
328            }
329    
330            if (signature[0] != 31) {
331                return false;
332            }
333    
334            if (signature[1] != -117) {
335                return false;
336            }
337    
338            return true;
339        }
340    
341        /**
342         * Closes the input stream (unless it is System.in).
343         *
344         * @since 1.2
345         */
346        @Override
347        public void close() throws IOException {
348            if (inf != null) {
349                inf.end();
350                inf = null;
351            }
352    
353            if (this.in != System.in) {
354                this.in.close();
355            }
356        }
357    }