001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 package org.apache.commons.compress.compressors.gzip; 020 021 import java.io.IOException; 022 import java.io.EOFException; 023 import java.io.InputStream; 024 import java.io.DataInputStream; 025 import java.io.BufferedInputStream; 026 import java.util.zip.DataFormatException; 027 import java.util.zip.Inflater; 028 import java.util.zip.CRC32; 029 030 import org.apache.commons.compress.compressors.CompressorInputStream; 031 032 /** 033 * Input stream that decompresses .gz files. 034 * This supports decompressing concatenated .gz files which is important 035 * when decompressing standalone .gz files. 036 * <p> 037 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 038 * files: it stops after the first member and silently ignores the rest. 039 * It doesn't leave the read position to point to the beginning of the next 040 * member, which makes it difficult workaround the lack of concatenation 041 * support. 042 * <p> 043 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 044 * container format decoder. The actual decompression is done with 045 * {@link java.util.zip.Inflater}. 046 */ 047 public class GzipCompressorInputStream extends CompressorInputStream { 048 // Header flags 049 // private static final int FTEXT = 0x01; // Uninteresting for us 050 private static final int FHCRC = 0x02; 051 private static final int FEXTRA = 0x04; 052 private static final int FNAME = 0x08; 053 private static final int FCOMMENT = 0x10; 054 private static final int FRESERVED = 0xE0; 055 056 // Compressed input stream, possibly wrapped in a BufferedInputStream 057 private final InputStream in; 058 059 // True if decompressing multimember streams. 060 private final boolean decompressConcatenated; 061 062 // Buffer to hold the input data 063 private final byte[] buf = new byte[8192]; 064 065 // Amount of data in buf. 066 private int bufUsed = 0; 067 068 // Decompressor 069 private Inflater inf = new Inflater(true); 070 071 // CRC32 from uncompressed data 072 private final CRC32 crc = new CRC32(); 073 074 private int memberSize; 075 076 // True once everything has been decompressed 077 private boolean endReached = false; 078 079 // used in no-arg read method 080 private final byte[] oneByte = new byte[1]; 081 082 /** 083 * Constructs a new input stream that decompresses gzip-compressed data 084 * from the specified input stream. 085 * <p> 086 * This is equivalent to 087 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 088 * will not decompress concatenated .gz files. 089 * 090 * @param inputStream the InputStream from which this object should 091 * be created of 092 * 093 * @throws IOException if the stream could not be created 094 */ 095 public GzipCompressorInputStream(InputStream inputStream) 096 throws IOException { 097 this(inputStream, false); 098 } 099 100 /** 101 * Constructs a new input stream that decompresses gzip-compressed data 102 * from the specified input stream. 103 * <p> 104 * If <code>decompressConcatenated</code> is {@code false}: 105 * This decompressor might read more input than it will actually use. 106 * If <code>inputStream</code> supports <code>mark</code> and 107 * <code>reset</code>, then the input position will be adjusted 108 * so that it is right after the last byte of the compressed stream. 109 * If <code>mark</code> isn't supported, the input position will be 110 * undefined. 111 * 112 * @param inputStream the InputStream from which this object should 113 * be created of 114 * @param decompressConcatenated 115 * if true, decompress until the end of the input; 116 * if false, stop after the first .gz member 117 * 118 * @throws IOException if the stream could not be created 119 */ 120 public GzipCompressorInputStream(InputStream inputStream, 121 boolean decompressConcatenated) 122 throws IOException { 123 // Mark support is strictly needed for concatenated files only, 124 // but it's simpler if it is always available. 125 if (inputStream.markSupported()) { 126 in = inputStream; 127 } else { 128 in = new BufferedInputStream(inputStream); 129 } 130 131 this.decompressConcatenated = decompressConcatenated; 132 init(true); 133 } 134 135 private boolean init(boolean isFirstMember) throws IOException { 136 assert isFirstMember || decompressConcatenated; 137 138 // Check the magic bytes without a possibility of EOFException. 139 int magic0 = in.read(); 140 int magic1 = in.read(); 141 142 // If end of input was reached after decompressing at least 143 // one .gz member, we have reached the end of the file successfully. 144 if (magic0 == -1 && !isFirstMember) { 145 return false; 146 } 147 148 if (magic0 != 31 || magic1 != 139) { 149 throw new IOException(isFirstMember 150 ? "Input is not in the .gz format" 151 : "Garbage after a valid .gz stream"); 152 } 153 154 // Parsing the rest of the header may throw EOFException. 155 DataInputStream inData = new DataInputStream(in); 156 int method = inData.readUnsignedByte(); 157 if (method != 8) { 158 throw new IOException("Unsupported compression method " 159 + method + " in the .gz header"); 160 } 161 162 int flg = inData.readUnsignedByte(); 163 if ((flg & FRESERVED) != 0) { 164 throw new IOException( 165 "Reserved flags are set in the .gz header"); 166 } 167 168 inData.readInt(); // mtime, ignored 169 inData.readUnsignedByte(); // extra flags, ignored 170 inData.readUnsignedByte(); // operating system, ignored 171 172 // Extra field, ignored 173 if ((flg & FEXTRA) != 0) { 174 int xlen = inData.readUnsignedByte(); 175 xlen |= inData.readUnsignedByte() << 8; 176 177 // This isn't as efficient as calling in.skip would be, 178 // but it's lazier to handle unexpected end of input this way. 179 // Most files don't have an extra field anyway. 180 while (xlen-- > 0) { 181 inData.readUnsignedByte(); 182 } 183 } 184 185 // Original file name, ignored 186 if ((flg & FNAME) != 0) { 187 readToNull(inData); 188 } 189 190 // Comment, ignored 191 if ((flg & FCOMMENT) != 0) { 192 readToNull(inData); 193 } 194 195 // Header "CRC16" which is actually a truncated CRC32 (which isn't 196 // as good as real CRC16). I don't know if any encoder implementation 197 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 198 // doesn't support this field, but zlib seems to be able to at least 199 // skip over it. 200 if ((flg & FHCRC) != 0) { 201 inData.readShort(); 202 } 203 204 // Reset 205 inf.reset(); 206 crc.reset(); 207 memberSize = 0; 208 209 return true; 210 } 211 212 private void readToNull(DataInputStream inData) throws IOException { 213 while (inData.readUnsignedByte() != 0x00) {} 214 } 215 216 /** {@inheritDoc} */ 217 @Override 218 public int read() throws IOException { 219 return read(oneByte, 0, 1) == -1 ? -1 : (oneByte[0] & 0xFF); 220 } 221 222 /** 223 * {@inheritDoc} 224 * 225 * @since 1.1 226 */ 227 @Override 228 public int read(byte[] b, int off, int len) throws IOException { 229 if (endReached) { 230 return -1; 231 } 232 233 int size = 0; 234 235 while (len > 0) { 236 if (inf.needsInput()) { 237 // Remember the current position because we may need to 238 // rewind after reading too much input. 239 in.mark(buf.length); 240 241 bufUsed = in.read(buf); 242 if (bufUsed == -1) { 243 throw new EOFException(); 244 } 245 246 inf.setInput(buf, 0, bufUsed); 247 } 248 249 int ret; 250 try { 251 ret = inf.inflate(b, off, len); 252 } catch (DataFormatException e) { 253 throw new IOException("Gzip-compressed data is corrupt"); 254 } 255 256 crc.update(b, off, ret); 257 memberSize += ret; 258 off += ret; 259 len -= ret; 260 size += ret; 261 count(ret); 262 263 if (inf.finished()) { 264 // We may have read too many bytes. Rewind the read 265 // position to match the actual amount used. 266 // 267 // NOTE: The "if" is there just in case. Since we used 268 // in.mark earler, it should always skip enough. 269 in.reset(); 270 271 int skipAmount = bufUsed - inf.getRemaining(); 272 if (in.skip(skipAmount) != skipAmount) { 273 throw new IOException(); 274 } 275 276 bufUsed = 0; 277 278 DataInputStream inData = new DataInputStream(in); 279 280 // CRC32 281 long crcStored = 0; 282 for (int i = 0; i < 4; ++i) { 283 crcStored |= (long)inData.readUnsignedByte() << (i * 8); 284 } 285 286 if (crcStored != crc.getValue()) { 287 throw new IOException("Gzip-compressed data is corrupt " 288 + "(CRC32 error)"); 289 } 290 291 // Uncompressed size modulo 2^32 (ISIZE in the spec) 292 int isize = 0; 293 for (int i = 0; i < 4; ++i) { 294 isize |= inData.readUnsignedByte() << (i * 8); 295 } 296 297 if (isize != memberSize) { 298 throw new IOException("Gzip-compressed data is corrupt" 299 + "(uncompressed size mismatch)"); 300 } 301 302 // See if this is the end of the file. 303 if (!decompressConcatenated || !init(false)) { 304 inf.end(); 305 inf = null; 306 endReached = true; 307 return size == 0 ? -1 : size; 308 } 309 } 310 } 311 312 return size; 313 } 314 315 /** 316 * Checks if the signature matches what is expected for a .gz file. 317 * 318 * @param signature the bytes to check 319 * @param length the number of bytes to check 320 * @return true if this is a .gz stream, false otherwise 321 * 322 * @since 1.1 323 */ 324 public static boolean matches(byte[] signature, int length) { 325 326 if (length < 2) { 327 return false; 328 } 329 330 if (signature[0] != 31) { 331 return false; 332 } 333 334 if (signature[1] != -117) { 335 return false; 336 } 337 338 return true; 339 } 340 341 /** 342 * Closes the input stream (unless it is System.in). 343 * 344 * @since 1.2 345 */ 346 @Override 347 public void close() throws IOException { 348 if (inf != null) { 349 inf.end(); 350 inf = null; 351 } 352 353 if (this.in != System.in) { 354 this.in.close(); 355 } 356 } 357 }