From 696eb15fe2c2be6004c6bd6b88e008807b1643e5 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Wed, 23 Nov 2022 07:38:33 -0800 Subject: [PATCH 01/31] Add CIFuzz Github action --- .github/workflows/cifuzz.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/cifuzz.yml diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 00000000..4288c55e --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,26 @@ +name: CIFuzz +on: [pull_request] +jobs: + Fuzzing: + runs-on: ubuntu-latest + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'flate2-rs' + dry-run: false + language: rust + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'flate2-rs' + fuzz-seconds: 300 + dry-run: false + language: rust + - name: Upload Crash + uses: actions/upload-artifact@v3 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts From ecb6838f4a11ca9926a40a954e7ebbc745004807 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Mon, 1 May 2023 08:30:28 +0100 Subject: [PATCH 02/31] Move GzHeader into GzState Reduces the size of `bufread::GzDecoder<&[u8]>` from 296 to 208 bytes. --- src/gz/bufread.rs | 106 +++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 54 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index c6ac5a98..266763e8 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -305,8 +305,7 @@ impl Write for GzEncoder { /// ``` #[derive(Debug)] pub struct GzDecoder { - inner: GzState, - header: Option, + state: GzState, reader: CrcReader>, multi: bool, } @@ -357,10 +356,10 @@ impl GzHeaderPartial { #[derive(Debug)] enum GzState { Header(GzHeaderPartial), - Body, - Finished(usize, [u8; 8]), + Body(GzHeader), + Finished(GzHeader, usize, [u8; 8]), Err(io::Error), - End, + End(Option), } /// A small adapter which reads data originally from `buf` and then reads all @@ -439,7 +438,6 @@ impl GzDecoder { /// gzip header. pub fn new(mut r: R) -> GzDecoder { let mut part = GzHeaderPartial::new(); - let mut header = None; let result = { let mut reader = Buffer::new(&mut part, &mut r); @@ -448,18 +446,17 @@ impl GzDecoder { let state = match result { Ok(()) => { - header = Some(part.take_header()); - GzState::Body + let header = part.take_header(); + GzState::Body(header) } Err(ref err) if io::ErrorKind::WouldBlock == err.kind() => GzState::Header(part), Err(err) => GzState::Err(err), }; GzDecoder { - inner: state, + state, reader: CrcReader::new(deflate::bufread::DeflateDecoder::new(r)), multi: false, - header, } } @@ -472,7 +469,11 @@ impl GzDecoder { impl GzDecoder { /// Returns the header associated with this stream, if it was valid pub fn header(&self) -> Option<&GzHeader> { - self.header.as_ref() + match &self.state { + GzState::Body(header) | GzState::Finished(header, _, _) => Some(header), + GzState::End(header) => header.as_ref(), + _ => None, + } } /// Acquires a reference to the underlying reader. @@ -497,14 +498,13 @@ impl GzDecoder { impl Read for GzDecoder { fn read(&mut self, into: &mut [u8]) -> io::Result { let GzDecoder { - inner, - header, + state, reader, multi, } = self; loop { - *inner = match mem::replace(inner, GzState::End) { + *state = match mem::replace(state, GzState::End(None)) { GzState::Header(mut part) => { let result = { let mut reader = Buffer::new(&mut part, reader.get_mut().get_mut()); @@ -512,94 +512,92 @@ impl Read for GzDecoder { }; match result { Ok(()) => { - *header = Some(part.take_header()); - GzState::Body + let header = part.take_header(); + GzState::Body(header) } Err(err) if io::ErrorKind::WouldBlock == err.kind() => { - *inner = GzState::Header(part); + *state = GzState::Header(part); return Err(err); } Err(err) => return Err(err), } } - GzState::Body => { + GzState::Body(header) => { if into.is_empty() { - *inner = GzState::Body; + *state = GzState::Body(header); return Ok(0); } - let n = reader.read(into).map_err(|err| { - if io::ErrorKind::WouldBlock == err.kind() { - *inner = GzState::Body; - } + let n = match reader.read(into) { + Ok(n) => n, + Err(err) => { + if io::ErrorKind::WouldBlock == err.kind() { + *state = GzState::Body(header); + } - err - })?; + return Err(err); + } + }; match n { - 0 => GzState::Finished(0, [0; 8]), + 0 => GzState::Finished(header, 0, [0; 8]), n => { - *inner = GzState::Body; + *state = GzState::Body(header); return Ok(n); } } } - GzState::Finished(pos, mut buf) => { + GzState::Finished(header, pos, mut buf) => { if pos < buf.len() { - let n = reader - .get_mut() - .get_mut() - .read(&mut buf[pos..]) - .and_then(|n| { + let n = match reader.get_mut().get_mut().read(&mut buf[pos..]) { + Ok(n) => { if n == 0 { - Err(io::ErrorKind::UnexpectedEof.into()) + return Err(io::ErrorKind::UnexpectedEof.into()); } else { - Ok(n) + n } - }) - .map_err(|err| { + } + Err(err) => { if io::ErrorKind::WouldBlock == err.kind() { - *inner = GzState::Finished(pos, buf); + *state = GzState::Finished(header, pos, buf); } - err - })?; + return Err(err); + } + }; - GzState::Finished(pos + n, buf) + GzState::Finished(header, pos + n, buf) } else { let (crc, amt) = finish(&buf); if crc != reader.crc().sum() || amt != reader.crc().amount() { return Err(corrupt()); } else if *multi { - let is_eof = reader - .get_mut() - .get_mut() - .fill_buf() - .map(|buf| buf.is_empty()) - .map_err(|err| { + let is_eof = match reader.get_mut().get_mut().fill_buf() { + Ok(buf) => buf.is_empty(), + Err(err) => { if io::ErrorKind::WouldBlock == err.kind() { - *inner = GzState::Finished(pos, buf); + *state = GzState::Finished(header, pos, buf); } - err - })?; + return Err(err); + } + }; if is_eof { - GzState::End + GzState::End(Some(header)) } else { reader.reset(); reader.get_mut().reset_data(); - header.take(); GzState::Header(GzHeaderPartial::new()) } } else { - GzState::End + GzState::End(Some(header)) } } } GzState::Err(err) => return Err(err), - GzState::End => return Ok(0), + GzState::End(_) => return Ok(0), }; } } From 4a622d9798357af9cbdbbb096b16953cdda6a0c1 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Mon, 1 May 2023 10:33:52 +0100 Subject: [PATCH 03/31] Move blocked_partial_header_read test to read module --- src/gz/mod.rs | 29 ------------------ src/gz/read.rs | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 29 deletions(-) diff --git a/src/gz/mod.rs b/src/gz/mod.rs index d31aa60b..aeb84162 100644 --- a/src/gz/mod.rs +++ b/src/gz/mod.rs @@ -353,33 +353,4 @@ mod tests { write!(f, "Hello world").unwrap(); f.flush().unwrap(); } - - use crate::gz::bufread::tests::BlockingCursor; - #[test] - // test function read_and_forget of Buffer - fn blocked_partial_header_read() { - // this is a reader which receives data afterwards - let mut r = BlockingCursor::new(); - let data = vec![1, 2, 3]; - - match r.write_all(&data) { - Ok(()) => {} - _ => { - panic!("Unexpected result for write_all"); - } - } - r.set_position(0); - - // this is unused except for the buffering - let mut decoder = read::GzDecoder::new(r); - let mut out = Vec::with_capacity(7); - match decoder.read(&mut out) { - Err(e) => { - assert_eq!(e.kind(), std::io::ErrorKind::WouldBlock); - } - _ => { - panic!("Unexpected result for decoder.read"); - } - } - } } diff --git a/src/gz/read.rs b/src/gz/read.rs index cfeb992e..2b0796b2 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -276,3 +276,84 @@ impl Write for MultiGzDecoder { self.get_mut().flush() } } + +#[cfg(test)] +mod tests { + use std::io::{Cursor, ErrorKind, Read, Result, Write}; + + use super::GzDecoder; + + //a cursor turning EOF into blocking errors + #[derive(Debug)] + pub struct BlockingCursor { + pub cursor: Cursor>, + } + + impl BlockingCursor { + pub fn new() -> BlockingCursor { + BlockingCursor { + cursor: Cursor::new(Vec::new()), + } + } + + pub fn set_position(&mut self, pos: u64) { + return self.cursor.set_position(pos); + } + } + + impl Write for BlockingCursor { + fn write(&mut self, buf: &[u8]) -> Result { + return self.cursor.write(buf); + } + fn flush(&mut self) -> Result<()> { + return self.cursor.flush(); + } + } + + impl Read for BlockingCursor { + fn read(&mut self, buf: &mut [u8]) -> Result { + //use the cursor, except it turns eof into blocking error + let r = self.cursor.read(buf); + match r { + Err(ref err) => { + if err.kind() == ErrorKind::UnexpectedEof { + return Err(ErrorKind::WouldBlock.into()); + } + } + Ok(0) => { + //regular EOF turned into blocking error + return Err(ErrorKind::WouldBlock.into()); + } + Ok(_n) => {} + } + return r; + } + } + + #[test] + fn blocked_partial_header_read() { + // this is a reader which receives data afterwards + let mut r = BlockingCursor::new(); + let data = vec![1, 2, 3]; + + match r.write_all(&data) { + Ok(()) => {} + _ => { + panic!("Unexpected result for write_all"); + } + } + r.set_position(0); + + // this is unused except for the buffering + let mut decoder = GzDecoder::new(r); + let mut out = Vec::with_capacity(7); + match decoder.read(&mut out) { + Err(e) => { + assert_eq!(e.kind(), ErrorKind::WouldBlock); + } + _ => { + panic!("Unexpected result for decoder.read"); + } + } + } +} From a5e2ebaac545df3b0b102ffa474306b6077c72a3 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Mon, 1 May 2023 18:41:57 +0100 Subject: [PATCH 04/31] Move gzip header parsing out of bufread module Header parsing is used by both bufread and write modules. --- src/gz/bufread.rs | 227 +--------------------------------------------- src/gz/mod.rs | 224 ++++++++++++++++++++++++++++++++++++++++++++- src/gz/write.rs | 3 +- 3 files changed, 228 insertions(+), 226 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index c6ac5a98..5de1e261 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -3,9 +3,12 @@ use std::io; use std::io::prelude::*; use std::mem; +use super::corrupt; +use super::read_gz_header_part; +use super::Buffer; +use super::GzHeaderPartial; use super::{GzBuilder, GzHeader}; -use super::{FCOMMENT, FEXTRA, FHCRC, FNAME}; -use crate::crc::{Crc, CrcReader}; +use crate::crc::CrcReader; use crate::deflate; use crate::Compression; @@ -18,112 +21,6 @@ fn copy(into: &mut [u8], from: &[u8], pos: &mut usize) -> usize { min } -pub(crate) fn corrupt() -> io::Error { - io::Error::new( - io::ErrorKind::InvalidInput, - "corrupt gzip stream does not have a matching checksum", - ) -} - -fn bad_header() -> io::Error { - io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header") -} - -fn read_le_u16(r: &mut Buffer) -> io::Result { - let mut b = [0; 2]; - r.read_and_forget(&mut b)?; - Ok((b[0] as u16) | ((b[1] as u16) << 8)) -} - -fn read_gz_header_part<'a, R: Read>(r: &'a mut Buffer<'a, R>) -> io::Result<()> { - loop { - match r.part.state { - GzHeaderParsingState::Start => { - let mut header = [0; 10]; - r.read_and_forget(&mut header)?; - - if header[0] != 0x1f || header[1] != 0x8b { - return Err(bad_header()); - } - if header[2] != 8 { - return Err(bad_header()); - } - - r.part.flg = header[3]; - r.part.header.mtime = ((header[4] as u32) << 0) - | ((header[5] as u32) << 8) - | ((header[6] as u32) << 16) - | ((header[7] as u32) << 24); - let _xfl = header[8]; - r.part.header.operating_system = header[9]; - r.part.state = GzHeaderParsingState::Xlen; - } - GzHeaderParsingState::Xlen => { - if r.part.flg & FEXTRA != 0 { - r.part.xlen = read_le_u16(r)?; - } - r.part.state = GzHeaderParsingState::Extra; - } - GzHeaderParsingState::Extra => { - if r.part.flg & FEXTRA != 0 { - let mut extra = vec![0; r.part.xlen as usize]; - r.read_and_forget(&mut extra)?; - r.part.header.extra = Some(extra); - } - r.part.state = GzHeaderParsingState::Filename; - } - GzHeaderParsingState::Filename => { - if r.part.flg & FNAME != 0 { - if r.part.header.filename.is_none() { - r.part.header.filename = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; - } - } - } - r.part.state = GzHeaderParsingState::Comment; - } - GzHeaderParsingState::Comment => { - if r.part.flg & FCOMMENT != 0 { - if r.part.header.comment.is_none() { - r.part.header.comment = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; - } - } - } - r.part.state = GzHeaderParsingState::Crc; - } - GzHeaderParsingState::Crc => { - if r.part.flg & FHCRC != 0 { - let stored_crc = read_le_u16(r)?; - let calced_crc = r.part.crc.sum() as u16; - if stored_crc != calced_crc { - return Err(corrupt()); - } - } - return Ok(()); - } - } - } -} - -pub(crate) fn read_gz_header(r: &mut R) -> io::Result { - let mut part = GzHeaderPartial::new(); - - let result = { - let mut reader = Buffer::new(&mut part, r); - read_gz_header_part(&mut reader) - }; - result.map(|()| part.take_header()) -} - /// A gzip streaming encoder /// /// This structure exposes a [`BufRead`] interface that will read uncompressed data @@ -311,49 +208,6 @@ pub struct GzDecoder { multi: bool, } -#[derive(Debug)] -pub enum GzHeaderParsingState { - Start, - Xlen, - Extra, - Filename, - Comment, - Crc, -} - -#[derive(Debug)] -pub struct GzHeaderPartial { - buf: Vec, - state: GzHeaderParsingState, - flg: u8, - xlen: u16, - crc: Crc, - header: GzHeader, -} - -impl GzHeaderPartial { - fn new() -> GzHeaderPartial { - GzHeaderPartial { - buf: Vec::with_capacity(10), // minimum header length - state: GzHeaderParsingState::Start, - flg: 0, - xlen: 0, - crc: Crc::new(), - header: GzHeader { - extra: None, - filename: None, - comment: None, - operating_system: 0, - mtime: 0, - }, - } - } - - pub fn take_header(self) -> GzHeader { - self.header - } -} - #[derive(Debug)] enum GzState { Header(GzHeaderPartial), @@ -363,77 +217,6 @@ enum GzState { End, } -/// A small adapter which reads data originally from `buf` and then reads all -/// further data from `reader`. This will also buffer all data read from -/// `reader` into `buf` for reuse on a further call. -struct Buffer<'a, T: 'a> { - part: &'a mut GzHeaderPartial, - buf_cur: usize, - buf_max: usize, - reader: &'a mut T, -} - -impl<'a, T> Buffer<'a, T> { - fn new(part: &'a mut GzHeaderPartial, reader: &'a mut T) -> Buffer<'a, T> { - Buffer { - reader, - buf_cur: 0, - buf_max: part.buf.len(), - part, - } - } -} - -impl<'a, T: Read> Read for Buffer<'a, T> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let mut bufref = match self.part.state { - GzHeaderParsingState::Filename => self.part.header.filename.as_mut(), - GzHeaderParsingState::Comment => self.part.header.comment.as_mut(), - _ => None, - }; - if let Some(ref mut b) = bufref { - // we have a direct reference to a buffer where to write - let len = self.reader.read(buf)?; - if len > 0 && buf[len - 1] == 0 { - // we do not append the final 0 - b.extend_from_slice(&buf[..len - 1]); - } else { - b.extend_from_slice(&buf[..len]); - } - self.part.crc.update(&buf[..len]); - Ok(len) - } else if self.buf_cur == self.buf_max { - // we read new bytes and also save them in self.part.buf - let len = self.reader.read(buf)?; - self.part.buf.extend_from_slice(&buf[..len]); - self.part.crc.update(&buf[..len]); - Ok(len) - } else { - // we first read the previously saved bytes - let len = (&self.part.buf[self.buf_cur..self.buf_max]).read(buf)?; - self.buf_cur += len; - Ok(len) - } - } -} - -impl<'a, T> Buffer<'a, T> -where - T: std::io::Read, -{ - // If we manage to read all the bytes, we reset the buffer - fn read_and_forget(&mut self, buf: &mut [u8]) -> io::Result { - self.read_exact(buf)?; - // we managed to read the whole buf - // we will no longer need the previously saved bytes in self.part.buf - let rlen = buf.len(); - self.part.buf.truncate(0); - self.buf_cur = 0; - self.buf_max = 0; - Ok(rlen) - } -} - impl GzDecoder { /// Creates a new decoder from the given reader, immediately parsing the /// gzip header. diff --git a/src/gz/mod.rs b/src/gz/mod.rs index d31aa60b..e72c43e0 100644 --- a/src/gz/mod.rs +++ b/src/gz/mod.rs @@ -1,9 +1,9 @@ use std::ffi::CString; -use std::io::prelude::*; +use std::io::{self, prelude::*}; use std::time; use crate::bufreader::BufReader; -use crate::Compression; +use crate::{Compression, Crc}; pub static FHCRC: u8 = 1 << 1; pub static FEXTRA: u8 = 1 << 2; @@ -82,6 +82,155 @@ impl GzHeader { } } +#[derive(Debug)] +pub enum GzHeaderParsingState { + Start, + Xlen, + Extra, + Filename, + Comment, + Crc, +} + +#[derive(Debug)] +pub struct GzHeaderPartial { + buf: Vec, + state: GzHeaderParsingState, + flg: u8, + xlen: u16, + crc: Crc, + header: GzHeader, +} + +impl GzHeaderPartial { + fn new() -> GzHeaderPartial { + GzHeaderPartial { + buf: Vec::with_capacity(10), // minimum header length + state: GzHeaderParsingState::Start, + flg: 0, + xlen: 0, + crc: Crc::new(), + header: GzHeader { + extra: None, + filename: None, + comment: None, + operating_system: 0, + mtime: 0, + }, + } + } + + pub fn take_header(self) -> GzHeader { + self.header + } +} + +fn read_gz_header_part<'a, R: Read>(r: &'a mut Buffer<'a, R>) -> io::Result<()> { + loop { + match r.part.state { + GzHeaderParsingState::Start => { + let mut header = [0; 10]; + r.read_and_forget(&mut header)?; + + if header[0] != 0x1f || header[1] != 0x8b { + return Err(bad_header()); + } + if header[2] != 8 { + return Err(bad_header()); + } + + r.part.flg = header[3]; + r.part.header.mtime = ((header[4] as u32) << 0) + | ((header[5] as u32) << 8) + | ((header[6] as u32) << 16) + | ((header[7] as u32) << 24); + let _xfl = header[8]; + r.part.header.operating_system = header[9]; + r.part.state = GzHeaderParsingState::Xlen; + } + GzHeaderParsingState::Xlen => { + if r.part.flg & FEXTRA != 0 { + r.part.xlen = read_le_u16(r)?; + } + r.part.state = GzHeaderParsingState::Extra; + } + GzHeaderParsingState::Extra => { + if r.part.flg & FEXTRA != 0 { + let mut extra = vec![0; r.part.xlen as usize]; + r.read_and_forget(&mut extra)?; + r.part.header.extra = Some(extra); + } + r.part.state = GzHeaderParsingState::Filename; + } + GzHeaderParsingState::Filename => { + if r.part.flg & FNAME != 0 { + if r.part.header.filename.is_none() { + r.part.header.filename = Some(Vec::new()); + }; + for byte in r.bytes() { + let byte = byte?; + if byte == 0 { + break; + } + } + } + r.part.state = GzHeaderParsingState::Comment; + } + GzHeaderParsingState::Comment => { + if r.part.flg & FCOMMENT != 0 { + if r.part.header.comment.is_none() { + r.part.header.comment = Some(Vec::new()); + }; + for byte in r.bytes() { + let byte = byte?; + if byte == 0 { + break; + } + } + } + r.part.state = GzHeaderParsingState::Crc; + } + GzHeaderParsingState::Crc => { + if r.part.flg & FHCRC != 0 { + let stored_crc = read_le_u16(r)?; + let calced_crc = r.part.crc.sum() as u16; + if stored_crc != calced_crc { + return Err(corrupt()); + } + } + return Ok(()); + } + } + } +} + +fn read_gz_header(r: &mut R) -> io::Result { + let mut part = GzHeaderPartial::new(); + + let result = { + let mut reader = Buffer::new(&mut part, r); + read_gz_header_part(&mut reader) + }; + result.map(|()| part.take_header()) +} + +fn read_le_u16(r: &mut Buffer) -> io::Result { + let mut b = [0; 2]; + r.read_and_forget(&mut b)?; + Ok((b[0] as u16) | ((b[1] as u16) << 8)) +} + +fn bad_header() -> io::Error { + io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header") +} + +fn corrupt() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidInput, + "corrupt gzip stream does not have a matching checksum", + ) +} + /// A builder structure to create a new gzip Encoder. /// /// This structure controls header configuration options such as the filename. @@ -249,6 +398,77 @@ impl GzBuilder { } } +/// A small adapter which reads data originally from `buf` and then reads all +/// further data from `reader`. This will also buffer all data read from +/// `reader` into `buf` for reuse on a further call. +struct Buffer<'a, T: 'a> { + part: &'a mut GzHeaderPartial, + buf_cur: usize, + buf_max: usize, + reader: &'a mut T, +} + +impl<'a, T> Buffer<'a, T> { + fn new(part: &'a mut GzHeaderPartial, reader: &'a mut T) -> Buffer<'a, T> { + Buffer { + reader, + buf_cur: 0, + buf_max: part.buf.len(), + part, + } + } +} + +impl<'a, T: Read> Read for Buffer<'a, T> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let mut bufref = match self.part.state { + GzHeaderParsingState::Filename => self.part.header.filename.as_mut(), + GzHeaderParsingState::Comment => self.part.header.comment.as_mut(), + _ => None, + }; + if let Some(ref mut b) = bufref { + // we have a direct reference to a buffer where to write + let len = self.reader.read(buf)?; + if len > 0 && buf[len - 1] == 0 { + // we do not append the final 0 + b.extend_from_slice(&buf[..len - 1]); + } else { + b.extend_from_slice(&buf[..len]); + } + self.part.crc.update(&buf[..len]); + Ok(len) + } else if self.buf_cur == self.buf_max { + // we read new bytes and also save them in self.part.buf + let len = self.reader.read(buf)?; + self.part.buf.extend_from_slice(&buf[..len]); + self.part.crc.update(&buf[..len]); + Ok(len) + } else { + // we first read the previously saved bytes + let len = (&self.part.buf[self.buf_cur..self.buf_max]).read(buf)?; + self.buf_cur += len; + Ok(len) + } + } +} + +impl<'a, T> Buffer<'a, T> +where + T: std::io::Read, +{ + // If we manage to read all the bytes, we reset the buffer + fn read_and_forget(&mut self, buf: &mut [u8]) -> io::Result { + self.read_exact(buf)?; + // we managed to read the whole buf + // we will no longer need the previously saved bytes in self.part.buf + let rlen = buf.len(); + self.part.buf.truncate(0); + self.buf_cur = 0; + self.buf_max = 0; + Ok(rlen) + } +} + #[cfg(test)] mod tests { use std::io::prelude::*; diff --git a/src/gz/write.rs b/src/gz/write.rs index 83eebb75..5336a17e 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -2,8 +2,7 @@ use std::cmp; use std::io; use std::io::prelude::*; -use super::bufread::{corrupt, read_gz_header}; -use super::{GzBuilder, GzHeader}; +use super::{corrupt, read_gz_header, GzBuilder, GzHeader}; use crate::crc::{Crc, CrcWriter}; use crate::zio; use crate::{Compress, Compression, Decompress, Status}; From 6e111fe8643321a7b00bd96e6e385521e8431e11 Mon Sep 17 00:00:00 2001 From: Yuki Okushi Date: Sun, 7 May 2023 09:58:05 +0900 Subject: [PATCH 05/31] Add notes about multiple streams to `GzDecoder` Signed-off-by: Yuki Okushi --- src/gz/bufread.rs | 1 + src/gz/read.rs | 2 +- src/gz/write.rs | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 8db25605..2c707b4c 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -171,6 +171,7 @@ impl Write for GzEncoder { /// /// This structure consumes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. +/// Use [`MultiGzDecoder`] if your file has multiple streams. /// /// [`BufRead`]: https://doc.rust-lang.org/std/io/trait.BufRead.html /// diff --git a/src/gz/read.rs b/src/gz/read.rs index 2b0796b2..5e9bcf65 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -94,13 +94,13 @@ impl Write for GzEncoder { /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. +/// Use [`MultiGzDecoder`] if your file has multiple streams. /// /// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html /// /// # Examples /// /// ``` -/// /// use std::io::prelude::*; /// use std::io; /// # use flate2::Compression; diff --git a/src/gz/write.rs b/src/gz/write.rs index 5336a17e..dd8a6bd0 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -170,6 +170,7 @@ impl Drop for GzEncoder { /// /// This structure exposes a [`Write`] interface that will emit uncompressed data /// to the underlying writer `W`. +/// Use [`MultiGzDecoder`] if your file has multiple streams. /// /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html /// From 3ea8c3dcdb3937fb6102c16627c52ce74ac63d13 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Sun, 7 May 2023 07:32:27 +0100 Subject: [PATCH 06/31] Fix GzDecoder Write partial filenames and comments If the gzip header contains an optional filename or comment but they are not completely contained in the buffer sent to a `write::GzDecoder`, then a valid header is created, missing data from these optional sections. A subsequent write call will treat the remaining header as encoded data and attempt to decode it, generally causing a panic. This change rewrites the header parsing code to handle partial headers correctly for both `Read` (where `WouldBlock` is handled specially) and `Write` (where `UnexpectedEof` is handled specially). --- src/gz/bufread.rs | 284 +++++------------------------------- src/gz/mod.rs | 356 ++++++++++++++++++++++------------------------ src/gz/write.rs | 105 ++++++++------ 3 files changed, 272 insertions(+), 473 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 8db25605..b59bf21c 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -3,11 +3,7 @@ use std::io; use std::io::prelude::*; use std::mem; -use super::corrupt; -use super::read_gz_header_part; -use super::Buffer; -use super::GzHeaderPartial; -use super::{GzBuilder, GzHeader}; +use super::{corrupt, read_into, GzBuilder, GzHeader, GzHeaderParser}; use crate::crc::CrcReader; use crate::deflate; use crate::Compression; @@ -209,7 +205,7 @@ pub struct GzDecoder { #[derive(Debug)] enum GzState { - Header(GzHeaderPartial), + Header(GzHeaderParser), Body(GzHeader), Finished(GzHeader, usize, [u8; 8]), Err(io::Error), @@ -220,19 +216,13 @@ impl GzDecoder { /// Creates a new decoder from the given reader, immediately parsing the /// gzip header. pub fn new(mut r: R) -> GzDecoder { - let mut part = GzHeaderPartial::new(); + let mut header_parser = GzHeaderParser::new(); - let result = { - let mut reader = Buffer::new(&mut part, &mut r); - read_gz_header_part(&mut reader) - }; - - let state = match result { - Ok(()) => { - let header = part.take_header(); - GzState::Body(header) + let state = match header_parser.parse(&mut r) { + Ok(_) => GzState::Body(GzHeader::from(header_parser)), + Err(ref err) if io::ErrorKind::WouldBlock == err.kind() => { + GzState::Header(header_parser) } - Err(ref err) if io::ErrorKind::WouldBlock == err.kind() => GzState::Header(part), Err(err) => GzState::Err(err), }; @@ -280,108 +270,61 @@ impl GzDecoder { impl Read for GzDecoder { fn read(&mut self, into: &mut [u8]) -> io::Result { - let GzDecoder { - state, - reader, - multi, - } = self; - loop { - *state = match mem::replace(state, GzState::End(None)) { - GzState::Header(mut part) => { - let result = { - let mut reader = Buffer::new(&mut part, reader.get_mut().get_mut()); - read_gz_header_part(&mut reader) - }; - match result { - Ok(()) => { - let header = part.take_header(); - GzState::Body(header) - } - Err(err) if io::ErrorKind::WouldBlock == err.kind() => { - *state = GzState::Header(part); - return Err(err); - } - Err(err) => return Err(err), - } + match &mut self.state { + GzState::Header(parser) => { + parser.parse(self.reader.get_mut().get_mut())?; + self.state = GzState::Body(GzHeader::from(mem::take(parser))); } GzState::Body(header) => { if into.is_empty() { - *state = GzState::Body(header); return Ok(0); } - - let n = match reader.read(into) { - Ok(n) => n, - Err(err) => { - if io::ErrorKind::WouldBlock == err.kind() { - *state = GzState::Body(header); - } - - return Err(err); + match self.reader.read(into)? { + 0 => { + self.state = GzState::Finished(mem::take(header), 0, [0; 8]); } - }; - - match n { - 0 => GzState::Finished(header, 0, [0; 8]), n => { - *state = GzState::Body(header); return Ok(n); } } } - GzState::Finished(header, pos, mut buf) => { - if pos < buf.len() { - let n = match reader.get_mut().get_mut().read(&mut buf[pos..]) { - Ok(n) => { - if n == 0 { - return Err(io::ErrorKind::UnexpectedEof.into()); - } else { - n - } - } - Err(err) => { - if io::ErrorKind::WouldBlock == err.kind() { - *state = GzState::Finished(header, pos, buf); - } - - return Err(err); - } - }; - - GzState::Finished(header, pos + n, buf) + GzState::Finished(header, pos, buf) => { + if *pos < buf.len() { + *pos += read_into(self.reader.get_mut().get_mut(), &mut buf[*pos..])?; } else { let (crc, amt) = finish(&buf); - if crc != reader.crc().sum() || amt != reader.crc().amount() { + if crc != self.reader.crc().sum() || amt != self.reader.crc().amount() { + self.state = GzState::End(Some(mem::take(header))); return Err(corrupt()); - } else if *multi { - let is_eof = match reader.get_mut().get_mut().fill_buf() { - Ok(buf) => buf.is_empty(), - Err(err) => { - if io::ErrorKind::WouldBlock == err.kind() { - *state = GzState::Finished(header, pos, buf); - } - - return Err(err); - } - }; + } else if self.multi { + let is_eof = self + .reader + .get_mut() + .get_mut() + .fill_buf() + .map(|buf| buf.is_empty())?; if is_eof { - GzState::End(Some(header)) + self.state = GzState::End(Some(mem::take(header))); } else { - reader.reset(); - reader.get_mut().reset_data(); - GzState::Header(GzHeaderPartial::new()) + self.reader.reset(); + self.reader.get_mut().reset_data(); + self.state = GzState::Header(GzHeaderParser::new()) } } else { - GzState::End(Some(header)) + self.state = GzState::End(Some(mem::take(header))); } } } - GzState::Err(err) => return Err(err), + GzState::Err(err) => { + let result = Err(mem::replace(err, io::ErrorKind::Other.into())); + self.state = GzState::End(None); + return result; + } GzState::End(_) => return Ok(0), - }; + } } } } @@ -478,156 +421,3 @@ impl Read for MultiGzDecoder { self.0.read(into) } } - -#[cfg(test)] -pub mod tests { - use crate::gz::bufread::*; - use std::io; - use std::io::{Cursor, Read, Write}; - - //a cursor turning EOF into blocking errors - #[derive(Debug)] - pub struct BlockingCursor { - pub cursor: Cursor>, - } - - impl BlockingCursor { - pub fn new() -> BlockingCursor { - BlockingCursor { - cursor: Cursor::new(Vec::new()), - } - } - - pub fn set_position(&mut self, pos: u64) { - self.cursor.set_position(pos) - } - - pub fn position(&mut self) -> u64 { - self.cursor.position() - } - } - - impl Write for BlockingCursor { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.cursor.write(buf) - } - fn flush(&mut self) -> io::Result<()> { - self.cursor.flush() - } - } - - impl Read for BlockingCursor { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - //use the cursor, except it turns eof into blocking error - let r = self.cursor.read(buf); - match r { - Err(ref err) => { - if err.kind() == io::ErrorKind::UnexpectedEof { - return Err(io::ErrorKind::WouldBlock.into()); - } - } - Ok(0) => { - //regular EOF turned into blocking error - return Err(io::ErrorKind::WouldBlock.into()); - } - Ok(_n) => {} - } - r - } - } - #[test] - // test function read_and_forget of Buffer - fn buffer_read_and_forget() { - // this is unused except for the buffering - let mut part = GzHeaderPartial::new(); - // this is a reader which receives data afterwards - let mut r = BlockingCursor::new(); - let data = vec![1, 2, 3]; - let mut out = Vec::with_capacity(7); - - match r.write_all(&data) { - Ok(()) => {} - _ => { - panic!("Unexpected result for write_all"); - } - } - r.set_position(0); - - // First read : successful for one byte - let mut reader = Buffer::new(&mut part, &mut r); - out.resize(1, 0); - match reader.read_and_forget(&mut out) { - Ok(1) => {} - _ => { - panic!("Unexpected result for read_and_forget with data"); - } - } - - // Second read : incomplete for 7 bytes (we have only 2) - out.resize(7, 0); - match reader.read_and_forget(&mut out) { - Err(ref err) => { - assert_eq!(io::ErrorKind::WouldBlock, err.kind()); - } - _ => { - panic!("Unexpected result for read_and_forget with incomplete"); - } - } - - // 3 more data bytes have arrived - let pos = r.position(); - let data2 = vec![4, 5, 6]; - match r.write_all(&data2) { - Ok(()) => {} - _ => { - panic!("Unexpected result for write_all"); - } - } - r.set_position(pos); - - // Third read : still incomplete for 7 bytes (we have 5) - let mut reader2 = Buffer::new(&mut part, &mut r); - match reader2.read_and_forget(&mut out) { - Err(ref err) => { - assert_eq!(io::ErrorKind::WouldBlock, err.kind()); - } - _ => { - panic!("Unexpected result for read_and_forget with more incomplete"); - } - } - - // 3 more data bytes have arrived again - let pos2 = r.position(); - let data3 = vec![7, 8, 9]; - match r.write_all(&data3) { - Ok(()) => {} - _ => { - panic!("Unexpected result for write_all"); - } - } - r.set_position(pos2); - - // Fourth read : now successful for 7 bytes - let mut reader3 = Buffer::new(&mut part, &mut r); - match reader3.read_and_forget(&mut out) { - Ok(7) => { - assert_eq!(out[0], 2); - assert_eq!(out[6], 8); - } - _ => { - panic!("Unexpected result for read_and_forget with data"); - } - } - - // Fifth read : successful for one more byte - out.resize(1, 0); - match reader3.read_and_forget(&mut out) { - Ok(1) => { - assert_eq!(out[0], 9); - } - _ => { - panic!("Unexpected result for read_and_forget with data"); - } - } - } -} diff --git a/src/gz/mod.rs b/src/gz/mod.rs index 26152c16..95ed8375 100644 --- a/src/gz/mod.rs +++ b/src/gz/mod.rs @@ -1,5 +1,5 @@ use std::ffi::CString; -use std::io::{self, prelude::*}; +use std::io::{BufRead, Error, ErrorKind, Read, Result, Write}; use std::time; use crate::bufreader::BufReader; @@ -9,11 +9,16 @@ pub static FHCRC: u8 = 1 << 1; pub static FEXTRA: u8 = 1 << 2; pub static FNAME: u8 = 1 << 3; pub static FCOMMENT: u8 = 1 << 4; +pub static FRESERVED: u8 = 1 << 5 | 1 << 6 | 1 << 7; pub mod bufread; pub mod read; pub mod write; +// The maximum length of the header filename and comment fields. More than +// enough for these fields in reasonable use, but prevents possible attacks. +const MAX_HEADER_BUF: usize = 65535; + /// A structure representing the header of a gzip stream. /// /// The header can contain metadata about the file that was compressed, if @@ -82,151 +87,201 @@ impl GzHeader { } } -#[derive(Debug)] -pub enum GzHeaderParsingState { - Start, - Xlen, - Extra, - Filename, - Comment, - Crc, +#[derive(Debug, Default)] +pub enum GzHeaderState { + Start(u8, [u8; 10]), + Xlen(Option>, u8, [u8; 2]), + Extra(Option>, u16), + Filename(Option>), + Comment(Option>), + Crc(Option>, u8, [u8; 2]), + #[default] + Complete, } -#[derive(Debug)] -pub struct GzHeaderPartial { - buf: Vec, - state: GzHeaderParsingState, - flg: u8, - xlen: u16, - crc: Crc, +#[derive(Debug, Default)] +pub struct GzHeaderParser { + state: GzHeaderState, + flags: u8, header: GzHeader, } -impl GzHeaderPartial { - fn new() -> GzHeaderPartial { - GzHeaderPartial { - buf: Vec::with_capacity(10), // minimum header length - state: GzHeaderParsingState::Start, - flg: 0, - xlen: 0, - crc: Crc::new(), - header: GzHeader { - extra: None, - filename: None, - comment: None, - operating_system: 0, - mtime: 0, - }, +impl GzHeaderParser { + fn new() -> Self { + GzHeaderParser { + state: GzHeaderState::Start(0, [0; 10]), + flags: 0, + header: GzHeader::default(), } } - pub fn take_header(self) -> GzHeader { - self.header - } -} - -fn read_gz_header_part<'a, R: Read>(r: &'a mut Buffer<'a, R>) -> io::Result<()> { - loop { - match r.part.state { - GzHeaderParsingState::Start => { - let mut header = [0; 10]; - r.read_and_forget(&mut header)?; - - if header[0] != 0x1f || header[1] != 0x8b { - return Err(bad_header()); - } - if header[2] != 8 { - return Err(bad_header()); + fn parse<'a, R: Read>(&mut self, r: &'a mut R) -> Result<()> { + loop { + match &mut self.state { + GzHeaderState::Start(count, buffer) => { + while (*count as usize) < buffer.len() { + *count += read_into(r, &mut buffer[*count as usize..])? as u8; + } + // Gzip identification bytes + if buffer[0] != 0x1f || buffer[1] != 0x8b { + return Err(bad_header()); + } + // Gzip compression method (8 = deflate) + if buffer[2] != 8 { + return Err(bad_header()); + } + self.flags = buffer[3]; + // RFC1952: "must give an error indication if any reserved bit is non-zero" + if self.flags & FRESERVED != 0 { + return Err(bad_header()); + } + self.header.mtime = ((buffer[4] as u32) << 0) + | ((buffer[5] as u32) << 8) + | ((buffer[6] as u32) << 16) + | ((buffer[7] as u32) << 24); + let _xfl = buffer[8]; + self.header.operating_system = buffer[9]; + let crc = if self.flags & FHCRC != 0 { + let mut crc = Box::new(Crc::new()); + crc.update(buffer); + Some(crc) + } else { + None + }; + self.state = GzHeaderState::Xlen(crc, 0, [0; 2]); } - - r.part.flg = header[3]; - r.part.header.mtime = ((header[4] as u32) << 0) - | ((header[5] as u32) << 8) - | ((header[6] as u32) << 16) - | ((header[7] as u32) << 24); - let _xfl = header[8]; - r.part.header.operating_system = header[9]; - r.part.state = GzHeaderParsingState::Xlen; - } - GzHeaderParsingState::Xlen => { - if r.part.flg & FEXTRA != 0 { - r.part.xlen = read_le_u16(r)?; + GzHeaderState::Xlen(crc, count, buffer) => { + if self.flags & FEXTRA != 0 { + while (*count as usize) < buffer.len() { + *count += read_into(r, &mut buffer[*count as usize..])? as u8; + } + if let Some(crc) = crc { + crc.update(buffer); + } + let xlen = parse_le_u16(&buffer); + self.header.extra = Some(vec![0; xlen as usize]); + self.state = GzHeaderState::Extra(crc.take(), 0); + } else { + self.state = GzHeaderState::Filename(crc.take()); + } } - r.part.state = GzHeaderParsingState::Extra; - } - GzHeaderParsingState::Extra => { - if r.part.flg & FEXTRA != 0 { - let mut extra = vec![0; r.part.xlen as usize]; - r.read_and_forget(&mut extra)?; - r.part.header.extra = Some(extra); + GzHeaderState::Extra(crc, count) => { + debug_assert!(self.header.extra.is_some()); + let extra = self.header.extra.as_mut().unwrap(); + while (*count as usize) < extra.len() { + *count += read_into(r, &mut extra[*count as usize..])? as u16; + } + if let Some(crc) = crc { + crc.update(extra); + } + self.state = GzHeaderState::Filename(crc.take()); } - r.part.state = GzHeaderParsingState::Filename; - } - GzHeaderParsingState::Filename => { - if r.part.flg & FNAME != 0 { - if r.part.header.filename.is_none() { - r.part.header.filename = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; + GzHeaderState::Filename(crc) => { + if self.flags & FNAME != 0 { + let filename = self.header.filename.get_or_insert_with(Vec::new); + read_to_nul(r, filename)?; + if let Some(crc) = crc { + crc.update(filename); + crc.update(b"0"); } } + self.state = GzHeaderState::Comment(crc.take()); } - r.part.state = GzHeaderParsingState::Comment; - } - GzHeaderParsingState::Comment => { - if r.part.flg & FCOMMENT != 0 { - if r.part.header.comment.is_none() { - r.part.header.comment = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; + GzHeaderState::Comment(crc) => { + if self.flags & FCOMMENT != 0 { + let comment = self.header.comment.get_or_insert_with(Vec::new); + read_to_nul(r, comment)?; + if let Some(crc) = crc { + crc.update(comment); + crc.update(b"0"); } } + self.state = GzHeaderState::Crc(crc.take(), 0, [0; 2]); } - r.part.state = GzHeaderParsingState::Crc; - } - GzHeaderParsingState::Crc => { - if r.part.flg & FHCRC != 0 { - let stored_crc = read_le_u16(r)?; - let calced_crc = r.part.crc.sum() as u16; - if stored_crc != calced_crc { - return Err(corrupt()); + GzHeaderState::Crc(crc, count, buffer) => { + if let Some(crc) = crc { + debug_assert!(self.flags & FHCRC != 0); + while (*count as usize) < buffer.len() { + *count += read_into(r, &mut buffer[*count as usize..])? as u8; + } + let stored_crc = parse_le_u16(&buffer); + let calced_crc = crc.sum() as u16; + if stored_crc != calced_crc { + return Err(corrupt()); + } } + self.state = GzHeaderState::Complete; + } + GzHeaderState::Complete => { + return Ok(()); } - return Ok(()); } } } + + fn header(&self) -> Option<&GzHeader> { + match self.state { + GzHeaderState::Complete => Some(&self.header), + _ => None, + } + } +} + +impl From for GzHeader { + fn from(parser: GzHeaderParser) -> Self { + debug_assert!(matches!(parser.state, GzHeaderState::Complete)); + parser.header + } } -fn read_gz_header(r: &mut R) -> io::Result { - let mut part = GzHeaderPartial::new(); +// Attempt to fill the `buffer` from `r`. Return the number of bytes read. +// Return an error if EOF is read before the buffer is full. This differs +// from `read` in that Ok(0) means that more data may be available. +fn read_into(r: &mut R, buffer: &mut [u8]) -> Result { + debug_assert!(!buffer.is_empty()); + match r.read(buffer) { + Ok(0) => Err(ErrorKind::UnexpectedEof.into()), + Ok(n) => Ok(n), + Err(ref e) if e.kind() == ErrorKind::Interrupted => Ok(0), + Err(e) => Err(e), + } +} - let result = { - let mut reader = Buffer::new(&mut part, r); - read_gz_header_part(&mut reader) - }; - result.map(|()| part.take_header()) +// Read `r` up to the first nul byte, pushing non-nul bytes to `buffer`. +fn read_to_nul(r: &mut R, buffer: &mut Vec) -> Result<()> { + let mut bytes = r.bytes(); + loop { + match bytes.next().transpose()? { + Some(byte) if byte == 0 => { + return Ok(()); + } + Some(_) if buffer.len() == MAX_HEADER_BUF => { + return Err(Error::new( + ErrorKind::InvalidInput, + "gzip header field too long", + )); + } + Some(byte) => { + buffer.push(byte); + } + None => { + return Err(ErrorKind::UnexpectedEof.into()); + } + } + } } -fn read_le_u16(r: &mut Buffer) -> io::Result { - let mut b = [0; 2]; - r.read_and_forget(&mut b)?; - Ok((b[0] as u16) | ((b[1] as u16) << 8)) +fn parse_le_u16(buffer: &[u8; 2]) -> u16 { + (buffer[0] as u16) | ((buffer[1] as u16) << 8) } -fn bad_header() -> io::Error { - io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header") +fn bad_header() -> Error { + Error::new(ErrorKind::InvalidInput, "invalid gzip header") } -fn corrupt() -> io::Error { - io::Error::new( - io::ErrorKind::InvalidInput, +fn corrupt() -> Error { + Error::new( + ErrorKind::InvalidInput, "corrupt gzip stream does not have a matching checksum", ) } @@ -398,77 +453,6 @@ impl GzBuilder { } } -/// A small adapter which reads data originally from `buf` and then reads all -/// further data from `reader`. This will also buffer all data read from -/// `reader` into `buf` for reuse on a further call. -struct Buffer<'a, T: 'a> { - part: &'a mut GzHeaderPartial, - buf_cur: usize, - buf_max: usize, - reader: &'a mut T, -} - -impl<'a, T> Buffer<'a, T> { - fn new(part: &'a mut GzHeaderPartial, reader: &'a mut T) -> Buffer<'a, T> { - Buffer { - reader, - buf_cur: 0, - buf_max: part.buf.len(), - part, - } - } -} - -impl<'a, T: Read> Read for Buffer<'a, T> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let mut bufref = match self.part.state { - GzHeaderParsingState::Filename => self.part.header.filename.as_mut(), - GzHeaderParsingState::Comment => self.part.header.comment.as_mut(), - _ => None, - }; - if let Some(ref mut b) = bufref { - // we have a direct reference to a buffer where to write - let len = self.reader.read(buf)?; - if len > 0 && buf[len - 1] == 0 { - // we do not append the final 0 - b.extend_from_slice(&buf[..len - 1]); - } else { - b.extend_from_slice(&buf[..len]); - } - self.part.crc.update(&buf[..len]); - Ok(len) - } else if self.buf_cur == self.buf_max { - // we read new bytes and also save them in self.part.buf - let len = self.reader.read(buf)?; - self.part.buf.extend_from_slice(&buf[..len]); - self.part.crc.update(&buf[..len]); - Ok(len) - } else { - // we first read the previously saved bytes - let len = (&self.part.buf[self.buf_cur..self.buf_max]).read(buf)?; - self.buf_cur += len; - Ok(len) - } - } -} - -impl<'a, T> Buffer<'a, T> -where - T: std::io::Read, -{ - // If we manage to read all the bytes, we reset the buffer - fn read_and_forget(&mut self, buf: &mut [u8]) -> io::Result { - self.read_exact(buf)?; - // we managed to read the whole buf - // we will no longer need the previously saved bytes in self.part.buf - let rlen = buf.len(); - self.part.buf.truncate(0); - self.buf_cur = 0; - self.buf_max = 0; - Ok(rlen) - } -} - #[cfg(test)] mod tests { use std::io::prelude::*; diff --git a/src/gz/write.rs b/src/gz/write.rs index 5336a17e..339914d0 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -2,7 +2,7 @@ use std::cmp; use std::io; use std::io::prelude::*; -use super::{corrupt, read_gz_header, GzBuilder, GzHeader}; +use super::{corrupt, GzBuilder, GzHeader, GzHeaderParser}; use crate::crc::{Crc, CrcWriter}; use crate::zio; use crate::{Compress, Compression, Decompress, Status}; @@ -202,8 +202,7 @@ impl Drop for GzEncoder { pub struct GzDecoder { inner: zio::Writer, Decompress>, crc_bytes: Vec, - header: Option, - header_buf: Vec, + header_parser: GzHeaderParser, } const CRC_BYTES_LEN: usize = 8; @@ -217,14 +216,13 @@ impl GzDecoder { GzDecoder { inner: zio::Writer::new(CrcWriter::new(w), Decompress::new(false)), crc_bytes: Vec::with_capacity(CRC_BYTES_LEN), - header: None, - header_buf: Vec::new(), + header_parser: GzHeaderParser::new(), } } /// Returns the header associated with this stream. pub fn header(&self) -> Option<&GzHeader> { - self.header.as_ref() + self.header_parser.header() } /// Acquires a reference to the underlying writer. @@ -305,47 +303,24 @@ impl GzDecoder { } } -struct Counter { - inner: T, - pos: usize, -} - -impl Read for Counter { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let pos = self.inner.read(buf)?; - self.pos += pos; - Ok(pos) - } -} - impl Write for GzDecoder { - fn write(&mut self, buf: &[u8]) -> io::Result { - if self.header.is_none() { - // trying to avoid buffer usage - let (res, pos) = { - let mut counter = Counter { - inner: self.header_buf.chain(buf), - pos: 0, - }; - let res = read_gz_header(&mut counter); - (res, counter.pos) - }; - - match res { + fn write(&mut self, mut buf: &[u8]) -> io::Result { + let buflen = buf.len(); + if self.header().is_none() { + match self.header_parser.parse(&mut buf) { Err(err) => { if err.kind() == io::ErrorKind::UnexpectedEof { - // not enough data for header, save to the buffer - self.header_buf.extend(buf); - Ok(buf.len()) + // all data read but header still not complete + Ok(buflen) } else { Err(err) } } - Ok(header) => { - self.header = Some(header); - let pos = pos - self.header_buf.len(); - self.header_buf.truncate(0); - Ok(pos) + Ok(_) => { + debug_assert!(self.header().is_some()); + // buf now contains the unread part of the original buf + let n = buflen - buf.len(); + Ok(n) } } } else { @@ -522,6 +497,56 @@ mod tests { assert_eq!(return_string, STR); } + #[test] + fn decode_writer_partial_header_filename() { + let filename = "test.txt"; + let mut e = GzBuilder::new() + .filename(filename) + .read(STR.as_bytes(), Compression::default()); + let mut bytes = Vec::new(); + e.read_to_end(&mut bytes).unwrap(); + + let mut writer = Vec::new(); + let mut decoder = GzDecoder::new(writer); + assert_eq!(decoder.write(&bytes[..12]).unwrap(), 12); + let n = decoder.write(&bytes[12..]).unwrap(); + if n < bytes.len() - 12 { + decoder.write(&bytes[n + 12..]).unwrap(); + } + assert_eq!( + decoder.header().unwrap().filename().unwrap(), + filename.as_bytes() + ); + writer = decoder.finish().unwrap(); + let return_string = String::from_utf8(writer).expect("String parsing error"); + assert_eq!(return_string, STR); + } + + #[test] + fn decode_writer_partial_header_comment() { + let comment = "test comment"; + let mut e = GzBuilder::new() + .comment(comment) + .read(STR.as_bytes(), Compression::default()); + let mut bytes = Vec::new(); + e.read_to_end(&mut bytes).unwrap(); + + let mut writer = Vec::new(); + let mut decoder = GzDecoder::new(writer); + assert_eq!(decoder.write(&bytes[..12]).unwrap(), 12); + let n = decoder.write(&bytes[12..]).unwrap(); + if n < bytes.len() - 12 { + decoder.write(&bytes[n + 12..]).unwrap(); + } + assert_eq!( + decoder.header().unwrap().comment().unwrap(), + comment.as_bytes() + ); + writer = decoder.finish().unwrap(); + let return_string = String::from_utf8(writer).expect("String parsing error"); + assert_eq!(return_string, STR); + } + #[test] fn decode_writer_exact_header() { let mut e = GzEncoder::new(Vec::new(), Compression::default()); From aedc7a692315c9f173d81bb90ce9be666c009149 Mon Sep 17 00:00:00 2001 From: Yuki Okushi Date: Wed, 17 May 2023 23:56:38 +0900 Subject: [PATCH 07/31] Fix a comment on the `Compression` struct Signed-off-by: Yuki Okushi --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 6789c5b7..b84865e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,7 +154,7 @@ fn _assert_send_sync() { } /// When compressing data, the compression level can be specified by a value in -/// this enum. +/// this struct. #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub struct Compression(u32); From afbbf48bd4acbc2400c8adae9227b1d5890b4d42 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 May 2023 16:10:45 +0200 Subject: [PATCH 08/31] doc: Refer to `MultiGzDecoder` from `GzDecoder`. (#301) This may help dealing with multi-stream gzip files. `MultiGzDecoder` documentation was also improved to further clarify why such files would exist. --- src/gz/bufread.rs | 6 ++++-- src/gz/read.rs | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 2c707b4c..5b5061a3 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -402,9 +402,11 @@ impl Write for GzDecoder { /// A gzip member consists of a header, compressed data and a trailer. The [gzip /// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple /// gzip members to be joined in a single stream. `MultiGzDecoder` will -/// decode all consecutive members while `GzDecoder` will only decompress +/// decode all consecutive members while [`GzDecoder`] will only decompress /// the first gzip member. The multistream format is commonly used in -/// bioinformatics, for example when using the BGZF compressed data. +/// bioinformatics, for example when using the BGZF compressed data. It's also useful +/// to compress large amounts of data in parallel where each thread produces one stream +/// for a chunk of input data. /// /// This structure exposes a [`BufRead`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. diff --git a/src/gz/read.rs b/src/gz/read.rs index 5e9bcf65..2a16a6ac 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -185,9 +185,11 @@ impl Write for GzDecoder { /// A gzip member consists of a header, compressed data and a trailer. The [gzip /// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple /// gzip members to be joined in a single stream. `MultiGzDecoder` will -/// decode all consecutive members while `GzDecoder` will only decompress the +/// decode all consecutive members while [`GzDecoder`] will only decompress the /// first gzip member. The multistream format is commonly used in bioinformatics, -/// for example when using the BGZF compressed data. +/// for example when using the BGZF compressed data. It's also useful +/// to compress large amounts of data in parallel where each thread produces one stream +/// for a chunk of input data. /// /// This structure exposes a [`Read`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. From 3281693768a6e30587d822de49a68392b212b10d Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Tue, 27 Jun 2023 12:19:49 +0100 Subject: [PATCH 09/31] Fix Read encoder examples The examples for the Read encoders previously used the read method to read data. However, as pointed out in issue 158, this may only read part of the data. Instead the read_to_end method should be used. Fixes: #355 --- src/deflate/read.rs | 6 +++--- src/gz/read.rs | 6 +++--- src/zlib/read.rs | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/deflate/read.rs b/src/deflate/read.rs index e6af130a..8acc45f1 100644 --- a/src/deflate/read.rs +++ b/src/deflate/read.rs @@ -25,11 +25,11 @@ use crate::bufreader::BufReader; /// # /// // Return a vector containing the Deflate compressed version of hello world /// fn deflateencoder_read_hello_world() -> io::Result> { -/// let mut ret_vec = [0;100]; +/// let mut ret_vec = Vec::new(); /// let c = b"hello world"; /// let mut deflater = DeflateEncoder::new(&c[..], Compression::fast()); -/// let count = deflater.read(&mut ret_vec)?; -/// Ok(ret_vec[0..count].to_vec()) +/// let count = deflater.read_to_end(&mut ret_vec)?; +/// Ok(ret_vec) /// } /// ``` #[derive(Debug)] diff --git a/src/gz/read.rs b/src/gz/read.rs index 2a16a6ac..e3573200 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -25,11 +25,11 @@ use crate::Compression; /// // Return a vector containing the GZ compressed version of hello world /// /// fn gzencode_hello_world() -> io::Result> { -/// let mut ret_vec = [0;100]; +/// let mut ret_vec = Vec::new(); /// let bytestring = b"hello world"; /// let mut gz = GzEncoder::new(&bytestring[..], Compression::fast()); -/// let count = gz.read(&mut ret_vec)?; -/// Ok(ret_vec[0..count].to_vec()) +/// let count = gz.read_to_end(&mut ret_vec)?; +/// Ok(ret_vec) /// } /// ``` #[derive(Debug)] diff --git a/src/zlib/read.rs b/src/zlib/read.rs index 33021304..917cea40 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -24,9 +24,9 @@ use crate::bufreader::BufReader; /// # fn open_hello_world() -> std::io::Result> { /// let f = File::open("examples/hello_world.txt")?; /// let mut z = ZlibEncoder::new(f, Compression::fast()); -/// let mut buffer = [0;50]; -/// let byte_count = z.read(&mut buffer)?; -/// # Ok(buffer[0..byte_count].to_vec()) +/// let mut buffer = Vec::new(); +/// let byte_count = z.read_to_end(&mut buffer)?; +/// # Ok(buffer) /// # } /// ``` #[derive(Debug)] From 2684a56e218f02f8f0b4dce2d2a8515223974407 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 5 Jul 2023 08:44:23 +0200 Subject: [PATCH 10/31] Add test to show how `--no-default-features` should respond. (#359) --- .github/workflows/main.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9f04add4..5a50b64a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,12 +38,18 @@ jobs: - run: cargo test - run: cargo test --features zlib - run: cargo test --features zlib --no-default-features + - run: cargo test --features zlib-default --no-default-features - run: cargo test --features zlib-ng-compat --no-default-features if: matrix.build != 'mingw' - run: cargo test --features zlib-ng --no-default-features if: matrix.build != 'mingw' - run: cargo test --features cloudflare_zlib --no-default-features if: matrix.build != 'mingw' + - run: | + if ! cargo check --no-default-features 2>&1 | grep "You need to choose" ; then + echo "expected message stating a zlib backend must be chosen" + exit 1 + fi rustfmt: name: Rustfmt From c9cf23f929187a87f10f9658523ed4f80d57a5c2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 5 Jul 2023 08:48:17 +0200 Subject: [PATCH 11/31] feat: show clear compiler error when no backend is chosen. (#359) --- .github/workflows/main.yml | 3 ++- Cargo.toml | 5 +++-- src/ffi/mod.rs | 4 ++-- src/lib.rs | 3 +++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5a50b64a..ba72877f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,10 +46,11 @@ jobs: - run: cargo test --features cloudflare_zlib --no-default-features if: matrix.build != 'mingw' - run: | - if ! cargo check --no-default-features 2>&1 | grep "You need to choose" ; then + if ! cargo check --no-default-features 2>&1 | grep "You need to choose"; then echo "expected message stating a zlib backend must be chosen" exit 1 fi + if: matrix.build == 'stable' rustfmt: name: Rustfmt diff --git a/Cargo.toml b/Cargo.toml index 10cf92ec..0dbe5397 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,13 +32,14 @@ quickcheck = { version = "1.0", default-features = false } [features] default = ["rust_backend"] -any_zlib = [] # note: this is not a real user-facing feature +any_zlib = ["any_impl"] # note: this is not a real user-facing feature +any_impl = [] # note: this is not a real user-facing feature zlib = ["any_zlib", "libz-sys"] zlib-default = ["any_zlib", "libz-sys/default"] zlib-ng-compat = ["zlib", "libz-sys/zlib-ng"] zlib-ng = ["any_zlib", "libz-ng-sys"] cloudflare_zlib = ["any_zlib", "cloudflare-zlib-sys"] -rust_backend = ["miniz_oxide"] +rust_backend = ["miniz_oxide", "any_impl"] miniz-sys = ["rust_backend"] # For backwards compatibility [package.metadata.docs.rs] diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 8bac6e42..20b3cae6 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -40,9 +40,9 @@ mod c; #[cfg(feature = "any_zlib")] pub use self::c::*; -#[cfg(not(feature = "any_zlib"))] +#[cfg(all(not(feature = "any_zlib"), feature = "miniz_oxide"))] mod rust; -#[cfg(not(feature = "any_zlib"))] +#[cfg(all(not(feature = "any_zlib"), feature = "miniz_oxide"))] pub use self::rust::*; impl std::fmt::Debug for ErrorMessage { diff --git a/src/lib.rs b/src/lib.rs index b84865e9..738875c5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -78,6 +78,9 @@ #![cfg_attr(test, deny(warnings))] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +#[cfg(not(feature = "any_impl",))] +compile_error!("You need to choose a zlib backend"); + pub use crate::crc::{Crc, CrcReader, CrcWriter}; pub use crate::gz::GzBuilder; pub use crate::gz::GzHeader; From 7d5856d0bb724eb77a558c89a5bae878e1d8dc3c Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Sun, 30 Oct 2022 20:14:10 -0700 Subject: [PATCH 12/31] Recommend MultiGzDecoder over GzDecoder in docs --- src/gz/bufread.rs | 18 +++++++----------- src/gz/read.rs | 21 +++++++-------------- src/gz/write.rs | 17 ++++++++--------- 3 files changed, 22 insertions(+), 34 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 5b5061a3..953cc569 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -167,11 +167,11 @@ impl Write for GzEncoder { } } -/// A gzip streaming decoder +/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for +/// most uses. /// /// This structure consumes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. -/// Use [`MultiGzDecoder`] if your file has multiple streams. /// /// [`BufRead`]: https://doc.rust-lang.org/std/io/trait.BufRead.html /// @@ -397,20 +397,16 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes all members of a multistream +/// A gzip streaming decoder that decodes a full [gzip file]. /// -/// A gzip member consists of a header, compressed data and a trailer. The [gzip -/// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple -/// gzip members to be joined in a single stream. `MultiGzDecoder` will -/// decode all consecutive members while [`GzDecoder`] will only decompress -/// the first gzip member. The multistream format is commonly used in -/// bioinformatics, for example when using the BGZF compressed data. It's also useful -/// to compress large amounts of data in parallel where each thread produces one stream -/// for a chunk of input data. +/// A gzip file consists of a series of "members" concatenated one after another. +/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode +/// the first one member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`BufRead`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. /// +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// [`BufRead`]: https://doc.rust-lang.org/std/io/trait.BufRead.html /// /// # Examples diff --git a/src/gz/read.rs b/src/gz/read.rs index 2a16a6ac..4d10c4a8 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -90,13 +90,11 @@ impl Write for GzEncoder { } } -/// A gzip streaming decoder +/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for +/// most uses. /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. -/// Use [`MultiGzDecoder`] if your file has multiple streams. -/// -/// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html /// /// # Examples /// @@ -180,21 +178,16 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes all members of a multistream +/// A gzip streaming decoder that decodes a full [gzip file]. /// -/// A gzip member consists of a header, compressed data and a trailer. The [gzip -/// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple -/// gzip members to be joined in a single stream. `MultiGzDecoder` will -/// decode all consecutive members while [`GzDecoder`] will only decompress the -/// first gzip member. The multistream format is commonly used in bioinformatics, -/// for example when using the BGZF compressed data. It's also useful -/// to compress large amounts of data in parallel where each thread produces one stream -/// for a chunk of input data. +/// A gzip file consists of a series of "members" concatenated one after another. +/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode +/// the first one member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`Read`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. /// -/// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// /// # Examples /// diff --git a/src/gz/write.rs b/src/gz/write.rs index dd8a6bd0..8f84ee73 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -166,11 +166,11 @@ impl Drop for GzEncoder { } } -/// A gzip streaming decoder +/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for +/// most uses. /// /// This structure exposes a [`Write`] interface that will emit uncompressed data /// to the underlying writer `W`. -/// Use [`MultiGzDecoder`] if your file has multiple streams. /// /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html /// @@ -373,17 +373,16 @@ impl Read for GzDecoder { } } -/// A gzip streaming decoder that decodes all members of a multistream +/// A gzip streaming decoder that decodes a full [gzip file]. /// -/// A gzip member consists of a header, compressed data and a trailer. The [gzip -/// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple -/// gzip members to be joined in a single stream. `MultiGzDecoder` will -/// decode all consecutive members while `GzDecoder` will only decompress -/// the first gzip member. The multistream format is commonly used in -/// bioinformatics, for example when using the BGZF compressed data. +/// A gzip file consists of a series of "members" concatenated one after another. +/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode +/// the first one member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`Write`] interface that will consume all gzip members /// from the written buffers and write uncompressed data to the writer. +/// +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 #[derive(Debug)] pub struct MultiGzDecoder { inner: GzDecoder, From 27540301ca1035070bfb8f4d8ab32f06bd5ae2a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pi=C3=A8rre?= <77776198+PierreV23@users.noreply.github.com> Date: Sun, 16 Jul 2023 21:06:22 +0200 Subject: [PATCH 13/31] Add functions that allow (de)compress instances --- src/zlib/bufread.rs | 20 ++++++++++++++++++++ src/zlib/read.rs | 34 ++++++++++++++++++++++++++++++++++ src/zlib/write.rs | 15 +++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/src/zlib/bufread.rs b/src/zlib/bufread.rs index 61d12525..729264ff 100644 --- a/src/zlib/bufread.rs +++ b/src/zlib/bufread.rs @@ -47,6 +47,15 @@ impl ZlibEncoder { data: Compress::new(level, true), } } + + /// Same as `new` but instead of passing a `Compression` instance, + /// a `Compress` instance is passed. + pub fn new_with_compress(r: R, compress: crate::Compress) -> ZlibEncoder { + ZlibEncoder { + obj: r, + data: compress, + } + } } pub fn reset_encoder_data(zlib: &mut ZlibEncoder) { @@ -165,6 +174,17 @@ impl ZlibDecoder { data: Decompress::new(true), } } + + /// Creates a new decoder which will decompress data read from the given + /// stream. + /// + /// Also takes in a Decompress instance. + pub fn new_with_decompress(r: R, decompress: Decompress) -> ZlibDecoder { + ZlibDecoder { + obj: r, + data: decompress, + } + } } pub fn reset_decoder_data(zlib: &mut ZlibDecoder) { diff --git a/src/zlib/read.rs b/src/zlib/read.rs index 33021304..41d5d9f8 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -3,6 +3,7 @@ use std::io::prelude::*; use super::bufread; use crate::bufreader::BufReader; +use crate::Decompress; /// A ZLIB encoder, or compressor. /// @@ -42,6 +43,14 @@ impl ZlibEncoder { inner: bufread::ZlibEncoder::new(BufReader::new(r), level), } } + + /// Same as `new` but with the ability to add a `Compress` instance rather + /// than a `Compression` instance. + pub fn new_with_compress(r: R, compress: crate::Compress) -> ZlibEncoder { + ZlibEncoder { + inner: bufread::ZlibEncoder::new_with_compress(BufReader::new(r), compress), + } + } } impl ZlibEncoder { @@ -169,6 +178,31 @@ impl ZlibDecoder { inner: bufread::ZlibDecoder::new(BufReader::with_buf(buf, r)), } } + + /// Creates a new decoder which will decompress data read from the given + /// stream. + /// + /// Also takes in a custom Decompress instance. + pub fn new_with_decompress(r: R, decompress: Decompress) -> ZlibDecoder { + ZlibDecoder::new_with_decompress_and_buf(r, vec![0; 32 * 1024], decompress) + } + + /// Same as `new_with_decompress`, but the intermediate buffer for data is specified. + /// + /// Note that the specified buffer will only be used up to its current + /// length. The buffer's capacity will also not grow over time. + pub fn new_with_decompress_and_buf( + r: R, + buf: Vec, + decompress: Decompress, + ) -> ZlibDecoder { + ZlibDecoder { + inner: bufread::ZlibDecoder::new_with_decompress( + BufReader::with_buf(buf, r), + decompress, + ), + } + } } impl ZlibDecoder { diff --git a/src/zlib/write.rs b/src/zlib/write.rs index c6718140..9cc27419 100644 --- a/src/zlib/write.rs +++ b/src/zlib/write.rs @@ -44,6 +44,14 @@ impl ZlibEncoder { } } + /// Same as `new` but with the ability to add a `Compress` instance rather + /// than a `Compression` instance. + pub fn new_with_compress(w: W, compress: crate::Compress) -> ZlibEncoder { + ZlibEncoder { + inner: zio::Writer::new(w, compress), + } + } + /// Acquires a reference to the underlying writer. pub fn get_ref(&self) -> &W { self.inner.get_ref() @@ -218,6 +226,13 @@ impl ZlibDecoder { } } + /// This is like `new` but with a supplied `Decompress` instance. + pub fn new_with_decompress(w: W, decomp: Decompress) -> ZlibDecoder { + ZlibDecoder { + inner: zio::Writer::new(w, decomp), + } + } + /// Acquires a reference to the underlying writer. pub fn get_ref(&self) -> &W { self.inner.get_ref() From 1e389c54f9ad53ac73df1fa207049d3aef23d0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pi=C3=A8rre?= <77776198+PierreV23@users.noreply.github.com> Date: Sun, 16 Jul 2023 21:23:00 +0200 Subject: [PATCH 14/31] Forgot to add grave accent's --- src/zlib/read.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zlib/read.rs b/src/zlib/read.rs index 41d5d9f8..bd6d1d2a 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -182,7 +182,7 @@ impl ZlibDecoder { /// Creates a new decoder which will decompress data read from the given /// stream. /// - /// Also takes in a custom Decompress instance. + /// Also takes in a custom `Decompress` instance. pub fn new_with_decompress(r: R, decompress: Decompress) -> ZlibDecoder { ZlibDecoder::new_with_decompress_and_buf(r, vec![0; 32 * 1024], decompress) } From acd2ab9c9000ef16fdaafbaeb338b66b8263222a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 17 Jul 2023 08:08:09 +0200 Subject: [PATCH 15/31] unify documentation style of newly added functions. It attempts to avoid referring to other functions, and instead repeats documentation from the function it is most similar to, with adjustments to show its differences. --- src/zlib/bufread.rs | 8 ++++---- src/zlib/read.rs | 27 ++++++++++++++------------- src/zlib/write.rs | 18 +++++++++++------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/zlib/bufread.rs b/src/zlib/bufread.rs index 729264ff..ff1f523d 100644 --- a/src/zlib/bufread.rs +++ b/src/zlib/bufread.rs @@ -48,12 +48,12 @@ impl ZlibEncoder { } } - /// Same as `new` but instead of passing a `Compression` instance, - /// a `Compress` instance is passed. - pub fn new_with_compress(r: R, compress: crate::Compress) -> ZlibEncoder { + /// Creates a new encoder with given `compresson` settings which will + /// read uncompressed data from the given stream `r` and emit the compressed stream. + pub fn new_with_compress(r: R, compression: crate::Compress) -> ZlibEncoder { ZlibEncoder { obj: r, - data: compress, + data: compression, } } } diff --git a/src/zlib/read.rs b/src/zlib/read.rs index bd6d1d2a..6d21fb93 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -44,11 +44,11 @@ impl ZlibEncoder { } } - /// Same as `new` but with the ability to add a `Compress` instance rather - /// than a `Compression` instance. - pub fn new_with_compress(r: R, compress: crate::Compress) -> ZlibEncoder { + /// Creates a new encoder with given `compression` settings which will + /// read uncompressed data from the given stream `r` and emit the compressed stream. + pub fn new_with_compress(r: R, compression: crate::Compress) -> ZlibEncoder { ZlibEncoder { - inner: bufread::ZlibEncoder::new_with_compress(BufReader::new(r), compress), + inner: bufread::ZlibEncoder::new_with_compress(BufReader::new(r), compression), } } } @@ -169,7 +169,8 @@ impl ZlibDecoder { ZlibDecoder::new_with_buf(r, vec![0; 32 * 1024]) } - /// Same as `new`, but the intermediate buffer for data is specified. + /// Creates a new decoder along with `buf` for intermediate data, + /// which will decompress data read from the given stream `r`. /// /// Note that the specified buffer will only be used up to its current /// length. The buffer's capacity will also not grow over time. @@ -180,26 +181,26 @@ impl ZlibDecoder { } /// Creates a new decoder which will decompress data read from the given - /// stream. - /// - /// Also takes in a custom `Decompress` instance. - pub fn new_with_decompress(r: R, decompress: Decompress) -> ZlibDecoder { - ZlibDecoder::new_with_decompress_and_buf(r, vec![0; 32 * 1024], decompress) + /// stream `r`, along with `decompression` settings + pub fn new_with_decompress(r: R, decompression: Decompress) -> ZlibDecoder { + ZlibDecoder::new_with_decompress_and_buf(r, vec![0; 32 * 1024], decompression) } - /// Same as `new_with_decompress`, but the intermediate buffer for data is specified. + /// Creates a new decoder along with `buf` for intermediate data, + /// which will decompress data read from the given stream `r`, along with + /// `decompression` settings /// /// Note that the specified buffer will only be used up to its current /// length. The buffer's capacity will also not grow over time. pub fn new_with_decompress_and_buf( r: R, buf: Vec, - decompress: Decompress, + decompression: Decompress, ) -> ZlibDecoder { ZlibDecoder { inner: bufread::ZlibDecoder::new_with_decompress( BufReader::with_buf(buf, r), - decompress, + decompression, ), } } diff --git a/src/zlib/write.rs b/src/zlib/write.rs index 9cc27419..a3f9adef 100644 --- a/src/zlib/write.rs +++ b/src/zlib/write.rs @@ -44,11 +44,11 @@ impl ZlibEncoder { } } - /// Same as `new` but with the ability to add a `Compress` instance rather - /// than a `Compression` instance. - pub fn new_with_compress(w: W, compress: crate::Compress) -> ZlibEncoder { + /// Creates a new encoder which will write compressed data to the stream + /// `w` with the given `compression` settings. + pub fn new_with_compress(w: W, compression: crate::Compress) -> ZlibEncoder { ZlibEncoder { - inner: zio::Writer::new(w, compress), + inner: zio::Writer::new(w, compression), } } @@ -226,10 +226,14 @@ impl ZlibDecoder { } } - /// This is like `new` but with a supplied `Decompress` instance. - pub fn new_with_decompress(w: W, decomp: Decompress) -> ZlibDecoder { + /// Creates a new decoder which will write uncompressed data to the stream `w` + /// using the given `decompression` settings. + /// + /// When this decoder is dropped or unwrapped the final pieces of data will + /// be flushed. + pub fn new_with_decompress(w: W, decompression: Decompress) -> ZlibDecoder { ZlibDecoder { - inner: zio::Writer::new(w, decomp), + inner: zio::Writer::new(w, decompression), } } From 08f7d7391ba291c7106372adba258b7f6731d2b1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 17 Jul 2023 08:18:11 +0200 Subject: [PATCH 16/31] Simplify doc-tests Now that `read_to_end` is used, the amount of returned bytes is neatly encoded in the buffer's length. --- src/deflate/read.rs | 2 +- src/gz/read.rs | 2 +- src/zlib/read.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/deflate/read.rs b/src/deflate/read.rs index 8acc45f1..5937e6f6 100644 --- a/src/deflate/read.rs +++ b/src/deflate/read.rs @@ -28,7 +28,7 @@ use crate::bufreader::BufReader; /// let mut ret_vec = Vec::new(); /// let c = b"hello world"; /// let mut deflater = DeflateEncoder::new(&c[..], Compression::fast()); -/// let count = deflater.read_to_end(&mut ret_vec)?; +/// deflater.read_to_end(&mut ret_vec)?; /// Ok(ret_vec) /// } /// ``` diff --git a/src/gz/read.rs b/src/gz/read.rs index e3573200..196598c5 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -28,7 +28,7 @@ use crate::Compression; /// let mut ret_vec = Vec::new(); /// let bytestring = b"hello world"; /// let mut gz = GzEncoder::new(&bytestring[..], Compression::fast()); -/// let count = gz.read_to_end(&mut ret_vec)?; +/// gz.read_to_end(&mut ret_vec)?; /// Ok(ret_vec) /// } /// ``` diff --git a/src/zlib/read.rs b/src/zlib/read.rs index 917cea40..94ca805a 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -25,7 +25,7 @@ use crate::bufreader::BufReader; /// let f = File::open("examples/hello_world.txt")?; /// let mut z = ZlibEncoder::new(f, Compression::fast()); /// let mut buffer = Vec::new(); -/// let byte_count = z.read_to_end(&mut buffer)?; +/// z.read_to_end(&mut buffer)?; /// # Ok(buffer) /// # } /// ``` From b58db7f3c03167eff55bc38d8ce628cec65b0057 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 17 Jul 2023 08:26:10 +0200 Subject: [PATCH 17/31] Add MAINTENANCE.md This document describes common workflows which should help current and new maintainers alike. --- MAINTENANCE.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 MAINTENANCE.md diff --git a/MAINTENANCE.md b/MAINTENANCE.md new file mode 100644 index 00000000..af770344 --- /dev/null +++ b/MAINTENANCE.md @@ -0,0 +1,21 @@ +This document explains how to perform the project's maintenance tasks. + +### Creating a new release + +#### Artifacts + +* a tag of the version number +* a new [crate version](https://crates.io/crates/flate2/versions) + +#### Process + +To generated all the artifacts above, one proceeds as follows: + +1. `git checkout -b release-` - move to a branch to prepare making changes to the repository. *Changes cannot be made to `main` as it is protected.* +2. Edit `Cargo.toml` to the next package version. +3. `gh pr create` to create a new PR for the current branch and **get it merged**. +4. `cargo publish` to create a new release on `crates.io`. +5. `git tag ` to remember the commit. +6. `git push --tags` to push the new tag. +7. Go to the newly created release page on GitHub and edit it by pressing the "Generate Release Notes" and the `@` button. Save the release. + From 51ab99ac30dc86cda6a62cbcc3d593b0db7a2d00 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 17 Jul 2023 08:39:23 +0200 Subject: [PATCH 18/31] change the fuzz-time to 3 minutes to avoid waiting for fuzzing. At the time of writing, the longest CI jobs clocked in at 4 minutes, so 3 minutes should typically not be the longest job running. --- .github/workflows/cifuzz.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml index 4288c55e..ea8cae60 100644 --- a/.github/workflows/cifuzz.yml +++ b/.github/workflows/cifuzz.yml @@ -15,7 +15,7 @@ jobs: uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master with: oss-fuzz-project-name: 'flate2-rs' - fuzz-seconds: 300 + fuzz-seconds: 180 dry-run: false language: rust - name: Upload Crash From 7cfdd4e93cfc42ec7c7ea6303087033746d26fde Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 17 Jul 2023 09:05:32 +0200 Subject: [PATCH 19/31] minor improvements to the MultiGzDecoder documentation --- src/gz/bufread.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 953cc569..25bd74ac 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -397,11 +397,11 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes a full [gzip file]. +/// A gzip streaming decoder that decodes a complete [gzip file]. /// /// A gzip file consists of a series of "members" concatenated one after another. /// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first one member. MultiGzDecoder is preferable in most cases. +/// the first member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`BufRead`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. From 230256e3ade2494710687d4fe3d8e686b4426566 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Mon, 17 Jul 2023 18:32:14 +0100 Subject: [PATCH 20/31] Fix header CRC calculation of trailing zeros --- src/gz/mod.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 4 deletions(-) diff --git a/src/gz/mod.rs b/src/gz/mod.rs index 95ed8375..e8e05c6e 100644 --- a/src/gz/mod.rs +++ b/src/gz/mod.rs @@ -182,7 +182,7 @@ impl GzHeaderParser { read_to_nul(r, filename)?; if let Some(crc) = crc { crc.update(filename); - crc.update(b"0"); + crc.update(b"\0"); } } self.state = GzHeaderState::Comment(crc.take()); @@ -193,7 +193,7 @@ impl GzHeaderParser { read_to_nul(r, comment)?; if let Some(crc) = crc { crc.update(comment); - crc.update(b"0"); + crc.update(b"\0"); } } self.state = GzHeaderState::Crc(crc.take(), 0, [0; 2]); @@ -457,8 +457,8 @@ impl GzBuilder { mod tests { use std::io::prelude::*; - use super::{read, write, GzBuilder}; - use crate::Compression; + use super::{read, write, GzBuilder, GzHeaderParser}; + use crate::{Compression, GzHeader}; use rand::{thread_rng, Rng}; #[test] @@ -508,6 +508,85 @@ mod tests { assert_eq!(res, v); } + // A Rust implementation of CRC that closely matches the C code in RFC1952. + // Only use this to create CRCs for tests. + struct Rfc1952Crc { + /* Table of CRCs of all 8-bit messages. */ + crc_table: [u32; 256], + } + + impl Rfc1952Crc { + fn new() -> Self { + let mut crc = Rfc1952Crc { + crc_table: [0; 256], + }; + /* Make the table for a fast CRC. */ + for n in 0usize..256 { + let mut c = n as u32; + for _k in 0..8 { + if c & 1 != 0 { + c = 0xedb88320 ^ (c >> 1); + } else { + c = c >> 1; + } + } + crc.crc_table[n] = c; + } + crc + } + + /* + Update a running crc with the bytes buf and return + the updated crc. The crc should be initialized to zero. Pre- and + post-conditioning (one's complement) is performed within this + function so it shouldn't be done by the caller. + */ + fn update_crc(&self, crc: u32, buf: &[u8]) -> u32 { + let mut c = crc ^ 0xffffffff; + + for b in buf { + c = self.crc_table[(c as u8 ^ *b) as usize] ^ (c >> 8); + } + c ^ 0xffffffff + } + + /* Return the CRC of the bytes buf. */ + fn crc(&self, buf: &[u8]) -> u32 { + self.update_crc(0, buf) + } + } + + #[test] + fn roundtrip_header() { + let mut header = GzBuilder::new() + .mtime(1234) + .operating_system(57) + .filename("filename") + .comment("comment") + .into_header(Compression::fast()); + + // Add a CRC to the header + header[3] = header[3] ^ super::FHCRC; + let rfc1952_crc = Rfc1952Crc::new(); + let crc32 = rfc1952_crc.crc(&header); + let crc16 = crc32 as u16; + header.extend(&crc16.to_le_bytes()); + + let mut parser = GzHeaderParser::new(); + parser.parse(&mut header.as_slice()).unwrap(); + let actual = parser.header().unwrap(); + assert_eq!( + actual, + &GzHeader { + extra: None, + filename: Some("filename".as_bytes().to_vec()), + comment: Some("comment".as_bytes().to_vec()), + operating_system: 57, + mtime: 1234 + } + ) + } + #[test] fn fields() { let r = vec![0, 2, 4, 6]; From a2325748912d02e3e1d80d6529aa786297ab768e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 08:31:20 +0200 Subject: [PATCH 21/31] applies copies of minor improvements --- src/gz/read.rs | 2 +- src/gz/write.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gz/read.rs b/src/gz/read.rs index 4d10c4a8..fbd34ccd 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -182,7 +182,7 @@ impl Write for GzDecoder { /// /// A gzip file consists of a series of "members" concatenated one after another. /// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first one member. MultiGzDecoder is preferable in most cases. +/// the first member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`Read`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. diff --git a/src/gz/write.rs b/src/gz/write.rs index 8f84ee73..b2b7be5e 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -377,7 +377,7 @@ impl Read for GzDecoder { /// /// A gzip file consists of a series of "members" concatenated one after another. /// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first one member. MultiGzDecoder is preferable in most cases. +/// the first member. MultiGzDecoder is preferable in most cases. /// /// This structure exposes a [`Write`] interface that will consume all gzip members /// from the written buffers and write uncompressed data to the writer. From e21986e28c728ceec2c53c16ca5dbbd8a5ccfd5b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 23 Jul 2023 15:21:21 +0200 Subject: [PATCH 22/31] Add top-level comparison between `GzDecoder` and `MultiGzDecoder` --- src/lib.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 738875c5..6cbb0063 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,12 +65,30 @@ //! `Write` trait if `T: Write`. That is, the "dual trait" is forwarded directly //! to the underlying object if available. //! +//! # About multi-member Gzip files +//! +//! While most `gzip` files one encounters will have a single *member* that can be read +//! with the [`GzDecoder`], there may be some files which have multiple members. +//! +//! If these are read with a [`GzDecoder`], only the first member will be consumed and +//! the rest will silently be left alone, which can be surprising. +//! +//! The [`MultiGzDecoder`] on the other hand will decode all *members* of `gzip` file +//! into one consecutive stream of bytes, which hides the underlying *members* entirely +//! while failing if the file does not contain solely `gzip` *members*. +//! +//! It's worth noting that major browser like Chrome, Firefox as well as tool like `curl` +//! will only decode the first member of a `gzip` encoded reply, so what's right to do +//! truly depends on the context, as well the expected input of the library or application. +//! //! [`read`]: read/index.html //! [`bufread`]: bufread/index.html //! [`write`]: write/index.html //! [read]: https://doc.rust-lang.org/std/io/trait.Read.html //! [write]: https://doc.rust-lang.org/std/io/trait.Write.html //! [bufread]: https://doc.rust-lang.org/std/io/trait.BufRead.html +//! [`GzDecoder`]: read/struct.GzDecoder.html +//! [`MultiGzDecoder`]: read/struct.MultiGzDecoder.html #![doc(html_root_url = "https://docs.rs/flate2/0.2")] #![deny(missing_docs)] #![deny(missing_debug_implementations)] From 1e095719b361f0a3e857fa6d539cef7cfad4166f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 23 Jul 2023 15:38:55 +0200 Subject: [PATCH 23/31] Apply suggestions to impartial to Gz and MultiGz implementations. I also added a reference to the general section about the differences in the crate documentation. Co-Authored-By: Josh Triplett --- src/gz/bufread.rs | 8 ++++---- src/gz/read.rs | 11 +++++++---- src/gz/write.rs | 13 ++++++++----- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 25bd74ac..6f7b514b 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -167,8 +167,7 @@ impl Write for GzEncoder { } } -/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for -/// most uses. +/// A decoder for a gzip file with a single member. /// /// This structure consumes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. @@ -397,11 +396,12 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes a complete [gzip file]. +/// A gzip streaming decoder that decodes a [gzip file] with multiple members. /// /// A gzip file consists of a series of "members" concatenated one after another. /// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first member. MultiGzDecoder is preferable in most cases. +/// the first member. Learn more +/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). /// /// This structure exposes a [`BufRead`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. diff --git a/src/gz/read.rs b/src/gz/read.rs index fbd34ccd..aa36ad2c 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -90,12 +90,14 @@ impl Write for GzEncoder { } } -/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for -/// most uses. +/// A decoder for a gzip file with a single member. /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. /// +/// This decoder only handles gzipped data with a single stream. +/// Use [`MultiGzDecoder`] for gzipped data with multiple streams. +/// /// # Examples /// /// ``` @@ -178,11 +180,12 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes a full [gzip file]. +/// A gzip streaming decoder that decodes a [gzip file] with multiple members. /// /// A gzip file consists of a series of "members" concatenated one after another. /// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first member. MultiGzDecoder is preferable in most cases. +/// the first member. Learn more +/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). /// /// This structure exposes a [`Read`] interface that will consume all gzip members /// from the underlying reader and emit uncompressed data. diff --git a/src/gz/write.rs b/src/gz/write.rs index b2b7be5e..d1d309dc 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -166,12 +166,14 @@ impl Drop for GzEncoder { } } -/// A decoder for a single member of a gzip file. Prefer [MultiGzDecoder] for -/// most uses. +/// A decoder for a gzip file with a single member. /// /// This structure exposes a [`Write`] interface that will emit uncompressed data /// to the underlying writer `W`. /// +/// This decoder only handles gzipped data with a single stream. +/// Use [`MultiGzDecoder`] for gzipped data with multiple streams. +/// /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html /// /// # Examples @@ -373,11 +375,12 @@ impl Read for GzDecoder { } } -/// A gzip streaming decoder that decodes a full [gzip file]. +/// A gzip streaming decoder that decodes a [gzip file] with multiple members. /// /// A gzip file consists of a series of "members" concatenated one after another. -/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first member. MultiGzDecoder is preferable in most cases. +/// `MultiGzDecoder` decodes all members of a file, while [GzDecoder] will only decode +/// the first member. Learn more +/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). /// /// This structure exposes a [`Write`] interface that will consume all gzip members /// from the written buffers and write uncompressed data to the writer. From ea0ad07bd32e471e49268f990eeba996ed7fe683 Mon Sep 17 00:00:00 2001 From: wcampbell Date: Mon, 24 Jul 2023 21:13:18 -0400 Subject: [PATCH 24/31] Fix broken link on README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e7239a6..23ce043d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A streaming compression/decompression library DEFLATE-based streams in Rust. This crate by default uses the `miniz_oxide` crate, a port of `miniz.c` to pure -Rust. This crate also supports other [backends](#Backends), such as the widely +Rust. This crate also supports other [backends](#backends), such as the widely available zlib library or the high-performance zlib-ng library. Supported formats: From 955728bb94b43dc8763c667e7c5d5c09edf3b7c8 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Mon, 24 Jul 2023 18:15:47 -0700 Subject: [PATCH 25/31] Tweak the {Gz,MultiGz}Decoder docs more - Use relative paths to link to the introduction. - Use consistent language across {Read,BufRead,Write}{Gz,MultiGz}Decoder. - Use `member` rather than `stream`. - Document what happens to unused data for `Gz` variants. --- src/gz/bufread.rs | 31 ++++++++++++++++++++++--------- src/gz/read.rs | 31 +++++++++++++++++++++---------- src/gz/write.rs | 29 ++++++++++++++++++----------- src/lib.rs | 17 ++++++++++------- 4 files changed, 71 insertions(+), 37 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 6f7b514b..3a0cda8f 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -167,11 +167,22 @@ impl Write for GzEncoder { } } -/// A decoder for a gzip file with a single member. +/// A decoder for the first member of a [gzip file]. /// -/// This structure consumes a [`BufRead`] interface, reading compressed data +/// This structure exposes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. /// +/// After reading the first member of a gzip file (which is often, but not +/// always, the only member), this reader will return Ok(0) even if there +/// are more bytes available in the underlying reader. If you want to be sure +/// not to drop bytes on the floor, call `into_inner()` after Ok(0) to +/// recover the underlying reader. +/// +/// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] +/// or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). +/// +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// [`BufRead`]: https://doc.rust-lang.org/std/io/trait.BufRead.html /// /// # Examples @@ -396,15 +407,17 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes a [gzip file] with multiple members. +/// A gzip streaming decoder that decodes a [gzip file] that may have multiple members. +/// +/// This structure exposes a [`BufRead`] interface that will consume compressed +/// data from the underlying reader and emit uncompressed data. /// -/// A gzip file consists of a series of "members" concatenated one after another. -/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first member. Learn more -/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). +/// A gzip file consists of a series of *members* concatenated one after another. +/// MultiGzDecoder decodes all members of a file and returns Ok(0) once the +/// underlying reader does. /// -/// This structure exposes a [`BufRead`] interface that will consume all gzip members -/// from the underlying reader and emit uncompressed data. +/// To handle members seperately, see [GzDecoder] or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). /// /// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// [`BufRead`]: https://doc.rust-lang.org/std/io/trait.BufRead.html diff --git a/src/gz/read.rs b/src/gz/read.rs index aa36ad2c..adc9cda6 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -90,13 +90,22 @@ impl Write for GzEncoder { } } -/// A decoder for a gzip file with a single member. +/// A decoder for the first member of a [gzip file]. /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. /// -/// This decoder only handles gzipped data with a single stream. -/// Use [`MultiGzDecoder`] for gzipped data with multiple streams. +/// After reading the first member of a gzip file (which is often, but not +/// always, the only member), this reader will return Ok(0) even if there +/// are more bytes available in the underlying reader. If you want to be sure +/// not to drop bytes on the floor, call `into_inner()` after Ok(0) to +/// recover the underlying reader. +/// +/// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] +/// or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). +/// +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// /// # Examples /// @@ -180,15 +189,17 @@ impl Write for GzDecoder { } } -/// A gzip streaming decoder that decodes a [gzip file] with multiple members. +/// A gzip streaming decoder that decodes a [gzip file] that may have multiple members. +/// +/// This structure exposes a [`Read`] interface that will consume compressed +/// data from the underlying reader and emit uncompressed data. /// -/// A gzip file consists of a series of "members" concatenated one after another. -/// MultiGzDecoder decodes all members of a file, while [GzDecoder] will only decode -/// the first member. Learn more -/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). +/// A gzip file consists of a series of *members* concatenated one after another. +/// MultiGzDecoder decodes all members of a file and returns Ok(0) once the +/// underlying reader does. /// -/// This structure exposes a [`Read`] interface that will consume all gzip members -/// from the underlying reader and emit uncompressed data. +/// To handle members seperately, see [GzDecoder] or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). /// /// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// diff --git a/src/gz/write.rs b/src/gz/write.rs index d1d309dc..030b38e5 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -166,14 +166,19 @@ impl Drop for GzEncoder { } } -/// A decoder for a gzip file with a single member. +/// A decoder for the first member of a [gzip file]. /// -/// This structure exposes a [`Write`] interface that will emit uncompressed data -/// to the underlying writer `W`. +/// This structure exposes a [`Write`] interface, receiving compressed data and +/// writing uncompressed data to the underlying writer. +/// +/// After decoding the first member of a gzip file, this writer will return XXX +/// to all subsequent writes. /// -/// This decoder only handles gzipped data with a single stream. -/// Use [`MultiGzDecoder`] for gzipped data with multiple streams. +/// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] +/// or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). /// +/// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html /// /// # Examples @@ -377,13 +382,15 @@ impl Read for GzDecoder { /// A gzip streaming decoder that decodes a [gzip file] with multiple members. /// -/// A gzip file consists of a series of "members" concatenated one after another. -/// `MultiGzDecoder` decodes all members of a file, while [GzDecoder] will only decode -/// the first member. Learn more -/// [in the introduction](https://docs.rs/flate2/*/flate2/#About-multi-member-Gzip-files). +/// This structure exposes a [`Write`] interface that will consume compressed data and +/// write uncompressed data to the underlying writer. +/// +/// A gzip file consists of a series of *members* concatenated one after another. +/// `MultiGzDecoder` decodes all members of a file and writes them to the +/// underlying writer one after another. /// -/// This structure exposes a [`Write`] interface that will consume all gzip members -/// from the written buffers and write uncompressed data to the writer. +/// To handle members separately, see [GzDecoder] or read more +/// [in the introduction](../index.html#about-multi-member-gzip-files). /// /// [gzip file]: https://www.rfc-editor.org/rfc/rfc1952#page-5 #[derive(Debug)] diff --git a/src/lib.rs b/src/lib.rs index 6cbb0063..c9590b92 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,13 +73,16 @@ //! If these are read with a [`GzDecoder`], only the first member will be consumed and //! the rest will silently be left alone, which can be surprising. //! -//! The [`MultiGzDecoder`] on the other hand will decode all *members* of `gzip` file -//! into one consecutive stream of bytes, which hides the underlying *members* entirely -//! while failing if the file does not contain solely `gzip` *members*. -//! -//! It's worth noting that major browser like Chrome, Firefox as well as tool like `curl` -//! will only decode the first member of a `gzip` encoded reply, so what's right to do -//! truly depends on the context, as well the expected input of the library or application. +//! The [`MultiGzDecoder`] on the other hand will decode all members of a `gzip` file +//! into one consecutive stream of bytes, which hides the underlying *members* entirely. +//! If a file contains contains non-gzip data after the gzip data, MultiGzDecoder will +//! emit an error after decoding the gzip data. This behavior matches the `gzip`, +//! `gunzip`, and `zcat` command line tools. +//! +//! Chrome and Firefox appear to implement behavior like `GzDecoder`, ignoring data +//! after the first member. `curl` appears to implement behavior somewhat like +//! `GzDecoder`, only decoding the first member, but emitting an error if there is +//! data after the first member, whether or not it is gzip data. //! //! [`read`]: read/index.html //! [`bufread`]: bufread/index.html From c9fe661d150b0750385c77796b351bd62760bcde Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 25 Jul 2023 13:03:04 +0200 Subject: [PATCH 26/31] further unify documentation, make sure sentences end with a period. --- src/zlib/bufread.rs | 12 +++++------- src/zlib/read.rs | 14 +++++++------- src/zlib/write.rs | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/zlib/bufread.rs b/src/zlib/bufread.rs index ff1f523d..aa8af64f 100644 --- a/src/zlib/bufread.rs +++ b/src/zlib/bufread.rs @@ -48,9 +48,9 @@ impl ZlibEncoder { } } - /// Creates a new encoder with given `compresson` settings which will + /// Creates a new encoder with the given `compression` settings which will /// read uncompressed data from the given stream `r` and emit the compressed stream. - pub fn new_with_compress(r: R, compression: crate::Compress) -> ZlibEncoder { + pub fn new_with_compress(r: R, compression: Compress) -> ZlibEncoder { ZlibEncoder { obj: r, data: compression, @@ -176,13 +176,11 @@ impl ZlibDecoder { } /// Creates a new decoder which will decompress data read from the given - /// stream. - /// - /// Also takes in a Decompress instance. - pub fn new_with_decompress(r: R, decompress: Decompress) -> ZlibDecoder { + /// stream, using the given `decompression` settings. + pub fn new_with_decompress(r: R, decompression: Decompress) -> ZlibDecoder { ZlibDecoder { obj: r, - data: decompress, + data: decompression, } } } diff --git a/src/zlib/read.rs b/src/zlib/read.rs index 6d21fb93..b65418be 100644 --- a/src/zlib/read.rs +++ b/src/zlib/read.rs @@ -44,7 +44,7 @@ impl ZlibEncoder { } } - /// Creates a new encoder with given `compression` settings which will + /// Creates a new encoder with the given `compression` settings which will /// read uncompressed data from the given stream `r` and emit the compressed stream. pub fn new_with_compress(r: R, compression: crate::Compress) -> ZlibEncoder { ZlibEncoder { @@ -169,8 +169,8 @@ impl ZlibDecoder { ZlibDecoder::new_with_buf(r, vec![0; 32 * 1024]) } - /// Creates a new decoder along with `buf` for intermediate data, - /// which will decompress data read from the given stream `r`. + /// Creates a new decoder which will decompress data read from the given + /// stream `r`, using `buf` as backing to speed up reading. /// /// Note that the specified buffer will only be used up to its current /// length. The buffer's capacity will also not grow over time. @@ -181,14 +181,14 @@ impl ZlibDecoder { } /// Creates a new decoder which will decompress data read from the given - /// stream `r`, along with `decompression` settings + /// stream `r`, along with `decompression` settings. pub fn new_with_decompress(r: R, decompression: Decompress) -> ZlibDecoder { ZlibDecoder::new_with_decompress_and_buf(r, vec![0; 32 * 1024], decompression) } - /// Creates a new decoder along with `buf` for intermediate data, - /// which will decompress data read from the given stream `r`, along with - /// `decompression` settings + /// Creates a new decoder which will decompress data read from the given + /// stream `r`, using `buf` as backing to speed up reading, + /// along with `decompression` settings to configure decoder. /// /// Note that the specified buffer will only be used up to its current /// length. The buffer's capacity will also not grow over time. diff --git a/src/zlib/write.rs b/src/zlib/write.rs index a3f9adef..d8ad2f26 100644 --- a/src/zlib/write.rs +++ b/src/zlib/write.rs @@ -46,7 +46,7 @@ impl ZlibEncoder { /// Creates a new encoder which will write compressed data to the stream /// `w` with the given `compression` settings. - pub fn new_with_compress(w: W, compression: crate::Compress) -> ZlibEncoder { + pub fn new_with_compress(w: W, compression: Compress) -> ZlibEncoder { ZlibEncoder { inner: zio::Writer::new(w, compression), } From f0bf8a6516936faf65b5a4ad856465d9c5ad9b95 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 30 Jul 2023 09:29:17 +0200 Subject: [PATCH 27/31] Apply suggestions from code review Co-authored-by: jongiddy --- src/gz/bufread.rs | 13 ++++++------- src/gz/read.rs | 10 +++++----- src/gz/write.rs | 7 ++++--- src/lib.rs | 6 ++++-- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 3a0cda8f..24634e30 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -167,15 +167,14 @@ impl Write for GzEncoder { } } -/// A decoder for the first member of a [gzip file]. +/// A decoder for a single member of a [gzip file]. /// /// This structure exposes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. /// -/// After reading the first member of a gzip file (which is often, but not -/// always, the only member), this reader will return Ok(0) even if there -/// are more bytes available in the underlying reader. If you want to be sure -/// not to drop bytes on the floor, call `into_inner()` after Ok(0) to +/// After reading a single member of the gzip data this reader will return +/// Ok(0) even if there are more bytes available in the underlying reader. +/// If you need the following bytes, call `into_inner()` after Ok(0) to /// recover the underlying reader. /// /// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] @@ -413,8 +412,8 @@ impl Write for GzDecoder { /// data from the underlying reader and emit uncompressed data. /// /// A gzip file consists of a series of *members* concatenated one after another. -/// MultiGzDecoder decodes all members of a file and returns Ok(0) once the -/// underlying reader does. +/// MultiGzDecoder decodes all members from the data and only returns Ok(0) when the +/// underlying reader does. For a file, this reads to the end of the file. /// /// To handle members seperately, see [GzDecoder] or read more /// [in the introduction](../index.html#about-multi-member-gzip-files). diff --git a/src/gz/read.rs b/src/gz/read.rs index adc9cda6..6368509a 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -95,11 +95,11 @@ impl Write for GzEncoder { /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. /// -/// After reading the first member of a gzip file (which is often, but not -/// always, the only member), this reader will return Ok(0) even if there -/// are more bytes available in the underlying reader. If you want to be sure -/// not to drop bytes on the floor, call `into_inner()` after Ok(0) to -/// recover the underlying reader. +/// After reading a single member of the gzip data this reader will return +/// Ok(0) even if there are more bytes available in the underlying reader. +/// `GzDecoder` may have read additional bytes past the end of the gzip data. +/// If you need the following bytes, wrap the `Reader` in a `std::io::BufReader` +/// and use `bufread::GzDecoder` instead. /// /// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] /// or read more diff --git a/src/gz/write.rs b/src/gz/write.rs index 030b38e5..4184c855 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -166,13 +166,14 @@ impl Drop for GzEncoder { } } -/// A decoder for the first member of a [gzip file]. +/// A decoder for a single member of a [gzip file]. /// /// This structure exposes a [`Write`] interface, receiving compressed data and /// writing uncompressed data to the underlying writer. /// -/// After decoding the first member of a gzip file, this writer will return XXX -/// to all subsequent writes. +/// After decoding a single member of the gzip data this writer will return the number of bytes up to +/// to the end of the gzip member and subsequent writes will return Ok(0) allowing the caller to +/// handle any data following the gzip member. /// /// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] /// or read more diff --git a/src/lib.rs b/src/lib.rs index c9590b92..018dc40a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,8 +70,10 @@ //! While most `gzip` files one encounters will have a single *member* that can be read //! with the [`GzDecoder`], there may be some files which have multiple members. //! -//! If these are read with a [`GzDecoder`], only the first member will be consumed and -//! the rest will silently be left alone, which can be surprising. +//! A [`GzDecoder`] will only read the first member of gzip data, which may unexpectedly +//! provide partial results when a multi-member gzip file is encountered. `GzDecoder` is appropriate +//! for data that is designed to be read as single members from a multi-member file. `bufread::GzDecoder` +//! and `write::GzDecoder` also allow non-gzip data following gzip data to be handled. //! //! The [`MultiGzDecoder`] on the other hand will decode all members of a `gzip` file //! into one consecutive stream of bytes, which hides the underlying *members* entirely. From fc30d9e24bffad84eba0d8bcc046e594126398a5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 30 Jul 2023 09:29:37 +0200 Subject: [PATCH 28/31] remove introductory paragraph that described other tools unrelated to `flate2` --- src/gz/bufread.rs | 4 ++-- src/gz/read.rs | 2 +- src/gz/write.rs | 4 ++-- src/lib.rs | 9 ++------- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index 24634e30..e65c2eb6 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -172,8 +172,8 @@ impl Write for GzEncoder { /// This structure exposes a [`BufRead`] interface, reading compressed data /// from the underlying reader, and emitting uncompressed data. /// -/// After reading a single member of the gzip data this reader will return -/// Ok(0) even if there are more bytes available in the underlying reader. +/// After reading a single member of the gzip data this reader will return +/// Ok(0) even if there are more bytes available in the underlying reader. /// If you need the following bytes, call `into_inner()` after Ok(0) to /// recover the underlying reader. /// diff --git a/src/gz/read.rs b/src/gz/read.rs index 6368509a..2f923731 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -98,7 +98,7 @@ impl Write for GzEncoder { /// After reading a single member of the gzip data this reader will return /// Ok(0) even if there are more bytes available in the underlying reader. /// `GzDecoder` may have read additional bytes past the end of the gzip data. -/// If you need the following bytes, wrap the `Reader` in a `std::io::BufReader` +/// If you need the following bytes, wrap the `Reader` in a `std::io::BufReader` /// and use `bufread::GzDecoder` instead. /// /// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] diff --git a/src/gz/write.rs b/src/gz/write.rs index 4184c855..feda221e 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -171,8 +171,8 @@ impl Drop for GzEncoder { /// This structure exposes a [`Write`] interface, receiving compressed data and /// writing uncompressed data to the underlying writer. /// -/// After decoding a single member of the gzip data this writer will return the number of bytes up to -/// to the end of the gzip member and subsequent writes will return Ok(0) allowing the caller to +/// After decoding a single member of the gzip data this writer will return the number of bytes up to +/// to the end of the gzip member and subsequent writes will return Ok(0) allowing the caller to /// handle any data following the gzip member. /// /// To handle gzip files that may have multiple members, see [`MultiGzDecoder`] diff --git a/src/lib.rs b/src/lib.rs index 018dc40a..127e2354 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,8 +70,8 @@ //! While most `gzip` files one encounters will have a single *member* that can be read //! with the [`GzDecoder`], there may be some files which have multiple members. //! -//! A [`GzDecoder`] will only read the first member of gzip data, which may unexpectedly -//! provide partial results when a multi-member gzip file is encountered. `GzDecoder` is appropriate +//! A [`GzDecoder`] will only read the first member of gzip data, which may unexpectedly +//! provide partial results when a multi-member gzip file is encountered. `GzDecoder` is appropriate //! for data that is designed to be read as single members from a multi-member file. `bufread::GzDecoder` //! and `write::GzDecoder` also allow non-gzip data following gzip data to be handled. //! @@ -81,11 +81,6 @@ //! emit an error after decoding the gzip data. This behavior matches the `gzip`, //! `gunzip`, and `zcat` command line tools. //! -//! Chrome and Firefox appear to implement behavior like `GzDecoder`, ignoring data -//! after the first member. `curl` appears to implement behavior somewhat like -//! `GzDecoder`, only decoding the first member, but emitting an error if there is -//! data after the first member, whether or not it is gzip data. -//! //! [`read`]: read/index.html //! [`bufread`]: bufread/index.html //! [`write`]: write/index.html From b2079e33f176bd62ac368a236f2f9e0ca44ed5b0 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Sun, 30 Jul 2023 12:12:41 +0100 Subject: [PATCH 29/31] Document that `read::GzDecoder` consumes bytes after end of gzip Add tests showing that the `GzDecoder`s in `bufread` and `write` support reading immediately after end of gzip data. Co-authored-by: Sebastian Thiel --- src/gz/bufread.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/gz/read.rs | 14 ++++++++++++-- src/gz/write.rs | 28 ++++++++++++++++++++++++++++ src/lib.rs | 7 +++++++ 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index e59ebc0f..6fc48bcd 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -432,3 +432,50 @@ impl Read for MultiGzDecoder { self.0.read(into) } } + +#[cfg(test)] +mod test { + use crate::bufread::GzDecoder; + use crate::gz::write; + use crate::Compression; + use std::io::{Read, Write}; + + // GzDecoder consumes one gzip member and then returns 0 for subsequent reads, allowing any + // additional data to be consumed by the caller. + #[test] + fn decode_extra_data() { + let expected = "Hello World"; + + let compressed = { + let mut e = write::GzEncoder::new(Vec::new(), Compression::default()); + e.write(expected.as_ref()).unwrap(); + let mut b = e.finish().unwrap(); + b.push(b'x'); + b + }; + + let mut output = Vec::new(); + let mut decoder = GzDecoder::new(compressed.as_slice()); + let decoded_bytes = decoder.read_to_end(&mut output).unwrap(); + assert_eq!(decoded_bytes, output.len()); + let actual = std::str::from_utf8(&output).expect("String parsing error"); + assert_eq!( + actual, expected, + "after decompression we obtain the original input" + ); + + output.clear(); + assert_eq!( + decoder.read(&mut output).unwrap(), + 0, + "subsequent read of decoder returns 0, but inner reader can return additional data" + ); + let mut reader = decoder.into_inner(); + assert_eq!( + reader.read_to_end(&mut output).unwrap(), + 1, + "extra data is accessible in underlying buf-read" + ); + assert_eq!(output, b"x"); + } +} diff --git a/src/gz/read.rs b/src/gz/read.rs index 8732fdc2..5a65526c 100644 --- a/src/gz/read.rs +++ b/src/gz/read.rs @@ -90,7 +90,7 @@ impl Write for GzEncoder { } } -/// A decoder for the first member of a [gzip file]. +/// A decoder for a single member of a [gzip file]. /// /// This structure exposes a [`Read`] interface that will consume compressed /// data from the underlying reader and emit uncompressed data. @@ -155,6 +155,9 @@ impl GzDecoder { } /// Acquires a reference to the underlying reader. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// To prevent this use [`bufread::GzDecoder`] instead. pub fn get_ref(&self) -> &R { self.inner.get_ref().get_ref() } @@ -162,12 +165,19 @@ impl GzDecoder { /// Acquires a mutable reference to the underlying stream. /// /// Note that mutation of the stream may result in surprising results if - /// this decoder is continued to be used. + /// this decoder continues to be used. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// To prevent this use [`bufread::GzDecoder`] instead. pub fn get_mut(&mut self) -> &mut R { self.inner.get_mut().get_mut() } /// Consumes this decoder, returning the underlying reader. + /// + /// Note that the decoder may have read past the end of the gzip data. + /// Subsequent reads will skip those bytes. To prevent this use + /// [`bufread::GzDecoder`] instead. pub fn into_inner(self) -> R { self.inner.into_inner().into_inner() } diff --git a/src/gz/write.rs b/src/gz/write.rs index d5e8b8e5..74d6c5ac 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -610,4 +610,32 @@ mod tests { let expected = STR.repeat(2); assert_eq!(return_string, expected); } + + // GzDecoder consumes one gzip member and then returns 0 for subsequent writes, allowing any + // additional data to be consumed by the caller. + #[test] + fn decode_extra_data() { + let compressed = { + let mut e = GzEncoder::new(Vec::new(), Compression::default()); + e.write(STR.as_ref()).unwrap(); + let mut b = e.finish().unwrap(); + b.push(b'x'); + b + }; + + let mut writer = Vec::new(); + let mut decoder = GzDecoder::new(writer); + let mut consumed_bytes = 0; + loop { + let n = decoder.write(&compressed[consumed_bytes..]).unwrap(); + if n == 0 { + break; + } + consumed_bytes += n; + } + writer = decoder.finish().unwrap(); + let actual = String::from_utf8(writer).expect("String parsing error"); + assert_eq!(actual, STR); + assert_eq!(&compressed[consumed_bytes..], b"x"); + } } diff --git a/src/lib.rs b/src/lib.rs index 127e2354..8c000b03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,7 +117,14 @@ mod zlib; /// Types which operate over [`Read`] streams, both encoders and decoders for /// various formats. /// +/// Note that the `read` decoder types may read past the end of the compressed +/// data while decoding. If the caller requires subsequent reads to start +/// immediately following the compressed data wrap the `Read` type in a +/// [`BufReader`] and use the `BufReader` with the equivalent decoder from the +/// `bufread` module and also for the subsequent reads. +/// /// [`Read`]: https://doc.rust-lang.org/std/io/trait.Read.html +/// [`BufReader`]: https://doc.rust-lang.org/std/io/struct.BufReader.html pub mod read { pub use crate::deflate::read::DeflateDecoder; pub use crate::deflate::read::DeflateEncoder; From 02cd317738df29252c23bdaa8e059a3bc57515f9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Aug 2023 17:25:28 +0200 Subject: [PATCH 30/31] Fix typo Co-authored-by: Yuki Okushi --- MAINTENANCE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTENANCE.md b/MAINTENANCE.md index af770344..c032c254 100644 --- a/MAINTENANCE.md +++ b/MAINTENANCE.md @@ -9,7 +9,7 @@ This document explains how to perform the project's maintenance tasks. #### Process -To generated all the artifacts above, one proceeds as follows: +To generate all the artifacts above, one proceeds as follows: 1. `git checkout -b release-` - move to a branch to prepare making changes to the repository. *Changes cannot be made to `main` as it is protected.* 2. Edit `Cargo.toml` to the next package version. From ccd3d3a417585094aa2661a13cd54a384728dbfc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Aug 2023 07:48:50 +0200 Subject: [PATCH 31/31] prepare 1.0.27 release --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0dbe5397..1a481fb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "flate2" authors = ["Alex Crichton ", "Josh Triplett "] -version = "1.0.26" +version = "1.0.27" edition = "2018" license = "MIT OR Apache-2.0" readme = "README.md"