From 16c35fb5110720080121dd1e4a78b41ba2c23343 Mon Sep 17 00:00:00 2001 From: Andrea Giudiceandrea Date: Thu, 10 Dec 2020 20:34:30 +0100 Subject: [PATCH 1/6] bpo-41928: Add support for Unicode Path Extra Field in ZipFile --- Lib/zipfile.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index e1a50a3eb51d95..ece94fad7fb875 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -336,6 +336,7 @@ class ZipInfo (object): 'external_attr', 'header_offset', 'CRC', + 'orig_filename_crc', 'compress_size', 'file_size', '_raw_time', @@ -484,6 +485,13 @@ def _decodeExtra(self): except struct.error: raise BadZipFile(f"Corrupt zip64 extra field. " f"{field} not found.") from None + elif tp == 0x7075: + data = extra[4:ln+4] + # Unicode Path Extra Field + up_version, up_name_crc = unpack(' 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) + orig_filename_crc = crc32(filename) flags = centdir[5] if flags & 0x800: # UTF-8 file names extension @@ -1378,6 +1387,7 @@ def _RealGetContents(self): x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + x.orig_filename_crc = orig_filename_crc x._decodeExtra() x.header_offset = x.header_offset + concat self.filelist.append(x) From 98ccf3c693e343ac359c92cb4c5e5fd0dc89507b Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 10 Dec 2020 19:48:32 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst diff --git a/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst b/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst new file mode 100644 index 00000000000000..25d944e6f66589 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst @@ -0,0 +1 @@ +Add support for Unicode Path Extra Field in ZipFile. \ No newline at end of file From d41ae56d72218a4fcb0656766cb1fe05d3280603 Mon Sep 17 00:00:00 2001 From: Andrea Giudiceandrea Date: Tue, 8 Jun 2021 22:45:37 +0200 Subject: [PATCH 3/6] Validate data in unicode path extra field --- Lib/zipfile.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index ece94fad7fb875..ae36f8fde3cce6 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -488,10 +488,18 @@ def _decodeExtra(self): elif tp == 0x7075: data = extra[4:ln+4] # Unicode Path Extra Field - up_version, up_name_crc = unpack(' Date: Tue, 8 Jun 2021 23:11:16 +0200 Subject: [PATCH 4/6] Sanitize filename in unicode path extra field --- Lib/zipfile.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index ae36f8fde3cce6..9d6ecc47168cc8 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -314,6 +314,22 @@ def _EndRecData(fpin): # Unable to find a valid end of central directory structure return None +def _sanitize_filename(filename): + """Terminate the file name at the first null byte and + ensure paths always use forward slashes as the directory separator.""" + + # Terminate the file name at the first null byte. Null bytes in file + # names are used as tricks by viruses in archives. + null_byte = filename.find(chr(0)) + if null_byte >= 0: + filename = filename[0:null_byte] + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: + filename = filename.replace(os.sep, "/") + return filename + class ZipInfo (object): """Class with attributes describing each file in the ZIP archive.""" @@ -345,16 +361,9 @@ class ZipInfo (object): def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.orig_filename = filename # Original file name in archive - # Terminate the file name at the first null byte. Null bytes in file - # names are used as tricks by viruses in archives. - null_byte = filename.find(chr(0)) - if null_byte >= 0: - filename = filename[0:null_byte] - # This is used to ensure paths in generated ZIP files always use - # forward slashes as the directory separator, as required by the - # ZIP format specification. - if os.sep != "/" and os.sep in filename: - filename = filename.replace(os.sep, "/") + # Terminate the file name at the first null byte and + # ensure paths always use forward slashes as the directory separator. + filename = _sanitize_filename(filename) self.filename = filename # Normalized file name self.date_time = date_time # year, month, day, hour, min, sec @@ -493,7 +502,7 @@ def _decodeExtra(self): if up_version == 1 and up_name_crc == self.orig_filename_crc: up_unicode_name = data[5:].decode('utf-8') if up_unicode_name: - self.filename = up_unicode_name + self.filename = _sanitize_filename(up_unicode_name) else: raise BadZipFile("Empty unicode path extra field (0x7075)") except struct.error: From 726047845135483e3531bf87e765bb79ff470f7a Mon Sep 17 00:00:00 2001 From: Andrea Giudiceandrea Date: Tue, 8 Jun 2021 23:34:58 +0200 Subject: [PATCH 5/6] Fix white space --- Lib/zipfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 9d6ecc47168cc8..9113bf9e39f797 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -317,7 +317,7 @@ def _EndRecData(fpin): def _sanitize_filename(filename): """Terminate the file name at the first null byte and ensure paths always use forward slashes as the directory separator.""" - + # Terminate the file name at the first null byte. Null bytes in file # names are used as tricks by viruses in archives. null_byte = filename.find(chr(0)) From 392c1e2a41bc1ef7c2f06b89f1ad0d1befef204d Mon Sep 17 00:00:00 2001 From: Andrea Giudiceandrea Date: Wed, 9 Jun 2021 00:31:12 +0200 Subject: [PATCH 6/6] Fix warnings for unicode path extra field --- Lib/zipfile.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 9113bf9e39f797..cd486ec32dc3bc 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -504,11 +504,14 @@ def _decodeExtra(self): if up_unicode_name: self.filename = _sanitize_filename(up_unicode_name) else: - raise BadZipFile("Empty unicode path extra field (0x7075)") + import warnings + warnings.warn("Empty unicode path extra field (0x7075)", stacklevel=2) except struct.error: - raise BadZipFile("Corrupt unicode path extra field (0x7075)") + import warnings + warnings.warn("Corrupt unicode path extra field (0x7075)", stacklevel=2) except UnicodeDecodeError: - raise BadZipFile("Corrupt unicode path extra field (0x7075) - invalid unicode bytes") + import warnings + warnings.warn('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes', stacklevel=2) extra = extra[ln+4:]