From 16c35fb5110720080121dd1e4a78b41ba2c23343 Mon Sep 17 00:00:00 2001
From: Andrea Giudiceandrea <andreaerdna@libero.it>
Date: Thu, 10 Dec 2020 20:34:30 +0100
Subject: [PATCH 1/6] bpo-41928: Add support for Unicode Path Extra Field in
 ZipFile

---
 Lib/zipfile.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index e1a50a3eb51d95..ece94fad7fb875 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -336,6 +336,7 @@ class ZipInfo (object):
         'external_attr',
         'header_offset',
         'CRC',
+        'orig_filename_crc',
         'compress_size',
         'file_size',
         '_raw_time',
@@ -484,6 +485,13 @@ def _decodeExtra(self):
                 except struct.error:
                     raise BadZipFile(f"Corrupt zip64 extra field. "
                                      f"{field} not found.") from None
+            elif tp == 0x7075:
+                data = extra[4:ln+4]
+                # Unicode Path Extra Field
+                up_version, up_name_crc = unpack('<BL', data[:5])
+                up_unicode_name = data[5:].decode('utf-8')
+                if up_version == 1 and up_name_crc == self.orig_filename_crc:
+                    self.filename = up_unicode_name
 
             extra = extra[ln+4:]
 
@@ -1354,6 +1362,7 @@ def _RealGetContents(self):
             if self.debug > 2:
                 print(centdir)
             filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+            orig_filename_crc = crc32(filename)
             flags = centdir[5]
             if flags & 0x800:
                 # UTF-8 file names extension
@@ -1378,6 +1387,7 @@ def _RealGetContents(self):
             x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
                             t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
 
+            x.orig_filename_crc = orig_filename_crc
             x._decodeExtra()
             x.header_offset = x.header_offset + concat
             self.filelist.append(x)

From 98ccf3c693e343ac359c92cb4c5e5fd0dc89507b Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Thu, 10 Dec 2020 19:48:32 +0000
Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst

diff --git a/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst b/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst
new file mode 100644
index 00000000000000..25d944e6f66589
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-12-10-19-48-31.bpo-41928.9TH76z.rst
@@ -0,0 +1 @@
+Add support for Unicode Path Extra Field in ZipFile.
\ No newline at end of file

From d41ae56d72218a4fcb0656766cb1fe05d3280603 Mon Sep 17 00:00:00 2001
From: Andrea Giudiceandrea <andreaerdna@libero.it>
Date: Tue, 8 Jun 2021 22:45:37 +0200
Subject: [PATCH 3/6] Validate data in unicode path extra field

---
 Lib/zipfile.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index ece94fad7fb875..ae36f8fde3cce6 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -488,10 +488,18 @@ def _decodeExtra(self):
             elif tp == 0x7075:
                 data = extra[4:ln+4]
                 # Unicode Path Extra Field
-                up_version, up_name_crc = unpack('<BL', data[:5])
-                up_unicode_name = data[5:].decode('utf-8')
-                if up_version == 1 and up_name_crc == self.orig_filename_crc:
-                    self.filename = up_unicode_name
+                try:
+                    up_version, up_name_crc = unpack('<BL', data[:5])
+                    if up_version == 1 and up_name_crc == self.orig_filename_crc:
+                        up_unicode_name = data[5:].decode('utf-8')
+                        if up_unicode_name:
+                            self.filename = up_unicode_name
+                        else:
+                            raise BadZipFile("Empty unicode path extra field (0x7075)")
+                except struct.error:
+                    raise BadZipFile("Corrupt unicode path extra field (0x7075)")
+                except UnicodeDecodeError:
+                    raise BadZipFile("Corrupt unicode path extra field (0x7075) - invalid unicode bytes")
 
             extra = extra[ln+4:]
 

From dc40233f14eb0213db3adcabf3f77d61e5b06e6f Mon Sep 17 00:00:00 2001
From: Andrea Giudiceandrea <andreaerdna@libero.it>
Date: Tue, 8 Jun 2021 23:11:16 +0200
Subject: [PATCH 4/6] Sanitize filename in unicode path extra field

---
 Lib/zipfile.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index ae36f8fde3cce6..9d6ecc47168cc8 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -314,6 +314,22 @@ def _EndRecData(fpin):
     # Unable to find a valid end of central directory structure
     return None
 
+def _sanitize_filename(filename):
+    """Terminate the file name at the first null byte and
+    ensure paths always use forward slashes as the directory separator."""
+    
+    # Terminate the file name at the first null byte.  Null bytes in file
+    # names are used as tricks by viruses in archives.
+    null_byte = filename.find(chr(0))
+    if null_byte >= 0:
+        filename = filename[0:null_byte]
+    # This is used to ensure paths in generated ZIP files always use
+    # forward slashes as the directory separator, as required by the
+    # ZIP format specification.
+    if os.sep != "/" and os.sep in filename:
+        filename = filename.replace(os.sep, "/")
+    return filename
+
 
 class ZipInfo (object):
     """Class with attributes describing each file in the ZIP archive."""
@@ -345,16 +361,9 @@ class ZipInfo (object):
     def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
         self.orig_filename = filename   # Original file name in archive
 
-        # Terminate the file name at the first null byte.  Null bytes in file
-        # names are used as tricks by viruses in archives.
-        null_byte = filename.find(chr(0))
-        if null_byte >= 0:
-            filename = filename[0:null_byte]
-        # This is used to ensure paths in generated ZIP files always use
-        # forward slashes as the directory separator, as required by the
-        # ZIP format specification.
-        if os.sep != "/" and os.sep in filename:
-            filename = filename.replace(os.sep, "/")
+        # Terminate the file name at the first null byte and
+        # ensure paths always use forward slashes as the directory separator.
+        filename = _sanitize_filename(filename)
 
         self.filename = filename        # Normalized file name
         self.date_time = date_time      # year, month, day, hour, min, sec
@@ -493,7 +502,7 @@ def _decodeExtra(self):
                     if up_version == 1 and up_name_crc == self.orig_filename_crc:
                         up_unicode_name = data[5:].decode('utf-8')
                         if up_unicode_name:
-                            self.filename = up_unicode_name
+                            self.filename = _sanitize_filename(up_unicode_name)
                         else:
                             raise BadZipFile("Empty unicode path extra field (0x7075)")
                 except struct.error:

From 726047845135483e3531bf87e765bb79ff470f7a Mon Sep 17 00:00:00 2001
From: Andrea Giudiceandrea <andreaerdna@libero.it>
Date: Tue, 8 Jun 2021 23:34:58 +0200
Subject: [PATCH 5/6] Fix white space

---
 Lib/zipfile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 9d6ecc47168cc8..9113bf9e39f797 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -317,7 +317,7 @@ def _EndRecData(fpin):
 def _sanitize_filename(filename):
     """Terminate the file name at the first null byte and
     ensure paths always use forward slashes as the directory separator."""
-    
+
     # Terminate the file name at the first null byte.  Null bytes in file
     # names are used as tricks by viruses in archives.
     null_byte = filename.find(chr(0))

From 392c1e2a41bc1ef7c2f06b89f1ad0d1befef204d Mon Sep 17 00:00:00 2001
From: Andrea Giudiceandrea <andreaerdna@libero.it>
Date: Wed, 9 Jun 2021 00:31:12 +0200
Subject: [PATCH 6/6] Fix warnings for unicode path extra field

---
 Lib/zipfile.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 9113bf9e39f797..cd486ec32dc3bc 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -504,11 +504,14 @@ def _decodeExtra(self):
                         if up_unicode_name:
                             self.filename = _sanitize_filename(up_unicode_name)
                         else:
-                            raise BadZipFile("Empty unicode path extra field (0x7075)")
+                            import warnings
+                            warnings.warn("Empty unicode path extra field (0x7075)", stacklevel=2)
                 except struct.error:
-                    raise BadZipFile("Corrupt unicode path extra field (0x7075)")
+                    import warnings
+                    warnings.warn("Corrupt unicode path extra field (0x7075)", stacklevel=2)
                 except UnicodeDecodeError:
-                    raise BadZipFile("Corrupt unicode path extra field (0x7075) - invalid unicode bytes")
+                    import warnings
+                    warnings.warn('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes', stacklevel=2)
 
             extra = extra[ln+4:]