Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9cbdd75

Browse files
committed
Add read support for all missing variants of the GNU sparse
extensions. Thus, in addition to GNUTYPE_SPARSE headers, sparse information in pax headers created by GNU tar can now be decoded. All three formats 0.0, 0.1 and 1.0 are supported. On filesystems that support this, holes in files are now restored whenever a sparse member is extracted.
1 parent 3122ce3 commit 9cbdd75

5 files changed

Lines changed: 180 additions & 127 deletions

File tree

Doc/library/tarfile.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ Some facts and figures:
2020
* read/write support for the POSIX.1-1988 (ustar) format.
2121

2222
* read/write support for the GNU tar format including *longname* and *longlink*
23-
extensions, read-only support for the *sparse* extension.
23+
extensions, read-only support for all variants of the *sparse* extension
24+
including restoration of sparse files.
2425

2526
* read/write support for the POSIX.1-2001 (pax) format.
2627

Lib/tarfile.py

Lines changed: 112 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -701,13 +701,29 @@ class _FileInFile(object):
701701
object.
702702
"""
703703

704-
def __init__(self, fileobj, offset, size, sparse=None):
704+
def __init__(self, fileobj, offset, size, blockinfo=None):
705705
self.fileobj = fileobj
706706
self.offset = offset
707707
self.size = size
708-
self.sparse = sparse
709708
self.position = 0
710709

710+
if blockinfo is None:
711+
blockinfo = [(0, size)]
712+
713+
# Construct a map with data and zero blocks.
714+
self.map_index = 0
715+
self.map = []
716+
lastpos = 0
717+
realpos = self.offset
718+
for offset, size in blockinfo:
719+
if offset > lastpos:
720+
self.map.append((False, lastpos, offset, None))
721+
self.map.append((True, offset, offset + size, realpos))
722+
realpos += size
723+
lastpos = offset + size
724+
if lastpos < self.size:
725+
self.map.append((False, lastpos, self.size, None))
726+
711727
def seekable(self):
712728
if not hasattr(self.fileobj, "seekable"):
713729
# XXX gzip.GzipFile and bz2.BZ2File
@@ -732,48 +748,26 @@ def read(self, size=None):
732748
else:
733749
size = min(size, self.size - self.position)
734750

735-
if self.sparse is None:
736-
return self.readnormal(size)
737-
else:
738-
return self.readsparse(size)
739-
740-
def readnormal(self, size):
741-
"""Read operation for regular files.
742-
"""
743-
self.fileobj.seek(self.offset + self.position)
744-
self.position += size
745-
return self.fileobj.read(size)
746-
747-
def readsparse(self, size):
748-
"""Read operation for sparse files.
749-
"""
750-
data = b""
751+
buf = b""
751752
while size > 0:
752-
buf = self.readsparsesection(size)
753-
if not buf:
754-
break
755-
size -= len(buf)
756-
data += buf
757-
return data
758-
759-
def readsparsesection(self, size):
760-
"""Read a single section of a sparse file.
761-
"""
762-
section = self.sparse.find(self.position)
763-
764-
if section is None:
765-
return b""
766-
767-
size = min(size, section.offset + section.size - self.position)
768-
769-
if isinstance(section, _data):
770-
realpos = section.realpos + self.position - section.offset
771-
self.fileobj.seek(self.offset + realpos)
772-
self.position += size
773-
return self.fileobj.read(size)
774-
else:
775-
self.position += size
776-
return NUL * size
753+
while True:
754+
data, start, stop, offset = self.map[self.map_index]
755+
if start <= self.position < stop:
756+
break
757+
else:
758+
self.map_index += 1
759+
if self.map_index == len(self.map):
760+
self.map_index = 0
761+
length = min(size, stop - self.position)
762+
if data:
763+
self.fileobj.seek(offset)
764+
block = self.fileobj.read(stop - start)
765+
buf += block[self.position - start:self.position + length]
766+
else:
767+
buf += NUL * length
768+
size -= length
769+
self.position += length
770+
return buf
777771
#class _FileInFile
778772

779773

@@ -1367,28 +1361,15 @@ def _proc_sparse(self, tarfile):
13671361
numbytes = nti(buf[pos + 12:pos + 24])
13681362
except ValueError:
13691363
break
1370-
structs.append((offset, numbytes))
1364+
if offset and numbytes:
1365+
structs.append((offset, numbytes))
13711366
pos += 24
13721367
isextended = bool(buf[504])
1373-
1374-
# Transform the sparse structures to something we can use
1375-
# in ExFileObject.
1376-
self.sparse = _ringbuffer()
1377-
lastpos = 0
1378-
realpos = 0
1379-
for offset, numbytes in structs:
1380-
if offset > lastpos:
1381-
self.sparse.append(_hole(lastpos, offset - lastpos))
1382-
self.sparse.append(_data(offset, numbytes, realpos))
1383-
realpos += numbytes
1384-
lastpos = offset + numbytes
1385-
if lastpos < origsize:
1386-
self.sparse.append(_hole(lastpos, origsize - lastpos))
1368+
self.sparse = structs
13871369

13881370
self.offset_data = tarfile.fileobj.tell()
13891371
tarfile.offset = self.offset_data + self._block(self.size)
13901372
self.size = origsize
1391-
13921373
return self
13931374

13941375
def _proc_pax(self, tarfile):
@@ -1464,6 +1445,19 @@ def _proc_pax(self, tarfile):
14641445
except HeaderError:
14651446
raise SubsequentHeaderError("missing or bad subsequent header")
14661447

1448+
# Process GNU sparse information.
1449+
if "GNU.sparse.map" in pax_headers:
1450+
# GNU extended sparse format version 0.1.
1451+
self._proc_gnusparse_01(next, pax_headers)
1452+
1453+
elif "GNU.sparse.size" in pax_headers:
1454+
# GNU extended sparse format version 0.0.
1455+
self._proc_gnusparse_00(next, pax_headers, buf)
1456+
1457+
elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1458+
# GNU extended sparse format version 1.0.
1459+
self._proc_gnusparse_10(next, pax_headers, tarfile)
1460+
14671461
if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
14681462
# Patch the TarInfo object with the extended header info.
14691463
next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
@@ -1480,24 +1474,59 @@ def _proc_pax(self, tarfile):
14801474

14811475
return next
14821476

1477+
def _proc_gnusparse_00(self, next, pax_headers, buf):
1478+
"""Process a GNU tar extended sparse header, version 0.0.
1479+
"""
1480+
offsets = []
1481+
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1482+
offsets.append(int(match.group(1)))
1483+
numbytes = []
1484+
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1485+
numbytes.append(int(match.group(1)))
1486+
next.sparse = list(zip(offsets, numbytes))
1487+
1488+
def _proc_gnusparse_01(self, next, pax_headers):
1489+
"""Process a GNU tar extended sparse header, version 0.1.
1490+
"""
1491+
sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1492+
next.sparse = list(zip(sparse[::2], sparse[1::2]))
1493+
1494+
def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1495+
"""Process a GNU tar extended sparse header, version 1.0.
1496+
"""
1497+
fields = None
1498+
sparse = []
1499+
buf = tarfile.fileobj.read(BLOCKSIZE)
1500+
fields, buf = buf.split(b"\n", 1)
1501+
fields = int(fields)
1502+
while len(sparse) < fields * 2:
1503+
if b"\n" not in buf:
1504+
buf += tarfile.fileobj.read(BLOCKSIZE)
1505+
number, buf = buf.split(b"\n", 1)
1506+
sparse.append(int(number))
1507+
next.offset_data = tarfile.fileobj.tell()
1508+
next.sparse = list(zip(sparse[::2], sparse[1::2]))
1509+
14831510
def _apply_pax_info(self, pax_headers, encoding, errors):
14841511
"""Replace fields with supplemental information from a previous
14851512
pax extended or global header.
14861513
"""
14871514
for keyword, value in pax_headers.items():
1488-
if keyword not in PAX_FIELDS:
1489-
continue
1490-
1491-
if keyword == "path":
1492-
value = value.rstrip("/")
1493-
1494-
if keyword in PAX_NUMBER_FIELDS:
1495-
try:
1496-
value = PAX_NUMBER_FIELDS[keyword](value)
1497-
except ValueError:
1498-
value = 0
1499-
1500-
setattr(self, keyword, value)
1515+
if keyword == "GNU.sparse.name":
1516+
setattr(self, "path", value)
1517+
elif keyword == "GNU.sparse.size":
1518+
setattr(self, "size", int(value))
1519+
elif keyword == "GNU.sparse.realsize":
1520+
setattr(self, "size", int(value))
1521+
elif keyword in PAX_FIELDS:
1522+
if keyword in PAX_NUMBER_FIELDS:
1523+
try:
1524+
value = PAX_NUMBER_FIELDS[keyword](value)
1525+
except ValueError:
1526+
value = 0
1527+
if keyword == "path":
1528+
value = value.rstrip("/")
1529+
setattr(self, keyword, value)
15011530

15021531
self.pax_headers = pax_headers.copy()
15031532

@@ -1535,7 +1564,7 @@ def isblk(self):
15351564
def isfifo(self):
15361565
return self.type == FIFOTYPE
15371566
def issparse(self):
1538-
return self.type == GNUTYPE_SPARSE
1567+
return self.sparse is not None
15391568
def isdev(self):
15401569
return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
15411570
# class TarInfo
@@ -2255,10 +2284,17 @@ def makedir(self, tarinfo, targetpath):
22552284
def makefile(self, tarinfo, targetpath):
22562285
"""Make a file called targetpath.
22572286
"""
2258-
source = self.extractfile(tarinfo)
2287+
source = self.fileobj
2288+
source.seek(tarinfo.offset_data)
22592289
target = bltn_open(targetpath, "wb")
2260-
copyfileobj(source, target)
2261-
source.close()
2290+
if tarinfo.sparse is not None:
2291+
for offset, size in tarinfo.sparse:
2292+
target.seek(offset)
2293+
copyfileobj(source, target, size)
2294+
else:
2295+
copyfileobj(source, target, tarinfo.size)
2296+
target.seek(tarinfo.size)
2297+
target.truncate()
22622298
target.close()
22632299

22642300
def makeunknown(self, tarinfo, targetpath):
@@ -2544,49 +2580,6 @@ def __next__(self):
25442580
self.index += 1
25452581
return tarinfo
25462582

2547-
# Helper classes for sparse file support
2548-
class _section:
2549-
"""Base class for _data and _hole.
2550-
"""
2551-
def __init__(self, offset, size):
2552-
self.offset = offset
2553-
self.size = size
2554-
def __contains__(self, offset):
2555-
return self.offset <= offset < self.offset + self.size
2556-
2557-
class _data(_section):
2558-
"""Represent a data section in a sparse file.
2559-
"""
2560-
def __init__(self, offset, size, realpos):
2561-
_section.__init__(self, offset, size)
2562-
self.realpos = realpos
2563-
2564-
class _hole(_section):
2565-
"""Represent a hole section in a sparse file.
2566-
"""
2567-
pass
2568-
2569-
class _ringbuffer(list):
2570-
"""Ringbuffer class which increases performance
2571-
over a regular list.
2572-
"""
2573-
def __init__(self):
2574-
self.idx = 0
2575-
def find(self, offset):
2576-
idx = self.idx
2577-
while True:
2578-
item = self[idx]
2579-
if offset in item:
2580-
break
2581-
idx += 1
2582-
if idx == len(self):
2583-
idx = 0
2584-
if idx == self.idx:
2585-
# End of File
2586-
return None
2587-
self.idx = idx
2588-
return item
2589-
25902583
#--------------------
25912584
# exported functions
25922585
#--------------------

Lib/test/test_tarfile.py

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,22 @@ def test_find_sparse(self):
526526
tarinfo = self.tar.getmember("ustar/sparse")
527527
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
528528

529+
def test_find_gnusparse(self):
530+
tarinfo = self.tar.getmember("gnu/sparse")
531+
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
532+
533+
def test_find_gnusparse_00(self):
534+
tarinfo = self.tar.getmember("gnu/sparse-0.0")
535+
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
536+
537+
def test_find_gnusparse_01(self):
538+
tarinfo = self.tar.getmember("gnu/sparse-0.1")
539+
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
540+
541+
def test_find_gnusparse_10(self):
542+
tarinfo = self.tar.getmember("gnu/sparse-1.0")
543+
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
544+
529545
def test_find_umlauts(self):
530546
tarinfo = self.tar.getmember("ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf")
531547
self._test_member(tarinfo, size=7011, chksum=md5_regtype)
@@ -589,13 +605,53 @@ class GNUReadTest(LongnameTest):
589605
subdir = "gnu"
590606
longnametype = tarfile.GNUTYPE_LONGNAME
591607

592-
def test_sparse_file(self):
593-
tarinfo1 = self.tar.getmember("ustar/sparse")
594-
fobj1 = self.tar.extractfile(tarinfo1)
595-
tarinfo2 = self.tar.getmember("gnu/sparse")
596-
fobj2 = self.tar.extractfile(tarinfo2)
597-
self.assertEqual(fobj1.read(), fobj2.read(),
598-
"sparse file extraction failed")
608+
# Since 3.2 tarfile is supposed to accurately restore sparse members and
609+
# produce files with holes. This is what we actually want to test here.
610+
# Unfortunately, not all platforms/filesystems support sparse files, and
611+
# even on platforms that do it is non-trivial to make reliable assertions
612+
# about holes in files. Therefore, we first do one basic test which works
613+
# an all platforms, and after that a test that will work only on
614+
# platforms/filesystems that prove to support sparse files.
615+
def _test_sparse_file(self, name):
616+
self.tar.extract(name, TEMPDIR)
617+
filename = os.path.join(TEMPDIR, name)
618+
with open(filename, "rb") as fobj:
619+
data = fobj.read()
620+
self.assertEqual(md5sum(data), md5_sparse,
621+
"wrong md5sum for %s" % name)
622+
623+
if self._fs_supports_holes():
624+
s = os.stat(filename)
625+
self.assertTrue(s.st_blocks * 512 < s.st_size)
626+
627+
def test_sparse_file_old(self):
628+
self._test_sparse_file("gnu/sparse")
629+
630+
def test_sparse_file_00(self):
631+
self._test_sparse_file("gnu/sparse-0.0")
632+
633+
def test_sparse_file_01(self):
634+
self._test_sparse_file("gnu/sparse-0.1")
635+
636+
def test_sparse_file_10(self):
637+
self._test_sparse_file("gnu/sparse-1.0")
638+
639+
@staticmethod
640+
def _fs_supports_holes():
641+
# Return True if the platform knows the st_blocks stat attribute and
642+
# uses st_blocks units of 512 bytes, and if the filesystem is able to
643+
# store holes in files.
644+
if sys.platform == "linux2":
645+
# Linux evidentially has 512 byte st_blocks units.
646+
name = os.path.join(TEMPDIR, "sparse-test")
647+
with open(name, "wb") as fobj:
648+
fobj.seek(4096)
649+
fobj.truncate()
650+
s = os.stat(name)
651+
os.remove(name)
652+
return s.st_blocks == 0
653+
else:
654+
return False
599655

600656

601657
class PaxReadTest(LongnameTest):

Lib/test/testtar.tar

126 KB
Binary file not shown.

0 commit comments

Comments
 (0)