@@ -701,13 +701,29 @@ class _FileInFile(object):
701701 object.
702702 """
703703
704- def __init__ (self , fileobj , offset , size , sparse = None ):
704+ def __init__ (self , fileobj , offset , size , blockinfo = None ):
705705 self .fileobj = fileobj
706706 self .offset = offset
707707 self .size = size
708- self .sparse = sparse
709708 self .position = 0
710709
710+ if blockinfo is None :
711+ blockinfo = [(0 , size )]
712+
713+ # Construct a map with data and zero blocks.
714+ self .map_index = 0
715+ self .map = []
716+ lastpos = 0
717+ realpos = self .offset
718+ for offset , size in blockinfo :
719+ if offset > lastpos :
720+ self .map .append ((False , lastpos , offset , None ))
721+ self .map .append ((True , offset , offset + size , realpos ))
722+ realpos += size
723+ lastpos = offset + size
724+ if lastpos < self .size :
725+ self .map .append ((False , lastpos , self .size , None ))
726+
711727 def seekable (self ):
712728 if not hasattr (self .fileobj , "seekable" ):
713729 # XXX gzip.GzipFile and bz2.BZ2File
@@ -732,48 +748,26 @@ def read(self, size=None):
732748 else :
733749 size = min (size , self .size - self .position )
734750
735- if self .sparse is None :
736- return self .readnormal (size )
737- else :
738- return self .readsparse (size )
739-
740- def readnormal (self , size ):
741- """Read operation for regular files.
742- """
743- self .fileobj .seek (self .offset + self .position )
744- self .position += size
745- return self .fileobj .read (size )
746-
747- def readsparse (self , size ):
748- """Read operation for sparse files.
749- """
750- data = b""
751+ buf = b""
751752 while size > 0 :
752- buf = self .readsparsesection (size )
753- if not buf :
754- break
755- size -= len (buf )
756- data += buf
757- return data
758-
759- def readsparsesection (self , size ):
760- """Read a single section of a sparse file.
761- """
762- section = self .sparse .find (self .position )
763-
764- if section is None :
765- return b""
766-
767- size = min (size , section .offset + section .size - self .position )
768-
769- if isinstance (section , _data ):
770- realpos = section .realpos + self .position - section .offset
771- self .fileobj .seek (self .offset + realpos )
772- self .position += size
773- return self .fileobj .read (size )
774- else :
775- self .position += size
776- return NUL * size
753+ while True :
754+ data , start , stop , offset = self .map [self .map_index ]
755+ if start <= self .position < stop :
756+ break
757+ else :
758+ self .map_index += 1
759+ if self .map_index == len (self .map ):
760+ self .map_index = 0
761+ length = min (size , stop - self .position )
762+ if data :
763+ self .fileobj .seek (offset )
764+ block = self .fileobj .read (stop - start )
765+ buf += block [self .position - start :self .position + length ]
766+ else :
767+ buf += NUL * length
768+ size -= length
769+ self .position += length
770+ return buf
777771#class _FileInFile
778772
779773
@@ -1367,28 +1361,15 @@ def _proc_sparse(self, tarfile):
13671361 numbytes = nti (buf [pos + 12 :pos + 24 ])
13681362 except ValueError :
13691363 break
1370- structs .append ((offset , numbytes ))
1364+ if offset and numbytes :
1365+ structs .append ((offset , numbytes ))
13711366 pos += 24
13721367 isextended = bool (buf [504 ])
1373-
1374- # Transform the sparse structures to something we can use
1375- # in ExFileObject.
1376- self .sparse = _ringbuffer ()
1377- lastpos = 0
1378- realpos = 0
1379- for offset , numbytes in structs :
1380- if offset > lastpos :
1381- self .sparse .append (_hole (lastpos , offset - lastpos ))
1382- self .sparse .append (_data (offset , numbytes , realpos ))
1383- realpos += numbytes
1384- lastpos = offset + numbytes
1385- if lastpos < origsize :
1386- self .sparse .append (_hole (lastpos , origsize - lastpos ))
1368+ self .sparse = structs
13871369
13881370 self .offset_data = tarfile .fileobj .tell ()
13891371 tarfile .offset = self .offset_data + self ._block (self .size )
13901372 self .size = origsize
1391-
13921373 return self
13931374
13941375 def _proc_pax (self , tarfile ):
@@ -1464,6 +1445,19 @@ def _proc_pax(self, tarfile):
14641445 except HeaderError :
14651446 raise SubsequentHeaderError ("missing or bad subsequent header" )
14661447
1448+ # Process GNU sparse information.
1449+ if "GNU.sparse.map" in pax_headers :
1450+ # GNU extended sparse format version 0.1.
1451+ self ._proc_gnusparse_01 (next , pax_headers )
1452+
1453+ elif "GNU.sparse.size" in pax_headers :
1454+ # GNU extended sparse format version 0.0.
1455+ self ._proc_gnusparse_00 (next , pax_headers , buf )
1456+
1457+ elif pax_headers .get ("GNU.sparse.major" ) == "1" and pax_headers .get ("GNU.sparse.minor" ) == "0" :
1458+ # GNU extended sparse format version 1.0.
1459+ self ._proc_gnusparse_10 (next , pax_headers , tarfile )
1460+
14671461 if self .type in (XHDTYPE , SOLARIS_XHDTYPE ):
14681462 # Patch the TarInfo object with the extended header info.
14691463 next ._apply_pax_info (pax_headers , tarfile .encoding , tarfile .errors )
@@ -1480,24 +1474,59 @@ def _proc_pax(self, tarfile):
14801474
14811475 return next
14821476
1477+ def _proc_gnusparse_00 (self , next , pax_headers , buf ):
1478+ """Process a GNU tar extended sparse header, version 0.0.
1479+ """
1480+ offsets = []
1481+ for match in re .finditer (br"\d+ GNU.sparse.offset=(\d+)\n" , buf ):
1482+ offsets .append (int (match .group (1 )))
1483+ numbytes = []
1484+ for match in re .finditer (br"\d+ GNU.sparse.numbytes=(\d+)\n" , buf ):
1485+ numbytes .append (int (match .group (1 )))
1486+ next .sparse = list (zip (offsets , numbytes ))
1487+
1488+ def _proc_gnusparse_01 (self , next , pax_headers ):
1489+ """Process a GNU tar extended sparse header, version 0.1.
1490+ """
1491+ sparse = [int (x ) for x in pax_headers ["GNU.sparse.map" ].split ("," )]
1492+ next .sparse = list (zip (sparse [::2 ], sparse [1 ::2 ]))
1493+
1494+ def _proc_gnusparse_10 (self , next , pax_headers , tarfile ):
1495+ """Process a GNU tar extended sparse header, version 1.0.
1496+ """
1497+ fields = None
1498+ sparse = []
1499+ buf = tarfile .fileobj .read (BLOCKSIZE )
1500+ fields , buf = buf .split (b"\n " , 1 )
1501+ fields = int (fields )
1502+ while len (sparse ) < fields * 2 :
1503+ if b"\n " not in buf :
1504+ buf += tarfile .fileobj .read (BLOCKSIZE )
1505+ number , buf = buf .split (b"\n " , 1 )
1506+ sparse .append (int (number ))
1507+ next .offset_data = tarfile .fileobj .tell ()
1508+ next .sparse = list (zip (sparse [::2 ], sparse [1 ::2 ]))
1509+
14831510 def _apply_pax_info (self , pax_headers , encoding , errors ):
14841511 """Replace fields with supplemental information from a previous
14851512 pax extended or global header.
14861513 """
14871514 for keyword , value in pax_headers .items ():
1488- if keyword not in PAX_FIELDS :
1489- continue
1490-
1491- if keyword == "path" :
1492- value = value .rstrip ("/" )
1493-
1494- if keyword in PAX_NUMBER_FIELDS :
1495- try :
1496- value = PAX_NUMBER_FIELDS [keyword ](value )
1497- except ValueError :
1498- value = 0
1499-
1500- setattr (self , keyword , value )
1515+ if keyword == "GNU.sparse.name" :
1516+ setattr (self , "path" , value )
1517+ elif keyword == "GNU.sparse.size" :
1518+ setattr (self , "size" , int (value ))
1519+ elif keyword == "GNU.sparse.realsize" :
1520+ setattr (self , "size" , int (value ))
1521+ elif keyword in PAX_FIELDS :
1522+ if keyword in PAX_NUMBER_FIELDS :
1523+ try :
1524+ value = PAX_NUMBER_FIELDS [keyword ](value )
1525+ except ValueError :
1526+ value = 0
1527+ if keyword == "path" :
1528+ value = value .rstrip ("/" )
1529+ setattr (self , keyword , value )
15011530
15021531 self .pax_headers = pax_headers .copy ()
15031532
@@ -1535,7 +1564,7 @@ def isblk(self):
15351564 def isfifo (self ):
15361565 return self .type == FIFOTYPE
15371566 def issparse (self ):
1538- return self .type == GNUTYPE_SPARSE
1567+ return self .sparse is not None
15391568 def isdev (self ):
15401569 return self .type in (CHRTYPE , BLKTYPE , FIFOTYPE )
15411570# class TarInfo
@@ -2255,10 +2284,17 @@ def makedir(self, tarinfo, targetpath):
22552284 def makefile (self , tarinfo , targetpath ):
22562285 """Make a file called targetpath.
22572286 """
2258- source = self .extractfile (tarinfo )
2287+ source = self .fileobj
2288+ source .seek (tarinfo .offset_data )
22592289 target = bltn_open (targetpath , "wb" )
2260- copyfileobj (source , target )
2261- source .close ()
2290+ if tarinfo .sparse is not None :
2291+ for offset , size in tarinfo .sparse :
2292+ target .seek (offset )
2293+ copyfileobj (source , target , size )
2294+ else :
2295+ copyfileobj (source , target , tarinfo .size )
2296+ target .seek (tarinfo .size )
2297+ target .truncate ()
22622298 target .close ()
22632299
22642300 def makeunknown (self , tarinfo , targetpath ):
@@ -2544,49 +2580,6 @@ def __next__(self):
25442580 self .index += 1
25452581 return tarinfo
25462582
2547- # Helper classes for sparse file support
2548- class _section :
2549- """Base class for _data and _hole.
2550- """
2551- def __init__ (self , offset , size ):
2552- self .offset = offset
2553- self .size = size
2554- def __contains__ (self , offset ):
2555- return self .offset <= offset < self .offset + self .size
2556-
2557- class _data (_section ):
2558- """Represent a data section in a sparse file.
2559- """
2560- def __init__ (self , offset , size , realpos ):
2561- _section .__init__ (self , offset , size )
2562- self .realpos = realpos
2563-
2564- class _hole (_section ):
2565- """Represent a hole section in a sparse file.
2566- """
2567- pass
2568-
2569- class _ringbuffer (list ):
2570- """Ringbuffer class which increases performance
2571- over a regular list.
2572- """
2573- def __init__ (self ):
2574- self .idx = 0
2575- def find (self , offset ):
2576- idx = self .idx
2577- while True :
2578- item = self [idx ]
2579- if offset in item :
2580- break
2581- idx += 1
2582- if idx == len (self ):
2583- idx = 0
2584- if idx == self .idx :
2585- # End of File
2586- return None
2587- self .idx = idx
2588- return item
2589-
25902583#--------------------
25912584# exported functions
25922585#--------------------
0 commit comments