1#-------------------------------------------------------------------
2# tarfile.py
3#-------------------------------------------------------------------
4# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
5# All rights reserved.
6#
7# Permission is hereby granted, free of charge, to any person
8# obtaining a copy of this software and associated documentation
9# files (the "Software"), to deal in the Software without
10# restriction, including without limitation the rights to use,
11# copy, modify, merge, publish, distribute, sublicense, and/or sell
12# copies of the Software, and to permit persons to whom the
13# Software is furnished to do so, subject to the following
14# conditions:
15#
16# The above copyright notice and this permission notice shall be
17# included in all copies or substantial portions of the Software.
18#
19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26# OTHER DEALINGS IN THE SOFTWARE.
27#
28"""Read from and write to tar format archives.
29"""
30
31version = "0.9.0"
32__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
33__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
34
35#---------
36# Imports
37#---------
38from builtins import open as bltn_open
39import sys
40import os
41import io
42import shutil
43import stat
44import time
45import struct
46import copy
47import re
48
49try:
50 import pwd
51except ImportError:
52 pwd = None
53try:
54 import grp
55except ImportError:
56 grp = None
57
58
59def _backportszstd_os_path_realpath_allow_missing(filename):
60 # specific patch versions of Python introduced:
61 # - strict parameter of os.path.realpath
62 # - os.path.ALLOW_MISSING
63 try:
64 return os.path.realpath(filename, strict=os.path.ALLOW_MISSING)
65 except (AttributeError, TypeError):
66 return os.path.realpath(filename)
67
68
69# os.symlink on Windows prior to 6.0 raises NotImplementedError
70# OSError (winerror=1314) will be raised if the caller does not hold the
71# SeCreateSymbolicLinkPrivilege privilege
72symlink_exception = (AttributeError, NotImplementedError, OSError)
73
74# from tarfile import *
75__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
76 "CompressionError", "StreamError", "ExtractError", "HeaderError",
77 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
78 "DEFAULT_FORMAT", "open","fully_trusted_filter", "data_filter",
79 "tar_filter", "FilterError", "AbsoluteLinkError",
80 "OutsideDestinationError", "SpecialFileError", "AbsolutePathError",
81 "LinkOutsideDestinationError", "LinkFallbackError"]
82
83
84#---------------------------------------------------------
85# tar constants
86#---------------------------------------------------------
87NUL = b"\0" # the null character
88BLOCKSIZE = 512 # length of processing blocks
89RECORDSIZE = BLOCKSIZE * 20 # length of records
90GNU_MAGIC = b"ustar \0" # magic gnu tar string
91POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
92
93LENGTH_NAME = 100 # maximum length of a filename
94LENGTH_LINK = 100 # maximum length of a linkname
95LENGTH_PREFIX = 155 # maximum length of the prefix field
96
97REGTYPE = b"0" # regular file
98AREGTYPE = b"\0" # regular file
99LNKTYPE = b"1" # link (inside tarfile)
100SYMTYPE = b"2" # symbolic link
101CHRTYPE = b"3" # character special device
102BLKTYPE = b"4" # block special device
103DIRTYPE = b"5" # directory
104FIFOTYPE = b"6" # fifo special device
105CONTTYPE = b"7" # contiguous file
106
107GNUTYPE_LONGNAME = b"L" # GNU tar longname
108GNUTYPE_LONGLINK = b"K" # GNU tar longlink
109GNUTYPE_SPARSE = b"S" # GNU tar sparse file
110
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = PAX_FORMAT
119
120#---------------------------------------------------------
121# tarfile constants
122#---------------------------------------------------------
123# File types that tarfile supports:
124SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
125 SYMTYPE, DIRTYPE, FIFOTYPE,
126 CONTTYPE, CHRTYPE, BLKTYPE,
127 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
128 GNUTYPE_SPARSE)
129
130# File types that will be treated as a regular file.
131REGULAR_TYPES = (REGTYPE, AREGTYPE,
132 CONTTYPE, GNUTYPE_SPARSE)
133
134# File types that are part of the GNU tar format.
135GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
136 GNUTYPE_SPARSE)
137
138# Fields from a pax header that override a TarInfo attribute.
139PAX_FIELDS = ("path", "linkpath", "size", "mtime",
140 "uid", "gid", "uname", "gname")
141
142# Fields from a pax header that are affected by hdrcharset.
143PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
144
145# Fields in a pax header that are numbers, all other fields
146# are treated as strings.
147PAX_NUMBER_FIELDS = {
148 "atime": float,
149 "ctime": float,
150 "mtime": float,
151 "uid": int,
152 "gid": int,
153 "size": int
154}
155
156#---------------------------------------------------------
157# initialization
158#---------------------------------------------------------
159if os.name == "nt":
160 ENCODING = "utf-8"
161else:
162 ENCODING = sys.getfilesystemencoding()
163
164#---------------------------------------------------------
165# Some useful functions
166#---------------------------------------------------------
167
168def stn(s, length, encoding, errors):
169 """Convert a string to a null-terminated bytes object.
170 """
171 if s is None:
172 raise ValueError("metadata cannot contain None")
173 s = s.encode(encoding, errors)
174 return s[:length] + (length - len(s)) * NUL
175
176def nts(s, encoding, errors):
177 """Convert a null-terminated bytes object to a string.
178 """
179 p = s.find(b"\0")
180 if p != -1:
181 s = s[:p]
182 return s.decode(encoding, errors)
183
184def nti(s):
185 """Convert a number field to a python number.
186 """
187 # There are two possible encodings for a number field, see
188 # itn() below.
189 if s[0] in (0o200, 0o377):
190 n = 0
191 for i in range(len(s) - 1):
192 n <<= 8
193 n += s[i + 1]
194 if s[0] == 0o377:
195 n = -(256 ** (len(s) - 1) - n)
196 else:
197 try:
198 s = nts(s, "ascii", "strict")
199 n = int(s.strip() or "0", 8)
200 except ValueError:
201 raise InvalidHeaderError("invalid header")
202 return n
203
204def itn(n, digits=8, format=DEFAULT_FORMAT):
205 """Convert a python number to a number field.
206 """
207 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
208 # octal digits followed by a null-byte, this allows values up to
209 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
210 # that if necessary. A leading 0o200 or 0o377 byte indicate this
211 # particular encoding, the following digits-1 bytes are a big-endian
212 # base-256 representation. This allows values up to (256**(digits-1))-1.
213 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
214 # number.
215 original_n = n
216 n = int(n)
217 if 0 <= n < 8 ** (digits - 1):
218 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
219 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
220 if n >= 0:
221 s = bytearray([0o200])
222 else:
223 s = bytearray([0o377])
224 n = 256 ** digits + n
225
226 for i in range(digits - 1):
227 s.insert(1, n & 0o377)
228 n >>= 8
229 else:
230 raise ValueError("overflow in number field")
231
232 return s
233
234def calc_chksums(buf):
235 """Calculate the checksum for a member's header by summing up all
236 characters except for the chksum field which is treated as if
237 it was filled with spaces. According to the GNU tar sources,
238 some tars (Sun and NeXT) calculate chksum with signed char,
239 which will be different if there are chars in the buffer with
240 the high bit set. So we calculate two checksums, unsigned and
241 signed.
242 """
243 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
244 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
245 return unsigned_chksum, signed_chksum
246
247def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
248 """Copy length bytes from fileobj src to fileobj dst.
249 If length is None, copy the entire content.
250 """
251 bufsize = bufsize or 16 * 1024
252 if length == 0:
253 return
254 if length is None:
255 shutil.copyfileobj(src, dst, bufsize)
256 return
257
258 blocks, remainder = divmod(length, bufsize)
259 for b in range(blocks):
260 buf = src.read(bufsize)
261 if len(buf) < bufsize:
262 raise exception("unexpected end of data")
263 dst.write(buf)
264
265 if remainder != 0:
266 buf = src.read(remainder)
267 if len(buf) < remainder:
268 raise exception("unexpected end of data")
269 dst.write(buf)
270 return
271
272def _safe_print(s):
273 encoding = getattr(sys.stdout, 'encoding', None)
274 if encoding is not None:
275 s = s.encode(encoding, 'backslashreplace').decode(encoding)
276 print(s, end=' ')
277
278
279class TarError(Exception):
280 """Base exception."""
281 pass
282class ExtractError(TarError):
283 """General exception for extract errors."""
284 pass
285class ReadError(TarError):
286 """Exception for unreadable tar archives."""
287 pass
288class CompressionError(TarError):
289 """Exception for unavailable compression methods."""
290 pass
291class StreamError(TarError):
292 """Exception for unsupported operations on stream-like TarFiles."""
293 pass
294class HeaderError(TarError):
295 """Base exception for header errors."""
296 pass
297class EmptyHeaderError(HeaderError):
298 """Exception for empty headers."""
299 pass
300class TruncatedHeaderError(HeaderError):
301 """Exception for truncated headers."""
302 pass
303class EOFHeaderError(HeaderError):
304 """Exception for end of file headers."""
305 pass
306class InvalidHeaderError(HeaderError):
307 """Exception for invalid headers."""
308 pass
309class SubsequentHeaderError(HeaderError):
310 """Exception for missing and invalid extended headers."""
311 pass
312
313#---------------------------
314# internal stream interface
315#---------------------------
316class _LowLevelFile:
317 """Low-level file object. Supports reading and writing.
318 It is used instead of a regular file object for streaming
319 access.
320 """
321
322 def __init__(self, name, mode):
323 mode = {
324 "r": os.O_RDONLY,
325 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
326 }[mode]
327 if hasattr(os, "O_BINARY"):
328 mode |= os.O_BINARY
329 self.fd = os.open(name, mode, 0o666)
330
331 def close(self):
332 os.close(self.fd)
333
334 def read(self, size):
335 return os.read(self.fd, size)
336
337 def write(self, s):
338 os.write(self.fd, s)
339
340class _Stream:
341 """Class that serves as an adapter between TarFile and
342 a stream-like object. The stream-like object only
343 needs to have a read() or write() method that works with bytes,
344 and the method is accessed blockwise.
345 Use of gzip or bzip2 compression is possible.
346 A stream-like object could be for example: sys.stdin.buffer,
347 sys.stdout.buffer, a socket, a tape device etc.
348
349 _Stream is intended to be used only internally.
350 """
351
352 def __init__(self, name, mode, comptype, fileobj, bufsize,
353 compresslevel, preset):
354 """Construct a _Stream object.
355 """
356 self._extfileobj = True
357 if fileobj is None:
358 fileobj = _LowLevelFile(name, mode)
359 self._extfileobj = False
360
361 if comptype == '*':
362 # Enable transparent compression detection for the
363 # stream interface
364 fileobj = _StreamProxy(fileobj)
365 comptype = fileobj.getcomptype()
366
367 self.name = os.fspath(name) if name is not None else ""
368 self.mode = mode
369 self.comptype = comptype
370 self.fileobj = fileobj
371 self.bufsize = bufsize
372 self.buf = b""
373 self.pos = 0
374 self.closed = False
375
376 try:
377 if comptype == "gz":
378 try:
379 import zlib
380 except ImportError:
381 raise CompressionError("zlib module is not available") from None
382 self.zlib = zlib
383 self.crc = zlib.crc32(b"")
384 if mode == "r":
385 self.exception = zlib.error
386 self._init_read_gz()
387 else:
388 self._init_write_gz(compresslevel)
389
390 elif comptype == "bz2":
391 try:
392 import bz2
393 except ImportError:
394 raise CompressionError("bz2 module is not available") from None
395 if mode == "r":
396 self.dbuf = b""
397 self.cmp = bz2.BZ2Decompressor()
398 self.exception = OSError
399 else:
400 self.cmp = bz2.BZ2Compressor(compresslevel)
401
402 elif comptype == "xz":
403 try:
404 import lzma
405 except ImportError:
406 raise CompressionError("lzma module is not available") from None
407 if mode == "r":
408 self.dbuf = b""
409 self.cmp = lzma.LZMADecompressor()
410 self.exception = lzma.LZMAError
411 else:
412 self.cmp = lzma.LZMACompressor(preset=preset)
413 elif comptype == "zst":
414 from backports import zstd
415 if mode == "r":
416 self.dbuf = b""
417 self.cmp = zstd.ZstdDecompressor()
418 self.exception = zstd.ZstdError
419 else:
420 self.cmp = zstd.ZstdCompressor()
421 elif comptype != "tar":
422 raise CompressionError("unknown compression type %r" % comptype)
423
424 except:
425 if not self._extfileobj:
426 self.fileobj.close()
427 self.closed = True
428 raise
429
430 def __del__(self):
431 if hasattr(self, "closed") and not self.closed:
432 self.close()
433
434 def _init_write_gz(self, compresslevel):
435 """Initialize for writing with gzip compression.
436 """
437 self.cmp = self.zlib.compressobj(compresslevel,
438 self.zlib.DEFLATED,
439 -self.zlib.MAX_WBITS,
440 self.zlib.DEF_MEM_LEVEL,
441 0)
442 timestamp = struct.pack("<L", int(time.time()))
443 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
444 if self.name.endswith(".gz"):
445 self.name = self.name[:-3]
446 # Honor "directory components removed" from RFC1952
447 self.name = os.path.basename(self.name)
448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
449 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
450
451 def write(self, s):
452 """Write string s to the stream.
453 """
454 if self.comptype == "gz":
455 self.crc = self.zlib.crc32(s, self.crc)
456 self.pos += len(s)
457 if self.comptype != "tar":
458 s = self.cmp.compress(s)
459 self.__write(s)
460
461 def __write(self, s):
462 """Write string s to the stream if a whole new block
463 is ready to be written.
464 """
465 self.buf += s
466 while len(self.buf) > self.bufsize:
467 self.fileobj.write(self.buf[:self.bufsize])
468 self.buf = self.buf[self.bufsize:]
469
470 def close(self):
471 """Close the _Stream object. No operation should be
472 done on it afterwards.
473 """
474 if self.closed:
475 return
476
477 self.closed = True
478 try:
479 if self.mode == "w" and self.comptype != "tar":
480 self.buf += self.cmp.flush()
481
482 if self.mode == "w" and self.buf:
483 self.fileobj.write(self.buf)
484 self.buf = b""
485 if self.comptype == "gz":
486 self.fileobj.write(struct.pack("<L", self.crc))
487 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
488 finally:
489 if not self._extfileobj:
490 self.fileobj.close()
491
492 def _init_read_gz(self):
493 """Initialize for reading a gzip compressed fileobj.
494 """
495 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
496 self.dbuf = b""
497
498 # taken from gzip.GzipFile with some alterations
499 if self.__read(2) != b"\037\213":
500 raise ReadError("not a gzip file")
501 if self.__read(1) != b"\010":
502 raise CompressionError("unsupported compression method")
503
504 flag = ord(self.__read(1))
505 self.__read(6)
506
507 if flag & 4:
508 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
509 self.read(xlen)
510 if flag & 8:
511 while True:
512 s = self.__read(1)
513 if not s or s == NUL:
514 break
515 if flag & 16:
516 while True:
517 s = self.__read(1)
518 if not s or s == NUL:
519 break
520 if flag & 2:
521 self.__read(2)
522
523 def tell(self):
524 """Return the stream's file pointer position.
525 """
526 return self.pos
527
528 def seek(self, pos=0):
529 """Set the stream's file pointer to pos. Negative seeking
530 is forbidden.
531 """
532 if pos - self.pos >= 0:
533 blocks, remainder = divmod(pos - self.pos, self.bufsize)
534 for i in range(blocks):
535 self.read(self.bufsize)
536 self.read(remainder)
537 else:
538 raise StreamError("seeking backwards is not allowed")
539 return self.pos
540
541 def read(self, size):
542 """Return the next size number of bytes from the stream."""
543 assert size is not None
544 buf = self._read(size)
545 self.pos += len(buf)
546 return buf
547
548 def _read(self, size):
549 """Return size bytes from the stream.
550 """
551 if self.comptype == "tar":
552 return self.__read(size)
553
554 c = len(self.dbuf)
555 t = [self.dbuf]
556 while c < size:
557 # Skip underlying buffer to avoid unaligned double buffering.
558 if self.buf:
559 buf = self.buf
560 self.buf = b""
561 else:
562 buf = self.fileobj.read(self.bufsize)
563 if not buf:
564 break
565 try:
566 buf = self.cmp.decompress(buf)
567 except self.exception as e:
568 raise ReadError("invalid compressed data") from e
569 t.append(buf)
570 c += len(buf)
571 t = b"".join(t)
572 self.dbuf = t[size:]
573 return t[:size]
574
575 def __read(self, size):
576 """Return size bytes from stream. If internal buffer is empty,
577 read another block from the stream.
578 """
579 c = len(self.buf)
580 t = [self.buf]
581 while c < size:
582 buf = self.fileobj.read(self.bufsize)
583 if not buf:
584 break
585 t.append(buf)
586 c += len(buf)
587 t = b"".join(t)
588 self.buf = t[size:]
589 return t[:size]
590# class _Stream
591
592class _StreamProxy(object):
593 """Small proxy class that enables transparent compression
594 detection for the Stream interface (mode 'r|*').
595 """
596
597 def __init__(self, fileobj):
598 self.fileobj = fileobj
599 self.buf = self.fileobj.read(BLOCKSIZE)
600
601 def read(self, size):
602 self.read = self.fileobj.read
603 return self.buf
604
605 def getcomptype(self):
606 if self.buf.startswith(b"\x1f\x8b\x08"):
607 return "gz"
608 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
609 return "bz2"
610 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
611 return "xz"
612 elif self.buf.startswith(b"\x28\xb5\x2f\xfd"):
613 return "zst"
614 else:
615 return "tar"
616
617 def close(self):
618 self.fileobj.close()
619# class StreamProxy
620
621#------------------------
622# Extraction file object
623#------------------------
624class _FileInFile(object):
625 """A thin wrapper around an existing file object that
626 provides a part of its data as an individual file
627 object.
628 """
629
630 def __init__(self, fileobj, offset, size, name, blockinfo=None):
631 self.fileobj = fileobj
632 self.offset = offset
633 self.size = size
634 self.position = 0
635 self.name = name
636 self.closed = False
637
638 if blockinfo is None:
639 blockinfo = [(0, size)]
640
641 # Construct a map with data and zero blocks.
642 self.map_index = 0
643 self.map = []
644 lastpos = 0
645 realpos = self.offset
646 for offset, size in blockinfo:
647 if offset > lastpos:
648 self.map.append((False, lastpos, offset, None))
649 self.map.append((True, offset, offset + size, realpos))
650 realpos += size
651 lastpos = offset + size
652 if lastpos < self.size:
653 self.map.append((False, lastpos, self.size, None))
654
655 def flush(self):
656 pass
657
658 @property
659 def mode(self):
660 return 'rb'
661
662 def readable(self):
663 return True
664
665 def writable(self):
666 return False
667
668 def seekable(self):
669 return self.fileobj.seekable()
670
671 def tell(self):
672 """Return the current file position.
673 """
674 return self.position
675
676 def seek(self, position, whence=io.SEEK_SET):
677 """Seek to a position in the file.
678 """
679 if whence == io.SEEK_SET:
680 self.position = min(max(position, 0), self.size)
681 elif whence == io.SEEK_CUR:
682 if position < 0:
683 self.position = max(self.position + position, 0)
684 else:
685 self.position = min(self.position + position, self.size)
686 elif whence == io.SEEK_END:
687 self.position = max(min(self.size + position, self.size), 0)
688 else:
689 raise ValueError("Invalid argument")
690 return self.position
691
692 def read(self, size=None):
693 """Read data from the file.
694 """
695 if size is None:
696 size = self.size - self.position
697 else:
698 size = min(size, self.size - self.position)
699
700 buf = b""
701 while size > 0:
702 while True:
703 data, start, stop, offset = self.map[self.map_index]
704 if start <= self.position < stop:
705 break
706 else:
707 self.map_index += 1
708 if self.map_index == len(self.map):
709 self.map_index = 0
710 length = min(size, stop - self.position)
711 if data:
712 self.fileobj.seek(offset + (self.position - start))
713 b = self.fileobj.read(length)
714 if len(b) != length:
715 raise ReadError("unexpected end of data")
716 buf += b
717 else:
718 buf += NUL * length
719 size -= length
720 self.position += length
721 return buf
722
723 def readinto(self, b):
724 buf = self.read(len(b))
725 b[:len(buf)] = buf
726 return len(buf)
727
728 def close(self):
729 self.closed = True
730#class _FileInFile
731
732class ExFileObject(io.BufferedReader):
733
734 def __init__(self, tarfile, tarinfo):
735 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
736 tarinfo.size, tarinfo.name, tarinfo.sparse)
737 super().__init__(fileobj)
738#class ExFileObject
739
740
741#-----------------------------
742# extraction filters (PEP 706)
743#-----------------------------
744
745class FilterError(TarError):
746 pass
747
748class AbsolutePathError(FilterError):
749 def __init__(self, tarinfo):
750 self.tarinfo = tarinfo
751 super().__init__(f'member {tarinfo.name!r} has an absolute path')
752
753class OutsideDestinationError(FilterError):
754 def __init__(self, tarinfo, path):
755 self.tarinfo = tarinfo
756 self._path = path
757 super().__init__(f'{tarinfo.name!r} would be extracted to {path!r}, '
758 + 'which is outside the destination')
759
760class SpecialFileError(FilterError):
761 def __init__(self, tarinfo):
762 self.tarinfo = tarinfo
763 super().__init__(f'{tarinfo.name!r} is a special file')
764
765class AbsoluteLinkError(FilterError):
766 def __init__(self, tarinfo):
767 self.tarinfo = tarinfo
768 super().__init__(f'{tarinfo.name!r} is a link to an absolute path')
769
770class LinkOutsideDestinationError(FilterError):
771 def __init__(self, tarinfo, path):
772 self.tarinfo = tarinfo
773 self._path = path
774 super().__init__(f'{tarinfo.name!r} would link to {path!r}, '
775 + 'which is outside the destination')
776
777class LinkFallbackError(FilterError):
778 def __init__(self, tarinfo, path):
779 self.tarinfo = tarinfo
780 self._path = path
781 super().__init__(f'link {tarinfo.name!r} would be extracted as a '
782 + f'copy of {path!r}, which was rejected')
783
784# Errors caused by filters -- both "fatal" and "non-fatal" -- that
785# we consider to be issues with the argument, rather than a bug in the
786# filter function
787_FILTER_ERRORS = (FilterError, OSError, ExtractError)
788
789def _get_filtered_attrs(member, dest_path, for_data=True):
790 new_attrs = {}
791 name = member.name
792 dest_path = _backportszstd_os_path_realpath_allow_missing(dest_path)
793 # Strip leading / (tar's directory separator) from filenames.
794 # Include os.sep (target OS directory separator) as well.
795 if name.startswith(('/', os.sep)):
796 name = new_attrs['name'] = member.path.lstrip('/' + os.sep)
797 if os.path.isabs(name):
798 # Path is absolute even after stripping.
799 # For example, 'C:/foo' on Windows.
800 raise AbsolutePathError(member)
801 # Ensure we stay in the destination
802 target_path = _backportszstd_os_path_realpath_allow_missing(os.path.join(dest_path, name))
803 if os.path.commonpath([target_path, dest_path]) != dest_path:
804 raise OutsideDestinationError(member, target_path)
805 # Limit permissions (no high bits, and go-w)
806 mode = member.mode
807 if mode is not None:
808 # Strip high bits & group/other write bits
809 mode = mode & 0o755
810 if for_data:
811 # For data, handle permissions & file types
812 if member.isreg() or member.islnk():
813 if not mode & 0o100:
814 # Clear executable bits if not executable by user
815 mode &= ~0o111
816 # Ensure owner can read & write
817 mode |= 0o600
818 elif member.isdir() or member.issym():
819 # Ignore mode for directories & symlinks
820 mode = None
821 else:
822 # Reject special files
823 raise SpecialFileError(member)
824 if mode != member.mode:
825 new_attrs['mode'] = mode
826 if for_data:
827 # Ignore ownership for 'data'
828 if member.uid is not None:
829 new_attrs['uid'] = None
830 if member.gid is not None:
831 new_attrs['gid'] = None
832 if member.uname is not None:
833 new_attrs['uname'] = None
834 if member.gname is not None:
835 new_attrs['gname'] = None
836 # Check link destination for 'data'
837 if member.islnk() or member.issym():
838 if os.path.isabs(member.linkname):
839 raise AbsoluteLinkError(member)
840 normalized = os.path.normpath(member.linkname)
841 if normalized != member.linkname:
842 new_attrs['linkname'] = normalized
843 if member.issym():
844 target_path = os.path.join(dest_path,
845 os.path.dirname(name),
846 member.linkname)
847 else:
848 target_path = os.path.join(dest_path,
849 member.linkname)
850 target_path = _backportszstd_os_path_realpath_allow_missing(target_path)
851 if os.path.commonpath([target_path, dest_path]) != dest_path:
852 raise LinkOutsideDestinationError(member, target_path)
853 return new_attrs
854
855def fully_trusted_filter(member, dest_path):
856 return member
857
858def tar_filter(member, dest_path):
859 new_attrs = _get_filtered_attrs(member, dest_path, False)
860 if new_attrs:
861 return member.replace(**new_attrs, deep=False)
862 return member
863
864def data_filter(member, dest_path):
865 new_attrs = _get_filtered_attrs(member, dest_path, True)
866 if new_attrs:
867 return member.replace(**new_attrs, deep=False)
868 return member
869
870_NAMED_FILTERS = {
871 "fully_trusted": fully_trusted_filter,
872 "tar": tar_filter,
873 "data": data_filter,
874}
875
876#------------------
877# Exported Classes
878#------------------
879
880# Sentinel for replace() defaults, meaning "don't change the attribute"
881_KEEP = object()
882
883# Header length is digits followed by a space.
884_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")
885
886class TarInfo(object):
887 """Informational class which holds the details about an
888 archive member given by a tar header block.
889 TarInfo objects are returned by TarFile.getmember(),
890 TarFile.getmembers() and TarFile.gettarinfo() and are
891 usually created internally.
892 """
893
894 __slots__ = dict(
895 name = 'Name of the archive member.',
896 mode = 'Permission bits.',
897 uid = 'User ID of the user who originally stored this member.',
898 gid = 'Group ID of the user who originally stored this member.',
899 size = 'Size in bytes.',
900 mtime = 'Time of last modification.',
901 chksum = 'Header checksum.',
902 type = ('File type. type is usually one of these constants: '
903 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, '
904 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'),
905 linkname = ('Name of the target file name, which is only present '
906 'in TarInfo objects of type LNKTYPE and SYMTYPE.'),
907 uname = 'User name.',
908 gname = 'Group name.',
909 devmajor = 'Device major number.',
910 devminor = 'Device minor number.',
911 offset = 'The tar header starts here.',
912 offset_data = "The file's data starts here.",
913 pax_headers = ('A dictionary containing key-value pairs of an '
914 'associated pax extended header.'),
915 sparse = 'Sparse member information.',
916 _tarfile = None,
917 _sparse_structs = None,
918 _link_target = None,
919 )
920
921 def __init__(self, name=""):
922 """Construct a TarInfo object. name is the optional name
923 of the member.
924 """
925 self.name = name # member name
926 self.mode = 0o644 # file permissions
927 self.uid = 0 # user id
928 self.gid = 0 # group id
929 self.size = 0 # file size
930 self.mtime = 0 # modification time
931 self.chksum = 0 # header checksum
932 self.type = REGTYPE # member type
933 self.linkname = "" # link name
934 self.uname = "" # user name
935 self.gname = "" # group name
936 self.devmajor = 0 # device major number
937 self.devminor = 0 # device minor number
938
939 self.offset = 0 # the tar header starts here
940 self.offset_data = 0 # the file's data starts here
941
942 self.sparse = None # sparse member information
943 self.pax_headers = {} # pax header information
944
945 @property
946 def tarfile(self):
947 import warnings
948 warnings.warn(
949 'The undocumented "tarfile" attribute of TarInfo objects '
950 + 'is deprecated and will be removed in Python 3.16',
951 DeprecationWarning, stacklevel=2)
952 return self._tarfile
953
954 @tarfile.setter
955 def tarfile(self, tarfile):
956 import warnings
957 warnings.warn(
958 'The undocumented "tarfile" attribute of TarInfo objects '
959 + 'is deprecated and will be removed in Python 3.16',
960 DeprecationWarning, stacklevel=2)
961 self._tarfile = tarfile
962
963 @property
964 def path(self):
965 'In pax headers, "name" is called "path".'
966 return self.name
967
968 @path.setter
969 def path(self, name):
970 self.name = name
971
972 @property
973 def linkpath(self):
974 'In pax headers, "linkname" is called "linkpath".'
975 return self.linkname
976
977 @linkpath.setter
978 def linkpath(self, linkname):
979 self.linkname = linkname
980
981 def __repr__(self):
982 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
983
984 def replace(self, *,
985 name=_KEEP, mtime=_KEEP, mode=_KEEP, linkname=_KEEP,
986 uid=_KEEP, gid=_KEEP, uname=_KEEP, gname=_KEEP,
987 deep=True, _KEEP=_KEEP):
988 """Return a deep copy of self with the given attributes replaced.
989 """
990 if deep:
991 result = copy.deepcopy(self)
992 else:
993 result = copy.copy(self)
994 if name is not _KEEP:
995 result.name = name
996 if mtime is not _KEEP:
997 result.mtime = mtime
998 if mode is not _KEEP:
999 result.mode = mode
1000 if linkname is not _KEEP:
1001 result.linkname = linkname
1002 if uid is not _KEEP:
1003 result.uid = uid
1004 if gid is not _KEEP:
1005 result.gid = gid
1006 if uname is not _KEEP:
1007 result.uname = uname
1008 if gname is not _KEEP:
1009 result.gname = gname
1010 return result
1011
1012 def get_info(self):
1013 """Return the TarInfo's attributes as a dictionary.
1014 """
1015 if self.mode is None:
1016 mode = None
1017 else:
1018 mode = self.mode & 0o7777
1019 info = {
1020 "name": self.name,
1021 "mode": mode,
1022 "uid": self.uid,
1023 "gid": self.gid,
1024 "size": self.size,
1025 "mtime": self.mtime,
1026 "chksum": self.chksum,
1027 "type": self.type,
1028 "linkname": self.linkname,
1029 "uname": self.uname,
1030 "gname": self.gname,
1031 "devmajor": self.devmajor,
1032 "devminor": self.devminor
1033 }
1034
1035 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1036 info["name"] += "/"
1037
1038 return info
1039
1040 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
1041 """Return a tar header as a string of 512 byte blocks.
1042 """
1043 info = self.get_info()
1044 for name, value in info.items():
1045 if value is None:
1046 raise ValueError("%s may not be None" % name)
1047
1048 if format == USTAR_FORMAT:
1049 return self.create_ustar_header(info, encoding, errors)
1050 elif format == GNU_FORMAT:
1051 return self.create_gnu_header(info, encoding, errors)
1052 elif format == PAX_FORMAT:
1053 return self.create_pax_header(info, encoding)
1054 else:
1055 raise ValueError("invalid format")
1056
1057 def create_ustar_header(self, info, encoding, errors):
1058 """Return the object as a ustar header block.
1059 """
1060 info["magic"] = POSIX_MAGIC
1061
1062 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
1063 raise ValueError("linkname is too long")
1064
1065 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
1066 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
1067
1068 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1069
1070 def create_gnu_header(self, info, encoding, errors):
1071 """Return the object as a GNU header block sequence.
1072 """
1073 info["magic"] = GNU_MAGIC
1074
1075 buf = b""
1076 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
1077 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
1078
1079 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
1080 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
1081
1082 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1083
1084 def create_pax_header(self, info, encoding):
1085 """Return the object as a ustar header block. If it cannot be
1086 represented this way, prepend a pax extended header sequence
1087 with supplement information.
1088 """
1089 info["magic"] = POSIX_MAGIC
1090 pax_headers = self.pax_headers.copy()
1091
1092 # Test string fields for values that exceed the field length or cannot
1093 # be represented in ASCII encoding.
1094 for name, hname, length in (
1095 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1096 ("uname", "uname", 32), ("gname", "gname", 32)):
1097
1098 if hname in pax_headers:
1099 # The pax header has priority.
1100 continue
1101
1102 # Try to encode the string as ASCII.
1103 try:
1104 info[name].encode("ascii", "strict")
1105 except UnicodeEncodeError:
1106 pax_headers[hname] = info[name]
1107 continue
1108
1109 if len(info[name]) > length:
1110 pax_headers[hname] = info[name]
1111
1112 # Test number fields for values that exceed the field limit or values
1113 # that like to be stored as float.
1114 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1115 needs_pax = False
1116
1117 val = info[name]
1118 val_is_float = isinstance(val, float)
1119 val_int = round(val) if val_is_float else val
1120 if not 0 <= val_int < 8 ** (digits - 1):
1121 # Avoid overflow.
1122 info[name] = 0
1123 needs_pax = True
1124 elif val_is_float:
1125 # Put rounded value in ustar header, and full
1126 # precision value in pax header.
1127 info[name] = val_int
1128 needs_pax = True
1129
1130 # The existing pax header has priority.
1131 if needs_pax and name not in pax_headers:
1132 pax_headers[name] = str(val)
1133
1134 # Create a pax extended header if necessary.
1135 if pax_headers:
1136 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1137 else:
1138 buf = b""
1139
1140 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1141
1142 @classmethod
1143 def create_pax_global_header(cls, pax_headers):
1144 """Return the object as a pax global header block sequence.
1145 """
1146 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1147
1148 def _posix_split_name(self, name, encoding, errors):
1149 """Split a name longer than 100 chars into a prefix
1150 and a name part.
1151 """
1152 components = name.split("/")
1153 for i in range(1, len(components)):
1154 prefix = "/".join(components[:i])
1155 name = "/".join(components[i:])
1156 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
1157 len(name.encode(encoding, errors)) <= LENGTH_NAME:
1158 break
1159 else:
1160 raise ValueError("name is too long")
1161
1162 return prefix, name
1163
1164 @staticmethod
1165 def _create_header(info, format, encoding, errors):
1166 """Return a header block. info is a dictionary with file
1167 information, format must be one of the *_FORMAT constants.
1168 """
1169 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE)
1170 if has_device_fields:
1171 devmajor = itn(info.get("devmajor", 0), 8, format)
1172 devminor = itn(info.get("devminor", 0), 8, format)
1173 else:
1174 devmajor = stn("", 8, encoding, errors)
1175 devminor = stn("", 8, encoding, errors)
1176
1177 # None values in metadata should cause ValueError.
1178 # itn()/stn() do this for all fields except type.
1179 filetype = info.get("type", REGTYPE)
1180 if filetype is None:
1181 raise ValueError("TarInfo.type must not be None")
1182
1183 parts = [
1184 stn(info.get("name", ""), 100, encoding, errors),
1185 itn(info.get("mode", 0) & 0o7777, 8, format),
1186 itn(info.get("uid", 0), 8, format),
1187 itn(info.get("gid", 0), 8, format),
1188 itn(info.get("size", 0), 12, format),
1189 itn(info.get("mtime", 0), 12, format),
1190 b" ", # checksum field
1191 filetype,
1192 stn(info.get("linkname", ""), 100, encoding, errors),
1193 info.get("magic", POSIX_MAGIC),
1194 stn(info.get("uname", ""), 32, encoding, errors),
1195 stn(info.get("gname", ""), 32, encoding, errors),
1196 devmajor,
1197 devminor,
1198 stn(info.get("prefix", ""), 155, encoding, errors)
1199 ]
1200
1201 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1202 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1203 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1204 return buf
1205
1206 @staticmethod
1207 def _create_payload(payload):
1208 """Return the string payload filled with zero bytes
1209 up to the next 512 byte border.
1210 """
1211 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1212 if remainder > 0:
1213 payload += (BLOCKSIZE - remainder) * NUL
1214 return payload
1215
1216 @classmethod
1217 def _create_gnu_long_header(cls, name, type, encoding, errors):
1218 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1219 for name.
1220 """
1221 name = name.encode(encoding, errors) + NUL
1222
1223 info = {}
1224 info["name"] = "././@LongLink"
1225 info["type"] = type
1226 info["size"] = len(name)
1227 info["magic"] = GNU_MAGIC
1228
1229 # create extended header + name blocks.
1230 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1231 cls._create_payload(name)
1232
1233 @classmethod
1234 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1235 """Return a POSIX.1-2008 extended or global header sequence
1236 that contains a list of keyword, value pairs. The values
1237 must be strings.
1238 """
1239 # Check if one of the fields contains surrogate characters and thereby
1240 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1241 binary = False
1242 for keyword, value in pax_headers.items():
1243 try:
1244 value.encode("utf-8", "strict")
1245 except UnicodeEncodeError:
1246 binary = True
1247 break
1248
1249 records = b""
1250 if binary:
1251 # Put the hdrcharset field at the beginning of the header.
1252 records += b"21 hdrcharset=BINARY\n"
1253
1254 for keyword, value in pax_headers.items():
1255 keyword = keyword.encode("utf-8")
1256 if binary:
1257 # Try to restore the original byte representation of 'value'.
1258 # Needless to say, that the encoding must match the string.
1259 value = value.encode(encoding, "surrogateescape")
1260 else:
1261 value = value.encode("utf-8")
1262
1263 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1264 n = p = 0
1265 while True:
1266 n = l + len(str(p))
1267 if n == p:
1268 break
1269 p = n
1270 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1271
1272 # We use a hardcoded "././@PaxHeader" name like star does
1273 # instead of the one that POSIX recommends.
1274 info = {}
1275 info["name"] = "././@PaxHeader"
1276 info["type"] = type
1277 info["size"] = len(records)
1278 info["magic"] = POSIX_MAGIC
1279
1280 # Create pax header + record blocks.
1281 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1282 cls._create_payload(records)
1283
1284 @classmethod
1285 def frombuf(cls, buf, encoding, errors):
1286 """Construct a TarInfo object from a 512 byte bytes object.
1287 """
1288 if len(buf) == 0:
1289 raise EmptyHeaderError("empty header")
1290 if len(buf) != BLOCKSIZE:
1291 raise TruncatedHeaderError("truncated header")
1292 if buf.count(NUL) == BLOCKSIZE:
1293 raise EOFHeaderError("end of file header")
1294
1295 chksum = nti(buf[148:156])
1296 if chksum not in calc_chksums(buf):
1297 raise InvalidHeaderError("bad checksum")
1298
1299 obj = cls()
1300 obj.name = nts(buf[0:100], encoding, errors)
1301 obj.mode = nti(buf[100:108])
1302 obj.uid = nti(buf[108:116])
1303 obj.gid = nti(buf[116:124])
1304 obj.size = nti(buf[124:136])
1305 obj.mtime = nti(buf[136:148])
1306 obj.chksum = chksum
1307 obj.type = buf[156:157]
1308 obj.linkname = nts(buf[157:257], encoding, errors)
1309 obj.uname = nts(buf[265:297], encoding, errors)
1310 obj.gname = nts(buf[297:329], encoding, errors)
1311 obj.devmajor = nti(buf[329:337])
1312 obj.devminor = nti(buf[337:345])
1313 prefix = nts(buf[345:500], encoding, errors)
1314
1315 # Old V7 tar format represents a directory as a regular
1316 # file with a trailing slash.
1317 if obj.type == AREGTYPE and obj.name.endswith("/"):
1318 obj.type = DIRTYPE
1319
1320 # The old GNU sparse format occupies some of the unused
1321 # space in the buffer for up to 4 sparse structures.
1322 # Save them for later processing in _proc_sparse().
1323 if obj.type == GNUTYPE_SPARSE:
1324 pos = 386
1325 structs = []
1326 for i in range(4):
1327 try:
1328 offset = nti(buf[pos:pos + 12])
1329 numbytes = nti(buf[pos + 12:pos + 24])
1330 except ValueError:
1331 break
1332 structs.append((offset, numbytes))
1333 pos += 24
1334 isextended = bool(buf[482])
1335 origsize = nti(buf[483:495])
1336 obj._sparse_structs = (structs, isextended, origsize)
1337
1338 # Remove redundant slashes from directories.
1339 if obj.isdir():
1340 obj.name = obj.name.rstrip("/")
1341
1342 # Reconstruct a ustar longname.
1343 if prefix and obj.type not in GNU_TYPES:
1344 obj.name = prefix + "/" + obj.name
1345 return obj
1346
1347 @classmethod
1348 def fromtarfile(cls, tarfile):
1349 """Return the next TarInfo object from TarFile object
1350 tarfile.
1351 """
1352 buf = tarfile.fileobj.read(BLOCKSIZE)
1353 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1354 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1355 return obj._proc_member(tarfile)
1356
1357 #--------------------------------------------------------------------------
1358 # The following are methods that are called depending on the type of a
1359 # member. The entry point is _proc_member() which can be overridden in a
1360 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1361 # implement the following
1362 # operations:
1363 # 1. Set self.offset_data to the position where the data blocks begin,
1364 # if there is data that follows.
1365 # 2. Set tarfile.offset to the position where the next member's header will
1366 # begin.
1367 # 3. Return self or another valid TarInfo object.
1368 def _proc_member(self, tarfile):
1369 """Choose the right processing method depending on
1370 the type and call it.
1371 """
1372 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1373 return self._proc_gnulong(tarfile)
1374 elif self.type == GNUTYPE_SPARSE:
1375 return self._proc_sparse(tarfile)
1376 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1377 return self._proc_pax(tarfile)
1378 else:
1379 return self._proc_builtin(tarfile)
1380
1381 def _proc_builtin(self, tarfile):
1382 """Process a builtin type or an unknown type which
1383 will be treated as a regular file.
1384 """
1385 self.offset_data = tarfile.fileobj.tell()
1386 offset = self.offset_data
1387 if self.isreg() or self.type not in SUPPORTED_TYPES:
1388 # Skip the following data blocks.
1389 offset += self._block(self.size)
1390 tarfile.offset = offset
1391
1392 # Patch the TarInfo object with saved global
1393 # header information.
1394 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1395
1396 # Remove redundant slashes from directories. This is to be consistent
1397 # with frombuf().
1398 if self.isdir():
1399 self.name = self.name.rstrip("/")
1400
1401 return self
1402
1403 def _proc_gnulong(self, tarfile):
1404 """Process the blocks that hold a GNU longname
1405 or longlink member.
1406 """
1407 buf = tarfile.fileobj.read(self._block(self.size))
1408
1409 # Fetch the next header and process it.
1410 try:
1411 next = self.fromtarfile(tarfile)
1412 except HeaderError as e:
1413 raise SubsequentHeaderError(str(e)) from None
1414
1415 # Patch the TarInfo object from the next header with
1416 # the longname information.
1417 next.offset = self.offset
1418 if self.type == GNUTYPE_LONGNAME:
1419 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1420 elif self.type == GNUTYPE_LONGLINK:
1421 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1422
1423 # Remove redundant slashes from directories. This is to be consistent
1424 # with frombuf().
1425 if next.isdir():
1426 next.name = next.name.removesuffix("/")
1427
1428 return next
1429
1430 def _proc_sparse(self, tarfile):
1431 """Process a GNU sparse header plus extra headers.
1432 """
1433 # We already collected some sparse structures in frombuf().
1434 structs, isextended, origsize = self._sparse_structs
1435 del self._sparse_structs
1436
1437 # Collect sparse structures from extended header blocks.
1438 while isextended:
1439 buf = tarfile.fileobj.read(BLOCKSIZE)
1440 pos = 0
1441 for i in range(21):
1442 try:
1443 offset = nti(buf[pos:pos + 12])
1444 numbytes = nti(buf[pos + 12:pos + 24])
1445 except ValueError:
1446 break
1447 if offset and numbytes:
1448 structs.append((offset, numbytes))
1449 pos += 24
1450 isextended = bool(buf[504])
1451 self.sparse = structs
1452
1453 self.offset_data = tarfile.fileobj.tell()
1454 tarfile.offset = self.offset_data + self._block(self.size)
1455 self.size = origsize
1456 return self
1457
1458 def _proc_pax(self, tarfile):
1459 """Process an extended or global header as described in
1460 POSIX.1-2008.
1461 """
1462 # Read the header information.
1463 buf = tarfile.fileobj.read(self._block(self.size))
1464
1465 # A pax header stores supplemental information for either
1466 # the following file (extended) or all following files
1467 # (global).
1468 if self.type == XGLTYPE:
1469 pax_headers = tarfile.pax_headers
1470 else:
1471 pax_headers = tarfile.pax_headers.copy()
1472
1473 # Parse pax header information. A record looks like that:
1474 # "%d %s=%s\n" % (length, keyword, value). length is the size
1475 # of the complete record including the length field itself and
1476 # the newline.
1477 pos = 0
1478 encoding = None
1479 raw_headers = []
1480 while len(buf) > pos and buf[pos] != 0x00:
1481 if not (match := _header_length_prefix_re.match(buf, pos)):
1482 raise InvalidHeaderError("invalid header")
1483 try:
1484 length = int(match.group(1))
1485 except ValueError:
1486 raise InvalidHeaderError("invalid header")
1487 # Headers must be at least 5 bytes, shortest being '5 x=\n'.
1488 # Value is allowed to be empty.
1489 if length < 5:
1490 raise InvalidHeaderError("invalid header")
1491 if pos + length > len(buf):
1492 raise InvalidHeaderError("invalid header")
1493
1494 header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
1495 keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
1496 raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")
1497
1498 # Check the framing of the header. The last character must be '\n' (0x0A)
1499 if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
1500 raise InvalidHeaderError("invalid header")
1501 raw_headers.append((length, raw_keyword, raw_value))
1502
1503 # Check if the pax header contains a hdrcharset field. This tells us
1504 # the encoding of the path, linkpath, uname and gname fields. Normally,
1505 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1506 # implementations are allowed to store them as raw binary strings if
1507 # the translation to UTF-8 fails. For the time being, we don't care about
1508 # anything other than "BINARY". The only other value that is currently
1509 # allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1510 # Note that we only follow the initial 'hdrcharset' setting to preserve
1511 # the initial behavior of the 'tarfile' module.
1512 if raw_keyword == b"hdrcharset" and encoding is None:
1513 if raw_value == b"BINARY":
1514 encoding = tarfile.encoding
1515 else: # This branch ensures only the first 'hdrcharset' header is used.
1516 encoding = "utf-8"
1517
1518 pos += length
1519
1520 # If no explicit hdrcharset is set, we use UTF-8 as a default.
1521 if encoding is None:
1522 encoding = "utf-8"
1523
1524 # After parsing the raw headers we can decode them to text.
1525 for length, raw_keyword, raw_value in raw_headers:
1526 # Normally, we could just use "utf-8" as the encoding and "strict"
1527 # as the error handler, but we better not take the risk. For
1528 # example, GNU tar <= 1.23 is known to store filenames it cannot
1529 # translate to UTF-8 as raw strings (unfortunately without a
1530 # hdrcharset=BINARY header).
1531 # We first try the strict standard encoding, and if that fails we
1532 # fall back on the user's encoding and error handler.
1533 keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
1534 tarfile.errors)
1535 if keyword in PAX_NAME_FIELDS:
1536 value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
1537 tarfile.errors)
1538 else:
1539 value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
1540 tarfile.errors)
1541
1542 pax_headers[keyword] = value
1543
1544 # Fetch the next header.
1545 try:
1546 next = self.fromtarfile(tarfile)
1547 except HeaderError as e:
1548 raise SubsequentHeaderError(str(e)) from None
1549
1550 # Process GNU sparse information.
1551 if "GNU.sparse.map" in pax_headers:
1552 # GNU extended sparse format version 0.1.
1553 self._proc_gnusparse_01(next, pax_headers)
1554
1555 elif "GNU.sparse.size" in pax_headers:
1556 # GNU extended sparse format version 0.0.
1557 self._proc_gnusparse_00(next, raw_headers)
1558
1559 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1560 # GNU extended sparse format version 1.0.
1561 self._proc_gnusparse_10(next, pax_headers, tarfile)
1562
1563 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1564 # Patch the TarInfo object with the extended header info.
1565 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1566 next.offset = self.offset
1567
1568 if "size" in pax_headers:
1569 # If the extended header replaces the size field,
1570 # we need to recalculate the offset where the next
1571 # header starts.
1572 offset = next.offset_data
1573 if next.isreg() or next.type not in SUPPORTED_TYPES:
1574 offset += next._block(next.size)
1575 tarfile.offset = offset
1576
1577 return next
1578
1579 def _proc_gnusparse_00(self, next, raw_headers):
1580 """Process a GNU tar extended sparse header, version 0.0.
1581 """
1582 offsets = []
1583 numbytes = []
1584 for _, keyword, value in raw_headers:
1585 if keyword == b"GNU.sparse.offset":
1586 try:
1587 offsets.append(int(value.decode()))
1588 except ValueError:
1589 raise InvalidHeaderError("invalid header")
1590
1591 elif keyword == b"GNU.sparse.numbytes":
1592 try:
1593 numbytes.append(int(value.decode()))
1594 except ValueError:
1595 raise InvalidHeaderError("invalid header")
1596
1597 next.sparse = list(zip(offsets, numbytes))
1598
1599 def _proc_gnusparse_01(self, next, pax_headers):
1600 """Process a GNU tar extended sparse header, version 0.1.
1601 """
1602 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1603 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1604
1605 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1606 """Process a GNU tar extended sparse header, version 1.0.
1607 """
1608 fields = None
1609 sparse = []
1610 buf = tarfile.fileobj.read(BLOCKSIZE)
1611 fields, buf = buf.split(b"\n", 1)
1612 fields = int(fields)
1613 while len(sparse) < fields * 2:
1614 if b"\n" not in buf:
1615 buf += tarfile.fileobj.read(BLOCKSIZE)
1616 number, buf = buf.split(b"\n", 1)
1617 sparse.append(int(number))
1618 next.offset_data = tarfile.fileobj.tell()
1619 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1620
1621 def _apply_pax_info(self, pax_headers, encoding, errors):
1622 """Replace fields with supplemental information from a previous
1623 pax extended or global header.
1624 """
1625 for keyword, value in pax_headers.items():
1626 if keyword == "GNU.sparse.name":
1627 setattr(self, "path", value)
1628 elif keyword == "GNU.sparse.size":
1629 setattr(self, "size", int(value))
1630 elif keyword == "GNU.sparse.realsize":
1631 setattr(self, "size", int(value))
1632 elif keyword in PAX_FIELDS:
1633 if keyword in PAX_NUMBER_FIELDS:
1634 try:
1635 value = PAX_NUMBER_FIELDS[keyword](value)
1636 except ValueError:
1637 value = 0
1638 if keyword == "path":
1639 value = value.rstrip("/")
1640 setattr(self, keyword, value)
1641
1642 self.pax_headers = pax_headers.copy()
1643
1644 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1645 """Decode a single field from a pax record.
1646 """
1647 try:
1648 return value.decode(encoding, "strict")
1649 except UnicodeDecodeError:
1650 return value.decode(fallback_encoding, fallback_errors)
1651
1652 def _block(self, count):
1653 """Round up a byte count by BLOCKSIZE and return it,
1654 e.g. _block(834) => 1024.
1655 """
1656 # Only non-negative offsets are allowed
1657 if count < 0:
1658 raise InvalidHeaderError("invalid offset")
1659 blocks, remainder = divmod(count, BLOCKSIZE)
1660 if remainder:
1661 blocks += 1
1662 return blocks * BLOCKSIZE
1663
1664 def isreg(self):
1665 'Return True if the Tarinfo object is a regular file.'
1666 return self.type in REGULAR_TYPES
1667
1668 def isfile(self):
1669 'Return True if the Tarinfo object is a regular file.'
1670 return self.isreg()
1671
1672 def isdir(self):
1673 'Return True if it is a directory.'
1674 return self.type == DIRTYPE
1675
1676 def issym(self):
1677 'Return True if it is a symbolic link.'
1678 return self.type == SYMTYPE
1679
1680 def islnk(self):
1681 'Return True if it is a hard link.'
1682 return self.type == LNKTYPE
1683
1684 def ischr(self):
1685 'Return True if it is a character device.'
1686 return self.type == CHRTYPE
1687
1688 def isblk(self):
1689 'Return True if it is a block device.'
1690 return self.type == BLKTYPE
1691
1692 def isfifo(self):
1693 'Return True if it is a FIFO.'
1694 return self.type == FIFOTYPE
1695
1696 def issparse(self):
1697 return self.sparse is not None
1698
1699 def isdev(self):
1700 'Return True if it is one of character device, block device or FIFO.'
1701 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1702# class TarInfo
1703
1704class TarFile(object):
1705 """The TarFile Class provides an interface to tar archives.
1706 """
1707
1708 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1709
1710 dereference = False # If true, add content of linked file to the
1711 # tar file, else the link.
1712
1713 ignore_zeros = False # If true, skips empty or invalid blocks and
1714 # continues processing.
1715
1716 errorlevel = 1 # If 0, fatal errors only appear in debug
1717 # messages (if debug >= 0). If > 0, errors
1718 # are passed to the caller as exceptions.
1719
1720 format = DEFAULT_FORMAT # The format to use when creating an archive.
1721
1722 encoding = ENCODING # Encoding for 8-bit character strings.
1723
1724 errors = None # Error handler for unicode conversion.
1725
1726 tarinfo = TarInfo # The default TarInfo class to use.
1727
1728 fileobject = ExFileObject # The file-object for extractfile().
1729
1730 extraction_filter = None # The default filter for extraction.
1731
1732 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1733 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1734 errors="surrogateescape", pax_headers=None, debug=None,
1735 errorlevel=None, copybufsize=None, stream=False):
1736 """Open an (uncompressed) tar archive 'name'. 'mode' is either 'r' to
1737 read from an existing archive, 'a' to append data to an existing
1738 file or 'w' to create a new file overwriting an existing one. 'mode'
1739 defaults to 'r'.
1740 If 'fileobj' is given, it is used for reading or writing data. If it
1741 can be determined, 'mode' is overridden by 'fileobj's mode.
1742 'fileobj' is not closed, when TarFile is closed.
1743 """
1744 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1745 if mode not in modes:
1746 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1747 self.mode = mode
1748 self._mode = modes[mode]
1749
1750 if not fileobj:
1751 if self.mode == "a" and not os.path.exists(name):
1752 # Create nonexistent files in append mode.
1753 self.mode = "w"
1754 self._mode = "wb"
1755 fileobj = bltn_open(name, self._mode)
1756 self._extfileobj = False
1757 else:
1758 if (name is None and hasattr(fileobj, "name") and
1759 isinstance(fileobj.name, (str, bytes))):
1760 name = fileobj.name
1761 if hasattr(fileobj, "mode"):
1762 self._mode = fileobj.mode
1763 self._extfileobj = True
1764 self.name = os.path.abspath(name) if name else None
1765 self.fileobj = fileobj
1766
1767 self.stream = stream
1768
1769 # Init attributes.
1770 if format is not None:
1771 self.format = format
1772 if tarinfo is not None:
1773 self.tarinfo = tarinfo
1774 if dereference is not None:
1775 self.dereference = dereference
1776 if ignore_zeros is not None:
1777 self.ignore_zeros = ignore_zeros
1778 if encoding is not None:
1779 self.encoding = encoding
1780 self.errors = errors
1781
1782 if pax_headers is not None and self.format == PAX_FORMAT:
1783 self.pax_headers = pax_headers
1784 else:
1785 self.pax_headers = {}
1786
1787 if debug is not None:
1788 self.debug = debug
1789 if errorlevel is not None:
1790 self.errorlevel = errorlevel
1791
1792 # Init datastructures.
1793 self.copybufsize = copybufsize
1794 self.closed = False
1795 self.members = [] # list of members as TarInfo objects
1796 self._loaded = False # flag if all members have been read
1797 self.offset = self.fileobj.tell()
1798 # current position in the archive file
1799 self.inodes = {} # dictionary caching the inodes of
1800 # archive members already added
1801 self._unames = {} # Cached mappings of uid -> uname
1802 self._gnames = {} # Cached mappings of gid -> gname
1803
1804 try:
1805 if self.mode == "r":
1806 self.firstmember = None
1807 self.firstmember = self.next()
1808
1809 if self.mode == "a":
1810 # Move to the end of the archive,
1811 # before the first empty block.
1812 while True:
1813 self.fileobj.seek(self.offset)
1814 try:
1815 tarinfo = self.tarinfo.fromtarfile(self)
1816 self.members.append(tarinfo)
1817 except EOFHeaderError:
1818 self.fileobj.seek(self.offset)
1819 break
1820 except HeaderError as e:
1821 raise ReadError(str(e)) from None
1822
1823 if self.mode in ("a", "w", "x"):
1824 self._loaded = True
1825
1826 if self.pax_headers:
1827 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1828 self.fileobj.write(buf)
1829 self.offset += len(buf)
1830 except:
1831 if not self._extfileobj:
1832 self.fileobj.close()
1833 self.closed = True
1834 raise
1835
1836 #--------------------------------------------------------------------------
1837 # Below are the classmethods which act as alternate constructors to the
1838 # TarFile class. The open() method is the only one that is needed for
1839 # public use; it is the "super"-constructor and is able to select an
1840 # adequate "sub"-constructor for a particular compression using the mapping
1841 # from OPEN_METH.
1842 #
1843 # This concept allows one to subclass TarFile without losing the comfort of
1844 # the super-constructor. A sub-constructor is registered and made available
1845 # by adding it to the mapping in OPEN_METH.
1846
1847 @classmethod
1848 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1849 """Open a tar archive for reading, writing or appending. Return
1850 an appropriate TarFile class.
1851
1852 mode:
1853 'r' or 'r:*' open for reading with transparent compression
1854 'r:' open for reading exclusively uncompressed
1855 'r:gz' open for reading with gzip compression
1856 'r:bz2' open for reading with bzip2 compression
1857 'r:xz' open for reading with lzma compression
1858 'r:zst' open for reading with zstd compression
1859 'a' or 'a:' open for appending, creating the file if necessary
1860 'w' or 'w:' open for writing without compression
1861 'w:gz' open for writing with gzip compression
1862 'w:bz2' open for writing with bzip2 compression
1863 'w:xz' open for writing with lzma compression
1864 'w:zst' open for writing with zstd compression
1865
1866 'x' or 'x:' create a tarfile exclusively without compression, raise
1867 an exception if the file is already created
1868 'x:gz' create a gzip compressed tarfile, raise an exception
1869 if the file is already created
1870 'x:bz2' create a bzip2 compressed tarfile, raise an exception
1871 if the file is already created
1872 'x:xz' create an lzma compressed tarfile, raise an exception
1873 if the file is already created
1874 'x:zst' create a zstd compressed tarfile, raise an exception
1875 if the file is already created
1876
1877 'r|*' open a stream of tar blocks with transparent compression
1878 'r|' open an uncompressed stream of tar blocks for reading
1879 'r|gz' open a gzip compressed stream of tar blocks
1880 'r|bz2' open a bzip2 compressed stream of tar blocks
1881 'r|xz' open an lzma compressed stream of tar blocks
1882 'r|zst' open a zstd compressed stream of tar blocks
1883 'w|' open an uncompressed stream for writing
1884 'w|gz' open a gzip compressed stream for writing
1885 'w|bz2' open a bzip2 compressed stream for writing
1886 'w|xz' open an lzma compressed stream for writing
1887 'w|zst' open a zstd compressed stream for writing
1888 """
1889
1890 if not name and not fileobj:
1891 raise ValueError("nothing to open")
1892
1893 if mode in ("r", "r:*"):
1894 # Find out which *open() is appropriate for opening the file.
1895 def not_compressed(comptype):
1896 return cls.OPEN_METH[comptype] == 'taropen'
1897 error_msgs = []
1898 for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1899 func = getattr(cls, cls.OPEN_METH[comptype])
1900 if fileobj is not None:
1901 saved_pos = fileobj.tell()
1902 try:
1903 return func(name, "r", fileobj, **kwargs)
1904 except (ReadError, CompressionError) as e:
1905 error_msgs.append(f'- method {comptype}: {e!r}')
1906 if fileobj is not None:
1907 fileobj.seek(saved_pos)
1908 continue
1909 error_msgs_summary = '\n'.join(error_msgs)
1910 raise ReadError(f"file could not be opened successfully:\n{error_msgs_summary}")
1911
1912 elif ":" in mode:
1913 filemode, comptype = mode.split(":", 1)
1914 filemode = filemode or "r"
1915 comptype = comptype or "tar"
1916
1917 # Select the *open() function according to
1918 # given compression.
1919 if comptype in cls.OPEN_METH:
1920 func = getattr(cls, cls.OPEN_METH[comptype])
1921 else:
1922 raise CompressionError("unknown compression type %r" % comptype)
1923 return func(name, filemode, fileobj, **kwargs)
1924
1925 elif "|" in mode:
1926 filemode, comptype = mode.split("|", 1)
1927 filemode = filemode or "r"
1928 comptype = comptype or "tar"
1929
1930 if filemode not in ("r", "w"):
1931 raise ValueError("mode must be 'r' or 'w'")
1932 if "compresslevel" in kwargs and comptype not in ("gz", "bz2"):
1933 raise ValueError(
1934 "compresslevel is only valid for w|gz and w|bz2 modes"
1935 )
1936 if "preset" in kwargs and comptype not in ("xz",):
1937 raise ValueError("preset is only valid for w|xz mode")
1938
1939 compresslevel = kwargs.pop("compresslevel", 9)
1940 preset = kwargs.pop("preset", None)
1941 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
1942 compresslevel, preset)
1943 try:
1944 t = cls(name, filemode, stream, **kwargs)
1945 except:
1946 stream.close()
1947 raise
1948 t._extfileobj = False
1949 return t
1950
1951 elif mode in ("a", "w", "x"):
1952 return cls.taropen(name, mode, fileobj, **kwargs)
1953
1954 raise ValueError("undiscernible mode")
1955
1956 @classmethod
1957 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1958 """Open uncompressed tar archive name for reading or writing.
1959 """
1960 if mode not in ("r", "a", "w", "x"):
1961 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1962 return cls(name, mode, fileobj, **kwargs)
1963
1964 @classmethod
1965 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1966 """Open gzip compressed tar archive name for reading or writing.
1967 Appending is not allowed.
1968 """
1969 if mode not in ("r", "w", "x"):
1970 raise ValueError("mode must be 'r', 'w' or 'x'")
1971
1972 try:
1973 from gzip import GzipFile
1974 except ImportError:
1975 raise CompressionError("gzip module is not available") from None
1976
1977 try:
1978 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
1979 except OSError as e:
1980 if fileobj is not None and mode == 'r':
1981 raise ReadError("not a gzip file") from e
1982 raise
1983
1984 try:
1985 t = cls.taropen(name, mode, fileobj, **kwargs)
1986 except OSError as e:
1987 fileobj.close()
1988 if mode == 'r':
1989 raise ReadError("not a gzip file") from e
1990 raise
1991 except:
1992 fileobj.close()
1993 raise
1994 t._extfileobj = False
1995 return t
1996
1997 @classmethod
1998 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1999 """Open bzip2 compressed tar archive name for reading or writing.
2000 Appending is not allowed.
2001 """
2002 if mode not in ("r", "w", "x"):
2003 raise ValueError("mode must be 'r', 'w' or 'x'")
2004
2005 try:
2006 from bz2 import BZ2File
2007 except ImportError:
2008 raise CompressionError("bz2 module is not available") from None
2009
2010 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel)
2011
2012 try:
2013 t = cls.taropen(name, mode, fileobj, **kwargs)
2014 except (OSError, EOFError) as e:
2015 fileobj.close()
2016 if mode == 'r':
2017 raise ReadError("not a bzip2 file") from e
2018 raise
2019 except:
2020 fileobj.close()
2021 raise
2022 t._extfileobj = False
2023 return t
2024
2025 @classmethod
2026 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2027 """Open lzma compressed tar archive name for reading or writing.
2028 Appending is not allowed.
2029 """
2030 if mode not in ("r", "w", "x"):
2031 raise ValueError("mode must be 'r', 'w' or 'x'")
2032
2033 try:
2034 from lzma import LZMAFile, LZMAError
2035 except ImportError:
2036 raise CompressionError("lzma module is not available") from None
2037
2038 fileobj = LZMAFile(fileobj or name, mode, preset=preset)
2039
2040 try:
2041 t = cls.taropen(name, mode, fileobj, **kwargs)
2042 except (LZMAError, EOFError) as e:
2043 fileobj.close()
2044 if mode == 'r':
2045 raise ReadError("not an lzma file") from e
2046 raise
2047 except:
2048 fileobj.close()
2049 raise
2050 t._extfileobj = False
2051 return t
2052
2053 @classmethod
2054 def zstopen(cls, name, mode="r", fileobj=None, level=None, options=None,
2055 zstd_dict=None, **kwargs):
2056 """Open zstd compressed tar archive name for reading or writing.
2057 Appending is not allowed.
2058 """
2059 if mode not in ("r", "w", "x"):
2060 raise ValueError("mode must be 'r', 'w' or 'x'")
2061
2062 from backports.zstd import ZstdFile, ZstdError
2063
2064 fileobj = ZstdFile(
2065 fileobj or name,
2066 mode,
2067 level=level,
2068 options=options,
2069 zstd_dict=zstd_dict
2070 )
2071
2072 try:
2073 t = cls.taropen(name, mode, fileobj, **kwargs)
2074 except (ZstdError, EOFError) as e:
2075 fileobj.close()
2076 if mode == 'r':
2077 raise ReadError("not a zstd file") from e
2078 raise
2079 except Exception:
2080 fileobj.close()
2081 raise
2082 t._extfileobj = False
2083 return t
2084
2085 # All *open() methods are registered here.
2086 OPEN_METH = {
2087 "tar": "taropen", # uncompressed tar
2088 "gz": "gzopen", # gzip compressed tar
2089 "bz2": "bz2open", # bzip2 compressed tar
2090 "xz": "xzopen", # lzma compressed tar
2091 "zst": "zstopen", # zstd compressed tar
2092 }
2093
2094 #--------------------------------------------------------------------------
2095 # The public methods which TarFile provides:
2096
2097 def close(self):
2098 """Close the TarFile. In write-mode, two finishing zero blocks are
2099 appended to the archive.
2100 """
2101 if self.closed:
2102 return
2103
2104 self.closed = True
2105 try:
2106 if self.mode in ("a", "w", "x"):
2107 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2108 self.offset += (BLOCKSIZE * 2)
2109 # fill up the end with zero-blocks
2110 # (like option -b20 for tar does)
2111 blocks, remainder = divmod(self.offset, RECORDSIZE)
2112 if remainder > 0:
2113 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2114 finally:
2115 if not self._extfileobj:
2116 self.fileobj.close()
2117
2118 def getmember(self, name):
2119 """Return a TarInfo object for member 'name'. If 'name' can not be
2120 found in the archive, KeyError is raised. If a member occurs more
2121 than once in the archive, its last occurrence is assumed to be the
2122 most up-to-date version.
2123 """
2124 tarinfo = self._getmember(name.rstrip('/'))
2125 if tarinfo is None:
2126 raise KeyError("filename %r not found" % name)
2127 return tarinfo
2128
2129 def getmembers(self):
2130 """Return the members of the archive as a list of TarInfo objects. The
2131 list has the same order as the members in the archive.
2132 """
2133 self._check()
2134 if not self._loaded: # if we want to obtain a list of
2135 self._load() # all members, we first have to
2136 # scan the whole archive.
2137 return self.members
2138
2139 def getnames(self):
2140 """Return the members of the archive as a list of their names. It has
2141 the same order as the list returned by getmembers().
2142 """
2143 return [tarinfo.name for tarinfo in self.getmembers()]
2144
2145 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2146 """Create a TarInfo object from the result of os.stat or equivalent
2147 on an existing file. The file is either named by 'name', or
2148 specified as a file object 'fileobj' with a file descriptor. If
2149 given, 'arcname' specifies an alternative name for the file in the
2150 archive, otherwise, the name is taken from the 'name' attribute of
2151 'fileobj', or the 'name' argument. The name should be a text
2152 string.
2153 """
2154 self._check("awx")
2155
2156 # When fileobj is given, replace name by
2157 # fileobj's real name.
2158 if fileobj is not None:
2159 name = fileobj.name
2160
2161 # Building the name of the member in the archive.
2162 # Backward slashes are converted to forward slashes,
2163 # Absolute paths are turned to relative paths.
2164 if arcname is None:
2165 arcname = name
2166 drv, arcname = os.path.splitdrive(arcname)
2167 arcname = arcname.replace(os.sep, "/")
2168 arcname = arcname.lstrip("/")
2169
2170 # Now, fill the TarInfo object with
2171 # information specific for the file.
2172 tarinfo = self.tarinfo()
2173 tarinfo._tarfile = self # To be removed in 3.16.
2174
2175 # Use os.stat or os.lstat, depending on if symlinks shall be resolved.
2176 if fileobj is None:
2177 if not self.dereference:
2178 statres = os.lstat(name)
2179 else:
2180 statres = os.stat(name)
2181 else:
2182 statres = os.fstat(fileobj.fileno())
2183 linkname = ""
2184
2185 stmd = statres.st_mode
2186 if stat.S_ISREG(stmd):
2187 inode = (statres.st_ino, statres.st_dev)
2188 if not self.dereference and statres.st_nlink > 1 and \
2189 inode in self.inodes and arcname != self.inodes[inode]:
2190 # Is it a hardlink to an already
2191 # archived file?
2192 type = LNKTYPE
2193 linkname = self.inodes[inode]
2194 else:
2195 # The inode is added only if its valid.
2196 # For win32 it is always 0.
2197 type = REGTYPE
2198 if inode[0]:
2199 self.inodes[inode] = arcname
2200 elif stat.S_ISDIR(stmd):
2201 type = DIRTYPE
2202 elif stat.S_ISFIFO(stmd):
2203 type = FIFOTYPE
2204 elif stat.S_ISLNK(stmd):
2205 type = SYMTYPE
2206 linkname = os.readlink(name)
2207 elif stat.S_ISCHR(stmd):
2208 type = CHRTYPE
2209 elif stat.S_ISBLK(stmd):
2210 type = BLKTYPE
2211 else:
2212 return None
2213
2214 # Fill the TarInfo object with all
2215 # information we can get.
2216 tarinfo.name = arcname
2217 tarinfo.mode = stmd
2218 tarinfo.uid = statres.st_uid
2219 tarinfo.gid = statres.st_gid
2220 if type == REGTYPE:
2221 tarinfo.size = statres.st_size
2222 else:
2223 tarinfo.size = 0
2224 tarinfo.mtime = statres.st_mtime
2225 tarinfo.type = type
2226 tarinfo.linkname = linkname
2227
2228 # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
2229 # speed things up, cache the resolved usernames and group names.
2230 if pwd:
2231 if tarinfo.uid not in self._unames:
2232 try:
2233 self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
2234 except KeyError:
2235 self._unames[tarinfo.uid] = ''
2236 tarinfo.uname = self._unames[tarinfo.uid]
2237 if grp:
2238 if tarinfo.gid not in self._gnames:
2239 try:
2240 self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
2241 except KeyError:
2242 self._gnames[tarinfo.gid] = ''
2243 tarinfo.gname = self._gnames[tarinfo.gid]
2244
2245 if type in (CHRTYPE, BLKTYPE):
2246 if hasattr(os, "major") and hasattr(os, "minor"):
2247 tarinfo.devmajor = os.major(statres.st_rdev)
2248 tarinfo.devminor = os.minor(statres.st_rdev)
2249 return tarinfo
2250
2251 def list(self, verbose=True, *, members=None):
2252 """Print a table of contents to sys.stdout. If 'verbose' is False, only
2253 the names of the members are printed. If it is True, an 'ls -l'-like
2254 output is produced. 'members' is optional and must be a subset of the
2255 list returned by getmembers().
2256 """
2257 # Convert tarinfo type to stat type.
2258 type2mode = {REGTYPE: stat.S_IFREG, SYMTYPE: stat.S_IFLNK,
2259 FIFOTYPE: stat.S_IFIFO, CHRTYPE: stat.S_IFCHR,
2260 DIRTYPE: stat.S_IFDIR, BLKTYPE: stat.S_IFBLK}
2261 self._check()
2262
2263 if members is None:
2264 members = self
2265 for tarinfo in members:
2266 if verbose:
2267 if tarinfo.mode is None:
2268 _safe_print("??????????")
2269 else:
2270 modetype = type2mode.get(tarinfo.type, 0)
2271 _safe_print(stat.filemode(modetype | tarinfo.mode))
2272 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2273 tarinfo.gname or tarinfo.gid))
2274 if tarinfo.ischr() or tarinfo.isblk():
2275 _safe_print("%10s" %
2276 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
2277 else:
2278 _safe_print("%10d" % tarinfo.size)
2279 if tarinfo.mtime is None:
2280 _safe_print("????-??-?? ??:??:??")
2281 else:
2282 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
2283 % time.localtime(tarinfo.mtime)[:6])
2284
2285 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
2286
2287 if verbose:
2288 if tarinfo.issym():
2289 _safe_print("-> " + tarinfo.linkname)
2290 if tarinfo.islnk():
2291 _safe_print("link to " + tarinfo.linkname)
2292 print()
2293
2294 def add(self, name, arcname=None, recursive=True, *, filter=None):
2295 """Add the file 'name' to the archive. 'name' may be any type of file
2296 (directory, fifo, symbolic link, etc.). If given, 'arcname'
2297 specifies an alternative name for the file in the archive.
2298 Directories are added recursively by default. This can be avoided by
2299 setting 'recursive' to False. 'filter' is a function
2300 that expects a TarInfo object argument and returns the changed
2301 TarInfo object, if it returns None the TarInfo object will be
2302 excluded from the archive.
2303 """
2304 self._check("awx")
2305
2306 if arcname is None:
2307 arcname = name
2308
2309 # Skip if somebody tries to archive the archive...
2310 if self.name is not None and os.path.abspath(name) == self.name:
2311 self._dbg(2, "tarfile: Skipped %r" % name)
2312 return
2313
2314 self._dbg(1, name)
2315
2316 # Create a TarInfo object from the file.
2317 tarinfo = self.gettarinfo(name, arcname)
2318
2319 if tarinfo is None:
2320 self._dbg(1, "tarfile: Unsupported type %r" % name)
2321 return
2322
2323 # Change or exclude the TarInfo object.
2324 if filter is not None:
2325 tarinfo = filter(tarinfo)
2326 if tarinfo is None:
2327 self._dbg(2, "tarfile: Excluded %r" % name)
2328 return
2329
2330 # Append the tar header and data to the archive.
2331 if tarinfo.isreg():
2332 with bltn_open(name, "rb") as f:
2333 self.addfile(tarinfo, f)
2334
2335 elif tarinfo.isdir():
2336 self.addfile(tarinfo)
2337 if recursive:
2338 for f in sorted(os.listdir(name)):
2339 self.add(os.path.join(name, f), os.path.join(arcname, f),
2340 recursive, filter=filter)
2341
2342 else:
2343 self.addfile(tarinfo)
2344
2345 def addfile(self, tarinfo, fileobj=None):
2346 """Add the TarInfo object 'tarinfo' to the archive. If 'tarinfo' represents
2347 a non zero-size regular file, the 'fileobj' argument should be a binary file,
2348 and tarinfo.size bytes are read from it and added to the archive.
2349 You can create TarInfo objects directly, or by using gettarinfo().
2350 """
2351 self._check("awx")
2352
2353 if fileobj is None and tarinfo.isreg() and tarinfo.size != 0:
2354 raise ValueError("fileobj not provided for non zero-size regular file")
2355
2356 tarinfo = copy.copy(tarinfo)
2357
2358 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2359 self.fileobj.write(buf)
2360 self.offset += len(buf)
2361 bufsize=self.copybufsize
2362 # If there's data to follow, append it.
2363 if fileobj is not None:
2364 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
2365 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2366 if remainder > 0:
2367 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2368 blocks += 1
2369 self.offset += blocks * BLOCKSIZE
2370
2371 self.members.append(tarinfo)
2372
2373 def _get_filter_function(self, filter):
2374 if filter is None:
2375 filter = self.extraction_filter
2376 if filter is None:
2377 return data_filter
2378 if isinstance(filter, str):
2379 raise TypeError(
2380 'String names are not supported for '
2381 + 'TarFile.extraction_filter. Use a function such as '
2382 + 'tarfile.data_filter directly.')
2383 return filter
2384 if callable(filter):
2385 return filter
2386 try:
2387 return _NAMED_FILTERS[filter]
2388 except KeyError:
2389 raise ValueError(f"filter {filter!r} not found") from None
2390
2391 def extractall(self, path=".", members=None, *, numeric_owner=False,
2392 filter=None):
2393 """Extract all members from the archive to the current working
2394 directory and set owner, modification time and permissions on
2395 directories afterwards. 'path' specifies a different directory
2396 to extract to. 'members' is optional and must be a subset of the
2397 list returned by getmembers(). If 'numeric_owner' is True, only
2398 the numbers for user/group names are used and not the names.
2399
2400 The 'filter' function will be called on each member just
2401 before extraction.
2402 It can return a changed TarInfo or None to skip the member.
2403 String names of common filters are accepted.
2404 """
2405 directories = []
2406
2407 filter_function = self._get_filter_function(filter)
2408 if members is None:
2409 members = self
2410
2411 for member in members:
2412 tarinfo, unfiltered = self._get_extract_tarinfo(
2413 member, filter_function, path)
2414 if tarinfo is None:
2415 continue
2416 if tarinfo.isdir():
2417 # For directories, delay setting attributes until later,
2418 # since permissions can interfere with extraction and
2419 # extracting contents can reset mtime.
2420 directories.append(unfiltered)
2421 self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(),
2422 numeric_owner=numeric_owner,
2423 filter_function=filter_function)
2424
2425 # Reverse sort directories.
2426 directories.sort(key=lambda a: a.name, reverse=True)
2427
2428
2429 # Set correct owner, mtime and filemode on directories.
2430 for unfiltered in directories:
2431 try:
2432 # Need to re-apply any filter, to take the *current* filesystem
2433 # state into account.
2434 try:
2435 tarinfo = filter_function(unfiltered, path)
2436 except _FILTER_ERRORS as exc:
2437 self._log_no_directory_fixup(unfiltered, repr(exc))
2438 continue
2439 if tarinfo is None:
2440 self._log_no_directory_fixup(unfiltered,
2441 'excluded by filter')
2442 continue
2443 dirpath = os.path.join(path, tarinfo.name)
2444 try:
2445 lstat = os.lstat(dirpath)
2446 except FileNotFoundError:
2447 self._log_no_directory_fixup(tarinfo, 'missing')
2448 continue
2449 if not stat.S_ISDIR(lstat.st_mode):
2450 # This is no longer a directory; presumably a later
2451 # member overwrote the entry.
2452 self._log_no_directory_fixup(tarinfo, 'not a directory')
2453 continue
2454 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2455 self.utime(tarinfo, dirpath)
2456 self.chmod(tarinfo, dirpath)
2457 except ExtractError as e:
2458 self._handle_nonfatal_error(e)
2459
2460 def _log_no_directory_fixup(self, member, reason):
2461 self._dbg(2, "tarfile: Not fixing up directory %r (%s)" %
2462 (member.name, reason))
2463
2464 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False,
2465 filter=None):
2466 """Extract a member from the archive to the current working directory,
2467 using its full name. Its file information is extracted as accurately
2468 as possible. 'member' may be a filename or a TarInfo object. You can
2469 specify a different directory using 'path'. File attributes (owner,
2470 mtime, mode) are set unless 'set_attrs' is False. If 'numeric_owner'
2471 is True, only the numbers for user/group names are used and not
2472 the names.
2473
2474 The 'filter' function will be called before extraction.
2475 It can return a changed TarInfo or None to skip the member.
2476 String names of common filters are accepted.
2477 """
2478 filter_function = self._get_filter_function(filter)
2479 tarinfo, unfiltered = self._get_extract_tarinfo(
2480 member, filter_function, path)
2481 if tarinfo is not None:
2482 self._extract_one(tarinfo, path, set_attrs, numeric_owner)
2483
2484 def _get_extract_tarinfo(self, member, filter_function, path):
2485 """Get (filtered, unfiltered) TarInfos from *member*
2486
2487 *member* might be a string.
2488
2489 Return (None, None) if not found.
2490 """
2491
2492 if isinstance(member, str):
2493 unfiltered = self.getmember(member)
2494 else:
2495 unfiltered = member
2496
2497 filtered = None
2498 try:
2499 filtered = filter_function(unfiltered, path)
2500 except (OSError, UnicodeEncodeError, FilterError) as e:
2501 self._handle_fatal_error(e)
2502 except ExtractError as e:
2503 self._handle_nonfatal_error(e)
2504 if filtered is None:
2505 self._dbg(2, "tarfile: Excluded %r" % unfiltered.name)
2506 return None, None
2507
2508 # Prepare the link target for makelink().
2509 if filtered.islnk():
2510 filtered = copy.copy(filtered)
2511 filtered._link_target = os.path.join(path, filtered.linkname)
2512 return filtered, unfiltered
2513
2514 def _extract_one(self, tarinfo, path, set_attrs, numeric_owner,
2515 filter_function=None):
2516 """Extract from filtered tarinfo to disk.
2517
2518 filter_function is only used when extracting a *different*
2519 member (e.g. as fallback to creating a symlink)
2520 """
2521 self._check("r")
2522
2523 try:
2524 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2525 set_attrs=set_attrs,
2526 numeric_owner=numeric_owner,
2527 filter_function=filter_function,
2528 extraction_root=path)
2529 except (OSError, UnicodeEncodeError) as e:
2530 self._handle_fatal_error(e)
2531 except ExtractError as e:
2532 self._handle_nonfatal_error(e)
2533
2534 def _handle_nonfatal_error(self, e):
2535 """Handle non-fatal error (ExtractError) according to errorlevel"""
2536 if self.errorlevel > 1:
2537 raise
2538 else:
2539 self._dbg(1, "tarfile: %s" % e)
2540
2541 def _handle_fatal_error(self, e):
2542 """Handle "fatal" error according to self.errorlevel"""
2543 if self.errorlevel > 0:
2544 raise
2545 elif isinstance(e, OSError):
2546 if e.filename is None:
2547 self._dbg(1, "tarfile: %s" % e.strerror)
2548 else:
2549 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2550 else:
2551 self._dbg(1, "tarfile: %s %s" % (type(e).__name__, e))
2552
2553 def extractfile(self, member):
2554 """Extract a member from the archive as a file object. 'member' may be
2555 a filename or a TarInfo object. If 'member' is a regular file or
2556 a link, an io.BufferedReader object is returned. For all other
2557 existing members, None is returned. If 'member' does not appear
2558 in the archive, KeyError is raised.
2559 """
2560 self._check("r")
2561
2562 if isinstance(member, str):
2563 tarinfo = self.getmember(member)
2564 else:
2565 tarinfo = member
2566
2567 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2568 # Members with unknown types are treated as regular files.
2569 return self.fileobject(self, tarinfo)
2570
2571 elif tarinfo.islnk() or tarinfo.issym():
2572 if isinstance(self.fileobj, _Stream):
2573 # A small but ugly workaround for the case that someone tries
2574 # to extract a (sym)link as a file-object from a non-seekable
2575 # stream of tar blocks.
2576 raise StreamError("cannot extract (sym)link as file object")
2577 else:
2578 # A (sym)link's file object is its target's file object.
2579 return self.extractfile(self._find_link_target(tarinfo))
2580 else:
2581 # If there's no data associated with the member (directory, chrdev,
2582 # blkdev, etc.), return None instead of a file object.
2583 return None
2584
2585 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2586 numeric_owner=False, *, filter_function=None,
2587 extraction_root=None):
2588 """Extract the filtered TarInfo object tarinfo to a physical
2589 file called targetpath.
2590
2591 filter_function is only used when extracting a *different*
2592 member (e.g. as fallback to creating a symlink)
2593 """
2594 # Fetch the TarInfo object for the given name
2595 # and build the destination pathname, replacing
2596 # forward slashes to platform specific separators.
2597 targetpath = targetpath.rstrip("/")
2598 targetpath = targetpath.replace("/", os.sep)
2599
2600 # Create all upper directories.
2601 upperdirs = os.path.dirname(targetpath)
2602 if upperdirs and not os.path.exists(upperdirs):
2603 # Create directories that are not part of the archive with
2604 # default permissions.
2605 os.makedirs(upperdirs, exist_ok=True)
2606
2607 if tarinfo.islnk() or tarinfo.issym():
2608 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2609 else:
2610 self._dbg(1, tarinfo.name)
2611
2612 if tarinfo.isreg():
2613 self.makefile(tarinfo, targetpath)
2614 elif tarinfo.isdir():
2615 self.makedir(tarinfo, targetpath)
2616 elif tarinfo.isfifo():
2617 self.makefifo(tarinfo, targetpath)
2618 elif tarinfo.ischr() or tarinfo.isblk():
2619 self.makedev(tarinfo, targetpath)
2620 elif tarinfo.islnk() or tarinfo.issym():
2621 self.makelink_with_filter(
2622 tarinfo, targetpath,
2623 filter_function=filter_function,
2624 extraction_root=extraction_root)
2625 elif tarinfo.type not in SUPPORTED_TYPES:
2626 self.makeunknown(tarinfo, targetpath)
2627 else:
2628 self.makefile(tarinfo, targetpath)
2629
2630 if set_attrs:
2631 self.chown(tarinfo, targetpath, numeric_owner)
2632 if not tarinfo.issym():
2633 self.chmod(tarinfo, targetpath)
2634 self.utime(tarinfo, targetpath)
2635
2636 #--------------------------------------------------------------------------
2637 # Below are the different file methods. They are called via
2638 # _extract_member() when extract() is called. They can be replaced in a
2639 # subclass to implement other functionality.
2640
2641 def makedir(self, tarinfo, targetpath):
2642 """Make a directory called targetpath.
2643 """
2644 try:
2645 if tarinfo.mode is None:
2646 # Use the system's default mode
2647 os.mkdir(targetpath)
2648 else:
2649 # Use a safe mode for the directory, the real mode is set
2650 # later in _extract_member().
2651 os.mkdir(targetpath, 0o700)
2652 except FileExistsError:
2653 if not os.path.isdir(targetpath):
2654 raise
2655
2656 def makefile(self, tarinfo, targetpath):
2657 """Make a file called targetpath.
2658 """
2659 source = self.fileobj
2660 source.seek(tarinfo.offset_data)
2661 bufsize = self.copybufsize
2662 with bltn_open(targetpath, "wb") as target:
2663 if tarinfo.sparse is not None:
2664 for offset, size in tarinfo.sparse:
2665 target.seek(offset)
2666 copyfileobj(source, target, size, ReadError, bufsize)
2667 target.seek(tarinfo.size)
2668 target.truncate()
2669 else:
2670 copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
2671
2672 def makeunknown(self, tarinfo, targetpath):
2673 """Make a file from a TarInfo object with an unknown type
2674 at targetpath.
2675 """
2676 self.makefile(tarinfo, targetpath)
2677 self._dbg(1, "tarfile: Unknown file type %r, " \
2678 "extracted as regular file." % tarinfo.type)
2679
2680 def makefifo(self, tarinfo, targetpath):
2681 """Make a fifo called targetpath.
2682 """
2683 if hasattr(os, "mkfifo"):
2684 os.mkfifo(targetpath)
2685 else:
2686 raise ExtractError("fifo not supported by system")
2687
2688 def makedev(self, tarinfo, targetpath):
2689 """Make a character or block device called targetpath.
2690 """
2691 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2692 raise ExtractError("special devices not supported by system")
2693
2694 mode = tarinfo.mode
2695 if mode is None:
2696 # Use mknod's default
2697 mode = 0o600
2698 if tarinfo.isblk():
2699 mode |= stat.S_IFBLK
2700 else:
2701 mode |= stat.S_IFCHR
2702
2703 os.mknod(targetpath, mode,
2704 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2705
2706 def makelink(self, tarinfo, targetpath):
2707 return self.makelink_with_filter(tarinfo, targetpath, None, None)
2708
2709 def makelink_with_filter(self, tarinfo, targetpath,
2710 filter_function, extraction_root):
2711 """Make a (symbolic) link called targetpath. If it cannot be created
2712 (platform limitation), we try to make a copy of the referenced file
2713 instead of a link.
2714
2715 filter_function is only used when extracting a *different*
2716 member (e.g. as fallback to creating a link).
2717 """
2718 keyerror_to_extracterror = False
2719 try:
2720 # For systems that support symbolic and hard links.
2721 if tarinfo.issym():
2722 if os.path.lexists(targetpath):
2723 # Avoid FileExistsError on following os.symlink.
2724 os.unlink(targetpath)
2725 os.symlink(tarinfo.linkname, targetpath)
2726 return
2727 else:
2728 if os.path.exists(tarinfo._link_target):
2729 if os.path.lexists(targetpath):
2730 # Avoid FileExistsError on following os.link.
2731 os.unlink(targetpath)
2732 os.link(tarinfo._link_target, targetpath)
2733 return
2734 except symlink_exception:
2735 keyerror_to_extracterror = True
2736
2737 try:
2738 unfiltered = self._find_link_target(tarinfo)
2739 except KeyError:
2740 if keyerror_to_extracterror:
2741 raise ExtractError(
2742 "unable to resolve link inside archive") from None
2743 else:
2744 raise
2745
2746 if filter_function is None:
2747 filtered = unfiltered
2748 else:
2749 if extraction_root is None:
2750 raise ExtractError(
2751 "makelink_with_filter: if filter_function is not None, "
2752 + "extraction_root must also not be None")
2753 try:
2754 filtered = filter_function(unfiltered, extraction_root)
2755 except _FILTER_ERRORS as cause:
2756 raise LinkFallbackError(tarinfo, unfiltered.name) from cause
2757 if filtered is not None:
2758 self._extract_member(filtered, targetpath,
2759 filter_function=filter_function,
2760 extraction_root=extraction_root)
2761
2762 def chown(self, tarinfo, targetpath, numeric_owner):
2763 """Set owner of targetpath according to tarinfo. If numeric_owner
2764 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2765 is False, fall back to .gid/.uid when the search based on name
2766 fails.
2767 """
2768 if hasattr(os, "geteuid") and os.geteuid() == 0:
2769 # We have to be root to do so.
2770 g = tarinfo.gid
2771 u = tarinfo.uid
2772 if not numeric_owner:
2773 try:
2774 if grp and tarinfo.gname:
2775 g = grp.getgrnam(tarinfo.gname)[2]
2776 except KeyError:
2777 pass
2778 try:
2779 if pwd and tarinfo.uname:
2780 u = pwd.getpwnam(tarinfo.uname)[2]
2781 except KeyError:
2782 pass
2783 if g is None:
2784 g = -1
2785 if u is None:
2786 u = -1
2787 try:
2788 if tarinfo.issym() and hasattr(os, "lchown"):
2789 os.lchown(targetpath, u, g)
2790 else:
2791 os.chown(targetpath, u, g)
2792 except (OSError, OverflowError) as e:
2793 # OverflowError can be raised if an ID doesn't fit in 'id_t'
2794 raise ExtractError("could not change owner") from e
2795
2796 def chmod(self, tarinfo, targetpath):
2797 """Set file permissions of targetpath according to tarinfo.
2798 """
2799 if tarinfo.mode is None:
2800 return
2801 try:
2802 os.chmod(targetpath, tarinfo.mode)
2803 except OSError as e:
2804 raise ExtractError("could not change mode") from e
2805
2806 def utime(self, tarinfo, targetpath):
2807 """Set modification time of targetpath according to tarinfo.
2808 """
2809 mtime = tarinfo.mtime
2810 if mtime is None:
2811 return
2812 if not hasattr(os, 'utime'):
2813 return
2814 try:
2815 os.utime(targetpath, (mtime, mtime))
2816 except OSError as e:
2817 raise ExtractError("could not change modification time") from e
2818
2819 #--------------------------------------------------------------------------
2820 def next(self):
2821 """Return the next member of the archive as a TarInfo object, when
2822 TarFile is opened for reading. Return None if there is no more
2823 available.
2824 """
2825 self._check("ra")
2826 if self.firstmember is not None:
2827 m = self.firstmember
2828 self.firstmember = None
2829 return m
2830
2831 # Advance the file pointer.
2832 if self.offset != self.fileobj.tell():
2833 if self.offset == 0:
2834 return None
2835 self.fileobj.seek(self.offset - 1)
2836 if not self.fileobj.read(1):
2837 raise ReadError("unexpected end of data")
2838
2839 # Read the next block.
2840 tarinfo = None
2841 while True:
2842 try:
2843 tarinfo = self.tarinfo.fromtarfile(self)
2844 except EOFHeaderError as e:
2845 if self.ignore_zeros:
2846 self._dbg(2, "0x%X: %s" % (self.offset, e))
2847 self.offset += BLOCKSIZE
2848 continue
2849 except InvalidHeaderError as e:
2850 if self.ignore_zeros:
2851 self._dbg(2, "0x%X: %s" % (self.offset, e))
2852 self.offset += BLOCKSIZE
2853 continue
2854 elif self.offset == 0:
2855 raise ReadError(str(e)) from None
2856 except EmptyHeaderError:
2857 if self.offset == 0:
2858 raise ReadError("empty file") from None
2859 except TruncatedHeaderError as e:
2860 if self.offset == 0:
2861 raise ReadError(str(e)) from None
2862 except SubsequentHeaderError as e:
2863 raise ReadError(str(e)) from None
2864 except Exception as e:
2865 try:
2866 import zlib
2867 if isinstance(e, zlib.error):
2868 raise ReadError(f'zlib error: {e}') from None
2869 else:
2870 raise e
2871 except ImportError:
2872 raise e
2873 break
2874
2875 if tarinfo is not None:
2876 # if streaming the file we do not want to cache the tarinfo
2877 if not self.stream:
2878 self.members.append(tarinfo)
2879 else:
2880 self._loaded = True
2881
2882 return tarinfo
2883
2884 #--------------------------------------------------------------------------
2885 # Little helper methods:
2886
2887 def _getmember(self, name, tarinfo=None, normalize=False):
2888 """Find an archive member by name from bottom to top.
2889 If tarinfo is given, it is used as the starting point.
2890 """
2891 # Ensure that all members have been loaded.
2892 members = self.getmembers()
2893
2894 # Limit the member search list up to tarinfo.
2895 skipping = False
2896 if tarinfo is not None:
2897 try:
2898 index = members.index(tarinfo)
2899 except ValueError:
2900 # The given starting point might be a (modified) copy.
2901 # We'll later skip members until we find an equivalent.
2902 skipping = True
2903 else:
2904 # Happy fast path
2905 members = members[:index]
2906
2907 if normalize:
2908 name = os.path.normpath(name)
2909
2910 for member in reversed(members):
2911 if skipping:
2912 if tarinfo.offset == member.offset:
2913 skipping = False
2914 continue
2915 if normalize:
2916 member_name = os.path.normpath(member.name)
2917 else:
2918 member_name = member.name
2919
2920 if name == member_name:
2921 return member
2922
2923 if skipping:
2924 # Starting point was not found
2925 raise ValueError(tarinfo)
2926
2927 def _load(self):
2928 """Read through the entire archive file and look for readable
2929 members. This should not run if the file is set to stream.
2930 """
2931 if not self.stream:
2932 while self.next() is not None:
2933 pass
2934 self._loaded = True
2935
2936 def _check(self, mode=None):
2937 """Check if TarFile is still open, and if the operation's mode
2938 corresponds to TarFile's mode.
2939 """
2940 if self.closed:
2941 raise OSError("%s is closed" % self.__class__.__name__)
2942 if mode is not None and self.mode not in mode:
2943 raise OSError("bad operation for mode %r" % self.mode)
2944
2945 def _find_link_target(self, tarinfo):
2946 """Find the target member of a symlink or hardlink member in the
2947 archive.
2948 """
2949 if tarinfo.issym():
2950 # Always search the entire archive.
2951 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2952 limit = None
2953 else:
2954 # Search the archive before the link, because a hard link is
2955 # just a reference to an already archived file.
2956 linkname = tarinfo.linkname
2957 limit = tarinfo
2958
2959 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2960 if member is None:
2961 raise KeyError("linkname %r not found" % linkname)
2962 return member
2963
2964 def __iter__(self):
2965 """Provide an iterator object.
2966 """
2967 if self._loaded:
2968 yield from self.members
2969 return
2970
2971 # Yield items using TarFile's next() method.
2972 # When all members have been read, set TarFile as _loaded.
2973 index = 0
2974 # Fix for SF #1100429: Under rare circumstances it can
2975 # happen that getmembers() is called during iteration,
2976 # which will have already exhausted the next() method.
2977 if self.firstmember is not None:
2978 tarinfo = self.next()
2979 index += 1
2980 yield tarinfo
2981
2982 while True:
2983 if index < len(self.members):
2984 tarinfo = self.members[index]
2985 elif not self._loaded:
2986 tarinfo = self.next()
2987 if not tarinfo:
2988 self._loaded = True
2989 return
2990 else:
2991 return
2992 index += 1
2993 yield tarinfo
2994
2995 def _dbg(self, level, msg):
2996 """Write debugging output to sys.stderr.
2997 """
2998 if level <= self.debug:
2999 print(msg, file=sys.stderr)
3000
3001 def __enter__(self):
3002 self._check()
3003 return self
3004
3005 def __exit__(self, type, value, traceback):
3006 if type is None:
3007 self.close()
3008 else:
3009 # An exception occurred. We must not call close() because
3010 # it would try to write end-of-archive blocks and padding.
3011 if not self._extfileobj:
3012 self.fileobj.close()
3013 self.closed = True
3014
3015#--------------------
3016# exported functions
3017#--------------------
3018
3019def is_tarfile(name):
3020 """Return True if name points to a tar archive that we
3021 are able to handle, else return False.
3022
3023 'name' should be a string, file, or file-like object.
3024 """
3025 try:
3026 if hasattr(name, "read"):
3027 pos = name.tell()
3028 t = open(fileobj=name)
3029 name.seek(pos)
3030 else:
3031 t = open(name)
3032 t.close()
3033 return True
3034 except TarError:
3035 return False
3036
3037open = TarFile.open
3038
3039
3040def main():
3041 import argparse
3042
3043 description = 'A simple command-line interface for tarfile module.'
3044 parser = argparse.ArgumentParser(description=description)
3045 parser.add_argument('-v', '--verbose', action='store_true', default=False,
3046 help='Verbose output')
3047 parser.add_argument('--filter', metavar='<filtername>',
3048 choices=_NAMED_FILTERS,
3049 help='Filter for extraction')
3050
3051 group = parser.add_mutually_exclusive_group(required=True)
3052 group.add_argument('-l', '--list', metavar='<tarfile>',
3053 help='Show listing of a tarfile')
3054 group.add_argument('-e', '--extract', nargs='+',
3055 metavar=('<tarfile>', '<output_dir>'),
3056 help='Extract tarfile into target dir')
3057 group.add_argument('-c', '--create', nargs='+',
3058 metavar=('<name>', '<file>'),
3059 help='Create tarfile from sources')
3060 group.add_argument('-t', '--test', metavar='<tarfile>',
3061 help='Test if a tarfile is valid')
3062
3063 args = parser.parse_args()
3064
3065 if args.filter and args.extract is None:
3066 parser.exit(1, '--filter is only valid for extraction\n')
3067
3068 if args.test is not None:
3069 src = args.test
3070 if is_tarfile(src):
3071 with open(src, 'r') as tar:
3072 tar.getmembers()
3073 print(tar.getmembers(), file=sys.stderr)
3074 if args.verbose:
3075 print('{!r} is a tar archive.'.format(src))
3076 else:
3077 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3078
3079 elif args.list is not None:
3080 src = args.list
3081 if is_tarfile(src):
3082 with TarFile.open(src, 'r:*') as tf:
3083 tf.list(verbose=args.verbose)
3084 else:
3085 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3086
3087 elif args.extract is not None:
3088 if len(args.extract) == 1:
3089 src = args.extract[0]
3090 curdir = os.curdir
3091 elif len(args.extract) == 2:
3092 src, curdir = args.extract
3093 else:
3094 parser.exit(1, parser.format_help())
3095
3096 if is_tarfile(src):
3097 with TarFile.open(src, 'r:*') as tf:
3098 tf.extractall(path=curdir, filter=args.filter)
3099 if args.verbose:
3100 if curdir == '.':
3101 msg = '{!r} file is extracted.'.format(src)
3102 else:
3103 msg = ('{!r} file is extracted '
3104 'into {!r} directory.').format(src, curdir)
3105 print(msg)
3106 else:
3107 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3108
3109 elif args.create is not None:
3110 tar_name = args.create.pop(0)
3111 _, ext = os.path.splitext(tar_name)
3112 compressions = {
3113 # gz
3114 '.gz': 'gz',
3115 '.tgz': 'gz',
3116 # xz
3117 '.xz': 'xz',
3118 '.txz': 'xz',
3119 # bz2
3120 '.bz2': 'bz2',
3121 '.tbz': 'bz2',
3122 '.tbz2': 'bz2',
3123 '.tb2': 'bz2',
3124 # zstd
3125 '.zst': 'zst',
3126 '.tzst': 'zst',
3127 }
3128 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
3129 tar_files = args.create
3130
3131 with TarFile.open(tar_name, tar_mode) as tf:
3132 for file_name in tar_files:
3133 tf.add(file_name)
3134
3135 if args.verbose:
3136 print('{!r} file created.'.format(tar_name))
3137
3138if __name__ == '__main__':
3139 main()