1#-------------------------------------------------------------------
2# tarfile.py
3#-------------------------------------------------------------------
4# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
5# All rights reserved.
6#
7# Permission is hereby granted, free of charge, to any person
8# obtaining a copy of this software and associated documentation
9# files (the "Software"), to deal in the Software without
10# restriction, including without limitation the rights to use,
11# copy, modify, merge, publish, distribute, sublicense, and/or sell
12# copies of the Software, and to permit persons to whom the
13# Software is furnished to do so, subject to the following
14# conditions:
15#
16# The above copyright notice and this permission notice shall be
17# included in all copies or substantial portions of the Software.
18#
19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26# OTHER DEALINGS IN THE SOFTWARE.
27#
28"""Read from and write to tar format archives.
29"""
30
31version = "0.9.0"
32__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
33__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
34
35#---------
36# Imports
37#---------
38from builtins import open as bltn_open
39import sys
40import os
41import io
42import shutil
43import stat
44import time
45import struct
46import copy
47import re
48
49try:
50 import pwd
51except ImportError:
52 pwd = None
53try:
54 import grp
55except ImportError:
56 grp = None
57
58
59def _backportszstd_os_path_realpath_allow_missing(filename):
60 # specific patch versions of Python introduced:
61 # - strict parameter of os.path.realpath
62 # - os.path.ALLOW_MISSING
63 try:
64 return os.path.realpath(filename, strict=os.path.ALLOW_MISSING)
65 except (AttributeError, TypeError):
66 return os.path.realpath(filename)
67
68
69# os.symlink on Windows prior to 6.0 raises NotImplementedError
70# OSError (winerror=1314) will be raised if the caller does not hold the
71# SeCreateSymbolicLinkPrivilege privilege
72symlink_exception = (AttributeError, NotImplementedError, OSError)
73
74# from tarfile import *
75__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
76 "CompressionError", "StreamError", "ExtractError", "HeaderError",
77 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
78 "DEFAULT_FORMAT", "open","fully_trusted_filter", "data_filter",
79 "tar_filter", "FilterError", "AbsoluteLinkError",
80 "OutsideDestinationError", "SpecialFileError", "AbsolutePathError",
81 "LinkOutsideDestinationError", "LinkFallbackError"]
82
83
84#---------------------------------------------------------
85# tar constants
86#---------------------------------------------------------
87NUL = b"\0" # the null character
88BLOCKSIZE = 512 # length of processing blocks
89RECORDSIZE = BLOCKSIZE * 20 # length of records
90GNU_MAGIC = b"ustar \0" # magic gnu tar string
91POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
92
93LENGTH_NAME = 100 # maximum length of a filename
94LENGTH_LINK = 100 # maximum length of a linkname
95LENGTH_PREFIX = 155 # maximum length of the prefix field
96
97REGTYPE = b"0" # regular file
98AREGTYPE = b"\0" # regular file
99LNKTYPE = b"1" # link (inside tarfile)
100SYMTYPE = b"2" # symbolic link
101CHRTYPE = b"3" # character special device
102BLKTYPE = b"4" # block special device
103DIRTYPE = b"5" # directory
104FIFOTYPE = b"6" # fifo special device
105CONTTYPE = b"7" # contiguous file
106
107GNUTYPE_LONGNAME = b"L" # GNU tar longname
108GNUTYPE_LONGLINK = b"K" # GNU tar longlink
109GNUTYPE_SPARSE = b"S" # GNU tar sparse file
110
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = PAX_FORMAT
119
120#---------------------------------------------------------
121# tarfile constants
122#---------------------------------------------------------
123# File types that tarfile supports:
124SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
125 SYMTYPE, DIRTYPE, FIFOTYPE,
126 CONTTYPE, CHRTYPE, BLKTYPE,
127 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
128 GNUTYPE_SPARSE)
129
130# File types that will be treated as a regular file.
131REGULAR_TYPES = (REGTYPE, AREGTYPE,
132 CONTTYPE, GNUTYPE_SPARSE)
133
134# File types that are part of the GNU tar format.
135GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
136 GNUTYPE_SPARSE)
137
138# Fields from a pax header that override a TarInfo attribute.
139PAX_FIELDS = ("path", "linkpath", "size", "mtime",
140 "uid", "gid", "uname", "gname")
141
142# Fields from a pax header that are affected by hdrcharset.
143PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
144
145# Fields in a pax header that are numbers, all other fields
146# are treated as strings.
147PAX_NUMBER_FIELDS = {
148 "atime": float,
149 "ctime": float,
150 "mtime": float,
151 "uid": int,
152 "gid": int,
153 "size": int
154}
155
156#---------------------------------------------------------
157# initialization
158#---------------------------------------------------------
159if os.name == "nt":
160 ENCODING = "utf-8"
161else:
162 ENCODING = sys.getfilesystemencoding()
163
164#---------------------------------------------------------
165# Some useful functions
166#---------------------------------------------------------
167
168def stn(s, length, encoding, errors):
169 """Convert a string to a null-terminated bytes object.
170 """
171 if s is None:
172 raise ValueError("metadata cannot contain None")
173 s = s.encode(encoding, errors)
174 return s[:length] + (length - len(s)) * NUL
175
176def nts(s, encoding, errors):
177 """Convert a null-terminated bytes object to a string.
178 """
179 p = s.find(b"\0")
180 if p != -1:
181 s = s[:p]
182 return s.decode(encoding, errors)
183
184def nti(s):
185 """Convert a number field to a python number.
186 """
187 # There are two possible encodings for a number field, see
188 # itn() below.
189 if s[0] in (0o200, 0o377):
190 n = 0
191 for i in range(len(s) - 1):
192 n <<= 8
193 n += s[i + 1]
194 if s[0] == 0o377:
195 n = -(256 ** (len(s) - 1) - n)
196 else:
197 try:
198 s = nts(s, "ascii", "strict")
199 n = int(s.strip() or "0", 8)
200 except ValueError:
201 raise InvalidHeaderError("invalid header")
202 return n
203
204def itn(n, digits=8, format=DEFAULT_FORMAT):
205 """Convert a python number to a number field.
206 """
207 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
208 # octal digits followed by a null-byte, this allows values up to
209 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
210 # that if necessary. A leading 0o200 or 0o377 byte indicate this
211 # particular encoding, the following digits-1 bytes are a big-endian
212 # base-256 representation. This allows values up to (256**(digits-1))-1.
213 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
214 # number.
215 original_n = n
216 n = int(n)
217 if 0 <= n < 8 ** (digits - 1):
218 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
219 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
220 if n >= 0:
221 s = bytearray([0o200])
222 else:
223 s = bytearray([0o377])
224 n = 256 ** digits + n
225
226 for i in range(digits - 1):
227 s.insert(1, n & 0o377)
228 n >>= 8
229 else:
230 raise ValueError("overflow in number field")
231
232 return s
233
234def calc_chksums(buf):
235 """Calculate the checksum for a member's header by summing up all
236 characters except for the chksum field which is treated as if
237 it was filled with spaces. According to the GNU tar sources,
238 some tars (Sun and NeXT) calculate chksum with signed char,
239 which will be different if there are chars in the buffer with
240 the high bit set. So we calculate two checksums, unsigned and
241 signed.
242 """
243 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
244 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
245 return unsigned_chksum, signed_chksum
246
247def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
248 """Copy length bytes from fileobj src to fileobj dst.
249 If length is None, copy the entire content.
250 """
251 bufsize = bufsize or 16 * 1024
252 if length == 0:
253 return
254 if length is None:
255 shutil.copyfileobj(src, dst, bufsize)
256 return
257
258 blocks, remainder = divmod(length, bufsize)
259 for b in range(blocks):
260 buf = src.read(bufsize)
261 if len(buf) < bufsize:
262 raise exception("unexpected end of data")
263 dst.write(buf)
264
265 if remainder != 0:
266 buf = src.read(remainder)
267 if len(buf) < remainder:
268 raise exception("unexpected end of data")
269 dst.write(buf)
270 return
271
272def _safe_print(s):
273 encoding = getattr(sys.stdout, 'encoding', None)
274 if encoding is not None:
275 s = s.encode(encoding, 'backslashreplace').decode(encoding)
276 print(s, end=' ')
277
278
279class TarError(Exception):
280 """Base exception."""
281 pass
282class ExtractError(TarError):
283 """General exception for extract errors."""
284 pass
285class ReadError(TarError):
286 """Exception for unreadable tar archives."""
287 pass
288class CompressionError(TarError):
289 """Exception for unavailable compression methods."""
290 pass
291class StreamError(TarError):
292 """Exception for unsupported operations on stream-like TarFiles."""
293 pass
294class HeaderError(TarError):
295 """Base exception for header errors."""
296 pass
297class EmptyHeaderError(HeaderError):
298 """Exception for empty headers."""
299 pass
300class TruncatedHeaderError(HeaderError):
301 """Exception for truncated headers."""
302 pass
303class EOFHeaderError(HeaderError):
304 """Exception for end of file headers."""
305 pass
306class InvalidHeaderError(HeaderError):
307 """Exception for invalid headers."""
308 pass
309class SubsequentHeaderError(HeaderError):
310 """Exception for missing and invalid extended headers."""
311 pass
312
313#---------------------------
314# internal stream interface
315#---------------------------
316class _LowLevelFile:
317 """Low-level file object. Supports reading and writing.
318 It is used instead of a regular file object for streaming
319 access.
320 """
321
322 def __init__(self, name, mode):
323 mode = {
324 "r": os.O_RDONLY,
325 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
326 }[mode]
327 if hasattr(os, "O_BINARY"):
328 mode |= os.O_BINARY
329 self.fd = os.open(name, mode, 0o666)
330
331 def close(self):
332 os.close(self.fd)
333
334 def read(self, size):
335 return os.read(self.fd, size)
336
337 def write(self, s):
338 os.write(self.fd, s)
339
340class _Stream:
341 """Class that serves as an adapter between TarFile and
342 a stream-like object. The stream-like object only
343 needs to have a read() or write() method that works with bytes,
344 and the method is accessed blockwise.
345 Use of gzip or bzip2 compression is possible.
346 A stream-like object could be for example: sys.stdin.buffer,
347 sys.stdout.buffer, a socket, a tape device etc.
348
349 _Stream is intended to be used only internally.
350 """
351
352 def __init__(self, name, mode, comptype, fileobj, bufsize,
353 compresslevel, preset):
354 """Construct a _Stream object.
355 """
356 self._extfileobj = True
357 if fileobj is None:
358 fileobj = _LowLevelFile(name, mode)
359 self._extfileobj = False
360
361 if comptype == '*':
362 # Enable transparent compression detection for the
363 # stream interface
364 fileobj = _StreamProxy(fileobj)
365 comptype = fileobj.getcomptype()
366
367 self.name = os.fspath(name) if name is not None else ""
368 self.mode = mode
369 self.comptype = comptype
370 self.fileobj = fileobj
371 self.bufsize = bufsize
372 self.buf = b""
373 self.pos = 0
374 self.closed = False
375
376 try:
377 if comptype == "gz":
378 try:
379 import zlib
380 except ImportError:
381 raise CompressionError("zlib module is not available") from None
382 self.zlib = zlib
383 self.crc = zlib.crc32(b"")
384 if mode == "r":
385 self.exception = zlib.error
386 self._init_read_gz()
387 else:
388 self._init_write_gz(compresslevel)
389
390 elif comptype == "bz2":
391 try:
392 import bz2
393 except ImportError:
394 raise CompressionError("bz2 module is not available") from None
395 if mode == "r":
396 self.dbuf = b""
397 self.cmp = bz2.BZ2Decompressor()
398 self.exception = OSError
399 else:
400 self.cmp = bz2.BZ2Compressor(compresslevel)
401
402 elif comptype == "xz":
403 try:
404 import lzma
405 except ImportError:
406 raise CompressionError("lzma module is not available") from None
407 if mode == "r":
408 self.dbuf = b""
409 self.cmp = lzma.LZMADecompressor()
410 self.exception = lzma.LZMAError
411 else:
412 self.cmp = lzma.LZMACompressor(preset=preset)
413 elif comptype == "zst":
414 from backports import zstd
415 if mode == "r":
416 self.dbuf = b""
417 self.cmp = zstd.ZstdDecompressor()
418 self.exception = zstd.ZstdError
419 else:
420 self.cmp = zstd.ZstdCompressor()
421 elif comptype != "tar":
422 raise CompressionError("unknown compression type %r" % comptype)
423
424 except:
425 if not self._extfileobj:
426 self.fileobj.close()
427 self.closed = True
428 raise
429
430 def __del__(self):
431 if hasattr(self, "closed") and not self.closed:
432 self.close()
433
434 def _init_write_gz(self, compresslevel):
435 """Initialize for writing with gzip compression.
436 """
437 self.cmp = self.zlib.compressobj(compresslevel,
438 self.zlib.DEFLATED,
439 -self.zlib.MAX_WBITS,
440 self.zlib.DEF_MEM_LEVEL,
441 0)
442 timestamp = struct.pack("<L", int(time.time()))
443 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
444 if self.name.endswith(".gz"):
445 self.name = self.name[:-3]
446 # Honor "directory components removed" from RFC1952
447 self.name = os.path.basename(self.name)
448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
449 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
450
451 def write(self, s):
452 """Write string s to the stream.
453 """
454 if self.comptype == "gz":
455 self.crc = self.zlib.crc32(s, self.crc)
456 self.pos += len(s)
457 if self.comptype != "tar":
458 s = self.cmp.compress(s)
459 self.__write(s)
460
461 def __write(self, s):
462 """Write string s to the stream if a whole new block
463 is ready to be written.
464 """
465 self.buf += s
466 while len(self.buf) > self.bufsize:
467 self.fileobj.write(self.buf[:self.bufsize])
468 self.buf = self.buf[self.bufsize:]
469
470 def close(self):
471 """Close the _Stream object. No operation should be
472 done on it afterwards.
473 """
474 if self.closed:
475 return
476
477 self.closed = True
478 try:
479 if self.mode == "w" and self.comptype != "tar":
480 self.buf += self.cmp.flush()
481
482 if self.mode == "w" and self.buf:
483 self.fileobj.write(self.buf)
484 self.buf = b""
485 if self.comptype == "gz":
486 self.fileobj.write(struct.pack("<L", self.crc))
487 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
488 finally:
489 if not self._extfileobj:
490 self.fileobj.close()
491
492 def _init_read_gz(self):
493 """Initialize for reading a gzip compressed fileobj.
494 """
495 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
496 self.dbuf = b""
497
498 # taken from gzip.GzipFile with some alterations
499 if self.__read(2) != b"\037\213":
500 raise ReadError("not a gzip file")
501 if self.__read(1) != b"\010":
502 raise CompressionError("unsupported compression method")
503
504 flag = ord(self.__read(1))
505 self.__read(6)
506
507 if flag & 4:
508 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
509 self.read(xlen)
510 if flag & 8:
511 while True:
512 s = self.__read(1)
513 if not s or s == NUL:
514 break
515 if flag & 16:
516 while True:
517 s = self.__read(1)
518 if not s or s == NUL:
519 break
520 if flag & 2:
521 self.__read(2)
522
523 def tell(self):
524 """Return the stream's file pointer position.
525 """
526 return self.pos
527
528 def seek(self, pos=0):
529 """Set the stream's file pointer to pos. Negative seeking
530 is forbidden.
531 """
532 if pos - self.pos >= 0:
533 blocks, remainder = divmod(pos - self.pos, self.bufsize)
534 for i in range(blocks):
535 self.read(self.bufsize)
536 self.read(remainder)
537 else:
538 raise StreamError("seeking backwards is not allowed")
539 return self.pos
540
541 def read(self, size):
542 """Return the next size number of bytes from the stream."""
543 assert size is not None
544 buf = self._read(size)
545 self.pos += len(buf)
546 return buf
547
548 def _read(self, size):
549 """Return size bytes from the stream.
550 """
551 if self.comptype == "tar":
552 return self.__read(size)
553
554 c = len(self.dbuf)
555 t = [self.dbuf]
556 while c < size:
557 # Skip underlying buffer to avoid unaligned double buffering.
558 if self.buf:
559 buf = self.buf
560 self.buf = b""
561 else:
562 buf = self.fileobj.read(self.bufsize)
563 if not buf:
564 break
565 try:
566 buf = self.cmp.decompress(buf)
567 except self.exception as e:
568 raise ReadError("invalid compressed data") from e
569 t.append(buf)
570 c += len(buf)
571 t = b"".join(t)
572 self.dbuf = t[size:]
573 return t[:size]
574
575 def __read(self, size):
576 """Return size bytes from stream. If internal buffer is empty,
577 read another block from the stream.
578 """
579 c = len(self.buf)
580 t = [self.buf]
581 while c < size:
582 buf = self.fileobj.read(self.bufsize)
583 if not buf:
584 break
585 t.append(buf)
586 c += len(buf)
587 t = b"".join(t)
588 self.buf = t[size:]
589 return t[:size]
590# class _Stream
591
592class _StreamProxy(object):
593 """Small proxy class that enables transparent compression
594 detection for the Stream interface (mode 'r|*').
595 """
596
597 def __init__(self, fileobj):
598 self.fileobj = fileobj
599 self.buf = self.fileobj.read(BLOCKSIZE)
600
601 def read(self, size):
602 self.read = self.fileobj.read
603 return self.buf
604
605 def getcomptype(self):
606 if self.buf.startswith(b"\x1f\x8b\x08"):
607 return "gz"
608 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
609 return "bz2"
610 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
611 return "xz"
612 elif self.buf.startswith(b"\x28\xb5\x2f\xfd"):
613 return "zst"
614 else:
615 return "tar"
616
617 def close(self):
618 self.fileobj.close()
619# class StreamProxy
620
621#------------------------
622# Extraction file object
623#------------------------
624class _FileInFile(object):
625 """A thin wrapper around an existing file object that
626 provides a part of its data as an individual file
627 object.
628 """
629
630 def __init__(self, fileobj, offset, size, name, blockinfo=None):
631 self.fileobj = fileobj
632 self.offset = offset
633 self.size = size
634 self.position = 0
635 self.name = name
636 self.closed = False
637
638 if blockinfo is None:
639 blockinfo = [(0, size)]
640
641 # Construct a map with data and zero blocks.
642 self.map_index = 0
643 self.map = []
644 lastpos = 0
645 realpos = self.offset
646 for offset, size in blockinfo:
647 if offset > lastpos:
648 self.map.append((False, lastpos, offset, None))
649 self.map.append((True, offset, offset + size, realpos))
650 realpos += size
651 lastpos = offset + size
652 if lastpos < self.size:
653 self.map.append((False, lastpos, self.size, None))
654
655 def flush(self):
656 pass
657
658 @property
659 def mode(self):
660 return 'rb'
661
662 def readable(self):
663 return True
664
665 def writable(self):
666 return False
667
668 def seekable(self):
669 return self.fileobj.seekable()
670
671 def tell(self):
672 """Return the current file position.
673 """
674 return self.position
675
676 def seek(self, position, whence=io.SEEK_SET):
677 """Seek to a position in the file.
678 """
679 if whence == io.SEEK_SET:
680 self.position = min(max(position, 0), self.size)
681 elif whence == io.SEEK_CUR:
682 if position < 0:
683 self.position = max(self.position + position, 0)
684 else:
685 self.position = min(self.position + position, self.size)
686 elif whence == io.SEEK_END:
687 self.position = max(min(self.size + position, self.size), 0)
688 else:
689 raise ValueError("Invalid argument")
690 return self.position
691
692 def read(self, size=None):
693 """Read data from the file.
694 """
695 if size is None:
696 size = self.size - self.position
697 else:
698 size = min(size, self.size - self.position)
699
700 buf = b""
701 while size > 0:
702 while True:
703 data, start, stop, offset = self.map[self.map_index]
704 if start <= self.position < stop:
705 break
706 else:
707 self.map_index += 1
708 if self.map_index == len(self.map):
709 self.map_index = 0
710 length = min(size, stop - self.position)
711 if data:
712 self.fileobj.seek(offset + (self.position - start))
713 b = self.fileobj.read(length)
714 if len(b) != length:
715 raise ReadError("unexpected end of data")
716 buf += b
717 else:
718 buf += NUL * length
719 size -= length
720 self.position += length
721 return buf
722
723 def readinto(self, b):
724 buf = self.read(len(b))
725 b[:len(buf)] = buf
726 return len(buf)
727
728 def close(self):
729 self.closed = True
730#class _FileInFile
731
732class ExFileObject(io.BufferedReader):
733
734 def __init__(self, tarfile, tarinfo):
735 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
736 tarinfo.size, tarinfo.name, tarinfo.sparse)
737 super().__init__(fileobj)
738#class ExFileObject
739
740
741#-----------------------------
742# extraction filters (PEP 706)
743#-----------------------------
744
745class FilterError(TarError):
746 pass
747
748class AbsolutePathError(FilterError):
749 def __init__(self, tarinfo):
750 self.tarinfo = tarinfo
751 super().__init__(f'member {tarinfo.name!r} has an absolute path')
752
753class OutsideDestinationError(FilterError):
754 def __init__(self, tarinfo, path):
755 self.tarinfo = tarinfo
756 self._path = path
757 super().__init__(f'{tarinfo.name!r} would be extracted to {path!r}, '
758 + 'which is outside the destination')
759
760class SpecialFileError(FilterError):
761 def __init__(self, tarinfo):
762 self.tarinfo = tarinfo
763 super().__init__(f'{tarinfo.name!r} is a special file')
764
765class AbsoluteLinkError(FilterError):
766 def __init__(self, tarinfo):
767 self.tarinfo = tarinfo
768 super().__init__(f'{tarinfo.name!r} is a link to an absolute path')
769
770class LinkOutsideDestinationError(FilterError):
771 def __init__(self, tarinfo, path):
772 self.tarinfo = tarinfo
773 self._path = path
774 super().__init__(f'{tarinfo.name!r} would link to {path!r}, '
775 + 'which is outside the destination')
776
777class LinkFallbackError(FilterError):
778 def __init__(self, tarinfo, path):
779 self.tarinfo = tarinfo
780 self._path = path
781 super().__init__(f'link {tarinfo.name!r} would be extracted as a '
782 + f'copy of {path!r}, which was rejected')
783
784# Errors caused by filters -- both "fatal" and "non-fatal" -- that
785# we consider to be issues with the argument, rather than a bug in the
786# filter function
787_FILTER_ERRORS = (FilterError, OSError, ExtractError)
788
789def _get_filtered_attrs(member, dest_path, for_data=True):
790 new_attrs = {}
791 name = member.name
792 dest_path = _backportszstd_os_path_realpath_allow_missing(dest_path)
793 # Strip leading / (tar's directory separator) from filenames.
794 # Include os.sep (target OS directory separator) as well.
795 if name.startswith(('/', os.sep)):
796 name = new_attrs['name'] = member.path.lstrip('/' + os.sep)
797 if os.path.isabs(name):
798 # Path is absolute even after stripping.
799 # For example, 'C:/foo' on Windows.
800 raise AbsolutePathError(member)
801 # Ensure we stay in the destination
802 target_path = _backportszstd_os_path_realpath_allow_missing(os.path.join(dest_path, name))
803 if os.path.commonpath([target_path, dest_path]) != dest_path:
804 raise OutsideDestinationError(member, target_path)
805 # Limit permissions (no high bits, and go-w)
806 mode = member.mode
807 if mode is not None:
808 # Strip high bits & group/other write bits
809 mode = mode & 0o755
810 if for_data:
811 # For data, handle permissions & file types
812 if member.isreg() or member.islnk():
813 if not mode & 0o100:
814 # Clear executable bits if not executable by user
815 mode &= ~0o111
816 # Ensure owner can read & write
817 mode |= 0o600
818 elif member.isdir() or member.issym():
819 # Ignore mode for directories & symlinks
820 mode = None
821 else:
822 # Reject special files
823 raise SpecialFileError(member)
824 if mode != member.mode:
825 new_attrs['mode'] = mode
826 if for_data:
827 # Ignore ownership for 'data'
828 if member.uid is not None:
829 new_attrs['uid'] = None
830 if member.gid is not None:
831 new_attrs['gid'] = None
832 if member.uname is not None:
833 new_attrs['uname'] = None
834 if member.gname is not None:
835 new_attrs['gname'] = None
836 # Check link destination for 'data'
837 if member.islnk() or member.issym():
838 if os.path.isabs(member.linkname):
839 raise AbsoluteLinkError(member)
840 normalized = os.path.normpath(member.linkname)
841 if normalized != member.linkname:
842 new_attrs['linkname'] = normalized
843 if member.issym():
844 target_path = os.path.join(dest_path,
845 os.path.dirname(name),
846 member.linkname)
847 else:
848 target_path = os.path.join(dest_path,
849 member.linkname)
850 target_path = _backportszstd_os_path_realpath_allow_missing(target_path)
851 if os.path.commonpath([target_path, dest_path]) != dest_path:
852 raise LinkOutsideDestinationError(member, target_path)
853 return new_attrs
854
855def fully_trusted_filter(member, dest_path):
856 return member
857
858def tar_filter(member, dest_path):
859 new_attrs = _get_filtered_attrs(member, dest_path, False)
860 if new_attrs:
861 return member.replace(**new_attrs, deep=False)
862 return member
863
864def data_filter(member, dest_path):
865 new_attrs = _get_filtered_attrs(member, dest_path, True)
866 if new_attrs:
867 return member.replace(**new_attrs, deep=False)
868 return member
869
870_NAMED_FILTERS = {
871 "fully_trusted": fully_trusted_filter,
872 "tar": tar_filter,
873 "data": data_filter,
874}
875
876#------------------
877# Exported Classes
878#------------------
879
880# Sentinel for replace() defaults, meaning "don't change the attribute"
881_KEEP = object()
882
883# Header length is digits followed by a space.
884_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")
885
886class TarInfo(object):
887 """Informational class which holds the details about an
888 archive member given by a tar header block.
889 TarInfo objects are returned by TarFile.getmember(),
890 TarFile.getmembers() and TarFile.gettarinfo() and are
891 usually created internally.
892 """
893
894 __slots__ = dict(
895 name = 'Name of the archive member.',
896 mode = 'Permission bits.',
897 uid = 'User ID of the user who originally stored this member.',
898 gid = 'Group ID of the user who originally stored this member.',
899 size = 'Size in bytes.',
900 mtime = 'Time of last modification.',
901 chksum = 'Header checksum.',
902 type = ('File type. type is usually one of these constants: '
903 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, '
904 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'),
905 linkname = ('Name of the target file name, which is only present '
906 'in TarInfo objects of type LNKTYPE and SYMTYPE.'),
907 uname = 'User name.',
908 gname = 'Group name.',
909 devmajor = 'Device major number.',
910 devminor = 'Device minor number.',
911 offset = 'The tar header starts here.',
912 offset_data = "The file's data starts here.",
913 pax_headers = ('A dictionary containing key-value pairs of an '
914 'associated pax extended header.'),
915 sparse = 'Sparse member information.',
916 _tarfile = None,
917 _sparse_structs = None,
918 _link_target = None,
919 )
920
921 def __init__(self, name=""):
922 """Construct a TarInfo object. name is the optional name
923 of the member.
924 """
925 self.name = name # member name
926 self.mode = 0o644 # file permissions
927 self.uid = 0 # user id
928 self.gid = 0 # group id
929 self.size = 0 # file size
930 self.mtime = 0 # modification time
931 self.chksum = 0 # header checksum
932 self.type = REGTYPE # member type
933 self.linkname = "" # link name
934 self.uname = "" # user name
935 self.gname = "" # group name
936 self.devmajor = 0 # device major number
937 self.devminor = 0 # device minor number
938
939 self.offset = 0 # the tar header starts here
940 self.offset_data = 0 # the file's data starts here
941
942 self.sparse = None # sparse member information
943 self.pax_headers = {} # pax header information
944
945 @property
946 def tarfile(self):
947 import warnings
948 warnings.warn(
949 'The undocumented "tarfile" attribute of TarInfo objects '
950 + 'is deprecated and will be removed in Python 3.16',
951 DeprecationWarning, stacklevel=2)
952 return self._tarfile
953
954 @tarfile.setter
955 def tarfile(self, tarfile):
956 import warnings
957 warnings.warn(
958 'The undocumented "tarfile" attribute of TarInfo objects '
959 + 'is deprecated and will be removed in Python 3.16',
960 DeprecationWarning, stacklevel=2)
961 self._tarfile = tarfile
962
963 @property
964 def path(self):
965 'In pax headers, "name" is called "path".'
966 return self.name
967
968 @path.setter
969 def path(self, name):
970 self.name = name
971
972 @property
973 def linkpath(self):
974 'In pax headers, "linkname" is called "linkpath".'
975 return self.linkname
976
977 @linkpath.setter
978 def linkpath(self, linkname):
979 self.linkname = linkname
980
981 def __repr__(self):
982 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
983
984 def replace(self, *,
985 name=_KEEP, mtime=_KEEP, mode=_KEEP, linkname=_KEEP,
986 uid=_KEEP, gid=_KEEP, uname=_KEEP, gname=_KEEP,
987 deep=True, _KEEP=_KEEP):
988 """Return a deep copy of self with the given attributes replaced.
989 """
990 if deep:
991 result = copy.deepcopy(self)
992 else:
993 result = copy.copy(self)
994 if name is not _KEEP:
995 result.name = name
996 if mtime is not _KEEP:
997 result.mtime = mtime
998 if mode is not _KEEP:
999 result.mode = mode
1000 if linkname is not _KEEP:
1001 result.linkname = linkname
1002 if uid is not _KEEP:
1003 result.uid = uid
1004 if gid is not _KEEP:
1005 result.gid = gid
1006 if uname is not _KEEP:
1007 result.uname = uname
1008 if gname is not _KEEP:
1009 result.gname = gname
1010 return result
1011
1012 def get_info(self):
1013 """Return the TarInfo's attributes as a dictionary.
1014 """
1015 if self.mode is None:
1016 mode = None
1017 else:
1018 mode = self.mode & 0o7777
1019 info = {
1020 "name": self.name,
1021 "mode": mode,
1022 "uid": self.uid,
1023 "gid": self.gid,
1024 "size": self.size,
1025 "mtime": self.mtime,
1026 "chksum": self.chksum,
1027 "type": self.type,
1028 "linkname": self.linkname,
1029 "uname": self.uname,
1030 "gname": self.gname,
1031 "devmajor": self.devmajor,
1032 "devminor": self.devminor
1033 }
1034
1035 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1036 info["name"] += "/"
1037
1038 return info
1039
1040 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
1041 """Return a tar header as a string of 512 byte blocks.
1042 """
1043 info = self.get_info()
1044 for name, value in info.items():
1045 if value is None:
1046 raise ValueError("%s may not be None" % name)
1047
1048 if format == USTAR_FORMAT:
1049 return self.create_ustar_header(info, encoding, errors)
1050 elif format == GNU_FORMAT:
1051 return self.create_gnu_header(info, encoding, errors)
1052 elif format == PAX_FORMAT:
1053 return self.create_pax_header(info, encoding)
1054 else:
1055 raise ValueError("invalid format")
1056
1057 def create_ustar_header(self, info, encoding, errors):
1058 """Return the object as a ustar header block.
1059 """
1060 info["magic"] = POSIX_MAGIC
1061
1062 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
1063 raise ValueError("linkname is too long")
1064
1065 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
1066 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
1067
1068 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1069
1070 def create_gnu_header(self, info, encoding, errors):
1071 """Return the object as a GNU header block sequence.
1072 """
1073 info["magic"] = GNU_MAGIC
1074
1075 buf = b""
1076 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
1077 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
1078
1079 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
1080 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
1081
1082 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1083
1084 def create_pax_header(self, info, encoding):
1085 """Return the object as a ustar header block. If it cannot be
1086 represented this way, prepend a pax extended header sequence
1087 with supplement information.
1088 """
1089 info["magic"] = POSIX_MAGIC
1090 pax_headers = self.pax_headers.copy()
1091
1092 # Test string fields for values that exceed the field length or cannot
1093 # be represented in ASCII encoding.
1094 for name, hname, length in (
1095 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1096 ("uname", "uname", 32), ("gname", "gname", 32)):
1097
1098 if hname in pax_headers:
1099 # The pax header has priority.
1100 continue
1101
1102 # Try to encode the string as ASCII.
1103 try:
1104 info[name].encode("ascii", "strict")
1105 except UnicodeEncodeError:
1106 pax_headers[hname] = info[name]
1107 continue
1108
1109 if len(info[name]) > length:
1110 pax_headers[hname] = info[name]
1111
1112 # Test number fields for values that exceed the field limit or values
1113 # that like to be stored as float.
1114 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1115 needs_pax = False
1116
1117 val = info[name]
1118 val_is_float = isinstance(val, float)
1119 val_int = round(val) if val_is_float else val
1120 if not 0 <= val_int < 8 ** (digits - 1):
1121 # Avoid overflow.
1122 info[name] = 0
1123 needs_pax = True
1124 elif val_is_float:
1125 # Put rounded value in ustar header, and full
1126 # precision value in pax header.
1127 info[name] = val_int
1128 needs_pax = True
1129
1130 # The existing pax header has priority.
1131 if needs_pax and name not in pax_headers:
1132 pax_headers[name] = str(val)
1133
1134 # Create a pax extended header if necessary.
1135 if pax_headers:
1136 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1137 else:
1138 buf = b""
1139
1140 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1141
1142 @classmethod
1143 def create_pax_global_header(cls, pax_headers):
1144 """Return the object as a pax global header block sequence.
1145 """
1146 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1147
1148 def _posix_split_name(self, name, encoding, errors):
1149 """Split a name longer than 100 chars into a prefix
1150 and a name part.
1151 """
1152 components = name.split("/")
1153 for i in range(1, len(components)):
1154 prefix = "/".join(components[:i])
1155 name = "/".join(components[i:])
1156 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
1157 len(name.encode(encoding, errors)) <= LENGTH_NAME:
1158 break
1159 else:
1160 raise ValueError("name is too long")
1161
1162 return prefix, name
1163
1164 @staticmethod
1165 def _create_header(info, format, encoding, errors):
1166 """Return a header block. info is a dictionary with file
1167 information, format must be one of the *_FORMAT constants.
1168 """
1169 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE)
1170 if has_device_fields:
1171 devmajor = itn(info.get("devmajor", 0), 8, format)
1172 devminor = itn(info.get("devminor", 0), 8, format)
1173 else:
1174 devmajor = stn("", 8, encoding, errors)
1175 devminor = stn("", 8, encoding, errors)
1176
1177 # None values in metadata should cause ValueError.
1178 # itn()/stn() do this for all fields except type.
1179 filetype = info.get("type", REGTYPE)
1180 if filetype is None:
1181 raise ValueError("TarInfo.type must not be None")
1182
1183 parts = [
1184 stn(info.get("name", ""), 100, encoding, errors),
1185 itn(info.get("mode", 0) & 0o7777, 8, format),
1186 itn(info.get("uid", 0), 8, format),
1187 itn(info.get("gid", 0), 8, format),
1188 itn(info.get("size", 0), 12, format),
1189 itn(info.get("mtime", 0), 12, format),
1190 b" ", # checksum field
1191 filetype,
1192 stn(info.get("linkname", ""), 100, encoding, errors),
1193 info.get("magic", POSIX_MAGIC),
1194 stn(info.get("uname", ""), 32, encoding, errors),
1195 stn(info.get("gname", ""), 32, encoding, errors),
1196 devmajor,
1197 devminor,
1198 stn(info.get("prefix", ""), 155, encoding, errors)
1199 ]
1200
1201 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1202 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1203 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1204 return buf
1205
1206 @staticmethod
1207 def _create_payload(payload):
1208 """Return the string payload filled with zero bytes
1209 up to the next 512 byte border.
1210 """
1211 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1212 if remainder > 0:
1213 payload += (BLOCKSIZE - remainder) * NUL
1214 return payload
1215
1216 @classmethod
1217 def _create_gnu_long_header(cls, name, type, encoding, errors):
1218 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1219 for name.
1220 """
1221 name = name.encode(encoding, errors) + NUL
1222
1223 info = {}
1224 info["name"] = "././@LongLink"
1225 info["type"] = type
1226 info["size"] = len(name)
1227 info["magic"] = GNU_MAGIC
1228
1229 # create extended header + name blocks.
1230 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1231 cls._create_payload(name)
1232
1233 @classmethod
1234 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1235 """Return a POSIX.1-2008 extended or global header sequence
1236 that contains a list of keyword, value pairs. The values
1237 must be strings.
1238 """
1239 # Check if one of the fields contains surrogate characters and thereby
1240 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1241 binary = False
1242 for keyword, value in pax_headers.items():
1243 try:
1244 value.encode("utf-8", "strict")
1245 except UnicodeEncodeError:
1246 binary = True
1247 break
1248
1249 records = b""
1250 if binary:
1251 # Put the hdrcharset field at the beginning of the header.
1252 records += b"21 hdrcharset=BINARY\n"
1253
1254 for keyword, value in pax_headers.items():
1255 keyword = keyword.encode("utf-8")
1256 if binary:
1257 # Try to restore the original byte representation of 'value'.
1258 # Needless to say, that the encoding must match the string.
1259 value = value.encode(encoding, "surrogateescape")
1260 else:
1261 value = value.encode("utf-8")
1262
1263 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1264 n = p = 0
1265 while True:
1266 n = l + len(str(p))
1267 if n == p:
1268 break
1269 p = n
1270 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1271
1272 # We use a hardcoded "././@PaxHeader" name like star does
1273 # instead of the one that POSIX recommends.
1274 info = {}
1275 info["name"] = "././@PaxHeader"
1276 info["type"] = type
1277 info["size"] = len(records)
1278 info["magic"] = POSIX_MAGIC
1279
1280 # Create pax header + record blocks.
1281 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1282 cls._create_payload(records)
1283
1284 @classmethod
1285 def frombuf(cls, buf, encoding, errors):
1286 """Construct a TarInfo object from a 512 byte bytes object.
1287
1288 To support the old v7 tar format AREGTYPE headers are
1289 transformed to DIRTYPE headers if their name ends in '/'.
1290 """
1291 return cls._frombuf(buf, encoding, errors)
1292
1293 @classmethod
1294 def _frombuf(cls, buf, encoding, errors, *, dircheck=True):
1295 """Construct a TarInfo object from a 512 byte bytes object.
1296
1297 If ``dircheck`` is set to ``True`` then ``AREGTYPE`` headers will
1298 be normalized to ``DIRTYPE`` if the name ends in a trailing slash.
1299 ``dircheck`` must be set to ``False`` if this function is called
1300 on a follow-up header such as ``GNUTYPE_LONGNAME``.
1301 """
1302 if len(buf) == 0:
1303 raise EmptyHeaderError("empty header")
1304 if len(buf) != BLOCKSIZE:
1305 raise TruncatedHeaderError("truncated header")
1306 if buf.count(NUL) == BLOCKSIZE:
1307 raise EOFHeaderError("end of file header")
1308
1309 chksum = nti(buf[148:156])
1310 if chksum not in calc_chksums(buf):
1311 raise InvalidHeaderError("bad checksum")
1312
1313 obj = cls()
1314 obj.name = nts(buf[0:100], encoding, errors)
1315 obj.mode = nti(buf[100:108])
1316 obj.uid = nti(buf[108:116])
1317 obj.gid = nti(buf[116:124])
1318 obj.size = nti(buf[124:136])
1319 obj.mtime = nti(buf[136:148])
1320 obj.chksum = chksum
1321 obj.type = buf[156:157]
1322 obj.linkname = nts(buf[157:257], encoding, errors)
1323 obj.uname = nts(buf[265:297], encoding, errors)
1324 obj.gname = nts(buf[297:329], encoding, errors)
1325 obj.devmajor = nti(buf[329:337])
1326 obj.devminor = nti(buf[337:345])
1327 prefix = nts(buf[345:500], encoding, errors)
1328
1329 # Old V7 tar format represents a directory as a regular
1330 # file with a trailing slash.
1331 if dircheck and obj.type == AREGTYPE and obj.name.endswith("/"):
1332 obj.type = DIRTYPE
1333
1334 # The old GNU sparse format occupies some of the unused
1335 # space in the buffer for up to 4 sparse structures.
1336 # Save them for later processing in _proc_sparse().
1337 if obj.type == GNUTYPE_SPARSE:
1338 pos = 386
1339 structs = []
1340 for i in range(4):
1341 try:
1342 offset = nti(buf[pos:pos + 12])
1343 numbytes = nti(buf[pos + 12:pos + 24])
1344 except ValueError:
1345 break
1346 structs.append((offset, numbytes))
1347 pos += 24
1348 isextended = bool(buf[482])
1349 origsize = nti(buf[483:495])
1350 obj._sparse_structs = (structs, isextended, origsize)
1351
1352 # Remove redundant slashes from directories.
1353 if obj.isdir():
1354 obj.name = obj.name.rstrip("/")
1355
1356 # Reconstruct a ustar longname.
1357 if prefix and obj.type not in GNU_TYPES:
1358 obj.name = prefix + "/" + obj.name
1359 return obj
1360
1361 @classmethod
1362 def fromtarfile(cls, tarfile):
1363 """Return the next TarInfo object from TarFile object
1364 tarfile.
1365 """
1366 return cls._fromtarfile(tarfile)
1367
1368 @classmethod
1369 def _fromtarfile(cls, tarfile, *, dircheck=True):
1370 """
1371 See dircheck documentation in _frombuf().
1372 """
1373 buf = tarfile.fileobj.read(BLOCKSIZE)
1374 obj = cls._frombuf(buf, tarfile.encoding, tarfile.errors, dircheck=dircheck)
1375 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1376 return obj._proc_member(tarfile)
1377
1378 #--------------------------------------------------------------------------
1379 # The following are methods that are called depending on the type of a
1380 # member. The entry point is _proc_member() which can be overridden in a
1381 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1382 # implement the following
1383 # operations:
1384 # 1. Set self.offset_data to the position where the data blocks begin,
1385 # if there is data that follows.
1386 # 2. Set tarfile.offset to the position where the next member's header will
1387 # begin.
1388 # 3. Return self or another valid TarInfo object.
1389 def _proc_member(self, tarfile):
1390 """Choose the right processing method depending on
1391 the type and call it.
1392 """
1393 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1394 return self._proc_gnulong(tarfile)
1395 elif self.type == GNUTYPE_SPARSE:
1396 return self._proc_sparse(tarfile)
1397 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1398 return self._proc_pax(tarfile)
1399 else:
1400 return self._proc_builtin(tarfile)
1401
1402 def _proc_builtin(self, tarfile):
1403 """Process a builtin type or an unknown type which
1404 will be treated as a regular file.
1405 """
1406 self.offset_data = tarfile.fileobj.tell()
1407 offset = self.offset_data
1408 if self.isreg() or self.type not in SUPPORTED_TYPES:
1409 # Skip the following data blocks.
1410 offset += self._block(self.size)
1411 tarfile.offset = offset
1412
1413 # Patch the TarInfo object with saved global
1414 # header information.
1415 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1416
1417 # Remove redundant slashes from directories. This is to be consistent
1418 # with frombuf().
1419 if self.isdir():
1420 self.name = self.name.rstrip("/")
1421
1422 return self
1423
1424 def _proc_gnulong(self, tarfile):
1425 """Process the blocks that hold a GNU longname
1426 or longlink member.
1427 """
1428 buf = tarfile.fileobj.read(self._block(self.size))
1429
1430 # Fetch the next header and process it.
1431 try:
1432 next = self._fromtarfile(tarfile, dircheck=False)
1433 except HeaderError as e:
1434 raise SubsequentHeaderError(str(e)) from None
1435
1436 # Patch the TarInfo object from the next header with
1437 # the longname information.
1438 next.offset = self.offset
1439 if self.type == GNUTYPE_LONGNAME:
1440 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1441 elif self.type == GNUTYPE_LONGLINK:
1442 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1443
1444 # Remove redundant slashes from directories. This is to be consistent
1445 # with frombuf().
1446 if next.isdir():
1447 next.name = next.name.removesuffix("/")
1448
1449 return next
1450
1451 def _proc_sparse(self, tarfile):
1452 """Process a GNU sparse header plus extra headers.
1453 """
1454 # We already collected some sparse structures in frombuf().
1455 structs, isextended, origsize = self._sparse_structs
1456 del self._sparse_structs
1457
1458 # Collect sparse structures from extended header blocks.
1459 while isextended:
1460 buf = tarfile.fileobj.read(BLOCKSIZE)
1461 pos = 0
1462 for i in range(21):
1463 try:
1464 offset = nti(buf[pos:pos + 12])
1465 numbytes = nti(buf[pos + 12:pos + 24])
1466 except ValueError:
1467 break
1468 if offset and numbytes:
1469 structs.append((offset, numbytes))
1470 pos += 24
1471 isextended = bool(buf[504])
1472 self.sparse = structs
1473
1474 self.offset_data = tarfile.fileobj.tell()
1475 tarfile.offset = self.offset_data + self._block(self.size)
1476 self.size = origsize
1477 return self
1478
1479 def _proc_pax(self, tarfile):
1480 """Process an extended or global header as described in
1481 POSIX.1-2008.
1482 """
1483 # Read the header information.
1484 buf = tarfile.fileobj.read(self._block(self.size))
1485
1486 # A pax header stores supplemental information for either
1487 # the following file (extended) or all following files
1488 # (global).
1489 if self.type == XGLTYPE:
1490 pax_headers = tarfile.pax_headers
1491 else:
1492 pax_headers = tarfile.pax_headers.copy()
1493
1494 # Parse pax header information. A record looks like that:
1495 # "%d %s=%s\n" % (length, keyword, value). length is the size
1496 # of the complete record including the length field itself and
1497 # the newline.
1498 pos = 0
1499 encoding = None
1500 raw_headers = []
1501 while len(buf) > pos and buf[pos] != 0x00:
1502 if not (match := _header_length_prefix_re.match(buf, pos)):
1503 raise InvalidHeaderError("invalid header")
1504 try:
1505 length = int(match.group(1))
1506 except ValueError:
1507 raise InvalidHeaderError("invalid header")
1508 # Headers must be at least 5 bytes, shortest being '5 x=\n'.
1509 # Value is allowed to be empty.
1510 if length < 5:
1511 raise InvalidHeaderError("invalid header")
1512 if pos + length > len(buf):
1513 raise InvalidHeaderError("invalid header")
1514
1515 header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
1516 keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
1517 raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")
1518
1519 # Check the framing of the header. The last character must be '\n' (0x0A)
1520 if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
1521 raise InvalidHeaderError("invalid header")
1522 raw_headers.append((length, raw_keyword, raw_value))
1523
1524 # Check if the pax header contains a hdrcharset field. This tells us
1525 # the encoding of the path, linkpath, uname and gname fields. Normally,
1526 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1527 # implementations are allowed to store them as raw binary strings if
1528 # the translation to UTF-8 fails. For the time being, we don't care about
1529 # anything other than "BINARY". The only other value that is currently
1530 # allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1531 # Note that we only follow the initial 'hdrcharset' setting to preserve
1532 # the initial behavior of the 'tarfile' module.
1533 if raw_keyword == b"hdrcharset" and encoding is None:
1534 if raw_value == b"BINARY":
1535 encoding = tarfile.encoding
1536 else: # This branch ensures only the first 'hdrcharset' header is used.
1537 encoding = "utf-8"
1538
1539 pos += length
1540
1541 # If no explicit hdrcharset is set, we use UTF-8 as a default.
1542 if encoding is None:
1543 encoding = "utf-8"
1544
1545 # After parsing the raw headers we can decode them to text.
1546 for length, raw_keyword, raw_value in raw_headers:
1547 # Normally, we could just use "utf-8" as the encoding and "strict"
1548 # as the error handler, but we better not take the risk. For
1549 # example, GNU tar <= 1.23 is known to store filenames it cannot
1550 # translate to UTF-8 as raw strings (unfortunately without a
1551 # hdrcharset=BINARY header).
1552 # We first try the strict standard encoding, and if that fails we
1553 # fall back on the user's encoding and error handler.
1554 keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
1555 tarfile.errors)
1556 if keyword in PAX_NAME_FIELDS:
1557 value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
1558 tarfile.errors)
1559 else:
1560 value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
1561 tarfile.errors)
1562
1563 pax_headers[keyword] = value
1564
1565 # Fetch the next header.
1566 try:
1567 next = self._fromtarfile(tarfile, dircheck=False)
1568 except HeaderError as e:
1569 raise SubsequentHeaderError(str(e)) from None
1570
1571 # Process GNU sparse information.
1572 if "GNU.sparse.map" in pax_headers:
1573 # GNU extended sparse format version 0.1.
1574 self._proc_gnusparse_01(next, pax_headers)
1575
1576 elif "GNU.sparse.size" in pax_headers:
1577 # GNU extended sparse format version 0.0.
1578 self._proc_gnusparse_00(next, raw_headers)
1579
1580 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1581 # GNU extended sparse format version 1.0.
1582 self._proc_gnusparse_10(next, pax_headers, tarfile)
1583
1584 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1585 # Patch the TarInfo object with the extended header info.
1586 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1587 next.offset = self.offset
1588
1589 if "size" in pax_headers:
1590 # If the extended header replaces the size field,
1591 # we need to recalculate the offset where the next
1592 # header starts.
1593 offset = next.offset_data
1594 if next.isreg() or next.type not in SUPPORTED_TYPES:
1595 offset += next._block(next.size)
1596 tarfile.offset = offset
1597
1598 return next
1599
1600 def _proc_gnusparse_00(self, next, raw_headers):
1601 """Process a GNU tar extended sparse header, version 0.0.
1602 """
1603 offsets = []
1604 numbytes = []
1605 for _, keyword, value in raw_headers:
1606 if keyword == b"GNU.sparse.offset":
1607 try:
1608 offsets.append(int(value.decode()))
1609 except ValueError:
1610 raise InvalidHeaderError("invalid header")
1611
1612 elif keyword == b"GNU.sparse.numbytes":
1613 try:
1614 numbytes.append(int(value.decode()))
1615 except ValueError:
1616 raise InvalidHeaderError("invalid header")
1617
1618 next.sparse = list(zip(offsets, numbytes))
1619
1620 def _proc_gnusparse_01(self, next, pax_headers):
1621 """Process a GNU tar extended sparse header, version 0.1.
1622 """
1623 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1624 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1625
1626 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1627 """Process a GNU tar extended sparse header, version 1.0.
1628 """
1629 fields = None
1630 sparse = []
1631 buf = tarfile.fileobj.read(BLOCKSIZE)
1632 fields, buf = buf.split(b"\n", 1)
1633 fields = int(fields)
1634 while len(sparse) < fields * 2:
1635 if b"\n" not in buf:
1636 buf += tarfile.fileobj.read(BLOCKSIZE)
1637 number, buf = buf.split(b"\n", 1)
1638 sparse.append(int(number))
1639 next.offset_data = tarfile.fileobj.tell()
1640 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1641
1642 def _apply_pax_info(self, pax_headers, encoding, errors):
1643 """Replace fields with supplemental information from a previous
1644 pax extended or global header.
1645 """
1646 for keyword, value in pax_headers.items():
1647 if keyword == "GNU.sparse.name":
1648 setattr(self, "path", value)
1649 elif keyword == "GNU.sparse.size":
1650 setattr(self, "size", int(value))
1651 elif keyword == "GNU.sparse.realsize":
1652 setattr(self, "size", int(value))
1653 elif keyword in PAX_FIELDS:
1654 if keyword in PAX_NUMBER_FIELDS:
1655 try:
1656 value = PAX_NUMBER_FIELDS[keyword](value)
1657 except ValueError:
1658 value = 0
1659 if keyword == "path":
1660 value = value.rstrip("/")
1661 setattr(self, keyword, value)
1662
1663 self.pax_headers = pax_headers.copy()
1664
1665 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1666 """Decode a single field from a pax record.
1667 """
1668 try:
1669 return value.decode(encoding, "strict")
1670 except UnicodeDecodeError:
1671 return value.decode(fallback_encoding, fallback_errors)
1672
1673 def _block(self, count):
1674 """Round up a byte count by BLOCKSIZE and return it,
1675 e.g. _block(834) => 1024.
1676 """
1677 # Only non-negative offsets are allowed
1678 if count < 0:
1679 raise InvalidHeaderError("invalid offset")
1680 blocks, remainder = divmod(count, BLOCKSIZE)
1681 if remainder:
1682 blocks += 1
1683 return blocks * BLOCKSIZE
1684
1685 def isreg(self):
1686 'Return True if the Tarinfo object is a regular file.'
1687 return self.type in REGULAR_TYPES
1688
1689 def isfile(self):
1690 'Return True if the Tarinfo object is a regular file.'
1691 return self.isreg()
1692
1693 def isdir(self):
1694 'Return True if it is a directory.'
1695 return self.type == DIRTYPE
1696
1697 def issym(self):
1698 'Return True if it is a symbolic link.'
1699 return self.type == SYMTYPE
1700
1701 def islnk(self):
1702 'Return True if it is a hard link.'
1703 return self.type == LNKTYPE
1704
1705 def ischr(self):
1706 'Return True if it is a character device.'
1707 return self.type == CHRTYPE
1708
1709 def isblk(self):
1710 'Return True if it is a block device.'
1711 return self.type == BLKTYPE
1712
1713 def isfifo(self):
1714 'Return True if it is a FIFO.'
1715 return self.type == FIFOTYPE
1716
1717 def issparse(self):
1718 return self.sparse is not None
1719
1720 def isdev(self):
1721 'Return True if it is one of character device, block device or FIFO.'
1722 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1723# class TarInfo
1724
1725class TarFile(object):
1726 """The TarFile Class provides an interface to tar archives.
1727 """
1728
1729 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1730
1731 dereference = False # If true, add content of linked file to the
1732 # tar file, else the link.
1733
1734 ignore_zeros = False # If true, skips empty or invalid blocks and
1735 # continues processing.
1736
1737 errorlevel = 1 # If 0, fatal errors only appear in debug
1738 # messages (if debug >= 0). If > 0, errors
1739 # are passed to the caller as exceptions.
1740
1741 format = DEFAULT_FORMAT # The format to use when creating an archive.
1742
1743 encoding = ENCODING # Encoding for 8-bit character strings.
1744
1745 errors = None # Error handler for unicode conversion.
1746
1747 tarinfo = TarInfo # The default TarInfo class to use.
1748
1749 fileobject = ExFileObject # The file-object for extractfile().
1750
1751 extraction_filter = None # The default filter for extraction.
1752
1753 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1754 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1755 errors="surrogateescape", pax_headers=None, debug=None,
1756 errorlevel=None, copybufsize=None, stream=False):
1757 """Open an (uncompressed) tar archive 'name'. 'mode' is either 'r' to
1758 read from an existing archive, 'a' to append data to an existing
1759 file or 'w' to create a new file overwriting an existing one. 'mode'
1760 defaults to 'r'.
1761 If 'fileobj' is given, it is used for reading or writing data. If it
1762 can be determined, 'mode' is overridden by 'fileobj's mode.
1763 'fileobj' is not closed, when TarFile is closed.
1764 """
1765 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1766 if mode not in modes:
1767 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1768 self.mode = mode
1769 self._mode = modes[mode]
1770
1771 if not fileobj:
1772 if self.mode == "a" and not os.path.exists(name):
1773 # Create nonexistent files in append mode.
1774 self.mode = "w"
1775 self._mode = "wb"
1776 fileobj = bltn_open(name, self._mode)
1777 self._extfileobj = False
1778 else:
1779 if (name is None and hasattr(fileobj, "name") and
1780 isinstance(fileobj.name, (str, bytes))):
1781 name = fileobj.name
1782 if hasattr(fileobj, "mode"):
1783 self._mode = fileobj.mode
1784 self._extfileobj = True
1785 self.name = os.path.abspath(name) if name else None
1786 self.fileobj = fileobj
1787
1788 self.stream = stream
1789
1790 # Init attributes.
1791 if format is not None:
1792 self.format = format
1793 if tarinfo is not None:
1794 self.tarinfo = tarinfo
1795 if dereference is not None:
1796 self.dereference = dereference
1797 if ignore_zeros is not None:
1798 self.ignore_zeros = ignore_zeros
1799 if encoding is not None:
1800 self.encoding = encoding
1801 self.errors = errors
1802
1803 if pax_headers is not None and self.format == PAX_FORMAT:
1804 self.pax_headers = pax_headers
1805 else:
1806 self.pax_headers = {}
1807
1808 if debug is not None:
1809 self.debug = debug
1810 if errorlevel is not None:
1811 self.errorlevel = errorlevel
1812
1813 # Init datastructures.
1814 self.copybufsize = copybufsize
1815 self.closed = False
1816 self.members = [] # list of members as TarInfo objects
1817 self._loaded = False # flag if all members have been read
1818 self.offset = self.fileobj.tell()
1819 # current position in the archive file
1820 self.inodes = {} # dictionary caching the inodes of
1821 # archive members already added
1822 self._unames = {} # Cached mappings of uid -> uname
1823 self._gnames = {} # Cached mappings of gid -> gname
1824
1825 try:
1826 if self.mode == "r":
1827 self.firstmember = None
1828 self.firstmember = self.next()
1829
1830 if self.mode == "a":
1831 # Move to the end of the archive,
1832 # before the first empty block.
1833 while True:
1834 self.fileobj.seek(self.offset)
1835 try:
1836 tarinfo = self.tarinfo.fromtarfile(self)
1837 self.members.append(tarinfo)
1838 except EOFHeaderError:
1839 self.fileobj.seek(self.offset)
1840 break
1841 except HeaderError as e:
1842 raise ReadError(str(e)) from None
1843
1844 if self.mode in ("a", "w", "x"):
1845 self._loaded = True
1846
1847 if self.pax_headers:
1848 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1849 self.fileobj.write(buf)
1850 self.offset += len(buf)
1851 except:
1852 if not self._extfileobj:
1853 self.fileobj.close()
1854 self.closed = True
1855 raise
1856
1857 #--------------------------------------------------------------------------
1858 # Below are the classmethods which act as alternate constructors to the
1859 # TarFile class. The open() method is the only one that is needed for
1860 # public use; it is the "super"-constructor and is able to select an
1861 # adequate "sub"-constructor for a particular compression using the mapping
1862 # from OPEN_METH.
1863 #
1864 # This concept allows one to subclass TarFile without losing the comfort of
1865 # the super-constructor. A sub-constructor is registered and made available
1866 # by adding it to the mapping in OPEN_METH.
1867
1868 @classmethod
1869 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1870 """Open a tar archive for reading, writing or appending. Return
1871 an appropriate TarFile class.
1872
1873 mode:
1874 'r' or 'r:*' open for reading with transparent compression
1875 'r:' open for reading exclusively uncompressed
1876 'r:gz' open for reading with gzip compression
1877 'r:bz2' open for reading with bzip2 compression
1878 'r:xz' open for reading with lzma compression
1879 'r:zst' open for reading with zstd compression
1880 'a' or 'a:' open for appending, creating the file if necessary
1881 'w' or 'w:' open for writing without compression
1882 'w:gz' open for writing with gzip compression
1883 'w:bz2' open for writing with bzip2 compression
1884 'w:xz' open for writing with lzma compression
1885 'w:zst' open for writing with zstd compression
1886
1887 'x' or 'x:' create a tarfile exclusively without compression, raise
1888 an exception if the file is already created
1889 'x:gz' create a gzip compressed tarfile, raise an exception
1890 if the file is already created
1891 'x:bz2' create a bzip2 compressed tarfile, raise an exception
1892 if the file is already created
1893 'x:xz' create an lzma compressed tarfile, raise an exception
1894 if the file is already created
1895 'x:zst' create a zstd compressed tarfile, raise an exception
1896 if the file is already created
1897
1898 'r|*' open a stream of tar blocks with transparent compression
1899 'r|' open an uncompressed stream of tar blocks for reading
1900 'r|gz' open a gzip compressed stream of tar blocks
1901 'r|bz2' open a bzip2 compressed stream of tar blocks
1902 'r|xz' open an lzma compressed stream of tar blocks
1903 'r|zst' open a zstd compressed stream of tar blocks
1904 'w|' open an uncompressed stream for writing
1905 'w|gz' open a gzip compressed stream for writing
1906 'w|bz2' open a bzip2 compressed stream for writing
1907 'w|xz' open an lzma compressed stream for writing
1908 'w|zst' open a zstd compressed stream for writing
1909 """
1910
1911 if not name and not fileobj:
1912 raise ValueError("nothing to open")
1913
1914 if mode in ("r", "r:*"):
1915 # Find out which *open() is appropriate for opening the file.
1916 def not_compressed(comptype):
1917 return cls.OPEN_METH[comptype] == 'taropen'
1918 error_msgs = []
1919 for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1920 func = getattr(cls, cls.OPEN_METH[comptype])
1921 if fileobj is not None:
1922 saved_pos = fileobj.tell()
1923 try:
1924 return func(name, "r", fileobj, **kwargs)
1925 except (ReadError, CompressionError) as e:
1926 error_msgs.append(f'- method {comptype}: {e!r}')
1927 if fileobj is not None:
1928 fileobj.seek(saved_pos)
1929 continue
1930 error_msgs_summary = '\n'.join(error_msgs)
1931 raise ReadError(f"file could not be opened successfully:\n{error_msgs_summary}")
1932
1933 elif ":" in mode:
1934 filemode, comptype = mode.split(":", 1)
1935 filemode = filemode or "r"
1936 comptype = comptype or "tar"
1937
1938 # Select the *open() function according to
1939 # given compression.
1940 if comptype in cls.OPEN_METH:
1941 func = getattr(cls, cls.OPEN_METH[comptype])
1942 else:
1943 raise CompressionError("unknown compression type %r" % comptype)
1944 return func(name, filemode, fileobj, **kwargs)
1945
1946 elif "|" in mode:
1947 filemode, comptype = mode.split("|", 1)
1948 filemode = filemode or "r"
1949 comptype = comptype or "tar"
1950
1951 if filemode not in ("r", "w"):
1952 raise ValueError("mode must be 'r' or 'w'")
1953 if "compresslevel" in kwargs and comptype not in ("gz", "bz2"):
1954 raise ValueError(
1955 "compresslevel is only valid for w|gz and w|bz2 modes"
1956 )
1957 if "preset" in kwargs and comptype not in ("xz",):
1958 raise ValueError("preset is only valid for w|xz mode")
1959
1960 compresslevel = kwargs.pop("compresslevel", 9)
1961 preset = kwargs.pop("preset", None)
1962 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
1963 compresslevel, preset)
1964 try:
1965 t = cls(name, filemode, stream, **kwargs)
1966 except:
1967 stream.close()
1968 raise
1969 t._extfileobj = False
1970 return t
1971
1972 elif mode in ("a", "w", "x"):
1973 return cls.taropen(name, mode, fileobj, **kwargs)
1974
1975 raise ValueError("undiscernible mode")
1976
1977 @classmethod
1978 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1979 """Open uncompressed tar archive name for reading or writing.
1980 """
1981 if mode not in ("r", "a", "w", "x"):
1982 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1983 return cls(name, mode, fileobj, **kwargs)
1984
1985 @classmethod
1986 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1987 """Open gzip compressed tar archive name for reading or writing.
1988 Appending is not allowed.
1989 """
1990 if mode not in ("r", "w", "x"):
1991 raise ValueError("mode must be 'r', 'w' or 'x'")
1992
1993 try:
1994 from gzip import GzipFile
1995 except ImportError:
1996 raise CompressionError("gzip module is not available") from None
1997
1998 try:
1999 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
2000 except OSError as e:
2001 if fileobj is not None and mode == 'r':
2002 raise ReadError("not a gzip file") from e
2003 raise
2004
2005 try:
2006 t = cls.taropen(name, mode, fileobj, **kwargs)
2007 except OSError as e:
2008 fileobj.close()
2009 if mode == 'r':
2010 raise ReadError("not a gzip file") from e
2011 raise
2012 except:
2013 fileobj.close()
2014 raise
2015 t._extfileobj = False
2016 return t
2017
2018 @classmethod
2019 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2020 """Open bzip2 compressed tar archive name for reading or writing.
2021 Appending is not allowed.
2022 """
2023 if mode not in ("r", "w", "x"):
2024 raise ValueError("mode must be 'r', 'w' or 'x'")
2025
2026 try:
2027 from bz2 import BZ2File
2028 except ImportError:
2029 raise CompressionError("bz2 module is not available") from None
2030
2031 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel)
2032
2033 try:
2034 t = cls.taropen(name, mode, fileobj, **kwargs)
2035 except (OSError, EOFError) as e:
2036 fileobj.close()
2037 if mode == 'r':
2038 raise ReadError("not a bzip2 file") from e
2039 raise
2040 except:
2041 fileobj.close()
2042 raise
2043 t._extfileobj = False
2044 return t
2045
2046 @classmethod
2047 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2048 """Open lzma compressed tar archive name for reading or writing.
2049 Appending is not allowed.
2050 """
2051 if mode not in ("r", "w", "x"):
2052 raise ValueError("mode must be 'r', 'w' or 'x'")
2053
2054 try:
2055 from lzma import LZMAFile, LZMAError
2056 except ImportError:
2057 raise CompressionError("lzma module is not available") from None
2058
2059 fileobj = LZMAFile(fileobj or name, mode, preset=preset)
2060
2061 try:
2062 t = cls.taropen(name, mode, fileobj, **kwargs)
2063 except (LZMAError, EOFError) as e:
2064 fileobj.close()
2065 if mode == 'r':
2066 raise ReadError("not an lzma file") from e
2067 raise
2068 except:
2069 fileobj.close()
2070 raise
2071 t._extfileobj = False
2072 return t
2073
2074 @classmethod
2075 def zstopen(cls, name, mode="r", fileobj=None, level=None, options=None,
2076 zstd_dict=None, **kwargs):
2077 """Open zstd compressed tar archive name for reading or writing.
2078 Appending is not allowed.
2079 """
2080 if mode not in ("r", "w", "x"):
2081 raise ValueError("mode must be 'r', 'w' or 'x'")
2082
2083 from backports.zstd import ZstdFile, ZstdError
2084
2085 fileobj = ZstdFile(
2086 fileobj or name,
2087 mode,
2088 level=level,
2089 options=options,
2090 zstd_dict=zstd_dict
2091 )
2092
2093 try:
2094 t = cls.taropen(name, mode, fileobj, **kwargs)
2095 except (ZstdError, EOFError) as e:
2096 fileobj.close()
2097 if mode == 'r':
2098 raise ReadError("not a zstd file") from e
2099 raise
2100 except Exception:
2101 fileobj.close()
2102 raise
2103 t._extfileobj = False
2104 return t
2105
2106 # All *open() methods are registered here.
2107 OPEN_METH = {
2108 "tar": "taropen", # uncompressed tar
2109 "gz": "gzopen", # gzip compressed tar
2110 "bz2": "bz2open", # bzip2 compressed tar
2111 "xz": "xzopen", # lzma compressed tar
2112 "zst": "zstopen", # zstd compressed tar
2113 }
2114
2115 #--------------------------------------------------------------------------
2116 # The public methods which TarFile provides:
2117
2118 def close(self):
2119 """Close the TarFile. In write-mode, two finishing zero blocks are
2120 appended to the archive.
2121 """
2122 if self.closed:
2123 return
2124
2125 self.closed = True
2126 try:
2127 if self.mode in ("a", "w", "x"):
2128 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2129 self.offset += (BLOCKSIZE * 2)
2130 # fill up the end with zero-blocks
2131 # (like option -b20 for tar does)
2132 blocks, remainder = divmod(self.offset, RECORDSIZE)
2133 if remainder > 0:
2134 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2135 finally:
2136 if not self._extfileobj:
2137 self.fileobj.close()
2138
2139 def getmember(self, name):
2140 """Return a TarInfo object for member 'name'. If 'name' can not be
2141 found in the archive, KeyError is raised. If a member occurs more
2142 than once in the archive, its last occurrence is assumed to be the
2143 most up-to-date version.
2144 """
2145 tarinfo = self._getmember(name.rstrip('/'))
2146 if tarinfo is None:
2147 raise KeyError("filename %r not found" % name)
2148 return tarinfo
2149
2150 def getmembers(self):
2151 """Return the members of the archive as a list of TarInfo objects. The
2152 list has the same order as the members in the archive.
2153 """
2154 self._check()
2155 if not self._loaded: # if we want to obtain a list of
2156 self._load() # all members, we first have to
2157 # scan the whole archive.
2158 return self.members
2159
2160 def getnames(self):
2161 """Return the members of the archive as a list of their names. It has
2162 the same order as the list returned by getmembers().
2163 """
2164 return [tarinfo.name for tarinfo in self.getmembers()]
2165
2166 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2167 """Create a TarInfo object from the result of os.stat or equivalent
2168 on an existing file. The file is either named by 'name', or
2169 specified as a file object 'fileobj' with a file descriptor. If
2170 given, 'arcname' specifies an alternative name for the file in the
2171 archive, otherwise, the name is taken from the 'name' attribute of
2172 'fileobj', or the 'name' argument. The name should be a text
2173 string.
2174 """
2175 self._check("awx")
2176
2177 # When fileobj is given, replace name by
2178 # fileobj's real name.
2179 if fileobj is not None:
2180 name = fileobj.name
2181
2182 # Building the name of the member in the archive.
2183 # Backward slashes are converted to forward slashes,
2184 # Absolute paths are turned to relative paths.
2185 if arcname is None:
2186 arcname = name
2187 drv, arcname = os.path.splitdrive(arcname)
2188 arcname = arcname.replace(os.sep, "/")
2189 arcname = arcname.lstrip("/")
2190
2191 # Now, fill the TarInfo object with
2192 # information specific for the file.
2193 tarinfo = self.tarinfo()
2194 tarinfo._tarfile = self # To be removed in 3.16.
2195
2196 # Use os.stat or os.lstat, depending on if symlinks shall be resolved.
2197 if fileobj is None:
2198 if not self.dereference:
2199 statres = os.lstat(name)
2200 else:
2201 statres = os.stat(name)
2202 else:
2203 statres = os.fstat(fileobj.fileno())
2204 linkname = ""
2205
2206 stmd = statres.st_mode
2207 if stat.S_ISREG(stmd):
2208 inode = (statres.st_ino, statres.st_dev)
2209 if not self.dereference and statres.st_nlink > 1 and \
2210 inode in self.inodes and arcname != self.inodes[inode]:
2211 # Is it a hardlink to an already
2212 # archived file?
2213 type = LNKTYPE
2214 linkname = self.inodes[inode]
2215 else:
2216 # The inode is added only if its valid.
2217 # For win32 it is always 0.
2218 type = REGTYPE
2219 if inode[0]:
2220 self.inodes[inode] = arcname
2221 elif stat.S_ISDIR(stmd):
2222 type = DIRTYPE
2223 elif stat.S_ISFIFO(stmd):
2224 type = FIFOTYPE
2225 elif stat.S_ISLNK(stmd):
2226 type = SYMTYPE
2227 linkname = os.readlink(name)
2228 elif stat.S_ISCHR(stmd):
2229 type = CHRTYPE
2230 elif stat.S_ISBLK(stmd):
2231 type = BLKTYPE
2232 else:
2233 return None
2234
2235 # Fill the TarInfo object with all
2236 # information we can get.
2237 tarinfo.name = arcname
2238 tarinfo.mode = stmd
2239 tarinfo.uid = statres.st_uid
2240 tarinfo.gid = statres.st_gid
2241 if type == REGTYPE:
2242 tarinfo.size = statres.st_size
2243 else:
2244 tarinfo.size = 0
2245 tarinfo.mtime = statres.st_mtime
2246 tarinfo.type = type
2247 tarinfo.linkname = linkname
2248
2249 # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
2250 # speed things up, cache the resolved usernames and group names.
2251 if pwd:
2252 if tarinfo.uid not in self._unames:
2253 try:
2254 self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
2255 except KeyError:
2256 self._unames[tarinfo.uid] = ''
2257 tarinfo.uname = self._unames[tarinfo.uid]
2258 if grp:
2259 if tarinfo.gid not in self._gnames:
2260 try:
2261 self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
2262 except KeyError:
2263 self._gnames[tarinfo.gid] = ''
2264 tarinfo.gname = self._gnames[tarinfo.gid]
2265
2266 if type in (CHRTYPE, BLKTYPE):
2267 if hasattr(os, "major") and hasattr(os, "minor"):
2268 tarinfo.devmajor = os.major(statres.st_rdev)
2269 tarinfo.devminor = os.minor(statres.st_rdev)
2270 return tarinfo
2271
2272 def list(self, verbose=True, *, members=None):
2273 """Print a table of contents to sys.stdout. If 'verbose' is False, only
2274 the names of the members are printed. If it is True, an 'ls -l'-like
2275 output is produced. 'members' is optional and must be a subset of the
2276 list returned by getmembers().
2277 """
2278 # Convert tarinfo type to stat type.
2279 type2mode = {REGTYPE: stat.S_IFREG, SYMTYPE: stat.S_IFLNK,
2280 FIFOTYPE: stat.S_IFIFO, CHRTYPE: stat.S_IFCHR,
2281 DIRTYPE: stat.S_IFDIR, BLKTYPE: stat.S_IFBLK}
2282 self._check()
2283
2284 if members is None:
2285 members = self
2286 for tarinfo in members:
2287 if verbose:
2288 if tarinfo.mode is None:
2289 _safe_print("??????????")
2290 else:
2291 modetype = type2mode.get(tarinfo.type, 0)
2292 _safe_print(stat.filemode(modetype | tarinfo.mode))
2293 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2294 tarinfo.gname or tarinfo.gid))
2295 if tarinfo.ischr() or tarinfo.isblk():
2296 _safe_print("%10s" %
2297 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
2298 else:
2299 _safe_print("%10d" % tarinfo.size)
2300 if tarinfo.mtime is None:
2301 _safe_print("????-??-?? ??:??:??")
2302 else:
2303 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
2304 % time.localtime(tarinfo.mtime)[:6])
2305
2306 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
2307
2308 if verbose:
2309 if tarinfo.issym():
2310 _safe_print("-> " + tarinfo.linkname)
2311 if tarinfo.islnk():
2312 _safe_print("link to " + tarinfo.linkname)
2313 print()
2314
2315 def add(self, name, arcname=None, recursive=True, *, filter=None):
2316 """Add the file 'name' to the archive. 'name' may be any type of file
2317 (directory, fifo, symbolic link, etc.). If given, 'arcname'
2318 specifies an alternative name for the file in the archive.
2319 Directories are added recursively by default. This can be avoided by
2320 setting 'recursive' to False. 'filter' is a function
2321 that expects a TarInfo object argument and returns the changed
2322 TarInfo object, if it returns None the TarInfo object will be
2323 excluded from the archive.
2324 """
2325 self._check("awx")
2326
2327 if arcname is None:
2328 arcname = name
2329
2330 # Skip if somebody tries to archive the archive...
2331 if self.name is not None and os.path.abspath(name) == self.name:
2332 self._dbg(2, "tarfile: Skipped %r" % name)
2333 return
2334
2335 self._dbg(1, name)
2336
2337 # Create a TarInfo object from the file.
2338 tarinfo = self.gettarinfo(name, arcname)
2339
2340 if tarinfo is None:
2341 self._dbg(1, "tarfile: Unsupported type %r" % name)
2342 return
2343
2344 # Change or exclude the TarInfo object.
2345 if filter is not None:
2346 tarinfo = filter(tarinfo)
2347 if tarinfo is None:
2348 self._dbg(2, "tarfile: Excluded %r" % name)
2349 return
2350
2351 # Append the tar header and data to the archive.
2352 if tarinfo.isreg():
2353 with bltn_open(name, "rb") as f:
2354 self.addfile(tarinfo, f)
2355
2356 elif tarinfo.isdir():
2357 self.addfile(tarinfo)
2358 if recursive:
2359 for f in sorted(os.listdir(name)):
2360 self.add(os.path.join(name, f), os.path.join(arcname, f),
2361 recursive, filter=filter)
2362
2363 else:
2364 self.addfile(tarinfo)
2365
2366 def addfile(self, tarinfo, fileobj=None):
2367 """Add the TarInfo object 'tarinfo' to the archive. If 'tarinfo' represents
2368 a non zero-size regular file, the 'fileobj' argument should be a binary file,
2369 and tarinfo.size bytes are read from it and added to the archive.
2370 You can create TarInfo objects directly, or by using gettarinfo().
2371 """
2372 self._check("awx")
2373
2374 if fileobj is None and tarinfo.isreg() and tarinfo.size != 0:
2375 raise ValueError("fileobj not provided for non zero-size regular file")
2376
2377 tarinfo = copy.copy(tarinfo)
2378
2379 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2380 self.fileobj.write(buf)
2381 self.offset += len(buf)
2382 bufsize=self.copybufsize
2383 # If there's data to follow, append it.
2384 if fileobj is not None:
2385 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
2386 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2387 if remainder > 0:
2388 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2389 blocks += 1
2390 self.offset += blocks * BLOCKSIZE
2391
2392 self.members.append(tarinfo)
2393
2394 def _get_filter_function(self, filter):
2395 if filter is None:
2396 filter = self.extraction_filter
2397 if filter is None:
2398 return data_filter
2399 if isinstance(filter, str):
2400 raise TypeError(
2401 'String names are not supported for '
2402 + 'TarFile.extraction_filter. Use a function such as '
2403 + 'tarfile.data_filter directly.')
2404 return filter
2405 if callable(filter):
2406 return filter
2407 try:
2408 return _NAMED_FILTERS[filter]
2409 except KeyError:
2410 raise ValueError(f"filter {filter!r} not found") from None
2411
2412 def extractall(self, path=".", members=None, *, numeric_owner=False,
2413 filter=None):
2414 """Extract all members from the archive to the current working
2415 directory and set owner, modification time and permissions on
2416 directories afterwards. 'path' specifies a different directory
2417 to extract to. 'members' is optional and must be a subset of the
2418 list returned by getmembers(). If 'numeric_owner' is True, only
2419 the numbers for user/group names are used and not the names.
2420
2421 The 'filter' function will be called on each member just
2422 before extraction.
2423 It can return a changed TarInfo or None to skip the member.
2424 String names of common filters are accepted.
2425 """
2426 directories = []
2427
2428 filter_function = self._get_filter_function(filter)
2429 if members is None:
2430 members = self
2431
2432 for member in members:
2433 tarinfo, unfiltered = self._get_extract_tarinfo(
2434 member, filter_function, path)
2435 if tarinfo is None:
2436 continue
2437 if tarinfo.isdir():
2438 # For directories, delay setting attributes until later,
2439 # since permissions can interfere with extraction and
2440 # extracting contents can reset mtime.
2441 directories.append(unfiltered)
2442 self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(),
2443 numeric_owner=numeric_owner,
2444 filter_function=filter_function)
2445
2446 # Reverse sort directories.
2447 directories.sort(key=lambda a: a.name, reverse=True)
2448
2449
2450 # Set correct owner, mtime and filemode on directories.
2451 for unfiltered in directories:
2452 try:
2453 # Need to re-apply any filter, to take the *current* filesystem
2454 # state into account.
2455 try:
2456 tarinfo = filter_function(unfiltered, path)
2457 except _FILTER_ERRORS as exc:
2458 self._log_no_directory_fixup(unfiltered, repr(exc))
2459 continue
2460 if tarinfo is None:
2461 self._log_no_directory_fixup(unfiltered,
2462 'excluded by filter')
2463 continue
2464 dirpath = os.path.join(path, tarinfo.name)
2465 try:
2466 lstat = os.lstat(dirpath)
2467 except FileNotFoundError:
2468 self._log_no_directory_fixup(tarinfo, 'missing')
2469 continue
2470 if not stat.S_ISDIR(lstat.st_mode):
2471 # This is no longer a directory; presumably a later
2472 # member overwrote the entry.
2473 self._log_no_directory_fixup(tarinfo, 'not a directory')
2474 continue
2475 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2476 self.utime(tarinfo, dirpath)
2477 self.chmod(tarinfo, dirpath)
2478 except ExtractError as e:
2479 self._handle_nonfatal_error(e)
2480
2481 def _log_no_directory_fixup(self, member, reason):
2482 self._dbg(2, "tarfile: Not fixing up directory %r (%s)" %
2483 (member.name, reason))
2484
2485 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False,
2486 filter=None):
2487 """Extract a member from the archive to the current working directory,
2488 using its full name. Its file information is extracted as accurately
2489 as possible. 'member' may be a filename or a TarInfo object. You can
2490 specify a different directory using 'path'. File attributes (owner,
2491 mtime, mode) are set unless 'set_attrs' is False. If 'numeric_owner'
2492 is True, only the numbers for user/group names are used and not
2493 the names.
2494
2495 The 'filter' function will be called before extraction.
2496 It can return a changed TarInfo or None to skip the member.
2497 String names of common filters are accepted.
2498 """
2499 filter_function = self._get_filter_function(filter)
2500 tarinfo, unfiltered = self._get_extract_tarinfo(
2501 member, filter_function, path)
2502 if tarinfo is not None:
2503 self._extract_one(tarinfo, path, set_attrs, numeric_owner)
2504
2505 def _get_extract_tarinfo(self, member, filter_function, path):
2506 """Get (filtered, unfiltered) TarInfos from *member*
2507
2508 *member* might be a string.
2509
2510 Return (None, None) if not found.
2511 """
2512
2513 if isinstance(member, str):
2514 unfiltered = self.getmember(member)
2515 else:
2516 unfiltered = member
2517
2518 filtered = None
2519 try:
2520 filtered = filter_function(unfiltered, path)
2521 except (OSError, UnicodeEncodeError, FilterError) as e:
2522 self._handle_fatal_error(e)
2523 except ExtractError as e:
2524 self._handle_nonfatal_error(e)
2525 if filtered is None:
2526 self._dbg(2, "tarfile: Excluded %r" % unfiltered.name)
2527 return None, None
2528
2529 # Prepare the link target for makelink().
2530 if filtered.islnk():
2531 filtered = copy.copy(filtered)
2532 filtered._link_target = os.path.join(path, filtered.linkname)
2533 return filtered, unfiltered
2534
2535 def _extract_one(self, tarinfo, path, set_attrs, numeric_owner,
2536 filter_function=None):
2537 """Extract from filtered tarinfo to disk.
2538
2539 filter_function is only used when extracting a *different*
2540 member (e.g. as fallback to creating a symlink)
2541 """
2542 self._check("r")
2543
2544 try:
2545 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2546 set_attrs=set_attrs,
2547 numeric_owner=numeric_owner,
2548 filter_function=filter_function,
2549 extraction_root=path)
2550 except (OSError, UnicodeEncodeError) as e:
2551 self._handle_fatal_error(e)
2552 except ExtractError as e:
2553 self._handle_nonfatal_error(e)
2554
2555 def _handle_nonfatal_error(self, e):
2556 """Handle non-fatal error (ExtractError) according to errorlevel"""
2557 if self.errorlevel > 1:
2558 raise
2559 else:
2560 self._dbg(1, "tarfile: %s" % e)
2561
2562 def _handle_fatal_error(self, e):
2563 """Handle "fatal" error according to self.errorlevel"""
2564 if self.errorlevel > 0:
2565 raise
2566 elif isinstance(e, OSError):
2567 if e.filename is None:
2568 self._dbg(1, "tarfile: %s" % e.strerror)
2569 else:
2570 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2571 else:
2572 self._dbg(1, "tarfile: %s %s" % (type(e).__name__, e))
2573
2574 def extractfile(self, member):
2575 """Extract a member from the archive as a file object. 'member' may be
2576 a filename or a TarInfo object. If 'member' is a regular file or
2577 a link, an io.BufferedReader object is returned. For all other
2578 existing members, None is returned. If 'member' does not appear
2579 in the archive, KeyError is raised.
2580 """
2581 self._check("r")
2582
2583 if isinstance(member, str):
2584 tarinfo = self.getmember(member)
2585 else:
2586 tarinfo = member
2587
2588 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2589 # Members with unknown types are treated as regular files.
2590 return self.fileobject(self, tarinfo)
2591
2592 elif tarinfo.islnk() or tarinfo.issym():
2593 if isinstance(self.fileobj, _Stream):
2594 # A small but ugly workaround for the case that someone tries
2595 # to extract a (sym)link as a file-object from a non-seekable
2596 # stream of tar blocks.
2597 raise StreamError("cannot extract (sym)link as file object")
2598 else:
2599 # A (sym)link's file object is its target's file object.
2600 return self.extractfile(self._find_link_target(tarinfo))
2601 else:
2602 # If there's no data associated with the member (directory, chrdev,
2603 # blkdev, etc.), return None instead of a file object.
2604 return None
2605
2606 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2607 numeric_owner=False, *, filter_function=None,
2608 extraction_root=None):
2609 """Extract the filtered TarInfo object tarinfo to a physical
2610 file called targetpath.
2611
2612 filter_function is only used when extracting a *different*
2613 member (e.g. as fallback to creating a symlink)
2614 """
2615 # Fetch the TarInfo object for the given name
2616 # and build the destination pathname, replacing
2617 # forward slashes to platform specific separators.
2618 targetpath = targetpath.rstrip("/")
2619 targetpath = targetpath.replace("/", os.sep)
2620
2621 # Create all upper directories.
2622 upperdirs = os.path.dirname(targetpath)
2623 if upperdirs and not os.path.exists(upperdirs):
2624 # Create directories that are not part of the archive with
2625 # default permissions.
2626 os.makedirs(upperdirs, exist_ok=True)
2627
2628 if tarinfo.islnk() or tarinfo.issym():
2629 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2630 else:
2631 self._dbg(1, tarinfo.name)
2632
2633 if tarinfo.isreg():
2634 self.makefile(tarinfo, targetpath)
2635 elif tarinfo.isdir():
2636 self.makedir(tarinfo, targetpath)
2637 elif tarinfo.isfifo():
2638 self.makefifo(tarinfo, targetpath)
2639 elif tarinfo.ischr() or tarinfo.isblk():
2640 self.makedev(tarinfo, targetpath)
2641 elif tarinfo.islnk() or tarinfo.issym():
2642 self.makelink_with_filter(
2643 tarinfo, targetpath,
2644 filter_function=filter_function,
2645 extraction_root=extraction_root)
2646 elif tarinfo.type not in SUPPORTED_TYPES:
2647 self.makeunknown(tarinfo, targetpath)
2648 else:
2649 self.makefile(tarinfo, targetpath)
2650
2651 if set_attrs:
2652 self.chown(tarinfo, targetpath, numeric_owner)
2653 if not tarinfo.issym():
2654 self.chmod(tarinfo, targetpath)
2655 self.utime(tarinfo, targetpath)
2656
2657 #--------------------------------------------------------------------------
2658 # Below are the different file methods. They are called via
2659 # _extract_member() when extract() is called. They can be replaced in a
2660 # subclass to implement other functionality.
2661
2662 def makedir(self, tarinfo, targetpath):
2663 """Make a directory called targetpath.
2664 """
2665 try:
2666 if tarinfo.mode is None:
2667 # Use the system's default mode
2668 os.mkdir(targetpath)
2669 else:
2670 # Use a safe mode for the directory, the real mode is set
2671 # later in _extract_member().
2672 os.mkdir(targetpath, 0o700)
2673 except FileExistsError:
2674 if not os.path.isdir(targetpath):
2675 raise
2676
2677 def makefile(self, tarinfo, targetpath):
2678 """Make a file called targetpath.
2679 """
2680 source = self.fileobj
2681 source.seek(tarinfo.offset_data)
2682 bufsize = self.copybufsize
2683 with bltn_open(targetpath, "wb") as target:
2684 if tarinfo.sparse is not None:
2685 for offset, size in tarinfo.sparse:
2686 target.seek(offset)
2687 copyfileobj(source, target, size, ReadError, bufsize)
2688 target.seek(tarinfo.size)
2689 target.truncate()
2690 else:
2691 copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
2692
2693 def makeunknown(self, tarinfo, targetpath):
2694 """Make a file from a TarInfo object with an unknown type
2695 at targetpath.
2696 """
2697 self.makefile(tarinfo, targetpath)
2698 self._dbg(1, "tarfile: Unknown file type %r, " \
2699 "extracted as regular file." % tarinfo.type)
2700
2701 def makefifo(self, tarinfo, targetpath):
2702 """Make a fifo called targetpath.
2703 """
2704 if hasattr(os, "mkfifo"):
2705 os.mkfifo(targetpath)
2706 else:
2707 raise ExtractError("fifo not supported by system")
2708
2709 def makedev(self, tarinfo, targetpath):
2710 """Make a character or block device called targetpath.
2711 """
2712 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2713 raise ExtractError("special devices not supported by system")
2714
2715 mode = tarinfo.mode
2716 if mode is None:
2717 # Use mknod's default
2718 mode = 0o600
2719 if tarinfo.isblk():
2720 mode |= stat.S_IFBLK
2721 else:
2722 mode |= stat.S_IFCHR
2723
2724 os.mknod(targetpath, mode,
2725 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2726
2727 def makelink(self, tarinfo, targetpath):
2728 return self.makelink_with_filter(tarinfo, targetpath, None, None)
2729
2730 def makelink_with_filter(self, tarinfo, targetpath,
2731 filter_function, extraction_root):
2732 """Make a (symbolic) link called targetpath. If it cannot be created
2733 (platform limitation), we try to make a copy of the referenced file
2734 instead of a link.
2735
2736 filter_function is only used when extracting a *different*
2737 member (e.g. as fallback to creating a link).
2738 """
2739 keyerror_to_extracterror = False
2740 try:
2741 # For systems that support symbolic and hard links.
2742 if tarinfo.issym():
2743 if os.path.lexists(targetpath):
2744 # Avoid FileExistsError on following os.symlink.
2745 os.unlink(targetpath)
2746 os.symlink(tarinfo.linkname, targetpath)
2747 return
2748 else:
2749 if os.path.exists(tarinfo._link_target):
2750 if os.path.lexists(targetpath):
2751 # Avoid FileExistsError on following os.link.
2752 os.unlink(targetpath)
2753 os.link(tarinfo._link_target, targetpath)
2754 return
2755 except symlink_exception:
2756 keyerror_to_extracterror = True
2757
2758 try:
2759 unfiltered = self._find_link_target(tarinfo)
2760 except KeyError:
2761 if keyerror_to_extracterror:
2762 raise ExtractError(
2763 "unable to resolve link inside archive") from None
2764 else:
2765 raise
2766
2767 if filter_function is None:
2768 filtered = unfiltered
2769 else:
2770 if extraction_root is None:
2771 raise ExtractError(
2772 "makelink_with_filter: if filter_function is not None, "
2773 + "extraction_root must also not be None")
2774 try:
2775 filtered = filter_function(unfiltered, extraction_root)
2776 except _FILTER_ERRORS as cause:
2777 raise LinkFallbackError(tarinfo, unfiltered.name) from cause
2778 if filtered is not None:
2779 self._extract_member(filtered, targetpath,
2780 filter_function=filter_function,
2781 extraction_root=extraction_root)
2782
2783 def chown(self, tarinfo, targetpath, numeric_owner):
2784 """Set owner of targetpath according to tarinfo. If numeric_owner
2785 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2786 is False, fall back to .gid/.uid when the search based on name
2787 fails.
2788 """
2789 if hasattr(os, "geteuid") and os.geteuid() == 0:
2790 # We have to be root to do so.
2791 g = tarinfo.gid
2792 u = tarinfo.uid
2793 if not numeric_owner:
2794 try:
2795 if grp and tarinfo.gname:
2796 g = grp.getgrnam(tarinfo.gname)[2]
2797 except KeyError:
2798 pass
2799 try:
2800 if pwd and tarinfo.uname:
2801 u = pwd.getpwnam(tarinfo.uname)[2]
2802 except KeyError:
2803 pass
2804 if g is None:
2805 g = -1
2806 if u is None:
2807 u = -1
2808 try:
2809 if tarinfo.issym() and hasattr(os, "lchown"):
2810 os.lchown(targetpath, u, g)
2811 else:
2812 os.chown(targetpath, u, g)
2813 except (OSError, OverflowError) as e:
2814 # OverflowError can be raised if an ID doesn't fit in 'id_t'
2815 raise ExtractError("could not change owner") from e
2816
2817 def chmod(self, tarinfo, targetpath):
2818 """Set file permissions of targetpath according to tarinfo.
2819 """
2820 if tarinfo.mode is None:
2821 return
2822 try:
2823 os.chmod(targetpath, tarinfo.mode)
2824 except OSError as e:
2825 raise ExtractError("could not change mode") from e
2826
2827 def utime(self, tarinfo, targetpath):
2828 """Set modification time of targetpath according to tarinfo.
2829 """
2830 mtime = tarinfo.mtime
2831 if mtime is None:
2832 return
2833 if not hasattr(os, 'utime'):
2834 return
2835 try:
2836 os.utime(targetpath, (mtime, mtime))
2837 except OSError as e:
2838 raise ExtractError("could not change modification time") from e
2839
2840 #--------------------------------------------------------------------------
2841 def next(self):
2842 """Return the next member of the archive as a TarInfo object, when
2843 TarFile is opened for reading. Return None if there is no more
2844 available.
2845 """
2846 self._check("ra")
2847 if self.firstmember is not None:
2848 m = self.firstmember
2849 self.firstmember = None
2850 return m
2851
2852 # Advance the file pointer.
2853 if self.offset != self.fileobj.tell():
2854 if self.offset == 0:
2855 return None
2856 self.fileobj.seek(self.offset - 1)
2857 if not self.fileobj.read(1):
2858 raise ReadError("unexpected end of data")
2859
2860 # Read the next block.
2861 tarinfo = None
2862 while True:
2863 try:
2864 tarinfo = self.tarinfo.fromtarfile(self)
2865 except EOFHeaderError as e:
2866 if self.ignore_zeros:
2867 self._dbg(2, "0x%X: %s" % (self.offset, e))
2868 self.offset += BLOCKSIZE
2869 continue
2870 except InvalidHeaderError as e:
2871 if self.ignore_zeros:
2872 self._dbg(2, "0x%X: %s" % (self.offset, e))
2873 self.offset += BLOCKSIZE
2874 continue
2875 elif self.offset == 0:
2876 raise ReadError(str(e)) from None
2877 except EmptyHeaderError:
2878 if self.offset == 0:
2879 raise ReadError("empty file") from None
2880 except TruncatedHeaderError as e:
2881 if self.offset == 0:
2882 raise ReadError(str(e)) from None
2883 except SubsequentHeaderError as e:
2884 raise ReadError(str(e)) from None
2885 except Exception as e:
2886 try:
2887 import zlib
2888 if isinstance(e, zlib.error):
2889 raise ReadError(f'zlib error: {e}') from None
2890 else:
2891 raise e
2892 except ImportError:
2893 raise e
2894 break
2895
2896 if tarinfo is not None:
2897 # if streaming the file we do not want to cache the tarinfo
2898 if not self.stream:
2899 self.members.append(tarinfo)
2900 else:
2901 self._loaded = True
2902
2903 return tarinfo
2904
2905 #--------------------------------------------------------------------------
2906 # Little helper methods:
2907
2908 def _getmember(self, name, tarinfo=None, normalize=False):
2909 """Find an archive member by name from bottom to top.
2910 If tarinfo is given, it is used as the starting point.
2911 """
2912 # Ensure that all members have been loaded.
2913 members = self.getmembers()
2914
2915 # Limit the member search list up to tarinfo.
2916 skipping = False
2917 if tarinfo is not None:
2918 try:
2919 index = members.index(tarinfo)
2920 except ValueError:
2921 # The given starting point might be a (modified) copy.
2922 # We'll later skip members until we find an equivalent.
2923 skipping = True
2924 else:
2925 # Happy fast path
2926 members = members[:index]
2927
2928 if normalize:
2929 name = os.path.normpath(name)
2930
2931 for member in reversed(members):
2932 if skipping:
2933 if tarinfo.offset == member.offset:
2934 skipping = False
2935 continue
2936 if normalize:
2937 member_name = os.path.normpath(member.name)
2938 else:
2939 member_name = member.name
2940
2941 if name == member_name:
2942 return member
2943
2944 if skipping:
2945 # Starting point was not found
2946 raise ValueError(tarinfo)
2947
2948 def _load(self):
2949 """Read through the entire archive file and look for readable
2950 members. This should not run if the file is set to stream.
2951 """
2952 if not self.stream:
2953 while self.next() is not None:
2954 pass
2955 self._loaded = True
2956
2957 def _check(self, mode=None):
2958 """Check if TarFile is still open, and if the operation's mode
2959 corresponds to TarFile's mode.
2960 """
2961 if self.closed:
2962 raise OSError("%s is closed" % self.__class__.__name__)
2963 if mode is not None and self.mode not in mode:
2964 raise OSError("bad operation for mode %r" % self.mode)
2965
2966 def _find_link_target(self, tarinfo):
2967 """Find the target member of a symlink or hardlink member in the
2968 archive.
2969 """
2970 if tarinfo.issym():
2971 # Always search the entire archive.
2972 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2973 limit = None
2974 else:
2975 # Search the archive before the link, because a hard link is
2976 # just a reference to an already archived file.
2977 linkname = tarinfo.linkname
2978 limit = tarinfo
2979
2980 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2981 if member is None:
2982 raise KeyError("linkname %r not found" % linkname)
2983 return member
2984
2985 def __iter__(self):
2986 """Provide an iterator object.
2987 """
2988 if self._loaded:
2989 yield from self.members
2990 return
2991
2992 # Yield items using TarFile's next() method.
2993 # When all members have been read, set TarFile as _loaded.
2994 index = 0
2995 # Fix for SF #1100429: Under rare circumstances it can
2996 # happen that getmembers() is called during iteration,
2997 # which will have already exhausted the next() method.
2998 if self.firstmember is not None:
2999 tarinfo = self.next()
3000 index += 1
3001 yield tarinfo
3002
3003 while True:
3004 if index < len(self.members):
3005 tarinfo = self.members[index]
3006 elif not self._loaded:
3007 tarinfo = self.next()
3008 if not tarinfo:
3009 self._loaded = True
3010 return
3011 else:
3012 return
3013 index += 1
3014 yield tarinfo
3015
3016 def _dbg(self, level, msg):
3017 """Write debugging output to sys.stderr.
3018 """
3019 if level <= self.debug:
3020 print(msg, file=sys.stderr)
3021
3022 def __enter__(self):
3023 self._check()
3024 return self
3025
3026 def __exit__(self, type, value, traceback):
3027 if type is None:
3028 self.close()
3029 else:
3030 # An exception occurred. We must not call close() because
3031 # it would try to write end-of-archive blocks and padding.
3032 if not self._extfileobj:
3033 self.fileobj.close()
3034 self.closed = True
3035
3036#--------------------
3037# exported functions
3038#--------------------
3039
3040def is_tarfile(name):
3041 """Return True if name points to a tar archive that we
3042 are able to handle, else return False.
3043
3044 'name' should be a string, file, or file-like object.
3045 """
3046 try:
3047 if hasattr(name, "read"):
3048 pos = name.tell()
3049 t = open(fileobj=name)
3050 name.seek(pos)
3051 else:
3052 t = open(name)
3053 t.close()
3054 return True
3055 except TarError:
3056 return False
3057
3058open = TarFile.open
3059
3060
3061def main():
3062 import argparse
3063
3064 description = 'A simple command-line interface for tarfile module.'
3065 parser = argparse.ArgumentParser(description=description)
3066 parser.add_argument('-v', '--verbose', action='store_true', default=False,
3067 help='Verbose output')
3068 parser.add_argument('--filter', metavar='<filtername>',
3069 choices=_NAMED_FILTERS,
3070 help='Filter for extraction')
3071
3072 group = parser.add_mutually_exclusive_group(required=True)
3073 group.add_argument('-l', '--list', metavar='<tarfile>',
3074 help='Show listing of a tarfile')
3075 group.add_argument('-e', '--extract', nargs='+',
3076 metavar=('<tarfile>', '<output_dir>'),
3077 help='Extract tarfile into target dir')
3078 group.add_argument('-c', '--create', nargs='+',
3079 metavar=('<name>', '<file>'),
3080 help='Create tarfile from sources')
3081 group.add_argument('-t', '--test', metavar='<tarfile>',
3082 help='Test if a tarfile is valid')
3083
3084 args = parser.parse_args()
3085
3086 if args.filter and args.extract is None:
3087 parser.exit(1, '--filter is only valid for extraction\n')
3088
3089 if args.test is not None:
3090 src = args.test
3091 if is_tarfile(src):
3092 with open(src, 'r') as tar:
3093 tar.getmembers()
3094 print(tar.getmembers(), file=sys.stderr)
3095 if args.verbose:
3096 print('{!r} is a tar archive.'.format(src))
3097 else:
3098 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3099
3100 elif args.list is not None:
3101 src = args.list
3102 if is_tarfile(src):
3103 with TarFile.open(src, 'r:*') as tf:
3104 tf.list(verbose=args.verbose)
3105 else:
3106 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3107
3108 elif args.extract is not None:
3109 if len(args.extract) == 1:
3110 src = args.extract[0]
3111 curdir = os.curdir
3112 elif len(args.extract) == 2:
3113 src, curdir = args.extract
3114 else:
3115 parser.exit(1, parser.format_help())
3116
3117 if is_tarfile(src):
3118 with TarFile.open(src, 'r:*') as tf:
3119 tf.extractall(path=curdir, filter=args.filter)
3120 if args.verbose:
3121 if curdir == '.':
3122 msg = '{!r} file is extracted.'.format(src)
3123 else:
3124 msg = ('{!r} file is extracted '
3125 'into {!r} directory.').format(src, curdir)
3126 print(msg)
3127 else:
3128 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
3129
3130 elif args.create is not None:
3131 tar_name = args.create.pop(0)
3132 _, ext = os.path.splitext(tar_name)
3133 compressions = {
3134 # gz
3135 '.gz': 'gz',
3136 '.tgz': 'gz',
3137 # xz
3138 '.xz': 'xz',
3139 '.txz': 'xz',
3140 # bz2
3141 '.bz2': 'bz2',
3142 '.tbz': 'bz2',
3143 '.tbz2': 'bz2',
3144 '.tb2': 'bz2',
3145 # zstd
3146 '.zst': 'zst',
3147 '.tzst': 'zst',
3148 }
3149 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
3150 tar_files = args.create
3151
3152 with TarFile.open(tar_name, tar_mode) as tf:
3153 for file_name in tar_files:
3154 tf.add(file_name)
3155
3156 if args.verbose:
3157 print('{!r} file created.'.format(tar_name))
3158
3159if __name__ == '__main__':
3160 main()