Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/pack.py: 27%

1# pack.py -- For dealing with packed git objects.

5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

6# General Public License as public by the Free Software Foundation; version 2.0

7# or (at your option) any later version. You can redistribute it and/or

8# modify it under the terms of either of these two licenses.

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS,

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13# See the License for the specific language governing permissions and

14# limitations under the License.

15#

16# You should have received a copy of the licenses; if not, see

17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

19# License, Version 2.0.

20#

22"""Classes for dealing with packed git objects.

24A pack is a compact representation of a bunch of objects, stored

25using deltas where possible.

27They have two parts, the pack file, which stores the data, and an index

28that tells you where the data is.

30To find an object you look in all of the index files 'til you find a

31match for the object name. You then use the pointer got from this as

32a pointer in to the corresponding packfile.

33"""

35import binascii

36from collections import defaultdict, deque

37from contextlib import suppress

38from io import BytesIO, UnsupportedOperation

40try:

41 from cdifflib import CSequenceMatcher as SequenceMatcher

42except ModuleNotFoundError:

43 from difflib import SequenceMatcher

45import os

46import struct

47import sys

48import warnings

49import zlib

50from hashlib import sha1

51from itertools import chain

52from os import SEEK_CUR, SEEK_END

53from struct import unpack_from

54from typing import (

55 BinaryIO,

56 Callable,

57 Deque,

58 Dict,

59 Generic,

60 Iterable,

61 Iterator,

62 List,

63 Optional,

64 Protocol,

65 Sequence,

66 Set,

67 Tuple,

68 TypeVar,

69 Union,

70)

72try:

73 import mmap

74except ImportError:

75 has_mmap = False

76else:

77 has_mmap = True

79# For some reason the above try, except fails to set has_mmap = False for plan9

80if sys.platform == "Plan9":

81 has_mmap = False

83from .errors import ApplyDeltaError, ChecksumMismatch

84from .file import GitFile

85from .lru_cache import LRUSizeCache

86from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex

88OFS_DELTA = 6

89REF_DELTA = 7

91DELTA_TYPES = (OFS_DELTA, REF_DELTA)

94DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

96# Keep pack files under 16Mb in memory, otherwise write them out to disk

97PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

100OldUnpackedObject = Union[Tuple[Union[bytes, int], List[bytes]], List[bytes]]

101ResolveExtRefFn = Callable[[bytes], Tuple[int, OldUnpackedObject]]

102ProgressFn = Callable[[int, str], None]

103PackHint = Tuple[int, Optional[bytes]]

104

105

106class UnresolvedDeltas(Exception):

107 """Delta objects could not be resolved."""

108

109 def __init__(self, shas):

110 self.shas = shas

111

112

113class ObjectContainer(Protocol):

114 def add_object(self, obj: ShaFile) -> None:

115 """Add a single object to this object store."""

116

117 def add_objects(

118 self,

119 objects: Sequence[Tuple[ShaFile, Optional[str]]],

120 progress: Optional[Callable[[str], None]] = None,

121 ) -> None:

122 """Add a set of objects to this object store.

123

124 Args:

125 objects: Iterable over a list of (object, path) tuples

126 """

127

128 def __contains__(self, sha1: bytes) -> bool:

129 """Check if a hex sha is present."""

130

131 def __getitem__(self, sha1: bytes) -> ShaFile:

132 """Retrieve an object."""

133

134

135class PackedObjectContainer(ObjectContainer):

136 def get_unpacked_object(

137 self, sha1: bytes, *, include_comp: bool = False

138 ) -> "UnpackedObject":

139 """Get a raw unresolved object."""

140 raise NotImplementedError(self.get_unpacked_object)

141

142 def iterobjects_subset(

143 self, shas: Iterable[bytes], *, allow_missing: bool = False

144 ) -> Iterator[ShaFile]:

145 raise NotImplementedError(self.iterobjects_subset)

146

147 def iter_unpacked_subset(

148 self,

149 shas: Set[bytes],

150 include_comp: bool = False,

151 allow_missing: bool = False,

152 convert_ofs_delta: bool = True,

153 ) -> Iterator["UnpackedObject"]:

154 raise NotImplementedError(self.iter_unpacked_subset)

155

156

157class UnpackedObjectStream:

158 def __iter__(self) -> Iterator["UnpackedObject"]:

159 raise NotImplementedError(self.__iter__)

160

161 def __len__(self) -> int:

162 raise NotImplementedError(self.__len__)

163

164

165def take_msb_bytes(

166 read: Callable[[int], bytes], crc32: Optional[int] = None

167) -> Tuple[List[int], Optional[int]]:

168 """Read bytes marked with most significant bit.

169

170 Args:

171 read: Read function

172 """

173 ret: List[int] = []

174 while len(ret) == 0 or ret[-1] & 0x80:

175 b = read(1)

176 if crc32 is not None:

177 crc32 = binascii.crc32(b, crc32)

178 ret.append(ord(b[:1]))

179 return ret, crc32

180

181

182class PackFileDisappeared(Exception):

183 def __init__(self, obj) -> None:

184 self.obj = obj

185

186

187class UnpackedObject:

188 """Class encapsulating an object unpacked from a pack file.

189

190 These objects should only be created from within unpack_object. Most

191 members start out as empty and are filled in at various points by

192 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

193

194 End users of this object should take care that the function they're getting

195 this object from is guaranteed to set the members they need.

196 """

197

198 __slots__ = [

199 "offset", # Offset in its pack.

200 "_sha", # Cached binary SHA.

201 "obj_type_num", # Type of this object.

202 "obj_chunks", # Decompressed and delta-resolved chunks.

203 "pack_type_num", # Type of this object in the pack (may be a delta).

204 "delta_base", # Delta base offset or SHA.

205 "comp_chunks", # Compressed object chunks.

206 "decomp_chunks", # Decompressed object chunks.

207 "decomp_len", # Decompressed length of this object.

208 "crc32", # CRC32.

209 ]

210

211 obj_type_num: Optional[int]

212 obj_chunks: Optional[List[bytes]]

213 delta_base: Union[None, bytes, int]

214 decomp_chunks: List[bytes]

215 comp_chunks: Optional[List[bytes]]

216

217 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

218 # methods of this object.

219 def __init__(

220 self,

221 pack_type_num,

222 *,

223 delta_base=None,

224 decomp_len=None,

225 crc32=None,

226 sha=None,

227 decomp_chunks=None,

228 offset=None,

229 ) -> None:

230 self.offset = offset

231 self._sha = sha

232 self.pack_type_num = pack_type_num

233 self.delta_base = delta_base

234 self.comp_chunks = None

235 self.decomp_chunks: List[bytes] = decomp_chunks or []

236 if decomp_chunks is not None and decomp_len is None:

237 self.decomp_len = sum(map(len, decomp_chunks))

238 else:

239 self.decomp_len = decomp_len

240 self.crc32 = crc32

241

242 if pack_type_num in DELTA_TYPES:

243 self.obj_type_num = None

244 self.obj_chunks = None

245 else:

246 self.obj_type_num = pack_type_num

247 self.obj_chunks = self.decomp_chunks

248 self.delta_base = delta_base

249

250 def sha(self):

251 """Return the binary SHA of this object."""

252 if self._sha is None:

253 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

254 return self._sha

255

256 def sha_file(self):

257 """Return a ShaFile from this object."""

258 assert self.obj_type_num is not None and self.obj_chunks is not None

259 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

260

261 # Only provided for backwards compatibility with code that expects either

262 # chunks or a delta tuple.

263 def _obj(self) -> OldUnpackedObject:

264 """Return the decompressed chunks, or (delta base, delta chunks)."""

265 if self.pack_type_num in DELTA_TYPES:

266 assert isinstance(self.delta_base, (bytes, int))

267 return (self.delta_base, self.decomp_chunks)

268 else:

269 return self.decomp_chunks

270

271 def __eq__(self, other):

272 if not isinstance(other, UnpackedObject):

273 return False

274 for slot in self.__slots__:

275 if getattr(self, slot) != getattr(other, slot):

276 return False

277 return True

278

279 def __ne__(self, other):

280 return not (self == other)

281

282 def __repr__(self) -> str:

283 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

284 return "{}({})".format(self.__class__.__name__, ", ".join(data))

285

286

287_ZLIB_BUFSIZE = 4096

288

289

290def read_zlib_chunks(

291 read_some: Callable[[int], bytes],

292 unpacked: UnpackedObject,

293 include_comp: bool = False,

294 buffer_size: int = _ZLIB_BUFSIZE,

295) -> bytes:

296 """Read zlib data from a buffer.

297

298 This function requires that the buffer have additional data following the

299 compressed data, which is guaranteed to be the case for git pack files.

300

301 Args:

302 read_some: Read function that returns at least one byte, but may

303 return less than the requested size.

304 unpacked: An UnpackedObject to write result data to. If its crc32

305 attr is not None, the CRC32 of the compressed bytes will be computed

306 using this starting CRC32.

307 After this function, will have the following attrs set:

308 * comp_chunks (if include_comp is True)

309 * decomp_chunks

310 * decomp_len

311 * crc32

312 include_comp: If True, include compressed data in the result.

313 buffer_size: Size of the read buffer.

314 Returns: Leftover unused data from the decompression.

315

316 Raises:

317 zlib.error: if a decompression error occurred.

318 """

319 if unpacked.decomp_len <= -1:

320 raise ValueError("non-negative zlib data stream size expected")

321 decomp_obj = zlib.decompressobj()

322

323 comp_chunks = []

324 decomp_chunks = unpacked.decomp_chunks

325 decomp_len = 0

326 crc32 = unpacked.crc32

327

328 while True:

329 add = read_some(buffer_size)

330 if not add:

331 raise zlib.error("EOF before end of zlib stream")

332 comp_chunks.append(add)

333 decomp = decomp_obj.decompress(add)

334 decomp_len += len(decomp)

335 decomp_chunks.append(decomp)

336 unused = decomp_obj.unused_data

337 if unused:

338 left = len(unused)

339 if crc32 is not None:

340 crc32 = binascii.crc32(add[:-left], crc32)

341 if include_comp:

342 comp_chunks[-1] = add[:-left]

343 break

344 elif crc32 is not None:

345 crc32 = binascii.crc32(add, crc32)

346 if crc32 is not None:

347 crc32 &= 0xFFFFFFFF

348

349 if decomp_len != unpacked.decomp_len:

350 raise zlib.error("decompressed data does not match expected size")

351

352 unpacked.crc32 = crc32

353 if include_comp:

354 unpacked.comp_chunks = comp_chunks

355 return unused

356

357

358def iter_sha1(iter):

359 """Return the hexdigest of the SHA1 over a set of names.

360

361 Args:

362 iter: Iterator over string objects

363 Returns: 40-byte hex sha1 digest

364 """

365 sha = sha1()

366 for name in iter:

367 sha.update(name)

368 return sha.hexdigest().encode("ascii")

369

370

371def load_pack_index(path):

372 """Load an index file by path.

373

374 Args:

375 path: Path to the index file

376 Returns: A PackIndex loaded from the given path

377 """

378 with GitFile(path, "rb") as f:

379 return load_pack_index_file(path, f)

380

381

382def _load_file_contents(f, size=None):

383 try:

384 fd = f.fileno()

385 except (UnsupportedOperation, AttributeError):

386 fd = None

387 # Attempt to use mmap if possible

388 if fd is not None:

389 if size is None:

390 size = os.fstat(fd).st_size

391 if has_mmap:

392 try:

393 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

394 except OSError:

395 # Perhaps a socket?

396 pass

397 else:

398 return contents, size

399 contents = f.read()

400 size = len(contents)

401 return contents, size

402

403

404def load_pack_index_file(path, f):

405 """Load an index file from a file-like object.

406

407 Args:

408 path: Path for the index file

409 f: File-like object

410 Returns: A PackIndex loaded from the given file

411 """

412 contents, size = _load_file_contents(f)

413 if contents[:4] == b"\377tOc":

414 version = struct.unpack(b">L", contents[4:8])[0]

415 if version == 2:

416 return PackIndex2(path, file=f, contents=contents, size=size)

417 else:

418 raise KeyError("Unknown pack index format %d" % version)

419 else:

420 return PackIndex1(path, file=f, contents=contents, size=size)

421

422

423def bisect_find_sha(start, end, sha, unpack_name):

424 """Find a SHA in a data blob with sorted SHAs.

425

426 Args:

427 start: Start index of range to search

428 end: End index of range to search

429 sha: Sha to find

430 unpack_name: Callback to retrieve SHA by index

431 Returns: Index of the SHA, or None if it wasn't found

432 """

433 assert start <= end

434 while start <= end:

435 i = (start + end) // 2

436 file_sha = unpack_name(i)

437 if file_sha < sha:

438 start = i + 1

439 elif file_sha > sha:

440 end = i - 1

441 else:

442 return i

443 return None

444

445

446PackIndexEntry = Tuple[bytes, int, Optional[int]]

447

448

449class PackIndex:

450 """An index in to a packfile.

451

452 Given a sha id of an object a pack index can tell you the location in the

453 packfile of that object if it has it.

454 """

455

456 def __eq__(self, other):

457 if not isinstance(other, PackIndex):

458 return False

459

460 for (name1, _, _), (name2, _, _) in zip(

461 self.iterentries(), other.iterentries()

462 ):

463 if name1 != name2:

464 return False

465 return True

466

467 def __ne__(self, other):

468 return not self.__eq__(other)

469

470 def __len__(self) -> int:

471 """Return the number of entries in this pack index."""

472 raise NotImplementedError(self.__len__)

473

474 def __iter__(self) -> Iterator[bytes]:

475 """Iterate over the SHAs in this pack."""

476 return map(sha_to_hex, self._itersha())

477

478 def iterentries(self) -> Iterator[PackIndexEntry]:

479 """Iterate over the entries in this pack index.

480

481 Returns: iterator over tuples with object name, offset in packfile and

482 crc32 checksum.

483 """

484 raise NotImplementedError(self.iterentries)

485

486 def get_pack_checksum(self) -> bytes:

487 """Return the SHA1 checksum stored for the corresponding packfile.

488

489 Returns: 20-byte binary digest

490 """

491 raise NotImplementedError(self.get_pack_checksum)

492

493 def object_index(self, sha: bytes) -> int:

494 warnings.warn(

495 "Please use object_offset instead", DeprecationWarning, stacklevel=2

496 )

497 return self.object_offset(sha)

498

499 def object_offset(self, sha: bytes) -> int:

500 """Return the offset in to the corresponding packfile for the object.

501

502 Given the name of an object it will return the offset that object

503 lives at within the corresponding pack file. If the pack file doesn't

504 have the object then None will be returned.

505 """

506 raise NotImplementedError(self.object_offset)

507

508 def object_sha1(self, index: int) -> bytes:

509 """Return the SHA1 corresponding to the index in the pack file."""

510 for name, offset, crc32 in self.iterentries():

511 if offset == index:

512 return name

513 else:

514 raise KeyError(index)

515

516 def _object_offset(self, sha: bytes) -> int:

517 """See object_offset.

518

519 Args:

520 sha: A *binary* SHA string. (20 characters long)_

521 """

522 raise NotImplementedError(self._object_offset)

523

524 def objects_sha1(self) -> bytes:

525 """Return the hex SHA1 over all the shas of all objects in this pack.

526

527 Note: This is used for the filename of the pack.

528 """

529 return iter_sha1(self._itersha())

530

531 def _itersha(self) -> Iterator[bytes]:

532 """Yield all the SHA1's of the objects in the index, sorted."""

533 raise NotImplementedError(self._itersha)

534

535 def close(self):

536 pass

537

538 def check(self) -> None:

539 pass

540

541

542class MemoryPackIndex(PackIndex):

543 """Pack index that is stored entirely in memory."""

544

545 def __init__(self, entries, pack_checksum=None) -> None:

546 """Create a new MemoryPackIndex.

547

548 Args:

549 entries: Sequence of name, idx, crc32 (sorted)

550 pack_checksum: Optional pack checksum

551 """

552 self._by_sha = {}

553 self._by_offset = {}

554 for name, offset, crc32 in entries:

555 self._by_sha[name] = offset

556 self._by_offset[offset] = name

557 self._entries = entries

558 self._pack_checksum = pack_checksum

559

560 def get_pack_checksum(self):

561 return self._pack_checksum

562

563 def __len__(self) -> int:

564 return len(self._entries)

565

566 def object_offset(self, sha):

567 if len(sha) == 40:

568 sha = hex_to_sha(sha)

569 return self._by_sha[sha]

570

571 def object_sha1(self, offset):

572 return self._by_offset[offset]

573

574 def _itersha(self):

575 return iter(self._by_sha)

576

577 def iterentries(self):

578 return iter(self._entries)

579

580 @classmethod

581 def for_pack(cls, pack):

582 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())

583

584 @classmethod

585 def clone(cls, other_index):

586 return cls(other_index.iterentries(), other_index.get_pack_checksum())

587

588

589class FilePackIndex(PackIndex):

590 """Pack index that is based on a file.

591

592 To do the loop it opens the file, and indexes first 256 4 byte groups

593 with the first byte of the sha id. The value in the four byte group indexed

594 is the end of the group that shares the same starting byte. Subtract one

595 from the starting byte and index again to find the start of the group.

596 The values are sorted by sha id within the group, so do the math to find

597 the start and end offset and then bisect in to find if the value is

598 present.

599 """

600

601 _fan_out_table: List[int]

602

603 def __init__(self, filename, file=None, contents=None, size=None) -> None:

604 """Create a pack index object.

605

606 Provide it with the name of the index file to consider, and it will map

607 it whenever required.

608 """

609 self._filename = filename

610 # Take the size now, so it can be checked each time we map the file to

611 # ensure that it hasn't changed.

612 if file is None:

613 self._file = GitFile(filename, "rb")

614 else:

615 self._file = file

616 if contents is None:

617 self._contents, self._size = _load_file_contents(self._file, size)

618 else:

619 self._contents, self._size = (contents, size)

620

621 @property

622 def path(self) -> str:

623 return self._filename

624

625 def __eq__(self, other):

626 # Quick optimization:

627 if (

628 isinstance(other, FilePackIndex)

629 and self._fan_out_table != other._fan_out_table

630 ):

631 return False

632

633 return super().__eq__(other)

634

635 def close(self) -> None:

636 self._file.close()

637 if getattr(self._contents, "close", None) is not None:

638 self._contents.close()

639

640 def __len__(self) -> int:

641 """Return the number of entries in this pack index."""

642 return self._fan_out_table[-1]

643

644 def _unpack_entry(self, i: int) -> PackIndexEntry:

645 """Unpack the i-th entry in the index file.

646

647 Returns: Tuple with object name (SHA), offset in pack file and CRC32

648 checksum (if known).

649 """

650 raise NotImplementedError(self._unpack_entry)

651

652 def _unpack_name(self, i):

653 """Unpack the i-th name from the index file."""

654 raise NotImplementedError(self._unpack_name)

655

656 def _unpack_offset(self, i):

657 """Unpack the i-th object offset from the index file."""

658 raise NotImplementedError(self._unpack_offset)

659

660 def _unpack_crc32_checksum(self, i):

661 """Unpack the crc32 checksum for the ith object from the index file."""

662 raise NotImplementedError(self._unpack_crc32_checksum)

663

664 def _itersha(self) -> Iterator[bytes]:

665 for i in range(len(self)):

666 yield self._unpack_name(i)

667

668 def iterentries(self) -> Iterator[PackIndexEntry]:

669 """Iterate over the entries in this pack index.

670

671 Returns: iterator over tuples with object name, offset in packfile and

672 crc32 checksum.

673 """

674 for i in range(len(self)):

675 yield self._unpack_entry(i)

676

677 def _read_fan_out_table(self, start_offset: int):

678 ret = []

679 for i in range(0x100):

680 fanout_entry = self._contents[

681 start_offset + i * 4 : start_offset + (i + 1) * 4

682 ]

683 ret.append(struct.unpack(">L", fanout_entry)[0])

684 return ret

685

686 def check(self) -> None:

687 """Check that the stored checksum matches the actual checksum."""

688 actual = self.calculate_checksum()

689 stored = self.get_stored_checksum()

690 if actual != stored:

691 raise ChecksumMismatch(stored, actual)

692

693 def calculate_checksum(self) -> bytes:

694 """Calculate the SHA1 checksum over this pack index.

695

696 Returns: This is a 20-byte binary digest

697 """

698 return sha1(self._contents[:-20]).digest()

699

700 def get_pack_checksum(self) -> bytes:

701 """Return the SHA1 checksum stored for the corresponding packfile.

702

703 Returns: 20-byte binary digest

704 """

705 return bytes(self._contents[-40:-20])

706

707 def get_stored_checksum(self) -> bytes:

708 """Return the SHA1 checksum stored for this index.

709

710 Returns: 20-byte binary digest

711 """

712 return bytes(self._contents[-20:])

713

714 def object_offset(self, sha: bytes) -> int:

715 """Return the offset in to the corresponding packfile for the object.

716

717 Given the name of an object it will return the offset that object

718 lives at within the corresponding pack file. If the pack file doesn't

719 have the object then None will be returned.

720 """

721 if len(sha) == 40:

722 sha = hex_to_sha(sha)

723 try:

724 return self._object_offset(sha)

725 except ValueError as exc:

726 closed = getattr(self._contents, "closed", None)

727 if closed in (None, True):

728 raise PackFileDisappeared(self) from exc

729 raise

730

731 def _object_offset(self, sha: bytes) -> int:

732 """See object_offset.

733

734 Args:

735 sha: A *binary* SHA string. (20 characters long)_

736 """

737 assert len(sha) == 20

738 idx = ord(sha[:1])

739 if idx == 0:

740 start = 0

741 else:

742 start = self._fan_out_table[idx - 1]

743 end = self._fan_out_table[idx]

744 i = bisect_find_sha(start, end, sha, self._unpack_name)

745 if i is None:

746 raise KeyError(sha)

747 return self._unpack_offset(i)

748

749

750class PackIndex1(FilePackIndex):

751 """Version 1 Pack Index file."""

752

753 def __init__(self, filename: str, file=None, contents=None, size=None) -> None:

754 super().__init__(filename, file, contents, size)

755 self.version = 1

756 self._fan_out_table = self._read_fan_out_table(0)

757

758 def _unpack_entry(self, i):

759 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

760 return (name, offset, None)

761

762 def _unpack_name(self, i):

763 offset = (0x100 * 4) + (i * 24) + 4

764 return self._contents[offset : offset + 20]

765

766 def _unpack_offset(self, i):

767 offset = (0x100 * 4) + (i * 24)

768 return unpack_from(">L", self._contents, offset)[0]

769

770 def _unpack_crc32_checksum(self, i):

771 # Not stored in v1 index files

772 return None

773

774

775class PackIndex2(FilePackIndex):

776 """Version 2 Pack Index file."""

777

778 def __init__(self, filename: str, file=None, contents=None, size=None) -> None:

779 super().__init__(filename, file, contents, size)

780 if self._contents[:4] != b"\377tOc":

781 raise AssertionError("Not a v2 pack index file")

782 (self.version,) = unpack_from(b">L", self._contents, 4)

783 if self.version != 2:

784 raise AssertionError("Version was %d" % self.version)

785 self._fan_out_table = self._read_fan_out_table(8)

786 self._name_table_offset = 8 + 0x100 * 4

787 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

788 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

789 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

790 self

791 )

792

793 def _unpack_entry(self, i):

794 return (

795 self._unpack_name(i),

796 self._unpack_offset(i),

797 self._unpack_crc32_checksum(i),

798 )

799

800 def _unpack_name(self, i):

801 offset = self._name_table_offset + i * 20

802 return self._contents[offset : offset + 20]

803

804 def _unpack_offset(self, i):

805 offset = self._pack_offset_table_offset + i * 4

806 offset = unpack_from(">L", self._contents, offset)[0]

807 if offset & (2**31):

808 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

809 offset = unpack_from(">Q", self._contents, offset)[0]

810 return offset

811

812 def _unpack_crc32_checksum(self, i):

813 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

814

815

816def read_pack_header(read) -> Tuple[int, int]:

817 """Read the header of a pack file.

818

819 Args:

820 read: Read function

821 Returns: Tuple of (pack version, number of objects). If no data is

822 available to read, returns (None, None).

823 """

824 header = read(12)

825 if not header:

826 raise AssertionError("file too short to contain pack")

827 if header[:4] != b"PACK":

828 raise AssertionError(f"Invalid pack header {header!r}")

829 (version,) = unpack_from(b">L", header, 4)

830 if version not in (2, 3):

831 raise AssertionError("Version was %d" % version)

832 (num_objects,) = unpack_from(b">L", header, 8)

833 return (version, num_objects)

834

835

836def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:

837 if isinstance(chunks, bytes):

838 return len(chunks)

839 else:

840 return sum(map(len, chunks))

841

842

843def unpack_object(

844 read_all: Callable[[int], bytes],

845 read_some: Optional[Callable[[int], bytes]] = None,

846 compute_crc32=False,

847 include_comp=False,

848 zlib_bufsize=_ZLIB_BUFSIZE,

849) -> Tuple[UnpackedObject, bytes]:

850 """Unpack a Git object.

851

852 Args:

853 read_all: Read function that blocks until the number of requested

854 bytes are read.

855 read_some: Read function that returns at least one byte, but may not

856 return the number of bytes requested.

857 compute_crc32: If True, compute the CRC32 of the compressed data. If

858 False, the returned CRC32 will be None.

859 include_comp: If True, include compressed data in the result.

860 zlib_bufsize: An optional buffer size for zlib operations.

861 Returns: A tuple of (unpacked, unused), where unused is the unused data

862 leftover from decompression, and unpacked in an UnpackedObject with

863 the following attrs set:

864

865 * obj_chunks (for non-delta types)

866 * pack_type_num

867 * delta_base (for delta types)

868 * comp_chunks (if include_comp is True)

869 * decomp_chunks

870 * decomp_len

871 * crc32 (if compute_crc32 is True)

872 """

873 if read_some is None:

874 read_some = read_all

875 if compute_crc32:

876 crc32 = 0

877 else:

878 crc32 = None

879

880 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

881 type_num = (raw[0] >> 4) & 0x07

882 size = raw[0] & 0x0F

883 for i, byte in enumerate(raw[1:]):

884 size += (byte & 0x7F) << ((i * 7) + 4)

885

886 delta_base: Union[int, bytes, None]

887 raw_base = len(raw)

888 if type_num == OFS_DELTA:

889 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

890 raw_base += len(raw)

891 if raw[-1] & 0x80:

892 raise AssertionError

893 delta_base_offset = raw[0] & 0x7F

894 for byte in raw[1:]:

895 delta_base_offset += 1

896 delta_base_offset <<= 7

897 delta_base_offset += byte & 0x7F

898 delta_base = delta_base_offset

899 elif type_num == REF_DELTA:

900 delta_base_obj = read_all(20)

901 if crc32 is not None:

902 crc32 = binascii.crc32(delta_base_obj, crc32)

903 delta_base = delta_base_obj

904 raw_base += 20

905 else:

906 delta_base = None

907

908 unpacked = UnpackedObject(

909 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

910 )

911 unused = read_zlib_chunks(

912 read_some,

913 unpacked,

914 buffer_size=zlib_bufsize,

915 include_comp=include_comp,

916 )

917 return unpacked, unused

918

919

920def _compute_object_size(value):

921 """Compute the size of a unresolved object for use with LRUSizeCache."""

922 (num, obj) = value

923 if num in DELTA_TYPES:

924 return chunks_length(obj[1])

925 return chunks_length(obj)

926

927

928class PackStreamReader:

929 """Class to read a pack stream.

930

931 The pack is read from a ReceivableProtocol using read() or recv() as

932 appropriate.

933 """

934

935 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:

936 self.read_all = read_all

937 if read_some is None:

938 self.read_some = read_all

939 else:

940 self.read_some = read_some

941 self.sha = sha1()

942 self._offset = 0

943 self._rbuf = BytesIO()

944 # trailer is a deque to avoid memory allocation on small reads

945 self._trailer: Deque[bytes] = deque()

946 self._zlib_bufsize = zlib_bufsize

947

948 def _read(self, read, size):

949 """Read up to size bytes using the given callback.

950

951 As a side effect, update the verifier's hash (excluding the last 20

952 bytes read).

953

954 Args:

955 read: The read callback to read from.

956 size: The maximum number of bytes to read; the particular

957 behavior is callback-specific.

958 """

959 data = read(size)

960

961 # maintain a trailer of the last 20 bytes we've read

962 n = len(data)

963 self._offset += n

964 tn = len(self._trailer)

965 if n >= 20:

966 to_pop = tn

967 to_add = 20

968 else:

969 to_pop = max(n + tn - 20, 0)

970 to_add = n

971 self.sha.update(

972 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

973 )

974 self._trailer.extend(data[-to_add:])

975

976 # hash everything but the trailer

977 self.sha.update(data[:-to_add])

978 return data

979

980 def _buf_len(self):

981 buf = self._rbuf

982 start = buf.tell()

983 buf.seek(0, SEEK_END)

984 end = buf.tell()

985 buf.seek(start)

986 return end - start

987

988 @property

989 def offset(self):

990 return self._offset - self._buf_len()

991

992 def read(self, size):

993 """Read, blocking until size bytes are read."""

994 buf_len = self._buf_len()

995 if buf_len >= size:

996 return self._rbuf.read(size)

997 buf_data = self._rbuf.read()

998 self._rbuf = BytesIO()

999 return buf_data + self._read(self.read_all, size - buf_len)

1000

1001 def recv(self, size):

1002 """Read up to size bytes, blocking until one byte is read."""

1003 buf_len = self._buf_len()

1004 if buf_len:

1005 data = self._rbuf.read(size)

1006 if size >= buf_len:

1007 self._rbuf = BytesIO()

1008 return data

1009 return self._read(self.read_some, size)

1010

1011 def __len__(self) -> int:

1012 return self._num_objects

1013

1014 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:

1015 """Read the objects in this pack file.

1016

1017 Args:

1018 compute_crc32: If True, compute the CRC32 of the compressed

1019 data. If False, the returned CRC32 will be None.

1020 Returns: Iterator over UnpackedObjects with the following members set:

1021 offset

1022 obj_type_num

1023 obj_chunks (for non-delta types)

1024 delta_base (for delta types)

1025 decomp_chunks

1026 decomp_len

1027 crc32 (if compute_crc32 is True)

1028

1029 Raises:

1030 ChecksumMismatch: if the checksum of the pack contents does not

1031 match the checksum in the pack trailer.

1032 zlib.error: if an error occurred during zlib decompression.

1033 IOError: if an error occurred writing to the output file.

1034 """

1035 pack_version, self._num_objects = read_pack_header(self.read)

1036

1037 for i in range(self._num_objects):

1038 offset = self.offset

1039 unpacked, unused = unpack_object(

1040 self.read,

1041 read_some=self.recv,

1042 compute_crc32=compute_crc32,

1043 zlib_bufsize=self._zlib_bufsize,

1044 )

1045 unpacked.offset = offset

1046

1047 # prepend any unused data to current read buffer

1048 buf = BytesIO()

1049 buf.write(unused)

1050 buf.write(self._rbuf.read())

1051 buf.seek(0)

1052 self._rbuf = buf

1053

1054 yield unpacked

1055

1056 if self._buf_len() < 20:

1057 # If the read buffer is full, then the last read() got the whole

1058 # trailer off the wire. If not, it means there is still some of the

1059 # trailer to read. We need to read() all 20 bytes; N come from the

1060 # read buffer and (20 - N) come from the wire.

1061 self.read(20)

1062

1063 pack_sha = bytearray(self._trailer) # type: ignore

1064 if pack_sha != self.sha.digest():

1065 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())

1066

1067

1068class PackStreamCopier(PackStreamReader):

1069 """Class to verify a pack stream as it is being read.

1070

1071 The pack is read from a ReceivableProtocol using read() or recv() as

1072 appropriate and written out to the given file-like object.

1073 """

1074

1075 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:

1076 """Initialize the copier.

1077

1078 Args:

1079 read_all: Read function that blocks until the number of

1080 requested bytes are read.

1081 read_some: Read function that returns at least one byte, but may

1082 not return the number of bytes requested.

1083 outfile: File-like object to write output through.

1084 delta_iter: Optional DeltaChainIterator to record deltas as we

1085 read them.

1086 """

1087 super().__init__(read_all, read_some=read_some)

1088 self.outfile = outfile

1089 self._delta_iter = delta_iter

1090

1091 def _read(self, read, size):

1092 """Read data from the read callback and write it to the file."""

1093 data = super()._read(read, size)

1094 self.outfile.write(data)

1095 return data

1096

1097 def verify(self, progress=None):

1098 """Verify a pack stream and write it to the output file.

1099

1100 See PackStreamReader.iterobjects for a list of exceptions this may

1101 throw.

1102 """

1103 i = 0 # default count of entries if read_objects() is empty

1104 for i, unpacked in enumerate(self.read_objects()):

1105 if self._delta_iter:

1106 self._delta_iter.record(unpacked)

1107 if progress is not None:

1108 progress(

1109 ("copying pack entries: %d/%d\r" % (i, len(self))).encode("ascii")

1110 )

1111 if progress is not None:

1112 progress(("copied %d pack entries\n" % i).encode("ascii"))

1113

1114

1115def obj_sha(type, chunks):

1116 """Compute the SHA for a numeric type and object chunks."""

1117 sha = sha1()

1118 sha.update(object_header(type, chunks_length(chunks)))

1119 if isinstance(chunks, bytes):

1120 sha.update(chunks)

1121 else:

1122 for chunk in chunks:

1123 sha.update(chunk)

1124 return sha.digest()

1125

1126

1127def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):

1128 """Hash a portion of a file into a new SHA.

1129

1130 Args:

1131 f: A file-like object to read from that supports seek().

1132 start_ofs: The offset in the file to start reading at.

1133 end_ofs: The offset in the file to end reading at, relative to the

1134 end of the file.

1135 buffer_size: A buffer size for reading.

1136 Returns: A new SHA object updated with data read from the file.

1137 """

1138 sha = sha1()

1139 f.seek(0, SEEK_END)

1140 length = f.tell()

1141 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1142 raise AssertionError(

1143 "Attempt to read beyond file length. "

1144 "start_ofs: %d, end_ofs: %d, file length: %d" % (start_ofs, end_ofs, length)

1145 )

1146 todo = length + end_ofs - start_ofs

1147 f.seek(start_ofs)

1148 while todo:

1149 data = f.read(min(todo, buffer_size))

1150 sha.update(data)

1151 todo -= len(data)

1152 return sha

1153

1154

1155class PackData:

1156 """The data contained in a packfile.

1157

1158 Pack files can be accessed both sequentially for exploding a pack, and

1159 directly with the help of an index to retrieve a specific object.

1160

1161 The objects within are either complete or a delta against another.

1162

1163 The header is variable length. If the MSB of each byte is set then it

1164 indicates that the subsequent byte is still part of the header.

1165 For the first byte the next MS bits are the type, which tells you the type

1166 of object, and whether it is a delta. The LS byte is the lowest bits of the

1167 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1168 size, i.e. the last byte of the header contains the MS bits of the size.

1169

1170 For the complete objects the data is stored as zlib deflated data.

1171 The size in the header is the uncompressed object size, so to uncompress

1172 you need to just keep feeding data to zlib until you get an object back,

1173 or it errors on bad data. This is done here by just giving the complete

1174 buffer from the start of the deflated object on. This is bad, but until I

1175 get mmap sorted out it will have to do.

1176

1177 Currently there are no integrity checks done. Also no attempt is made to

1178 try and detect the delta case, or a request for an object at the wrong

1179 position. It will all just throw a zlib or KeyError.

1180 """

1181

1182 def __init__(self, filename, file=None, size=None) -> None:

1183 """Create a PackData object representing the pack in the given filename.

1184

1185 The file must exist and stay readable until the object is disposed of.

1186 It must also stay the same size. It will be mapped whenever needed.

1187

1188 Currently there is a restriction on the size of the pack as the python

1189 mmap implementation is flawed.

1190 """

1191 self._filename = filename

1192 self._size = size

1193 self._header_size = 12

1194 if file is None:

1195 self._file = GitFile(self._filename, "rb")

1196 else:

1197 self._file = file

1198 (version, self._num_objects) = read_pack_header(self._file.read)

1199 self._offset_cache = LRUSizeCache[int, Tuple[int, OldUnpackedObject]](

1200 1024 * 1024 * 20, compute_size=_compute_object_size

1201 )

1202

1203 @property

1204 def filename(self):

1205 return os.path.basename(self._filename)

1206

1207 @property

1208 def path(self):

1209 return self._filename

1210

1211 @classmethod

1212 def from_file(cls, file, size=None):

1213 return cls(str(file), file=file, size=size)

1214

1215 @classmethod

1216 def from_path(cls, path):

1217 return cls(filename=path)

1218

1219 def close(self):

1220 self._file.close()

1221

1222 def __enter__(self):

1223 return self

1224

1225 def __exit__(self, exc_type, exc_val, exc_tb):

1226 self.close()

1227

1228 def __eq__(self, other):

1229 if isinstance(other, PackData):

1230 return self.get_stored_checksum() == other.get_stored_checksum()

1231 return False

1232

1233 def _get_size(self):

1234 if self._size is not None:

1235 return self._size

1236 self._size = os.path.getsize(self._filename)

1237 if self._size < self._header_size:

1238 errmsg = "%s is too small for a packfile (%d < %d)" % (

1239 self._filename,

1240 self._size,

1241 self._header_size,

1242 )

1243 raise AssertionError(errmsg)

1244 return self._size

1245

1246 def __len__(self) -> int:

1247 """Returns the number of objects in this pack."""

1248 return self._num_objects

1249

1250 def calculate_checksum(self):

1251 """Calculate the checksum for this pack.

1252

1253 Returns: 20-byte binary SHA1 digest

1254 """

1255 return compute_file_sha(self._file, end_ofs=-20).digest()

1256

1257 def iter_unpacked(self, *, include_comp: bool = False):

1258 self._file.seek(self._header_size)

1259

1260 if self._num_objects is None:

1261 return

1262

1263 for _ in range(self._num_objects):

1264 offset = self._file.tell()

1265 unpacked, unused = unpack_object(

1266 self._file.read, compute_crc32=False, include_comp=include_comp

1267 )

1268 unpacked.offset = offset

1269 yield unpacked

1270 # Back up over unused data.

1271 self._file.seek(-len(unused), SEEK_CUR)

1272

1273 def iterentries(

1274 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None

1275 ):

1276 """Yield entries summarizing the contents of this pack.

1277

1278 Args:

1279 progress: Progress function, called with current and total

1280 object count.

1281 Returns: iterator of tuples with (sha, offset, crc32)

1282 """

1283 num_objects = self._num_objects

1284 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1285 for i, result in enumerate(indexer):

1286 if progress is not None:

1287 progress(i, num_objects)

1288 yield result

1289

1290 def sorted_entries(

1291 self,

1292 progress: Optional[ProgressFn] = None,

1293 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

1294 ):

1295 """Return entries in this pack, sorted by SHA.

1296

1297 Args:

1298 progress: Progress function, called with current and total

1299 object count

1300 Returns: Iterator of tuples with (sha, offset, crc32)

1301 """

1302 return sorted(

1303 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)

1304 )

1305

1306 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):

1307 """Create a version 1 file for this data file.

1308

1309 Args:

1310 filename: Index filename.

1311 progress: Progress report function

1312 Returns: Checksum of index file

1313 """

1314 entries = self.sorted_entries(

1315 progress=progress, resolve_ext_ref=resolve_ext_ref

1316 )

1317 with GitFile(filename, "wb") as f:

1318 return write_pack_index_v1(f, entries, self.calculate_checksum())

1319

1320 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):

1321 """Create a version 2 index file for this data file.

1322

1323 Args:

1324 filename: Index filename.

1325 progress: Progress report function

1326 Returns: Checksum of index file

1327 """

1328 entries = self.sorted_entries(

1329 progress=progress, resolve_ext_ref=resolve_ext_ref

1330 )

1331 with GitFile(filename, "wb") as f:

1332 return write_pack_index_v2(f, entries, self.calculate_checksum())

1333

1334 def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None):

1335 """Create an index file for this data file.

1336

1337 Args:

1338 filename: Index filename.

1339 progress: Progress report function

1340 Returns: Checksum of index file

1341 """

1342 if version == 1:

1343 return self.create_index_v1(

1344 filename, progress, resolve_ext_ref=resolve_ext_ref

1345 )

1346 elif version == 2:

1347 return self.create_index_v2(

1348 filename, progress, resolve_ext_ref=resolve_ext_ref

1349 )

1350 else:

1351 raise ValueError("unknown index format %d" % version)

1352

1353 def get_stored_checksum(self):

1354 """Return the expected checksum stored in this pack."""

1355 self._file.seek(-20, SEEK_END)

1356 return self._file.read(20)

1357

1358 def check(self):

1359 """Check the consistency of this pack."""

1360 actual = self.calculate_checksum()

1361 stored = self.get_stored_checksum()

1362 if actual != stored:

1363 raise ChecksumMismatch(stored, actual)

1364

1365 def get_unpacked_object_at(

1366 self, offset: int, *, include_comp: bool = False

1367 ) -> UnpackedObject:

1368 """Given offset in the packfile return a UnpackedObject."""

1369 assert offset >= self._header_size

1370 self._file.seek(offset)

1371 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1372 unpacked.offset = offset

1373 return unpacked

1374

1375 def get_object_at(self, offset: int) -> Tuple[int, OldUnpackedObject]:

1376 """Given an offset in to the packfile return the object that is there.

1377

1378 Using the associated index the location of an object can be looked up,

1379 and then the packfile can be asked directly for that object using this

1380 function.

1381 """

1382 try:

1383 return self._offset_cache[offset]

1384 except KeyError:

1385 pass

1386 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1387 return (unpacked.pack_type_num, unpacked._obj())

1390T = TypeVar("T")

1393class DeltaChainIterator(Generic[T]):

1394 """Abstract iterator over pack data based on delta chains.

1395

1396 Each object in the pack is guaranteed to be inflated exactly once,

1397 regardless of how many objects reference it as a delta base. As a result,

1398 memory usage is proportional to the length of the longest delta chain.

1399

1400 Subclasses can override _result to define the result type of the iterator.

1401 By default, results are UnpackedObjects with the following members set:

1402

1403 * offset

1404 * obj_type_num

1405 * obj_chunks

1406 * pack_type_num

1407 * delta_base (for delta types)

1408 * comp_chunks (if _include_comp is True)

1409 * decomp_chunks

1410 * decomp_len

1411 * crc32 (if _compute_crc32 is True)

1412 """

1413

1414 _compute_crc32 = False

1415 _include_comp = False

1416

1417 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:

1418 self._file = file_obj

1419 self._resolve_ext_ref = resolve_ext_ref

1420 self._pending_ofs: Dict[int, List[int]] = defaultdict(list)

1421 self._pending_ref: Dict[bytes, List[int]] = defaultdict(list)

1422 self._full_ofs: List[Tuple[int, int]] = []

1423 self._ext_refs: List[bytes] = []

1424

1425 @classmethod

1426 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):

1427 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1428 walker.set_pack_data(pack_data)

1429 for unpacked in pack_data.iter_unpacked(include_comp=False):

1430 walker.record(unpacked)

1431 return walker

1432

1433 @classmethod

1434 def for_pack_subset(

1435 cls,

1436 pack: "Pack",

1437 shas: Iterable[bytes],

1438 *,

1439 allow_missing: bool = False,

1440 resolve_ext_ref=None,

1441 ):

1442 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1443 walker.set_pack_data(pack.data)

1444 todo = set()

1445 for sha in shas:

1446 assert isinstance(sha, bytes)

1447 try:

1448 off = pack.index.object_offset(sha)

1449 except KeyError:

1450 if not allow_missing:

1451 raise

1452 else:

1453 todo.add(off)

1454 done = set()

1455 while todo:

1456 off = todo.pop()

1457 unpacked = pack.data.get_unpacked_object_at(off)

1458 walker.record(unpacked)

1459 done.add(off)

1460 base_ofs = None

1461 if unpacked.pack_type_num == OFS_DELTA:

1462 base_ofs = unpacked.offset - unpacked.delta_base

1463 elif unpacked.pack_type_num == REF_DELTA:

1464 with suppress(KeyError):

1465 assert isinstance(unpacked.delta_base, bytes)

1466 base_ofs = pack.index.object_index(unpacked.delta_base)

1467 if base_ofs is not None and base_ofs not in done:

1468 todo.add(base_ofs)

1469 return walker

1470

1471 def record(self, unpacked: UnpackedObject) -> None:

1472 type_num = unpacked.pack_type_num

1473 offset = unpacked.offset

1474 if type_num == OFS_DELTA:

1475 base_offset = offset - unpacked.delta_base

1476 self._pending_ofs[base_offset].append(offset)

1477 elif type_num == REF_DELTA:

1478 assert isinstance(unpacked.delta_base, bytes)

1479 self._pending_ref[unpacked.delta_base].append(offset)

1480 else:

1481 self._full_ofs.append((offset, type_num))

1482

1483 def set_pack_data(self, pack_data: PackData) -> None:

1484 self._file = pack_data._file

1485

1486 def _walk_all_chains(self):

1487 for offset, type_num in self._full_ofs:

1488 yield from self._follow_chain(offset, type_num, None)

1489 yield from self._walk_ref_chains()

1490 assert not self._pending_ofs, repr(self._pending_ofs)

1491

1492 def _ensure_no_pending(self) -> None:

1493 if self._pending_ref:

1494 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])

1495

1496 def _walk_ref_chains(self):

1497 if not self._resolve_ext_ref:

1498 self._ensure_no_pending()

1499 return

1500

1501 for base_sha, pending in sorted(self._pending_ref.items()):

1502 if base_sha not in self._pending_ref:

1503 continue

1504 try:

1505 type_num, chunks = self._resolve_ext_ref(base_sha)

1506 except KeyError:

1507 # Not an external ref, but may depend on one. Either it will

1508 # get popped via a _follow_chain call, or we will raise an

1509 # error below.

1510 continue

1511 self._ext_refs.append(base_sha)

1512 self._pending_ref.pop(base_sha)

1513 for new_offset in pending:

1514 yield from self._follow_chain(new_offset, type_num, chunks)

1515

1516 self._ensure_no_pending()

1517

1518 def _result(self, unpacked: UnpackedObject) -> T:

1519 raise NotImplementedError

1520

1521 def _resolve_object(

1522 self, offset: int, obj_type_num: int, base_chunks: List[bytes]

1523 ) -> UnpackedObject:

1524 self._file.seek(offset)

1525 unpacked, _ = unpack_object(

1526 self._file.read,

1527 include_comp=self._include_comp,

1528 compute_crc32=self._compute_crc32,

1529 )

1530 unpacked.offset = offset

1531 if base_chunks is None:

1532 assert unpacked.pack_type_num == obj_type_num

1533 else:

1534 assert unpacked.pack_type_num in DELTA_TYPES

1535 unpacked.obj_type_num = obj_type_num

1536 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

1537 return unpacked

1538

1539 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: List[bytes]):

1540 # Unlike PackData.get_object_at, there is no need to cache offsets as

1541 # this approach by design inflates each object exactly once.

1542 todo = [(offset, obj_type_num, base_chunks)]

1543 while todo:

1544 (offset, obj_type_num, base_chunks) = todo.pop()

1545 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

1546 yield self._result(unpacked)

1547

1548 unblocked = chain(

1549 self._pending_ofs.pop(unpacked.offset, []),

1550 self._pending_ref.pop(unpacked.sha(), []),

1551 )

1552 todo.extend(

1553 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

1554 for new_offset in unblocked

1555 )

1556

1557 def __iter__(self) -> Iterator[T]:

1558 return self._walk_all_chains()

1559

1560 def ext_refs(self):

1561 return self._ext_refs

1562

1563

1564class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

1565 """Delta chain iterator that yield unpacked objects."""

1566

1567 def _result(self, unpacked):

1568 return unpacked

1569

1570

1571class PackIndexer(DeltaChainIterator[PackIndexEntry]):

1572 """Delta chain iterator that yields index entries."""

1573

1574 _compute_crc32 = True

1575

1576 def _result(self, unpacked):

1577 return unpacked.sha(), unpacked.offset, unpacked.crc32

1578

1579

1580class PackInflater(DeltaChainIterator[ShaFile]):

1581 """Delta chain iterator that yields ShaFile objects."""

1582

1583 def _result(self, unpacked):

1584 return unpacked.sha_file()

1585

1586

1587class SHA1Reader:

1588 """Wrapper for file-like object that remembers the SHA1 of its data."""

1589

1590 def __init__(self, f) -> None:

1591 self.f = f

1592 self.sha1 = sha1(b"")

1593

1594 def read(self, num=None):

1595 data = self.f.read(num)

1596 self.sha1.update(data)

1597 return data

1598

1599 def check_sha(self):

1600 stored = self.f.read(20)

1601 if stored != self.sha1.digest():

1602 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))

1603

1604 def close(self):

1605 return self.f.close()

1606

1607 def tell(self):

1608 return self.f.tell()

1609

1610

1611class SHA1Writer:

1612 """Wrapper for file-like object that remembers the SHA1 of its data."""

1613

1614 def __init__(self, f) -> None:

1615 self.f = f

1616 self.length = 0

1617 self.sha1 = sha1(b"")

1618

1619 def write(self, data):

1620 self.sha1.update(data)

1621 self.f.write(data)

1622 self.length += len(data)

1623

1624 def write_sha(self):

1625 sha = self.sha1.digest()

1626 assert len(sha) == 20

1627 self.f.write(sha)

1628 self.length += len(sha)

1629 return sha

1630

1631 def close(self):

1632 sha = self.write_sha()

1633 self.f.close()

1634 return sha

1635

1636 def offset(self):

1637 return self.length

1638

1639 def tell(self):

1640 return self.f.tell()

1641

1642

1643def pack_object_header(type_num, delta_base, size):

1644 """Create a pack object header for the given object info.

1645

1646 Args:

1647 type_num: Numeric type of the object.

1648 delta_base: Delta base offset or ref, or None for whole objects.

1649 size: Uncompressed object size.

1650 Returns: A header for a packed object.

1651 """

1652 header = []

1653 c = (type_num << 4) | (size & 15)

1654 size >>= 4

1655 while size:

1656 header.append(c | 0x80)

1657 c = size & 0x7F

1658 size >>= 7

1659 header.append(c)

1660 if type_num == OFS_DELTA:

1661 ret = [delta_base & 0x7F]

1662 delta_base >>= 7

1663 while delta_base:

1664 delta_base -= 1

1665 ret.insert(0, 0x80 | (delta_base & 0x7F))

1666 delta_base >>= 7

1667 header.extend(ret)

1668 elif type_num == REF_DELTA:

1669 assert len(delta_base) == 20

1670 header += delta_base

1671 return bytearray(header)

1672

1673

1674def pack_object_chunks(type, object, compression_level=-1):

1675 """Generate chunks for a pack object.

1676

1677 Args:

1678 type: Numeric type of the object

1679 object: Object to write

1680 compression_level: the zlib compression level

1681 Returns: Chunks

1682 """

1683 if type in DELTA_TYPES:

1684 delta_base, object = object

1685 else:

1686 delta_base = None

1687 if isinstance(object, bytes):

1688 object = [object]

1689 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))

1690 compressor = zlib.compressobj(level=compression_level)

1691 for data in object:

1692 yield compressor.compress(data)

1693 yield compressor.flush()

1694

1695

1696def write_pack_object(write, type, object, sha=None, compression_level=-1):

1697 """Write pack object to a file.

1698

1699 Args:

1700 write: Write function to use

1701 type: Numeric type of the object

1702 object: Object to write

1703 compression_level: the zlib compression level

1704 Returns: Tuple with offset at which the object was written, and crc32

1705 """

1706 crc32 = 0

1707 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

1708 write(chunk)

1709 if sha is not None:

1710 sha.update(chunk)

1711 crc32 = binascii.crc32(chunk, crc32)

1712 return crc32 & 0xFFFFFFFF

1713

1714

1715def write_pack(

1716 filename,

1717 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],

1718 *,

1719 deltify: Optional[bool] = None,

1720 delta_window_size: Optional[int] = None,

1721 compression_level: int = -1,

1722):

1723 """Write a new pack data file.

1724

1725 Args:

1726 filename: Path to the new pack file (without .pack extension)

1727 container: PackedObjectContainer

1728 entries: Sequence of (object_id, path) tuples to write

1729 delta_window_size: Delta window size

1730 deltify: Whether to deltify pack objects

1731 compression_level: the zlib compression level

1732 Returns: Tuple with checksum of pack file and index file

1733 """

1734 with GitFile(filename + ".pack", "wb") as f:

1735 entries, data_sum = write_pack_objects(

1736 f.write,

1737 objects,

1738 delta_window_size=delta_window_size,

1739 deltify=deltify,

1740 compression_level=compression_level,

1741 )

1742 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

1743 with GitFile(filename + ".idx", "wb") as f:

1744 return data_sum, write_pack_index_v2(f, entries, data_sum)

1745

1746

1747def pack_header_chunks(num_objects):

1748 """Yield chunks for a pack header."""

1749 yield b"PACK" # Pack header

1750 yield struct.pack(b">L", 2) # Pack version

1751 yield struct.pack(b">L", num_objects) # Number of objects in pack

1752

1753

1754def write_pack_header(write, num_objects):

1755 """Write a pack header for the given number of objects."""

1756 if hasattr(write, "write"):

1757 write = write.write

1758 warnings.warn(

1759 "write_pack_header() now takes a write rather than file argument",

1760 DeprecationWarning,

1761 stacklevel=2,

1762 )

1763 for chunk in pack_header_chunks(num_objects):

1764 write(chunk)

1765

1766

1767def find_reusable_deltas(

1768 container: PackedObjectContainer,

1769 object_ids: Set[bytes],

1770 *,

1771 other_haves: Optional[Set[bytes]] = None,

1772 progress=None,

1773) -> Iterator[UnpackedObject]:

1774 if other_haves is None:

1775 other_haves = set()

1776 reused = 0

1777 for i, unpacked in enumerate(

1778 container.iter_unpacked_subset(

1779 object_ids, allow_missing=True, convert_ofs_delta=True

1780 )

1781 ):

1782 if progress is not None and i % 1000 == 0:

1783 progress(

1784 ("checking for reusable deltas: %d/%d\r" % (i, len(object_ids))).encode(

1785 "utf-8"

1786 )

1787 )

1788 if unpacked.pack_type_num == REF_DELTA:

1789 hexsha = sha_to_hex(unpacked.delta_base)

1790 if hexsha in object_ids or hexsha in other_haves:

1791 yield unpacked

1792 reused += 1

1793 if progress is not None:

1794 progress(("found %d deltas to reuse\n" % (reused,)).encode("utf-8"))

1795

1796

1797def deltify_pack_objects(

1798 objects: Union[Iterator[bytes], Iterator[Tuple[ShaFile, Optional[bytes]]]],

1799 *,

1800 window_size: Optional[int] = None,

1801 progress=None,

1802) -> Iterator[UnpackedObject]:

1803 """Generate deltas for pack objects.

1804

1805 Args:

1806 objects: An iterable of (object, path) tuples to deltify.

1807 window_size: Window size; None for default

1808 Returns: Iterator over type_num, object id, delta_base, content

1809 delta_base is None for full text entries

1810 """

1811

1812 def objects_with_hints():

1813 for e in objects:

1814 if isinstance(e, ShaFile):

1815 yield (e, (e.type_num, None))

1816 else:

1817 yield (e[0], (e[0].type_num, e[1]))

1818

1819 yield from deltas_from_sorted_objects(

1820 sort_objects_for_delta(objects_with_hints()),

1821 window_size=window_size,

1822 progress=progress,

1823 )

1824

1825

1826def sort_objects_for_delta(

1827 objects: Union[Iterator[ShaFile], Iterator[Tuple[ShaFile, Optional[PackHint]]]],

1828) -> Iterator[ShaFile]:

1829 magic = []

1830 for entry in objects:

1831 if isinstance(entry, tuple):

1832 obj, hint = entry

1833 if hint is None:

1834 type_num = None

1835 path = None

1836 else:

1837 (type_num, path) = hint

1838 else:

1839 obj = entry

1840 magic.append((type_num, path, -obj.raw_length(), obj))

1841 # Build a list of objects ordered by the magic Linus heuristic

1842 # This helps us find good objects to diff against us

1843 magic.sort()

1844 return (x[3] for x in magic)

1845

1846

1847def deltas_from_sorted_objects(

1848 objects, window_size: Optional[int] = None, progress=None

1849):

1850 # TODO(jelmer): Use threads

1851 if window_size is None:

1852 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

1853

1854 possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()

1855 for i, o in enumerate(objects):

1856 if progress is not None and i % 1000 == 0:

1857 progress(("generating deltas: %d\r" % (i,)).encode("utf-8"))

1858 raw = o.as_raw_chunks()

1859 winner = raw

1860 winner_len = sum(map(len, winner))

1861 winner_base = None

1862 for base_id, base_type_num, base in possible_bases:

1863 if base_type_num != o.type_num:

1864 continue

1865 delta_len = 0

1866 delta = []

1867 for chunk in create_delta(base, raw):

1868 delta_len += len(chunk)

1869 if delta_len >= winner_len:

1870 break

1871 delta.append(chunk)

1872 else:

1873 winner_base = base_id

1874 winner = delta

1875 winner_len = sum(map(len, winner))

1876 yield UnpackedObject(

1877 o.type_num,

1878 sha=o.sha().digest(),

1879 delta_base=winner_base,

1880 decomp_len=winner_len,

1881 decomp_chunks=winner,

1882 )

1883 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

1884 while len(possible_bases) > window_size:

1885 possible_bases.pop()

1886

1887

1888def pack_objects_to_data(

1889 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],

1890 *,

1891 deltify: Optional[bool] = None,

1892 delta_window_size: Optional[int] = None,

1893 ofs_delta: bool = True,

1894 progress=None,

1895) -> Tuple[int, Iterator[UnpackedObject]]:

1896 """Create pack data from objects.

1897

1898 Args:

1899 objects: Pack objects

1900 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

1901 """

1902 # TODO(jelmer): support deltaifying

1903 count = len(objects)

1904 if deltify is None:

1905 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

1906 # slow at the moment.

1907 deltify = False

1908 if deltify:

1909 return (

1910 count,

1911 deltify_pack_objects(

1912 iter(objects), # type: ignore

1913 window_size=delta_window_size,

1914 progress=progress,

1915 ),

1916 )

1917 else:

1918

1919 def iter_without_path():

1920 for o in objects:

1921 if isinstance(o, tuple):

1922 yield full_unpacked_object(o[0])

1923 else:

1924 yield full_unpacked_object(o)

1925

1926 return (count, iter_without_path())

1927

1928

1929def generate_unpacked_objects(

1930 container: PackedObjectContainer,

1931 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]],

1932 delta_window_size: Optional[int] = None,

1933 deltify: Optional[bool] = None,

1934 reuse_deltas: bool = True,

1935 ofs_delta: bool = True,

1936 other_haves: Optional[Set[bytes]] = None,

1937 progress=None,

1938) -> Iterator[UnpackedObject]:

1939 """Create pack data from objects.

1940

1941 Args:

1942 objects: Pack objects

1943 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

1944 """

1945 todo = dict(object_ids)

1946 if reuse_deltas:

1947 for unpack in find_reusable_deltas(

1948 container, set(todo), other_haves=other_haves, progress=progress

1949 ):

1950 del todo[sha_to_hex(unpack.sha())]

1951 yield unpack

1952 if deltify is None:

1953 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

1954 # slow at the moment.

1955 deltify = False

1956 if deltify:

1957 objects_to_delta = container.iterobjects_subset(

1958 todo.keys(), allow_missing=False

1959 )

1960 yield from deltas_from_sorted_objects(

1961 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),

1962 window_size=delta_window_size,

1963 progress=progress,

1964 )

1965 else:

1966 for oid in todo:

1967 yield full_unpacked_object(container[oid])

1968

1969

1970def full_unpacked_object(o: ShaFile) -> UnpackedObject:

1971 return UnpackedObject(

1972 o.type_num,

1973 delta_base=None,

1974 crc32=None,

1975 decomp_chunks=o.as_raw_chunks(),

1976 sha=o.sha().digest(),

1977 )

1978

1979

1980def write_pack_from_container(

1981 write,

1982 container: PackedObjectContainer,

1983 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]],

1984 delta_window_size: Optional[int] = None,

1985 deltify: Optional[bool] = None,

1986 reuse_deltas: bool = True,

1987 compression_level: int = -1,

1988 other_haves: Optional[Set[bytes]] = None,

1989):

1990 """Write a new pack data file.

1991

1992 Args:

1993 write: write function to use

1994 container: PackedObjectContainer

1995 entries: Sequence of (object_id, path) tuples to write

1996 delta_window_size: Sliding window size for searching for deltas;

1997 Set to None for default window size.

1998 deltify: Whether to deltify objects

1999 compression_level: the zlib compression level to use

2000 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2001 """

2002 pack_contents_count = len(object_ids)

2003 pack_contents = generate_unpacked_objects(

2004 container,

2005 object_ids,

2006 delta_window_size=delta_window_size,

2007 deltify=deltify,

2008 reuse_deltas=reuse_deltas,

2009 other_haves=other_haves,

2010 )

2011

2012 return write_pack_data(

2013 write,

2014 pack_contents,

2015 num_records=pack_contents_count,

2016 compression_level=compression_level,

2017 )

2018

2019

2020def write_pack_objects(

2021 write,

2022 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],

2023 *,

2024 delta_window_size: Optional[int] = None,

2025 deltify: Optional[bool] = None,

2026 compression_level: int = -1,

2027):

2028 """Write a new pack data file.

2029

2030 Args:

2031 write: write function to use

2032 objects: Sequence of (object, path) tuples to write

2033 delta_window_size: Sliding window size for searching for deltas;

2034 Set to None for default window size.

2035 deltify: Whether to deltify objects

2036 compression_level: the zlib compression level to use

2037 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2038 """

2039 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2040

2041 return write_pack_data(

2042 write,

2043 pack_contents,

2044 num_records=pack_contents_count,

2045 compression_level=compression_level,

2046 )

2047

2048

2049class PackChunkGenerator:

2050 def __init__(

2051 self,

2052 num_records=None,

2053 records=None,

2054 progress=None,

2055 compression_level=-1,

2056 reuse_compressed=True,

2057 ) -> None:

2058 self.cs = sha1(b"")

2059 self.entries: Dict[Union[int, bytes], Tuple[int, int]] = {}

2060 self._it = self._pack_data_chunks(

2061 num_records=num_records,

2062 records=records,

2063 progress=progress,

2064 compression_level=compression_level,

2065 reuse_compressed=reuse_compressed,

2066 )

2067

2068 def sha1digest(self):

2069 return self.cs.digest()

2070

2071 def __iter__(self):

2072 return self._it

2073

2074 def _pack_data_chunks(

2075 self,

2076 records: Iterator[UnpackedObject],

2077 *,

2078 num_records=None,

2079 progress=None,

2080 compression_level: int = -1,

2081 reuse_compressed: bool = True,

2082 ) -> Iterator[bytes]:

2083 """Iterate pack data file chunks.

2084

2085 Args:

2086 records: Iterator over UnpackedObject

2087 num_records: Number of records (defaults to len(records) if not specified)

2088 progress: Function to report progress to

2089 compression_level: the zlib compression level

2090 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2091 """

2092 # Write the pack

2093 if num_records is None:

2094 num_records = len(records) # type: ignore

2095 offset = 0

2096 for chunk in pack_header_chunks(num_records):

2097 yield chunk

2098 self.cs.update(chunk)

2099 offset += len(chunk)

2100 actual_num_records = 0

2101 for i, unpacked in enumerate(records):

2102 type_num = unpacked.pack_type_num

2103 if progress is not None and i % 1000 == 0:

2104 progress(

2105 ("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii")

2106 )

2107 raw: Union[List[bytes], Tuple[int, List[bytes]], Tuple[bytes, List[bytes]]]

2108 if unpacked.delta_base is not None:

2109 try:

2110 base_offset, base_crc32 = self.entries[unpacked.delta_base]

2111 except KeyError:

2112 type_num = REF_DELTA

2113 assert isinstance(unpacked.delta_base, bytes)

2114 raw = (unpacked.delta_base, unpacked.decomp_chunks)

2115 else:

2116 type_num = OFS_DELTA

2117 raw = (offset - base_offset, unpacked.decomp_chunks)

2118 else:

2119 raw = unpacked.decomp_chunks

2120 if unpacked.comp_chunks is not None and reuse_compressed:

2121 chunks = unpacked.comp_chunks

2122 else:

2123 chunks = pack_object_chunks(

2124 type_num, raw, compression_level=compression_level

2125 )

2126 crc32 = 0

2127 object_size = 0

2128 for chunk in chunks:

2129 yield chunk

2130 crc32 = binascii.crc32(chunk, crc32)

2131 self.cs.update(chunk)

2132 object_size += len(chunk)

2133 actual_num_records += 1

2134 self.entries[unpacked.sha()] = (offset, crc32)

2135 offset += object_size

2136 if actual_num_records != num_records:

2137 raise AssertionError(

2138 "actual records written differs: %d != %d"

2139 % (actual_num_records, num_records)

2140 )

2141

2142 yield self.cs.digest()

2143

2144

2145def write_pack_data(

2146 write,

2147 records: Iterator[UnpackedObject],

2148 *,

2149 num_records=None,

2150 progress=None,

2151 compression_level=-1,

2152):

2153 """Write a new pack data file.

2154

2155 Args:

2156 write: Write function to use

2157 num_records: Number of records (defaults to len(records) if None)

2158 records: Iterator over type_num, object_id, delta_base, raw

2159 progress: Function to report progress to

2160 compression_level: the zlib compression level

2161 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2162 """

2163 chunk_generator = PackChunkGenerator(

2164 num_records=num_records,

2165 records=records,

2166 progress=progress,

2167 compression_level=compression_level,

2168 )

2169 for chunk in chunk_generator:

2170 write(chunk)

2171 return chunk_generator.entries, chunk_generator.sha1digest()

2172

2173

2174def write_pack_index_v1(f, entries, pack_checksum):

2175 """Write a new pack index file.

2176

2177 Args:

2178 f: A file-like object to write to

2179 entries: List of tuples with object name (sha), offset_in_pack,

2180 and crc32_checksum.

2181 pack_checksum: Checksum of the pack file.

2182 Returns: The SHA of the written index file

2183 """

2184 f = SHA1Writer(f)

2185 fan_out_table = defaultdict(lambda: 0)

2186 for name, offset, entry_checksum in entries:

2187 fan_out_table[ord(name[:1])] += 1

2188 # Fan-out table

2189 for i in range(0x100):

2190 f.write(struct.pack(">L", fan_out_table[i]))

2191 fan_out_table[i + 1] += fan_out_table[i]

2192 for name, offset, entry_checksum in entries:

2193 if not (offset <= 0xFFFFFFFF):

2194 raise TypeError("pack format 1 only supports offsets < 2Gb")

2195 f.write(struct.pack(">L20s", offset, name))

2196 assert len(pack_checksum) == 20

2197 f.write(pack_checksum)

2198 return f.write_sha()

2199

2200

2201def _delta_encode_size(size) -> bytes:

2202 ret = bytearray()

2203 c = size & 0x7F

2204 size >>= 7

2205 while size:

2206 ret.append(c | 0x80)

2207 c = size & 0x7F

2208 size >>= 7

2209 ret.append(c)

2210 return bytes(ret)

2211

2212

2213# The length of delta compression copy operations in version 2 packs is limited

2214# to 64K. To copy more, we use several copy operations. Version 3 packs allow

2215# 24-bit lengths in copy operations, but we always make version 2 packs.

2216_MAX_COPY_LEN = 0xFFFF

2217

2218

2219def _encode_copy_operation(start, length):

2220 scratch = bytearray([0x80])

2221 for i in range(4):

2222 if start & 0xFF << i * 8:

2223 scratch.append((start >> i * 8) & 0xFF)

2224 scratch[0] |= 1 << i

2225 for i in range(2):

2226 if length & 0xFF << i * 8:

2227 scratch.append((length >> i * 8) & 0xFF)

2228 scratch[0] |= 1 << (4 + i)

2229 return bytes(scratch)

2230

2231

2232def create_delta(base_buf, target_buf):

2233 """Use python difflib to work out how to transform base_buf to target_buf.

2234

2235 Args:

2236 base_buf: Base buffer

2237 target_buf: Target buffer

2238 """

2239 if isinstance(base_buf, list):

2240 base_buf = b"".join(base_buf)

2241 if isinstance(target_buf, list):

2242 target_buf = b"".join(target_buf)

2243 assert isinstance(base_buf, bytes)

2244 assert isinstance(target_buf, bytes)

2245 # write delta header

2246 yield _delta_encode_size(len(base_buf))

2247 yield _delta_encode_size(len(target_buf))

2248 # write out delta opcodes

2249 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

2250 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

2251 # Git patch opcodes don't care about deletes!

2252 # if opcode == 'replace' or opcode == 'delete':

2253 # pass

2254 if opcode == "equal":

2255 # If they are equal, unpacker will use data from base_buf

2256 # Write out an opcode that says what range to use

2257 copy_start = i1

2258 copy_len = i2 - i1

2259 while copy_len > 0:

2260 to_copy = min(copy_len, _MAX_COPY_LEN)

2261 yield _encode_copy_operation(copy_start, to_copy)

2262 copy_start += to_copy

2263 copy_len -= to_copy

2264 if opcode == "replace" or opcode == "insert":

2265 # If we are replacing a range or adding one, then we just

2266 # output it to the stream (prefixed by its size)

2267 s = j2 - j1

2268 o = j1

2269 while s > 127:

2270 yield bytes([127])

2271 yield memoryview(target_buf)[o : o + 127]

2272 s -= 127

2273 o += 127

2274 yield bytes([s])

2275 yield memoryview(target_buf)[o : o + s]

2276

2277

2278def apply_delta(src_buf, delta):

2279 """Based on the similar function in git's patch-delta.c.

2280

2281 Args:

2282 src_buf: Source buffer

2283 delta: Delta instructions

2284 """

2285 if not isinstance(src_buf, bytes):

2286 src_buf = b"".join(src_buf)

2287 if not isinstance(delta, bytes):

2288 delta = b"".join(delta)

2289 out = []

2290 index = 0

2291 delta_length = len(delta)

2292

2293 def get_delta_header_size(delta, index):

2294 size = 0

2295 i = 0

2296 while delta:

2297 cmd = ord(delta[index : index + 1])

2298 index += 1

2299 size |= (cmd & ~0x80) << i

2300 i += 7

2301 if not cmd & 0x80:

2302 break

2303 return size, index

2304

2305 src_size, index = get_delta_header_size(delta, index)

2306 dest_size, index = get_delta_header_size(delta, index)

2307 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))

2308 while index < delta_length:

2309 cmd = ord(delta[index : index + 1])

2310 index += 1

2311 if cmd & 0x80:

2312 cp_off = 0

2313 for i in range(4):

2314 if cmd & (1 << i):

2315 x = ord(delta[index : index + 1])

2316 index += 1

2317 cp_off |= x << (i * 8)

2318 cp_size = 0

2319 # Version 3 packs can contain copy sizes larger than 64K.

2320 for i in range(3):

2321 if cmd & (1 << (4 + i)):

2322 x = ord(delta[index : index + 1])

2323 index += 1

2324 cp_size |= x << (i * 8)

2325 if cp_size == 0:

2326 cp_size = 0x10000

2327 if (

2328 cp_off + cp_size < cp_size

2329 or cp_off + cp_size > src_size

2330 or cp_size > dest_size

2331 ):

2332 break

2333 out.append(src_buf[cp_off : cp_off + cp_size])

2334 elif cmd != 0:

2335 out.append(delta[index : index + cmd])

2336 index += cmd

2337 else:

2338 raise ApplyDeltaError("Invalid opcode 0")

2339

2340 if index != delta_length:

2341 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

2342

2343 if dest_size != chunks_length(out):

2344 raise ApplyDeltaError("dest size incorrect")

2345

2346 return out

2347

2348

2349def write_pack_index_v2(

2350 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes

2351) -> bytes:

2352 """Write a new pack index file.

2353

2354 Args:

2355 f: File-like object to write to

2356 entries: List of tuples with object name (sha), offset_in_pack, and

2357 crc32_checksum.

2358 pack_checksum: Checksum of the pack file.

2359 Returns: The SHA of the index file written

2360 """

2361 f = SHA1Writer(f)

2362 f.write(b"\377tOc") # Magic!

2363 f.write(struct.pack(">L", 2))

2364 fan_out_table: Dict[int, int] = defaultdict(lambda: 0)

2365 for name, offset, entry_checksum in entries:

2366 fan_out_table[ord(name[:1])] += 1

2367 # Fan-out table

2368 largetable: List[int] = []

2369 for i in range(0x100):

2370 f.write(struct.pack(b">L", fan_out_table[i]))

2371 fan_out_table[i + 1] += fan_out_table[i]

2372 for name, offset, entry_checksum in entries:

2373 f.write(name)

2374 for name, offset, entry_checksum in entries:

2375 f.write(struct.pack(b">L", entry_checksum))

2376 for name, offset, entry_checksum in entries:

2377 if offset < 2**31:

2378 f.write(struct.pack(b">L", offset))

2379 else:

2380 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2381 largetable.append(offset)

2382 for offset in largetable:

2383 f.write(struct.pack(b">Q", offset))

2384 assert len(pack_checksum) == 20

2385 f.write(pack_checksum)

2386 return f.write_sha()

2387

2388

2389write_pack_index = write_pack_index_v2

2390

2391

2392class Pack:

2393 """A Git pack object."""

2394

2395 _data_load: Optional[Callable[[], PackData]]

2396 _idx_load: Optional[Callable[[], PackIndex]]

2397

2398 _data: Optional[PackData]

2399 _idx: Optional[PackIndex]

2400

2401 def __init__(

2402 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None

2403 ) -> None:

2404 self._basename = basename

2405 self._data = None

2406 self._idx = None

2407 self._idx_path = self._basename + ".idx"

2408 self._data_path = self._basename + ".pack"

2409 self._data_load = lambda: PackData(self._data_path)

2410 self._idx_load = lambda: load_pack_index(self._idx_path)

2411 self.resolve_ext_ref = resolve_ext_ref

2412

2413 @classmethod

2414 def from_lazy_objects(cls, data_fn, idx_fn):

2415 """Create a new pack object from callables to load pack data and

2416 index objects.

2417 """

2418 ret = cls("")

2419 ret._data_load = data_fn

2420 ret._idx_load = idx_fn

2421 return ret

2422

2423 @classmethod

2424 def from_objects(cls, data, idx):

2425 """Create a new pack object from pack data and index objects."""

2426 ret = cls("")

2427 ret._data = data

2428 ret._data_load = None

2429 ret._idx = idx

2430 ret._idx_load = None

2431 ret.check_length_and_checksum()

2432 return ret

2433

2434 def name(self):

2435 """The SHA over the SHAs of the objects in this pack."""

2436 return self.index.objects_sha1()

2437

2438 @property

2439 def data(self) -> PackData:

2440 """The pack data object being used."""

2441 if self._data is None:

2442 assert self._data_load

2443 self._data = self._data_load()

2444 self.check_length_and_checksum()

2445 return self._data

2446

2447 @property

2448 def index(self) -> PackIndex:

2449 """The index being used.

2450

2451 Note: This may be an in-memory index

2452 """

2453 if self._idx is None:

2454 assert self._idx_load

2455 self._idx = self._idx_load()

2456 return self._idx

2457

2458 def close(self):

2459 if self._data is not None:

2460 self._data.close()

2461 if self._idx is not None:

2462 self._idx.close()

2463

2464 def __enter__(self):

2465 return self

2466

2467 def __exit__(self, exc_type, exc_val, exc_tb):

2468 self.close()

2469

2470 def __eq__(self, other):

2471 return isinstance(self, type(other)) and self.index == other.index

2472

2473 def __len__(self) -> int:

2474 """Number of entries in this pack."""

2475 return len(self.index)

2476

2477 def __repr__(self) -> str:

2478 return f"{self.__class__.__name__}({self._basename!r})"

2479

2480 def __iter__(self):

2481 """Iterate over all the sha1s of the objects in this pack."""

2482 return iter(self.index)

2483

2484 def check_length_and_checksum(self) -> None:

2485 """Sanity check the length and checksum of the pack index and data."""

2486 assert len(self.index) == len(

2487 self.data

2488 ), f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

2489 idx_stored_checksum = self.index.get_pack_checksum()

2490 data_stored_checksum = self.data.get_stored_checksum()

2491 if idx_stored_checksum != data_stored_checksum:

2492 raise ChecksumMismatch(

2493 sha_to_hex(idx_stored_checksum),

2494 sha_to_hex(data_stored_checksum),

2495 )

2496

2497 def check(self) -> None:

2498 """Check the integrity of this pack.

2499

2500 Raises:

2501 ChecksumMismatch: if a checksum for the index or data is wrong

2502 """

2503 self.index.check()

2504 self.data.check()

2505 for obj in self.iterobjects():

2506 obj.check()

2507 # TODO: object connectivity checks

2508

2509 def get_stored_checksum(self) -> bytes:

2510 return self.data.get_stored_checksum()

2511

2512 def pack_tuples(self):

2513 return [(o, None) for o in self.iterobjects()]

2514

2515 def __contains__(self, sha1: bytes) -> bool:

2516 """Check whether this pack contains a particular SHA1."""

2517 try:

2518 self.index.object_offset(sha1)

2519 return True

2520 except KeyError:

2521 return False

2522

2523 def get_raw(self, sha1: bytes) -> Tuple[int, bytes]:

2524 offset = self.index.object_offset(sha1)

2525 obj_type, obj = self.data.get_object_at(offset)

2526 type_num, chunks = self.resolve_object(offset, obj_type, obj)

2527 return type_num, b"".join(chunks)

2528

2529 def __getitem__(self, sha1: bytes) -> ShaFile:

2530 """Retrieve the specified SHA1."""

2531 type, uncomp = self.get_raw(sha1)

2532 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

2533

2534 def iterobjects(self) -> Iterator[ShaFile]:

2535 """Iterate over the objects in this pack."""

2536 return iter(

2537 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

2538 )

2539

2540 def iterobjects_subset(

2541 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

2542 ) -> Iterator[ShaFile]:

2543 return (

2544 uo

2545 for uo in PackInflater.for_pack_subset(

2546 self,

2547 shas,

2548 allow_missing=allow_missing,

2549 resolve_ext_ref=self.resolve_ext_ref,

2550 )

2551 if uo.id in shas

2552 )

2553

2554 def iter_unpacked_subset(

2555 self,

2556 shas: Iterable[ObjectID],

2557 *,

2558 include_comp: bool = False,

2559 allow_missing: bool = False,

2560 convert_ofs_delta: bool = False,

2561 ) -> Iterator[UnpackedObject]:

2562 ofs_pending: Dict[int, List[UnpackedObject]] = defaultdict(list)

2563 ofs: Dict[bytes, int] = {}

2564 todo = set(shas)

2565 for unpacked in self.iter_unpacked(include_comp=include_comp):

2566 sha = unpacked.sha()

2567 ofs[unpacked.offset] = sha

2568 hexsha = sha_to_hex(sha)

2569 if hexsha in todo:

2570 if unpacked.pack_type_num == OFS_DELTA:

2571 assert isinstance(unpacked.delta_base, int)

2572 base_offset = unpacked.offset - unpacked.delta_base

2573 try:

2574 unpacked.delta_base = ofs[base_offset]

2575 except KeyError:

2576 ofs_pending[base_offset].append(unpacked)

2577 continue

2578 else:

2579 unpacked.pack_type_num = REF_DELTA

2580 yield unpacked

2581 todo.remove(hexsha)

2582 for child in ofs_pending.pop(unpacked.offset, []):

2583 child.pack_type_num = REF_DELTA

2584 child.delta_base = sha

2585 yield child

2586 assert not ofs_pending

2587 if not allow_missing and todo:

2588 raise UnresolvedDeltas(todo)

2589

2590 def iter_unpacked(self, include_comp=False):

2591 ofs_to_entries = {

2592 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

2593 }

2594 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

2595 (sha, crc32) = ofs_to_entries[unpacked.offset]

2596 unpacked._sha = sha

2597 unpacked.crc32 = crc32

2598 yield unpacked

2599

2600 def keep(self, msg: Optional[bytes] = None) -> str:

2601 """Add a .keep file for the pack, preventing git from garbage collecting it.

2602

2603 Args:

2604 msg: A message written inside the .keep file; can be used later

2605 to determine whether or not a .keep file is obsolete.

2606 Returns: The path of the .keep file, as a string.

2607 """

2608 keepfile_name = f"{self._basename}.keep"

2609 with GitFile(keepfile_name, "wb") as keepfile:

2610 if msg:

2611 keepfile.write(msg)

2612 keepfile.write(b"\n")

2613 return keepfile_name

2614

2615 def get_ref(self, sha: bytes) -> Tuple[Optional[int], int, OldUnpackedObject]:

2616 """Get the object for a ref SHA, only looking in this pack."""

2617 # TODO: cache these results

2618 try:

2619 offset = self.index.object_offset(sha)

2620 except KeyError:

2621 offset = None

2622 if offset:

2623 type, obj = self.data.get_object_at(offset)

2624 elif self.resolve_ext_ref:

2625 type, obj = self.resolve_ext_ref(sha)

2626 else:

2627 raise KeyError(sha)

2628 return offset, type, obj

2629

2630 def resolve_object(

2631 self, offset: int, type: int, obj, get_ref=None

2632 ) -> Tuple[int, Iterable[bytes]]:

2633 """Resolve an object, possibly resolving deltas when necessary.

2634

2635 Returns: Tuple with object type and contents.

2636 """

2637 # Walk down the delta chain, building a stack of deltas to reach

2638 # the requested object.

2639 base_offset = offset

2640 base_type = type

2641 base_obj = obj

2642 delta_stack = []

2643 while base_type in DELTA_TYPES:

2644 prev_offset = base_offset

2645 if get_ref is None:

2646 get_ref = self.get_ref

2647 if base_type == OFS_DELTA:

2648 (delta_offset, delta) = base_obj

2649 # TODO: clean up asserts and replace with nicer error messages

2650 base_offset = base_offset - delta_offset

2651 base_type, base_obj = self.data.get_object_at(base_offset)

2652 assert isinstance(base_type, int)

2653 elif base_type == REF_DELTA:

2654 (basename, delta) = base_obj

2655 assert isinstance(basename, bytes) and len(basename) == 20

2656 base_offset, base_type, base_obj = get_ref(basename)

2657 assert isinstance(base_type, int)

2658 if base_offset == prev_offset: # object is based on itself

2659 raise UnresolvedDeltas(sha_to_hex(basename))

2660 delta_stack.append((prev_offset, base_type, delta))

2661

2662 # Now grab the base object (mustn't be a delta) and apply the

2663 # deltas all the way up the stack.

2664 chunks = base_obj

2665 for prev_offset, delta_type, delta in reversed(delta_stack):

2666 chunks = apply_delta(chunks, delta)

2667 # TODO(dborowitz): This can result in poor performance if

2668 # large base objects are separated from deltas in the pack.

2669 # We should reorganize so that we apply deltas to all

2670 # objects in a chain one after the other to optimize cache

2671 # performance.

2672 if prev_offset is not None:

2673 self.data._offset_cache[prev_offset] = base_type, chunks

2674 return base_type, chunks

2675

2676 def entries(

2677 self, progress: Optional[ProgressFn] = None

2678 ) -> Iterator[PackIndexEntry]:

2679 """Yield entries summarizing the contents of this pack.

2680

2681 Args:

2682 progress: Progress function, called with current and total

2683 object count.

2684 Returns: iterator of tuples with (sha, offset, crc32)

2685 """

2686 return self.data.iterentries(

2687 progress=progress, resolve_ext_ref=self.resolve_ext_ref

2688 )

2689

2690 def sorted_entries(

2691 self, progress: Optional[ProgressFn] = None

2692 ) -> Iterator[PackIndexEntry]:

2693 """Return entries in this pack, sorted by SHA.

2694

2695 Args:

2696 progress: Progress function, called with current and total

2697 object count

2698 Returns: Iterator of tuples with (sha, offset, crc32)

2699 """

2700 return self.data.sorted_entries(

2701 progress=progress, resolve_ext_ref=self.resolve_ext_ref

2702 )

2703

2704 def get_unpacked_object(

2705 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True

2706 ) -> UnpackedObject:

2707 """Get the unpacked object for a sha.

2708

2709 Args:

2710 sha: SHA of object to fetch

2711 include_comp: Whether to include compression data in UnpackedObject

2712 """

2713 offset = self.index.object_offset(sha)

2714 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

2715 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

2716 assert isinstance(unpacked.delta_base, int)

2717 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

2718 unpacked.pack_type_num = REF_DELTA

2719 return unpacked

2720

2721

2722def extend_pack(

2723 f: BinaryIO,

2724 object_ids: Set[ObjectID],

2725 get_raw,

2726 *,

2727 compression_level=-1,

2728 progress=None,

2729) -> Tuple[bytes, List]:

2730 """Extend a pack file with more objects.

2731

2732 The caller should make sure that object_ids does not contain any objects

2733 that are already in the pack

2734 """

2735 # Update the header with the new number of objects.

2736 f.seek(0)

2737 _version, num_objects = read_pack_header(f.read)

2738

2739 if object_ids:

2740 f.seek(0)

2741 write_pack_header(f.write, num_objects + len(object_ids))

2742

2743 # Must flush before reading (http://bugs.python.org/issue3207)

2744 f.flush()

2745

2746 # Rescan the rest of the pack, computing the SHA with the new header.

2747 new_sha = compute_file_sha(f, end_ofs=-20)

2748

2749 # Must reposition before writing (http://bugs.python.org/issue3207)

2750 f.seek(0, os.SEEK_CUR)

2751

2752 extra_entries = []

2753

2754 # Complete the pack.

2755 for i, object_id in enumerate(object_ids):

2756 if progress is not None:

2757 progress(

2758 ("writing extra base objects: %d/%d\r" % (i, len(object_ids))).encode(

2759 "ascii"

2760 )

2761 )

2762 assert len(object_id) == 20

2763 type_num, data = get_raw(object_id)

2764 offset = f.tell()

2765 crc32 = write_pack_object(

2766 f.write,

2767 type_num,

2768 data,

2769 sha=new_sha,

2770 compression_level=compression_level,

2771 )

2772 extra_entries.append((object_id, offset, crc32))

2773 pack_sha = new_sha.digest()

2774 f.write(pack_sha)

2775 return pack_sha, extra_entries

2776

2777

2778try:

2779 from dulwich._pack import ( # type: ignore

2780 apply_delta, # type: ignore

2781 bisect_find_sha, # type: ignore

2782 )

2783except ImportError:

2784 pass