Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 23%

1# pack.py -- For dealing with packed git objects.

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

7# General Public License as published by the Free Software Foundation; version 2.0

8# or (at your option) any later version. You can redistribute it and/or

9# modify it under the terms of either of these two licenses.

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS,

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14# See the License for the specific language governing permissions and

15# limitations under the License.

16#

17# You should have received a copy of the licenses; if not, see

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

20# License, Version 2.0.

21#

23"""Classes for dealing with packed git objects.

25A pack is a compact representation of a bunch of objects, stored

26using deltas where possible.

28They have two parts, the pack file, which stores the data, and an index

29that tells you where the data is.

31To find an object you look in all of the index files 'til you find a

32match for the object name. You then use the pointer got from this as

33a pointer in to the corresponding packfile.

34"""

36import binascii

37from collections import defaultdict, deque

38from contextlib import suppress

39from io import BytesIO, UnsupportedOperation

41try:

42 from cdifflib import CSequenceMatcher as SequenceMatcher

43except ModuleNotFoundError:

44 from difflib import SequenceMatcher

46import os

47import struct

48import sys

49import warnings

50import zlib

51from collections.abc import Iterable, Iterator, Sequence

52from hashlib import sha1

53from itertools import chain

54from os import SEEK_CUR, SEEK_END

55from struct import unpack_from

56from typing import (

57 BinaryIO,

58 Callable,

59 Generic,

60 Optional,

61 Protocol,

62 TypeVar,

63 Union,

64)

66try:

67 import mmap

68except ImportError:

69 has_mmap = False

70else:

71 has_mmap = True

73# For some reason the above try, except fails to set has_mmap = False for plan9

74if sys.platform == "Plan9":

75 has_mmap = False

77from . import replace_me

78from .errors import ApplyDeltaError, ChecksumMismatch

79from .file import GitFile

80from .lru_cache import LRUSizeCache

81from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex

83OFS_DELTA = 6

84REF_DELTA = 7

86DELTA_TYPES = (OFS_DELTA, REF_DELTA)

89DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

91# Keep pack files under 16Mb in memory, otherwise write them out to disk

92PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

94# Default pack index version to use when none is specified

95DEFAULT_PACK_INDEX_VERSION = 2

98OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]

99ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]

100ProgressFn = Callable[[int, str], None]

101PackHint = tuple[int, Optional[bytes]]

102

103

104class UnresolvedDeltas(Exception):

105 """Delta objects could not be resolved."""

106

107 def __init__(self, shas) -> None:

108 self.shas = shas

109

110

111class ObjectContainer(Protocol):

112 def add_object(self, obj: ShaFile) -> None:

113 """Add a single object to this object store."""

114

115 def add_objects(

116 self,

117 objects: Sequence[tuple[ShaFile, Optional[str]]],

118 progress: Optional[Callable[[str], None]] = None,

119 ) -> None:

120 """Add a set of objects to this object store.

121

122 Args:

123 objects: Iterable over a list of (object, path) tuples

124 """

125

126 def __contains__(self, sha1: bytes) -> bool:

127 """Check if a hex sha is present."""

128

129 def __getitem__(self, sha1: bytes) -> ShaFile:

130 """Retrieve an object."""

131

132 def get_commit_graph(self):

133 """Get the commit graph for this object store.

134

135 Returns:

136 CommitGraph object if available, None otherwise

137 """

138 return None

139

140

141class PackedObjectContainer(ObjectContainer):

142 def get_unpacked_object(

143 self, sha1: bytes, *, include_comp: bool = False

144 ) -> "UnpackedObject":

145 """Get a raw unresolved object."""

146 raise NotImplementedError(self.get_unpacked_object)

147

148 def iterobjects_subset(

149 self, shas: Iterable[bytes], *, allow_missing: bool = False

150 ) -> Iterator[ShaFile]:

151 raise NotImplementedError(self.iterobjects_subset)

152

153 def iter_unpacked_subset(

154 self,

155 shas: set[bytes],

156 include_comp: bool = False,

157 allow_missing: bool = False,

158 convert_ofs_delta: bool = True,

159 ) -> Iterator["UnpackedObject"]:

160 raise NotImplementedError(self.iter_unpacked_subset)

161

162

163class UnpackedObjectStream:

164 def __iter__(self) -> Iterator["UnpackedObject"]:

165 raise NotImplementedError(self.__iter__)

166

167 def __len__(self) -> int:

168 raise NotImplementedError(self.__len__)

169

170

171def take_msb_bytes(

172 read: Callable[[int], bytes], crc32: Optional[int] = None

173) -> tuple[list[int], Optional[int]]:

174 """Read bytes marked with most significant bit.

175

176 Args:

177 read: Read function

178 """

179 ret: list[int] = []

180 while len(ret) == 0 or ret[-1] & 0x80:

181 b = read(1)

182 if crc32 is not None:

183 crc32 = binascii.crc32(b, crc32)

184 ret.append(ord(b[:1]))

185 return ret, crc32

186

187

188class PackFileDisappeared(Exception):

189 def __init__(self, obj) -> None:

190 self.obj = obj

191

192

193class UnpackedObject:

194 """Class encapsulating an object unpacked from a pack file.

195

196 These objects should only be created from within unpack_object. Most

197 members start out as empty and are filled in at various points by

198 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

199

200 End users of this object should take care that the function they're getting

201 this object from is guaranteed to set the members they need.

202 """

203

204 __slots__ = [

205 "_sha", # Cached binary SHA.

206 "comp_chunks", # Compressed object chunks.

207 "crc32", # CRC32.

208 "decomp_chunks", # Decompressed object chunks.

209 "decomp_len", # Decompressed length of this object.

210 "delta_base", # Delta base offset or SHA.

211 "obj_chunks", # Decompressed and delta-resolved chunks.

212 "obj_type_num", # Type of this object.

213 "offset", # Offset in its pack.

214 "pack_type_num", # Type of this object in the pack (may be a delta).

215 ]

216

217 obj_type_num: Optional[int]

218 obj_chunks: Optional[list[bytes]]

219 delta_base: Union[None, bytes, int]

220 decomp_chunks: list[bytes]

221 comp_chunks: Optional[list[bytes]]

222

223 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

224 # methods of this object.

225 def __init__(

226 self,

227 pack_type_num,

228 *,

229 delta_base=None,

230 decomp_len=None,

231 crc32=None,

232 sha=None,

233 decomp_chunks=None,

234 offset=None,

235 ) -> None:

236 self.offset = offset

237 self._sha = sha

238 self.pack_type_num = pack_type_num

239 self.delta_base = delta_base

240 self.comp_chunks = None

241 self.decomp_chunks: list[bytes] = decomp_chunks or []

242 if decomp_chunks is not None and decomp_len is None:

243 self.decomp_len = sum(map(len, decomp_chunks))

244 else:

245 self.decomp_len = decomp_len

246 self.crc32 = crc32

247

248 if pack_type_num in DELTA_TYPES:

249 self.obj_type_num = None

250 self.obj_chunks = None

251 else:

252 self.obj_type_num = pack_type_num

253 self.obj_chunks = self.decomp_chunks

254 self.delta_base = delta_base

255

256 def sha(self):

257 """Return the binary SHA of this object."""

258 if self._sha is None:

259 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

260 return self._sha

261

262 def sha_file(self):

263 """Return a ShaFile from this object."""

264 assert self.obj_type_num is not None and self.obj_chunks is not None

265 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

266

267 # Only provided for backwards compatibility with code that expects either

268 # chunks or a delta tuple.

269 def _obj(self) -> OldUnpackedObject:

270 """Return the decompressed chunks, or (delta base, delta chunks)."""

271 if self.pack_type_num in DELTA_TYPES:

272 assert isinstance(self.delta_base, (bytes, int))

273 return (self.delta_base, self.decomp_chunks)

274 else:

275 return self.decomp_chunks

276

277 def __eq__(self, other):

278 if not isinstance(other, UnpackedObject):

279 return False

280 for slot in self.__slots__:

281 if getattr(self, slot) != getattr(other, slot):

282 return False

283 return True

284

285 def __ne__(self, other):

286 return not (self == other)

287

288 def __repr__(self) -> str:

289 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

290 return "{}({})".format(self.__class__.__name__, ", ".join(data))

291

292

293_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance

294

295

296def read_zlib_chunks(

297 read_some: Callable[[int], bytes],

298 unpacked: UnpackedObject,

299 include_comp: bool = False,

300 buffer_size: int = _ZLIB_BUFSIZE,

301) -> bytes:

302 """Read zlib data from a buffer.

303

304 This function requires that the buffer have additional data following the

305 compressed data, which is guaranteed to be the case for git pack files.

306

307 Args:

308 read_some: Read function that returns at least one byte, but may

309 return less than the requested size.

310 unpacked: An UnpackedObject to write result data to. If its crc32

311 attr is not None, the CRC32 of the compressed bytes will be computed

312 using this starting CRC32.

313 After this function, will have the following attrs set:

314 * comp_chunks (if include_comp is True)

315 * decomp_chunks

316 * decomp_len

317 * crc32

318 include_comp: If True, include compressed data in the result.

319 buffer_size: Size of the read buffer.

320 Returns: Leftover unused data from the decompression.

321

322 Raises:

323 zlib.error: if a decompression error occurred.

324 """

325 if unpacked.decomp_len <= -1:

326 raise ValueError("non-negative zlib data stream size expected")

327 decomp_obj = zlib.decompressobj()

328

329 comp_chunks = []

330 decomp_chunks = unpacked.decomp_chunks

331 decomp_len = 0

332 crc32 = unpacked.crc32

333

334 while True:

335 add = read_some(buffer_size)

336 if not add:

337 raise zlib.error("EOF before end of zlib stream")

338 comp_chunks.append(add)

339 decomp = decomp_obj.decompress(add)

340 decomp_len += len(decomp)

341 decomp_chunks.append(decomp)

342 unused = decomp_obj.unused_data

343 if unused:

344 left = len(unused)

345 if crc32 is not None:

346 crc32 = binascii.crc32(add[:-left], crc32)

347 if include_comp:

348 comp_chunks[-1] = add[:-left]

349 break

350 elif crc32 is not None:

351 crc32 = binascii.crc32(add, crc32)

352 if crc32 is not None:

353 crc32 &= 0xFFFFFFFF

354

355 if decomp_len != unpacked.decomp_len:

356 raise zlib.error("decompressed data does not match expected size")

357

358 unpacked.crc32 = crc32

359 if include_comp:

360 unpacked.comp_chunks = comp_chunks

361 return unused

362

363

364def iter_sha1(iter):

365 """Return the hexdigest of the SHA1 over a set of names.

366

367 Args:

368 iter: Iterator over string objects

369 Returns: 40-byte hex sha1 digest

370 """

371 sha = sha1()

372 for name in iter:

373 sha.update(name)

374 return sha.hexdigest().encode("ascii")

375

376

377def load_pack_index(path: Union[str, os.PathLike]):

378 """Load an index file by path.

379

380 Args:

381 path: Path to the index file

382 Returns: A PackIndex loaded from the given path

383 """

384 with GitFile(path, "rb") as f:

385 return load_pack_index_file(path, f)

386

387

388def _load_file_contents(f, size=None):

389 try:

390 fd = f.fileno()

391 except (UnsupportedOperation, AttributeError):

392 fd = None

393 # Attempt to use mmap if possible

394 if fd is not None:

395 if size is None:

396 size = os.fstat(fd).st_size

397 if has_mmap:

398 try:

399 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

400 except (OSError, ValueError):

401 # Can't mmap - perhaps a socket or invalid file descriptor

402 pass

403 else:

404 return contents, size

405 contents = f.read()

406 size = len(contents)

407 return contents, size

408

409

410def load_pack_index_file(path: Union[str, os.PathLike], f):

411 """Load an index file from a file-like object.

412

413 Args:

414 path: Path for the index file

415 f: File-like object

416 Returns: A PackIndex loaded from the given file

417 """

418 contents, size = _load_file_contents(f)

419 if contents[:4] == b"\377tOc":

420 version = struct.unpack(b">L", contents[4:8])[0]

421 if version == 2:

422 return PackIndex2(path, file=f, contents=contents, size=size)

423 elif version == 3:

424 return PackIndex3(path, file=f, contents=contents, size=size)

425 else:

426 raise KeyError(f"Unknown pack index format {version}")

427 else:

428 return PackIndex1(path, file=f, contents=contents, size=size)

429

430

431def bisect_find_sha(start, end, sha, unpack_name):

432 """Find a SHA in a data blob with sorted SHAs.

433

434 Args:

435 start: Start index of range to search

436 end: End index of range to search

437 sha: Sha to find

438 unpack_name: Callback to retrieve SHA by index

439 Returns: Index of the SHA, or None if it wasn't found

440 """

441 assert start <= end

442 while start <= end:

443 i = (start + end) // 2

444 file_sha = unpack_name(i)

445 if file_sha < sha:

446 start = i + 1

447 elif file_sha > sha:

448 end = i - 1

449 else:

450 return i

451 return None

452

453

454PackIndexEntry = tuple[bytes, int, Optional[int]]

455

456

457class PackIndex:

458 """An index in to a packfile.

459

460 Given a sha id of an object a pack index can tell you the location in the

461 packfile of that object if it has it.

462 """

463

464 # Default to SHA-1 for backward compatibility

465 hash_algorithm = 1

466 hash_size = 20

467

468 def __eq__(self, other):

469 if not isinstance(other, PackIndex):

470 return False

471

472 for (name1, _, _), (name2, _, _) in zip(

473 self.iterentries(), other.iterentries()

474 ):

475 if name1 != name2:

476 return False

477 return True

478

479 def __ne__(self, other):

480 return not self.__eq__(other)

481

482 def __len__(self) -> int:

483 """Return the number of entries in this pack index."""

484 raise NotImplementedError(self.__len__)

485

486 def __iter__(self) -> Iterator[bytes]:

487 """Iterate over the SHAs in this pack."""

488 return map(sha_to_hex, self._itersha())

489

490 def iterentries(self) -> Iterator[PackIndexEntry]:

491 """Iterate over the entries in this pack index.

492

493 Returns: iterator over tuples with object name, offset in packfile and

494 crc32 checksum.

495 """

496 raise NotImplementedError(self.iterentries)

497

498 def get_pack_checksum(self) -> bytes:

499 """Return the SHA1 checksum stored for the corresponding packfile.

500

501 Returns: 20-byte binary digest

502 """

503 raise NotImplementedError(self.get_pack_checksum)

504

505 @replace_me(since="0.21.0", remove_in="0.23.0")

506 def object_index(self, sha: bytes) -> int:

507 return self.object_offset(sha)

508

509 def object_offset(self, sha: bytes) -> int:

510 """Return the offset in to the corresponding packfile for the object.

511

512 Given the name of an object it will return the offset that object

513 lives at within the corresponding pack file. If the pack file doesn't

514 have the object then None will be returned.

515 """

516 raise NotImplementedError(self.object_offset)

517

518 def object_sha1(self, index: int) -> bytes:

519 """Return the SHA1 corresponding to the index in the pack file."""

520 for name, offset, _crc32 in self.iterentries():

521 if offset == index:

522 return name

523 else:

524 raise KeyError(index)

525

526 def _object_offset(self, sha: bytes) -> int:

527 """See object_offset.

528

529 Args:

530 sha: A *binary* SHA string. (20 characters long)_

531 """

532 raise NotImplementedError(self._object_offset)

533

534 def objects_sha1(self) -> bytes:

535 """Return the hex SHA1 over all the shas of all objects in this pack.

536

537 Note: This is used for the filename of the pack.

538 """

539 return iter_sha1(self._itersha())

540

541 def _itersha(self) -> Iterator[bytes]:

542 """Yield all the SHA1's of the objects in the index, sorted."""

543 raise NotImplementedError(self._itersha)

544

545 def close(self) -> None:

546 pass

547

548 def check(self) -> None:

549 pass

550

551

552class MemoryPackIndex(PackIndex):

553 """Pack index that is stored entirely in memory."""

554

555 def __init__(self, entries, pack_checksum=None) -> None:

556 """Create a new MemoryPackIndex.

557

558 Args:

559 entries: Sequence of name, idx, crc32 (sorted)

560 pack_checksum: Optional pack checksum

561 """

562 self._by_sha = {}

563 self._by_offset = {}

564 for name, offset, _crc32 in entries:

565 self._by_sha[name] = offset

566 self._by_offset[offset] = name

567 self._entries = entries

568 self._pack_checksum = pack_checksum

569

570 def get_pack_checksum(self):

571 return self._pack_checksum

572

573 def __len__(self) -> int:

574 return len(self._entries)

575

576 def object_offset(self, sha):

577 if len(sha) == 40:

578 sha = hex_to_sha(sha)

579 return self._by_sha[sha]

580

581 def object_sha1(self, offset):

582 return self._by_offset[offset]

583

584 def _itersha(self):

585 return iter(self._by_sha)

586

587 def iterentries(self):

588 return iter(self._entries)

589

590 @classmethod

591 def for_pack(cls, pack):

592 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())

593

594 @classmethod

595 def clone(cls, other_index):

596 return cls(other_index.iterentries(), other_index.get_pack_checksum())

597

598

599class FilePackIndex(PackIndex):

600 """Pack index that is based on a file.

601

602 To do the loop it opens the file, and indexes first 256 4 byte groups

603 with the first byte of the sha id. The value in the four byte group indexed

604 is the end of the group that shares the same starting byte. Subtract one

605 from the starting byte and index again to find the start of the group.

606 The values are sorted by sha id within the group, so do the math to find

607 the start and end offset and then bisect in to find if the value is

608 present.

609 """

610

611 _fan_out_table: list[int]

612

613 def __init__(self, filename, file=None, contents=None, size=None) -> None:

614 """Create a pack index object.

615

616 Provide it with the name of the index file to consider, and it will map

617 it whenever required.

618 """

619 self._filename = filename

620 # Take the size now, so it can be checked each time we map the file to

621 # ensure that it hasn't changed.

622 if file is None:

623 self._file = GitFile(filename, "rb")

624 else:

625 self._file = file

626 if contents is None:

627 self._contents, self._size = _load_file_contents(self._file, size)

628 else:

629 self._contents, self._size = (contents, size)

630

631 @property

632 def path(self) -> str:

633 return self._filename

634

635 def __eq__(self, other):

636 # Quick optimization:

637 if (

638 isinstance(other, FilePackIndex)

639 and self._fan_out_table != other._fan_out_table

640 ):

641 return False

642

643 return super().__eq__(other)

644

645 def close(self) -> None:

646 self._file.close()

647 if getattr(self._contents, "close", None) is not None:

648 self._contents.close()

649

650 def __len__(self) -> int:

651 """Return the number of entries in this pack index."""

652 return self._fan_out_table[-1]

653

654 def _unpack_entry(self, i: int) -> PackIndexEntry:

655 """Unpack the i-th entry in the index file.

656

657 Returns: Tuple with object name (SHA), offset in pack file and CRC32

658 checksum (if known).

659 """

660 raise NotImplementedError(self._unpack_entry)

661

662 def _unpack_name(self, i) -> bytes:

663 """Unpack the i-th name from the index file."""

664 raise NotImplementedError(self._unpack_name)

665

666 def _unpack_offset(self, i) -> int:

667 """Unpack the i-th object offset from the index file."""

668 raise NotImplementedError(self._unpack_offset)

669

670 def _unpack_crc32_checksum(self, i) -> Optional[int]:

671 """Unpack the crc32 checksum for the ith object from the index file."""

672 raise NotImplementedError(self._unpack_crc32_checksum)

673

674 def _itersha(self) -> Iterator[bytes]:

675 for i in range(len(self)):

676 yield self._unpack_name(i)

677

678 def iterentries(self) -> Iterator[PackIndexEntry]:

679 """Iterate over the entries in this pack index.

680

681 Returns: iterator over tuples with object name, offset in packfile and

682 crc32 checksum.

683 """

684 for i in range(len(self)):

685 yield self._unpack_entry(i)

686

687 def _read_fan_out_table(self, start_offset: int):

688 ret = []

689 for i in range(0x100):

690 fanout_entry = self._contents[

691 start_offset + i * 4 : start_offset + (i + 1) * 4

692 ]

693 ret.append(struct.unpack(">L", fanout_entry)[0])

694 return ret

695

696 def check(self) -> None:

697 """Check that the stored checksum matches the actual checksum."""

698 actual = self.calculate_checksum()

699 stored = self.get_stored_checksum()

700 if actual != stored:

701 raise ChecksumMismatch(stored, actual)

702

703 def calculate_checksum(self) -> bytes:

704 """Calculate the SHA1 checksum over this pack index.

705

706 Returns: This is a 20-byte binary digest

707 """

708 return sha1(self._contents[:-20]).digest()

709

710 def get_pack_checksum(self) -> bytes:

711 """Return the SHA1 checksum stored for the corresponding packfile.

712

713 Returns: 20-byte binary digest

714 """

715 return bytes(self._contents[-40:-20])

716

717 def get_stored_checksum(self) -> bytes:

718 """Return the SHA1 checksum stored for this index.

719

720 Returns: 20-byte binary digest

721 """

722 return bytes(self._contents[-20:])

723

724 def object_offset(self, sha: bytes) -> int:

725 """Return the offset in to the corresponding packfile for the object.

726

727 Given the name of an object it will return the offset that object

728 lives at within the corresponding pack file. If the pack file doesn't

729 have the object then None will be returned.

730 """

731 if len(sha) == 40:

732 sha = hex_to_sha(sha)

733 try:

734 return self._object_offset(sha)

735 except ValueError as exc:

736 closed = getattr(self._contents, "closed", None)

737 if closed in (None, True):

738 raise PackFileDisappeared(self) from exc

739 raise

740

741 def _object_offset(self, sha: bytes) -> int:

742 """See object_offset.

743

744 Args:

745 sha: A *binary* SHA string. (20 characters long)_

746 """

747 assert len(sha) == 20

748 idx = ord(sha[:1])

749 if idx == 0:

750 start = 0

751 else:

752 start = self._fan_out_table[idx - 1]

753 end = self._fan_out_table[idx]

754 i = bisect_find_sha(start, end, sha, self._unpack_name)

755 if i is None:

756 raise KeyError(sha)

757 return self._unpack_offset(i)

758

759 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:

760 """Iterate over all SHA1s with the given prefix."""

761 start = ord(prefix[:1])

762 if start == 0:

763 start = 0

764 else:

765 start = self._fan_out_table[start - 1]

766 end = ord(prefix[:1]) + 1

767 if end == 0x100:

768 end = len(self)

769 else:

770 end = self._fan_out_table[end]

771 assert start <= end

772 started = False

773 for i in range(start, end):

774 name: bytes = self._unpack_name(i)

775 if name.startswith(prefix):

776 yield name

777 started = True

778 elif started:

779 break

780

781

782class PackIndex1(FilePackIndex):

783 """Version 1 Pack Index file."""

784

785 def __init__(

786 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

787 ) -> None:

788 super().__init__(filename, file, contents, size)

789 self.version = 1

790 self._fan_out_table = self._read_fan_out_table(0)

791

792 def _unpack_entry(self, i):

793 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

794 return (name, offset, None)

795

796 def _unpack_name(self, i):

797 offset = (0x100 * 4) + (i * 24) + 4

798 return self._contents[offset : offset + 20]

799

800 def _unpack_offset(self, i):

801 offset = (0x100 * 4) + (i * 24)

802 return unpack_from(">L", self._contents, offset)[0]

803

804 def _unpack_crc32_checksum(self, i) -> None:

805 # Not stored in v1 index files

806 return None

807

808

809class PackIndex2(FilePackIndex):

810 """Version 2 Pack Index file."""

811

812 def __init__(

813 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

814 ) -> None:

815 super().__init__(filename, file, contents, size)

816 if self._contents[:4] != b"\377tOc":

817 raise AssertionError("Not a v2 pack index file")

818 (self.version,) = unpack_from(b">L", self._contents, 4)

819 if self.version != 2:

820 raise AssertionError(f"Version was {self.version}")

821 self._fan_out_table = self._read_fan_out_table(8)

822 self._name_table_offset = 8 + 0x100 * 4

823 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

824 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

825 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

826 self

827 )

828

829 def _unpack_entry(self, i):

830 return (

831 self._unpack_name(i),

832 self._unpack_offset(i),

833 self._unpack_crc32_checksum(i),

834 )

835

836 def _unpack_name(self, i):

837 offset = self._name_table_offset + i * 20

838 return self._contents[offset : offset + 20]

839

840 def _unpack_offset(self, i):

841 offset = self._pack_offset_table_offset + i * 4

842 offset = unpack_from(">L", self._contents, offset)[0]

843 if offset & (2**31):

844 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

845 offset = unpack_from(">Q", self._contents, offset)[0]

846 return offset

847

848 def _unpack_crc32_checksum(self, i):

849 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

850

851

852class PackIndex3(FilePackIndex):

853 """Version 3 Pack Index file.

854

855 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).

856 """

857

858 def __init__(

859 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

860 ) -> None:

861 super().__init__(filename, file, contents, size)

862 if self._contents[:4] != b"\377tOc":

863 raise AssertionError("Not a v3 pack index file")

864 (self.version,) = unpack_from(b">L", self._contents, 4)

865 if self.version != 3:

866 raise AssertionError(f"Version was {self.version}")

867

868 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

869 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)

870 if self.hash_algorithm == 1:

871 self.hash_size = 20 # SHA-1

872 elif self.hash_algorithm == 2:

873 self.hash_size = 32 # SHA-256

874 else:

875 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")

876

877 # Read length of shortened object names

878 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)

879

880 # Calculate offsets based on variable hash size

881 self._fan_out_table = self._read_fan_out_table(

882 16

883 ) # After header (4 + 4 + 4 + 4)

884 self._name_table_offset = 16 + 0x100 * 4

885 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)

886 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

887 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

888 self

889 )

890

891 def _unpack_entry(self, i):

892 return (

893 self._unpack_name(i),

894 self._unpack_offset(i),

895 self._unpack_crc32_checksum(i),

896 )

897

898 def _unpack_name(self, i):

899 offset = self._name_table_offset + i * self.hash_size

900 return self._contents[offset : offset + self.hash_size]

901

902 def _unpack_offset(self, i):

903 offset = self._pack_offset_table_offset + i * 4

904 offset = unpack_from(">L", self._contents, offset)[0]

905 if offset & (2**31):

906 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

907 offset = unpack_from(">Q", self._contents, offset)[0]

908 return offset

909

910 def _unpack_crc32_checksum(self, i):

911 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

912

913

914def read_pack_header(read) -> tuple[int, int]:

915 """Read the header of a pack file.

916

917 Args:

918 read: Read function

919 Returns: Tuple of (pack version, number of objects). If no data is

920 available to read, returns (None, None).

921 """

922 header = read(12)

923 if not header:

924 raise AssertionError("file too short to contain pack")

925 if header[:4] != b"PACK":

926 raise AssertionError(f"Invalid pack header {header!r}")

927 (version,) = unpack_from(b">L", header, 4)

928 if version not in (2, 3):

929 raise AssertionError(f"Version was {version}")

930 (num_objects,) = unpack_from(b">L", header, 8)

931 return (version, num_objects)

932

933

934def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:

935 if isinstance(chunks, bytes):

936 return len(chunks)

937 else:

938 return sum(map(len, chunks))

939

940

941def unpack_object(

942 read_all: Callable[[int], bytes],

943 read_some: Optional[Callable[[int], bytes]] = None,

944 compute_crc32=False,

945 include_comp=False,

946 zlib_bufsize=_ZLIB_BUFSIZE,

947) -> tuple[UnpackedObject, bytes]:

948 """Unpack a Git object.

949

950 Args:

951 read_all: Read function that blocks until the number of requested

952 bytes are read.

953 read_some: Read function that returns at least one byte, but may not

954 return the number of bytes requested.

955 compute_crc32: If True, compute the CRC32 of the compressed data. If

956 False, the returned CRC32 will be None.

957 include_comp: If True, include compressed data in the result.

958 zlib_bufsize: An optional buffer size for zlib operations.

959 Returns: A tuple of (unpacked, unused), where unused is the unused data

960 leftover from decompression, and unpacked in an UnpackedObject with

961 the following attrs set:

962

963 * obj_chunks (for non-delta types)

964 * pack_type_num

965 * delta_base (for delta types)

966 * comp_chunks (if include_comp is True)

967 * decomp_chunks

968 * decomp_len

969 * crc32 (if compute_crc32 is True)

970 """

971 if read_some is None:

972 read_some = read_all

973 if compute_crc32:

974 crc32 = 0

975 else:

976 crc32 = None

977

978 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

979 type_num = (raw[0] >> 4) & 0x07

980 size = raw[0] & 0x0F

981 for i, byte in enumerate(raw[1:]):

982 size += (byte & 0x7F) << ((i * 7) + 4)

983

984 delta_base: Union[int, bytes, None]

985 raw_base = len(raw)

986 if type_num == OFS_DELTA:

987 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

988 raw_base += len(raw)

989 if raw[-1] & 0x80:

990 raise AssertionError

991 delta_base_offset = raw[0] & 0x7F

992 for byte in raw[1:]:

993 delta_base_offset += 1

994 delta_base_offset <<= 7

995 delta_base_offset += byte & 0x7F

996 delta_base = delta_base_offset

997 elif type_num == REF_DELTA:

998 delta_base_obj = read_all(20)

999 if crc32 is not None:

1000 crc32 = binascii.crc32(delta_base_obj, crc32)

1001 delta_base = delta_base_obj

1002 raw_base += 20

1003 else:

1004 delta_base = None

1005

1006 unpacked = UnpackedObject(

1007 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

1008 )

1009 unused = read_zlib_chunks(

1010 read_some,

1011 unpacked,

1012 buffer_size=zlib_bufsize,

1013 include_comp=include_comp,

1014 )

1015 return unpacked, unused

1016

1017

1018def _compute_object_size(value):

1019 """Compute the size of a unresolved object for use with LRUSizeCache."""

1020 (num, obj) = value

1021 if num in DELTA_TYPES:

1022 return chunks_length(obj[1])

1023 return chunks_length(obj)

1024

1025

1026class PackStreamReader:

1027 """Class to read a pack stream.

1028

1029 The pack is read from a ReceivableProtocol using read() or recv() as

1030 appropriate.

1031 """

1032

1033 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:

1034 self.read_all = read_all

1035 if read_some is None:

1036 self.read_some = read_all

1037 else:

1038 self.read_some = read_some

1039 self.sha = sha1()

1040 self._offset = 0

1041 self._rbuf = BytesIO()

1042 # trailer is a deque to avoid memory allocation on small reads

1043 self._trailer: deque[bytes] = deque()

1044 self._zlib_bufsize = zlib_bufsize

1045

1046 def _read(self, read, size):

1047 """Read up to size bytes using the given callback.

1048

1049 As a side effect, update the verifier's hash (excluding the last 20

1050 bytes read).

1051

1052 Args:

1053 read: The read callback to read from.

1054 size: The maximum number of bytes to read; the particular

1055 behavior is callback-specific.

1056 """

1057 data = read(size)

1058

1059 # maintain a trailer of the last 20 bytes we've read

1060 n = len(data)

1061 self._offset += n

1062 tn = len(self._trailer)

1063 if n >= 20:

1064 to_pop = tn

1065 to_add = 20

1066 else:

1067 to_pop = max(n + tn - 20, 0)

1068 to_add = n

1069 self.sha.update(

1070 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

1071 )

1072 self._trailer.extend(data[-to_add:])

1073

1074 # hash everything but the trailer

1075 self.sha.update(data[:-to_add])

1076 return data

1077

1078 def _buf_len(self):

1079 buf = self._rbuf

1080 start = buf.tell()

1081 buf.seek(0, SEEK_END)

1082 end = buf.tell()

1083 buf.seek(start)

1084 return end - start

1085

1086 @property

1087 def offset(self):

1088 return self._offset - self._buf_len()

1089

1090 def read(self, size):

1091 """Read, blocking until size bytes are read."""

1092 buf_len = self._buf_len()

1093 if buf_len >= size:

1094 return self._rbuf.read(size)

1095 buf_data = self._rbuf.read()

1096 self._rbuf = BytesIO()

1097 return buf_data + self._read(self.read_all, size - buf_len)

1098

1099 def recv(self, size):

1100 """Read up to size bytes, blocking until one byte is read."""

1101 buf_len = self._buf_len()

1102 if buf_len:

1103 data = self._rbuf.read(size)

1104 if size >= buf_len:

1105 self._rbuf = BytesIO()

1106 return data

1107 return self._read(self.read_some, size)

1108

1109 def __len__(self) -> int:

1110 return self._num_objects

1111

1112 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:

1113 """Read the objects in this pack file.

1114

1115 Args:

1116 compute_crc32: If True, compute the CRC32 of the compressed

1117 data. If False, the returned CRC32 will be None.

1118 Returns: Iterator over UnpackedObjects with the following members set:

1119 offset

1120 obj_type_num

1121 obj_chunks (for non-delta types)

1122 delta_base (for delta types)

1123 decomp_chunks

1124 decomp_len

1125 crc32 (if compute_crc32 is True)

1126

1127 Raises:

1128 ChecksumMismatch: if the checksum of the pack contents does not

1129 match the checksum in the pack trailer.

1130 zlib.error: if an error occurred during zlib decompression.

1131 IOError: if an error occurred writing to the output file.

1132 """

1133 pack_version, self._num_objects = read_pack_header(self.read)

1134

1135 for _ in range(self._num_objects):

1136 offset = self.offset

1137 unpacked, unused = unpack_object(

1138 self.read,

1139 read_some=self.recv,

1140 compute_crc32=compute_crc32,

1141 zlib_bufsize=self._zlib_bufsize,

1142 )

1143 unpacked.offset = offset

1144

1145 # prepend any unused data to current read buffer

1146 buf = BytesIO()

1147 buf.write(unused)

1148 buf.write(self._rbuf.read())

1149 buf.seek(0)

1150 self._rbuf = buf

1151

1152 yield unpacked

1153

1154 if self._buf_len() < 20:

1155 # If the read buffer is full, then the last read() got the whole

1156 # trailer off the wire. If not, it means there is still some of the

1157 # trailer to read. We need to read() all 20 bytes; N come from the

1158 # read buffer and (20 - N) come from the wire.

1159 self.read(20)

1160

1161 pack_sha = bytearray(self._trailer) # type: ignore

1162 if pack_sha != self.sha.digest():

1163 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())

1164

1165

1166class PackStreamCopier(PackStreamReader):

1167 """Class to verify a pack stream as it is being read.

1168

1169 The pack is read from a ReceivableProtocol using read() or recv() as

1170 appropriate and written out to the given file-like object.

1171 """

1172

1173 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:

1174 """Initialize the copier.

1175

1176 Args:

1177 read_all: Read function that blocks until the number of

1178 requested bytes are read.

1179 read_some: Read function that returns at least one byte, but may

1180 not return the number of bytes requested.

1181 outfile: File-like object to write output through.

1182 delta_iter: Optional DeltaChainIterator to record deltas as we

1183 read them.

1184 """

1185 super().__init__(read_all, read_some=read_some)

1186 self.outfile = outfile

1187 self._delta_iter = delta_iter

1188

1189 def _read(self, read, size):

1190 """Read data from the read callback and write it to the file."""

1191 data = super()._read(read, size)

1192 self.outfile.write(data)

1193 return data

1194

1195 def verify(self, progress=None) -> None:

1196 """Verify a pack stream and write it to the output file.

1197

1198 See PackStreamReader.iterobjects for a list of exceptions this may

1199 throw.

1200 """

1201 i = 0 # default count of entries if read_objects() is empty

1202 for i, unpacked in enumerate(self.read_objects()):

1203 if self._delta_iter:

1204 self._delta_iter.record(unpacked)

1205 if progress is not None:

1206 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))

1207 if progress is not None:

1208 progress(f"copied {i} pack entries\n".encode("ascii"))

1209

1210

1211def obj_sha(type, chunks):

1212 """Compute the SHA for a numeric type and object chunks."""

1213 sha = sha1()

1214 sha.update(object_header(type, chunks_length(chunks)))

1215 if isinstance(chunks, bytes):

1216 sha.update(chunks)

1217 else:

1218 for chunk in chunks:

1219 sha.update(chunk)

1220 return sha.digest()

1221

1222

1223def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):

1224 """Hash a portion of a file into a new SHA.

1225

1226 Args:

1227 f: A file-like object to read from that supports seek().

1228 start_ofs: The offset in the file to start reading at.

1229 end_ofs: The offset in the file to end reading at, relative to the

1230 end of the file.

1231 buffer_size: A buffer size for reading.

1232 Returns: A new SHA object updated with data read from the file.

1233 """

1234 sha = sha1()

1235 f.seek(0, SEEK_END)

1236 length = f.tell()

1237 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1238 raise AssertionError(

1239 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"

1240 )

1241 todo = length + end_ofs - start_ofs

1242 f.seek(start_ofs)

1243 while todo:

1244 data = f.read(min(todo, buffer_size))

1245 sha.update(data)

1246 todo -= len(data)

1247 return sha

1248

1249

1250class PackData:

1251 """The data contained in a packfile.

1252

1253 Pack files can be accessed both sequentially for exploding a pack, and

1254 directly with the help of an index to retrieve a specific object.

1255

1256 The objects within are either complete or a delta against another.

1257

1258 The header is variable length. If the MSB of each byte is set then it

1259 indicates that the subsequent byte is still part of the header.

1260 For the first byte the next MS bits are the type, which tells you the type

1261 of object, and whether it is a delta. The LS byte is the lowest bits of the

1262 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1263 size, i.e. the last byte of the header contains the MS bits of the size.

1264

1265 For the complete objects the data is stored as zlib deflated data.

1266 The size in the header is the uncompressed object size, so to uncompress

1267 you need to just keep feeding data to zlib until you get an object back,

1268 or it errors on bad data. This is done here by just giving the complete

1269 buffer from the start of the deflated object on. This is bad, but until I

1270 get mmap sorted out it will have to do.

1271

1272 Currently there are no integrity checks done. Also no attempt is made to

1273 try and detect the delta case, or a request for an object at the wrong

1274 position. It will all just throw a zlib or KeyError.

1275 """

1276

1277 def __init__(

1278 self,

1279 filename: Union[str, os.PathLike],

1280 file=None,

1281 size=None,

1282 *,

1283 delta_window_size=None,

1284 window_memory=None,

1285 delta_cache_size=None,

1286 depth=None,

1287 threads=None,

1288 big_file_threshold=None,

1289 ) -> None:

1290 """Create a PackData object representing the pack in the given filename.

1291

1292 The file must exist and stay readable until the object is disposed of.

1293 It must also stay the same size. It will be mapped whenever needed.

1294

1295 Currently there is a restriction on the size of the pack as the python

1296 mmap implementation is flawed.

1297 """

1298 self._filename = filename

1299 self._size = size

1300 self._header_size = 12

1301 self.delta_window_size = delta_window_size

1302 self.window_memory = window_memory

1303 self.delta_cache_size = delta_cache_size

1304 self.depth = depth

1305 self.threads = threads

1306 self.big_file_threshold = big_file_threshold

1307

1308 if file is None:

1309 self._file = GitFile(self._filename, "rb")

1310 else:

1311 self._file = file

1312 (version, self._num_objects) = read_pack_header(self._file.read)

1313

1314 # Use delta_cache_size config if available, otherwise default

1315 cache_size = delta_cache_size or (1024 * 1024 * 20)

1316 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](

1317 cache_size, compute_size=_compute_object_size

1318 )

1319

1320 @property

1321 def filename(self):

1322 return os.path.basename(self._filename)

1323

1324 @property

1325 def path(self):

1326 return self._filename

1327

1328 @classmethod

1329 def from_file(cls, file, size=None):

1330 return cls(str(file), file=file, size=size)

1331

1332 @classmethod

1333 def from_path(cls, path: Union[str, os.PathLike]):

1334 return cls(filename=path)

1335

1336 def close(self) -> None:

1337 self._file.close()

1338

1339 def __enter__(self):

1340 return self

1341

1342 def __exit__(self, exc_type, exc_val, exc_tb):

1343 self.close()

1344

1345 def __eq__(self, other):

1346 if isinstance(other, PackData):

1347 return self.get_stored_checksum() == other.get_stored_checksum()

1348 return False

1349

1350 def _get_size(self):

1351 if self._size is not None:

1352 return self._size

1353 self._size = os.path.getsize(self._filename)

1354 if self._size < self._header_size:

1355 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"

1356 raise AssertionError(errmsg)

1357 return self._size

1358

1359 def __len__(self) -> int:

1360 """Returns the number of objects in this pack."""

1361 return self._num_objects

1362

1363 def calculate_checksum(self):

1364 """Calculate the checksum for this pack.

1365

1366 Returns: 20-byte binary SHA1 digest

1367 """

1368 return compute_file_sha(self._file, end_ofs=-20).digest()

1369

1370 def iter_unpacked(self, *, include_comp: bool = False):

1371 self._file.seek(self._header_size)

1372

1373 if self._num_objects is None:

1374 return

1375

1376 for _ in range(self._num_objects):

1377 offset = self._file.tell()

1378 unpacked, unused = unpack_object(

1379 self._file.read, compute_crc32=False, include_comp=include_comp

1380 )

1381 unpacked.offset = offset

1382 yield unpacked

1383 # Back up over unused data.

1384 self._file.seek(-len(unused), SEEK_CUR)

1385

1386 def iterentries(

1387 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None

1388 ):

1389 """Yield entries summarizing the contents of this pack.

1390

1391 Args:

1392 progress: Progress function, called with current and total

1393 object count.

1394 Returns: iterator of tuples with (sha, offset, crc32)

1395 """

1396 num_objects = self._num_objects

1397 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1398 for i, result in enumerate(indexer):

1399 if progress is not None:

1400 progress(i, num_objects)

1401 yield result

1402

1403 def sorted_entries(

1404 self,

1405 progress: Optional[ProgressFn] = None,

1406 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

1407 ):

1408 """Return entries in this pack, sorted by SHA.

1409

1410 Args:

1411 progress: Progress function, called with current and total

1412 object count

1413 Returns: Iterator of tuples with (sha, offset, crc32)

1414 """

1415 return sorted(

1416 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)

1417 )

1418

1419 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):

1420 """Create a version 1 file for this data file.

1421

1422 Args:

1423 filename: Index filename.

1424 progress: Progress report function

1425 Returns: Checksum of index file

1426 """

1427 entries = self.sorted_entries(

1428 progress=progress, resolve_ext_ref=resolve_ext_ref

1429 )

1430 with GitFile(filename, "wb") as f:

1431 return write_pack_index_v1(f, entries, self.calculate_checksum())

1432

1433 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):

1434 """Create a version 2 index file for this data file.

1435

1436 Args:

1437 filename: Index filename.

1438 progress: Progress report function

1439 Returns: Checksum of index file

1440 """

1441 entries = self.sorted_entries(

1442 progress=progress, resolve_ext_ref=resolve_ext_ref

1443 )

1444 with GitFile(filename, "wb") as f:

1445 return write_pack_index_v2(f, entries, self.calculate_checksum())

1446

1447 def create_index_v3(

1448 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1

1449 ):

1450 """Create a version 3 index file for this data file.

1451

1452 Args:

1453 filename: Index filename.

1454 progress: Progress report function

1455 resolve_ext_ref: Function to resolve external references

1456 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1457 Returns: Checksum of index file

1458 """

1459 entries = self.sorted_entries(

1460 progress=progress, resolve_ext_ref=resolve_ext_ref

1461 )

1462 with GitFile(filename, "wb") as f:

1463 return write_pack_index_v3(

1464 f, entries, self.calculate_checksum(), hash_algorithm

1465 )

1466

1467 def create_index(

1468 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1

1469 ):

1470 """Create an index file for this data file.

1471

1472 Args:

1473 filename: Index filename.

1474 progress: Progress report function

1475 version: Index version (1, 2, or 3)

1476 resolve_ext_ref: Function to resolve external references

1477 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)

1478 Returns: Checksum of index file

1479 """

1480 if version == 1:

1481 return self.create_index_v1(

1482 filename, progress, resolve_ext_ref=resolve_ext_ref

1483 )

1484 elif version == 2:

1485 return self.create_index_v2(

1486 filename, progress, resolve_ext_ref=resolve_ext_ref

1487 )

1488 elif version == 3:

1489 return self.create_index_v3(

1490 filename,

1491 progress,

1492 resolve_ext_ref=resolve_ext_ref,

1493 hash_algorithm=hash_algorithm,

1494 )

1495 else:

1496 raise ValueError(f"unknown index format {version}")

1497

1498 def get_stored_checksum(self):

1499 """Return the expected checksum stored in this pack."""

1500 self._file.seek(-20, SEEK_END)

1501 return self._file.read(20)

1502

1503 def check(self) -> None:

1504 """Check the consistency of this pack."""

1505 actual = self.calculate_checksum()

1506 stored = self.get_stored_checksum()

1507 if actual != stored:

1508 raise ChecksumMismatch(stored, actual)

1509

1510 def get_unpacked_object_at(

1511 self, offset: int, *, include_comp: bool = False

1512 ) -> UnpackedObject:

1513 """Given offset in the packfile return a UnpackedObject."""

1514 assert offset >= self._header_size

1515 self._file.seek(offset)

1516 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1517 unpacked.offset = offset

1518 return unpacked

1519

1520 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:

1521 """Given an offset in to the packfile return the object that is there.

1522

1523 Using the associated index the location of an object can be looked up,

1524 and then the packfile can be asked directly for that object using this

1525 function.

1526 """

1527 try:

1528 return self._offset_cache[offset]

1529 except KeyError:

1530 pass

1531 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1532 return (unpacked.pack_type_num, unpacked._obj())

1535T = TypeVar("T")

1538class DeltaChainIterator(Generic[T]):

1539 """Abstract iterator over pack data based on delta chains.

1540

1541 Each object in the pack is guaranteed to be inflated exactly once,

1542 regardless of how many objects reference it as a delta base. As a result,

1543 memory usage is proportional to the length of the longest delta chain.

1544

1545 Subclasses can override _result to define the result type of the iterator.

1546 By default, results are UnpackedObjects with the following members set:

1547

1548 * offset

1549 * obj_type_num

1550 * obj_chunks

1551 * pack_type_num

1552 * delta_base (for delta types)

1553 * comp_chunks (if _include_comp is True)

1554 * decomp_chunks

1555 * decomp_len

1556 * crc32 (if _compute_crc32 is True)

1557 """

1558

1559 _compute_crc32 = False

1560 _include_comp = False

1561

1562 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:

1563 self._file = file_obj

1564 self._resolve_ext_ref = resolve_ext_ref

1565 self._pending_ofs: dict[int, list[int]] = defaultdict(list)

1566 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)

1567 self._full_ofs: list[tuple[int, int]] = []

1568 self._ext_refs: list[bytes] = []

1569

1570 @classmethod

1571 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):

1572 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1573 walker.set_pack_data(pack_data)

1574 for unpacked in pack_data.iter_unpacked(include_comp=False):

1575 walker.record(unpacked)

1576 return walker

1577

1578 @classmethod

1579 def for_pack_subset(

1580 cls,

1581 pack: "Pack",

1582 shas: Iterable[bytes],

1583 *,

1584 allow_missing: bool = False,

1585 resolve_ext_ref=None,

1586 ):

1587 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1588 walker.set_pack_data(pack.data)

1589 todo = set()

1590 for sha in shas:

1591 assert isinstance(sha, bytes)

1592 try:

1593 off = pack.index.object_offset(sha)

1594 except KeyError:

1595 if not allow_missing:

1596 raise

1597 else:

1598 todo.add(off)

1599 done = set()

1600 while todo:

1601 off = todo.pop()

1602 unpacked = pack.data.get_unpacked_object_at(off)

1603 walker.record(unpacked)

1604 done.add(off)

1605 base_ofs = None

1606 if unpacked.pack_type_num == OFS_DELTA:

1607 base_ofs = unpacked.offset - unpacked.delta_base

1608 elif unpacked.pack_type_num == REF_DELTA:

1609 with suppress(KeyError):

1610 assert isinstance(unpacked.delta_base, bytes)

1611 base_ofs = pack.index.object_index(unpacked.delta_base)

1612 if base_ofs is not None and base_ofs not in done:

1613 todo.add(base_ofs)

1614 return walker

1615

1616 def record(self, unpacked: UnpackedObject) -> None:

1617 type_num = unpacked.pack_type_num

1618 offset = unpacked.offset

1619 if type_num == OFS_DELTA:

1620 base_offset = offset - unpacked.delta_base

1621 self._pending_ofs[base_offset].append(offset)

1622 elif type_num == REF_DELTA:

1623 assert isinstance(unpacked.delta_base, bytes)

1624 self._pending_ref[unpacked.delta_base].append(offset)

1625 else:

1626 self._full_ofs.append((offset, type_num))

1627

1628 def set_pack_data(self, pack_data: PackData) -> None:

1629 self._file = pack_data._file

1630

1631 def _walk_all_chains(self):

1632 for offset, type_num in self._full_ofs:

1633 yield from self._follow_chain(offset, type_num, None)

1634 yield from self._walk_ref_chains()

1635 assert not self._pending_ofs, repr(self._pending_ofs)

1636

1637 def _ensure_no_pending(self) -> None:

1638 if self._pending_ref:

1639 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])

1640

1641 def _walk_ref_chains(self):

1642 if not self._resolve_ext_ref:

1643 self._ensure_no_pending()

1644 return

1645

1646 for base_sha, pending in sorted(self._pending_ref.items()):

1647 if base_sha not in self._pending_ref:

1648 continue

1649 try:

1650 type_num, chunks = self._resolve_ext_ref(base_sha)

1651 except KeyError:

1652 # Not an external ref, but may depend on one. Either it will

1653 # get popped via a _follow_chain call, or we will raise an

1654 # error below.

1655 continue

1656 self._ext_refs.append(base_sha)

1657 self._pending_ref.pop(base_sha)

1658 for new_offset in pending:

1659 yield from self._follow_chain(new_offset, type_num, chunks)

1660

1661 self._ensure_no_pending()

1662

1663 def _result(self, unpacked: UnpackedObject) -> T:

1664 raise NotImplementedError

1665

1666 def _resolve_object(

1667 self, offset: int, obj_type_num: int, base_chunks: list[bytes]

1668 ) -> UnpackedObject:

1669 self._file.seek(offset)

1670 unpacked, _ = unpack_object(

1671 self._file.read,

1672 include_comp=self._include_comp,

1673 compute_crc32=self._compute_crc32,

1674 )

1675 unpacked.offset = offset

1676 if base_chunks is None:

1677 assert unpacked.pack_type_num == obj_type_num

1678 else:

1679 assert unpacked.pack_type_num in DELTA_TYPES

1680 unpacked.obj_type_num = obj_type_num

1681 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

1682 return unpacked

1683

1684 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):

1685 # Unlike PackData.get_object_at, there is no need to cache offsets as

1686 # this approach by design inflates each object exactly once.

1687 todo = [(offset, obj_type_num, base_chunks)]

1688 while todo:

1689 (offset, obj_type_num, base_chunks) = todo.pop()

1690 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

1691 yield self._result(unpacked)

1692

1693 unblocked = chain(

1694 self._pending_ofs.pop(unpacked.offset, []),

1695 self._pending_ref.pop(unpacked.sha(), []),

1696 )

1697 todo.extend(

1698 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

1699 for new_offset in unblocked

1700 )

1701

1702 def __iter__(self) -> Iterator[T]:

1703 return self._walk_all_chains()

1704

1705 def ext_refs(self):

1706 return self._ext_refs

1707

1708

1709class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

1710 """Delta chain iterator that yield unpacked objects."""

1711

1712 def _result(self, unpacked):

1713 return unpacked

1714

1715

1716class PackIndexer(DeltaChainIterator[PackIndexEntry]):

1717 """Delta chain iterator that yields index entries."""

1718

1719 _compute_crc32 = True

1720

1721 def _result(self, unpacked):

1722 return unpacked.sha(), unpacked.offset, unpacked.crc32

1723

1724

1725class PackInflater(DeltaChainIterator[ShaFile]):

1726 """Delta chain iterator that yields ShaFile objects."""

1727

1728 def _result(self, unpacked):

1729 return unpacked.sha_file()

1730

1731

1732class SHA1Reader(BinaryIO):

1733 """Wrapper for file-like object that remembers the SHA1 of its data."""

1734

1735 def __init__(self, f) -> None:

1736 self.f = f

1737 self.sha1 = sha1(b"")

1738

1739 def read(self, size: int = -1) -> bytes:

1740 data = self.f.read(size)

1741 self.sha1.update(data)

1742 return data

1743

1744 def check_sha(self, allow_empty: bool = False) -> None:

1745 stored = self.f.read(20)

1746 # If git option index.skipHash is set the index will be empty

1747 if stored != self.sha1.digest() and (

1748 not allow_empty

1749 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"

1750 ):

1751 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))

1752

1753 def close(self):

1754 return self.f.close()

1755

1756 def tell(self) -> int:

1757 return self.f.tell()

1758

1759 # BinaryIO abstract methods

1760 def readable(self) -> bool:

1761 return True

1762

1763 def writable(self) -> bool:

1764 return False

1765

1766 def seekable(self) -> bool:

1767 return getattr(self.f, "seekable", lambda: False)()

1768

1769 def seek(self, offset: int, whence: int = 0) -> int:

1770 return self.f.seek(offset, whence)

1771

1772 def flush(self) -> None:

1773 if hasattr(self.f, "flush"):

1774 self.f.flush()

1775

1776 def readline(self, size: int = -1) -> bytes:

1777 return self.f.readline(size)

1778

1779 def readlines(self, hint: int = -1) -> list[bytes]:

1780 return self.f.readlines(hint)

1781

1782 def writelines(self, lines) -> None:

1783 raise UnsupportedOperation("writelines")

1784

1785 def write(self, data) -> int:

1786 raise UnsupportedOperation("write")

1787

1788 def __enter__(self):

1789 return self

1790

1791 def __exit__(self, type, value, traceback):

1792 self.close()

1793

1794 def __iter__(self):

1795 return self

1796

1797 def __next__(self) -> bytes:

1798 line = self.readline()

1799 if not line:

1800 raise StopIteration

1801 return line

1802

1803 def fileno(self) -> int:

1804 return self.f.fileno()

1805

1806 def isatty(self) -> bool:

1807 return getattr(self.f, "isatty", lambda: False)()

1808

1809 def truncate(self, size: Optional[int] = None) -> int:

1810 raise UnsupportedOperation("truncate")

1811

1812

1813class SHA1Writer(BinaryIO):

1814 """Wrapper for file-like object that remembers the SHA1 of its data."""

1815

1816 def __init__(self, f) -> None:

1817 self.f = f

1818 self.length = 0

1819 self.sha1 = sha1(b"")

1820

1821 def write(self, data) -> int:

1822 self.sha1.update(data)

1823 self.f.write(data)

1824 self.length += len(data)

1825 return len(data)

1826

1827 def write_sha(self):

1828 sha = self.sha1.digest()

1829 assert len(sha) == 20

1830 self.f.write(sha)

1831 self.length += len(sha)

1832 return sha

1833

1834 def close(self):

1835 sha = self.write_sha()

1836 self.f.close()

1837 return sha

1838

1839 def offset(self):

1840 return self.length

1841

1842 def tell(self) -> int:

1843 return self.f.tell()

1844

1845 # BinaryIO abstract methods

1846 def readable(self) -> bool:

1847 return False

1848

1849 def writable(self) -> bool:

1850 return True

1851

1852 def seekable(self) -> bool:

1853 return getattr(self.f, "seekable", lambda: False)()

1854

1855 def seek(self, offset: int, whence: int = 0) -> int:

1856 return self.f.seek(offset, whence)

1857

1858 def flush(self) -> None:

1859 if hasattr(self.f, "flush"):

1860 self.f.flush()

1861

1862 def readline(self, size: int = -1) -> bytes:

1863 raise UnsupportedOperation("readline")

1864

1865 def readlines(self, hint: int = -1) -> list[bytes]:

1866 raise UnsupportedOperation("readlines")

1867

1868 def writelines(self, lines) -> None:

1869 for line in lines:

1870 self.write(line)

1871

1872 def read(self, size: int = -1) -> bytes:

1873 raise UnsupportedOperation("read")

1874

1875 def __enter__(self):

1876 return self

1877

1878 def __exit__(self, type, value, traceback):

1879 self.close()

1880

1881 def __iter__(self):

1882 return self

1883

1884 def __next__(self) -> bytes:

1885 raise UnsupportedOperation("__next__")

1886

1887 def fileno(self) -> int:

1888 return self.f.fileno()

1889

1890 def isatty(self) -> bool:

1891 return getattr(self.f, "isatty", lambda: False)()

1892

1893 def truncate(self, size: Optional[int] = None) -> int:

1894 raise UnsupportedOperation("truncate")

1895

1896

1897def pack_object_header(type_num, delta_base, size):

1898 """Create a pack object header for the given object info.

1899

1900 Args:

1901 type_num: Numeric type of the object.

1902 delta_base: Delta base offset or ref, or None for whole objects.

1903 size: Uncompressed object size.

1904 Returns: A header for a packed object.

1905 """

1906 header = []

1907 c = (type_num << 4) | (size & 15)

1908 size >>= 4

1909 while size:

1910 header.append(c | 0x80)

1911 c = size & 0x7F

1912 size >>= 7

1913 header.append(c)

1914 if type_num == OFS_DELTA:

1915 ret = [delta_base & 0x7F]

1916 delta_base >>= 7

1917 while delta_base:

1918 delta_base -= 1

1919 ret.insert(0, 0x80 | (delta_base & 0x7F))

1920 delta_base >>= 7

1921 header.extend(ret)

1922 elif type_num == REF_DELTA:

1923 assert len(delta_base) == 20

1924 header += delta_base

1925 return bytearray(header)

1926

1927

1928def pack_object_chunks(type, object, compression_level=-1):

1929 """Generate chunks for a pack object.

1930

1931 Args:

1932 type: Numeric type of the object

1933 object: Object to write

1934 compression_level: the zlib compression level

1935 Returns: Chunks

1936 """

1937 if type in DELTA_TYPES:

1938 delta_base, object = object

1939 else:

1940 delta_base = None

1941 if isinstance(object, bytes):

1942 object = [object]

1943 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))

1944 compressor = zlib.compressobj(level=compression_level)

1945 for data in object:

1946 yield compressor.compress(data)

1947 yield compressor.flush()

1948

1949

1950def write_pack_object(write, type, object, sha=None, compression_level=-1):

1951 """Write pack object to a file.

1952

1953 Args:

1954 write: Write function to use

1955 type: Numeric type of the object

1956 object: Object to write

1957 compression_level: the zlib compression level

1958 Returns: Tuple with offset at which the object was written, and crc32

1959 """

1960 crc32 = 0

1961 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

1962 write(chunk)

1963 if sha is not None:

1964 sha.update(chunk)

1965 crc32 = binascii.crc32(chunk, crc32)

1966 return crc32 & 0xFFFFFFFF

1967

1968

1969def write_pack(

1970 filename,

1971 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

1972 *,

1973 deltify: Optional[bool] = None,

1974 delta_window_size: Optional[int] = None,

1975 compression_level: int = -1,

1976):

1977 """Write a new pack data file.

1978

1979 Args:

1980 filename: Path to the new pack file (without .pack extension)

1981 delta_window_size: Delta window size

1982 deltify: Whether to deltify pack objects

1983 compression_level: the zlib compression level

1984 Returns: Tuple with checksum of pack file and index file

1985 """

1986 with GitFile(filename + ".pack", "wb") as f:

1987 entries, data_sum = write_pack_objects(

1988 f.write,

1989 objects,

1990 delta_window_size=delta_window_size,

1991 deltify=deltify,

1992 compression_level=compression_level,

1993 )

1994 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

1995 with GitFile(filename + ".idx", "wb") as f:

1996 return data_sum, write_pack_index(f, entries, data_sum)

1997

1998

1999def pack_header_chunks(num_objects):

2000 """Yield chunks for a pack header."""

2001 yield b"PACK" # Pack header

2002 yield struct.pack(b">L", 2) # Pack version

2003 yield struct.pack(b">L", num_objects) # Number of objects in pack

2004

2005

2006def write_pack_header(write, num_objects) -> None:

2007 """Write a pack header for the given number of objects."""

2008 if hasattr(write, "write"):

2009 write = write.write

2010 warnings.warn(

2011 "write_pack_header() now takes a write rather than file argument",

2012 DeprecationWarning,

2013 stacklevel=2,

2014 )

2015 for chunk in pack_header_chunks(num_objects):

2016 write(chunk)

2017

2018

2019def find_reusable_deltas(

2020 container: PackedObjectContainer,

2021 object_ids: set[bytes],

2022 *,

2023 other_haves: Optional[set[bytes]] = None,

2024 progress=None,

2025) -> Iterator[UnpackedObject]:

2026 if other_haves is None:

2027 other_haves = set()

2028 reused = 0

2029 for i, unpacked in enumerate(

2030 container.iter_unpacked_subset(

2031 object_ids, allow_missing=True, convert_ofs_delta=True

2032 )

2033 ):

2034 if progress is not None and i % 1000 == 0:

2035 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())

2036 if unpacked.pack_type_num == REF_DELTA:

2037 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore

2038 if hexsha in object_ids or hexsha in other_haves:

2039 yield unpacked

2040 reused += 1

2041 if progress is not None:

2042 progress((f"found {reused} deltas to reuse\n").encode())

2043

2044

2045def deltify_pack_objects(

2046 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],

2047 *,

2048 window_size: Optional[int] = None,

2049 progress=None,

2050) -> Iterator[UnpackedObject]:

2051 """Generate deltas for pack objects.

2052

2053 Args:

2054 objects: An iterable of (object, path) tuples to deltify.

2055 window_size: Window size; None for default

2056 Returns: Iterator over type_num, object id, delta_base, content

2057 delta_base is None for full text entries

2058 """

2059

2060 def objects_with_hints():

2061 for e in objects:

2062 if isinstance(e, ShaFile):

2063 yield (e, (e.type_num, None))

2064 else:

2065 yield (e[0], (e[0].type_num, e[1]))

2066

2067 yield from deltas_from_sorted_objects(

2068 sort_objects_for_delta(objects_with_hints()),

2069 window_size=window_size,

2070 progress=progress,

2071 )

2072

2073

2074def sort_objects_for_delta(

2075 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],

2076) -> Iterator[ShaFile]:

2077 magic = []

2078 for entry in objects:

2079 if isinstance(entry, tuple):

2080 obj, hint = entry

2081 if hint is None:

2082 type_num = None

2083 path = None

2084 else:

2085 (type_num, path) = hint

2086 else:

2087 obj = entry

2088 magic.append((type_num, path, -obj.raw_length(), obj))

2089 # Build a list of objects ordered by the magic Linus heuristic

2090 # This helps us find good objects to diff against us

2091 magic.sort()

2092 return (x[3] for x in magic)

2093

2094

2095def deltas_from_sorted_objects(

2096 objects, window_size: Optional[int] = None, progress=None

2097):

2098 # TODO(jelmer): Use threads

2099 if window_size is None:

2100 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

2101

2102 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()

2103 for i, o in enumerate(objects):

2104 if progress is not None and i % 1000 == 0:

2105 progress((f"generating deltas: {i}\r").encode())

2106 raw = o.as_raw_chunks()

2107 winner = raw

2108 winner_len = sum(map(len, winner))

2109 winner_base = None

2110 for base_id, base_type_num, base in possible_bases:

2111 if base_type_num != o.type_num:

2112 continue

2113 delta_len = 0

2114 delta = []

2115 for chunk in create_delta(base, raw):

2116 delta_len += len(chunk)

2117 if delta_len >= winner_len:

2118 break

2119 delta.append(chunk)

2120 else:

2121 winner_base = base_id

2122 winner = delta

2123 winner_len = sum(map(len, winner))

2124 yield UnpackedObject(

2125 o.type_num,

2126 sha=o.sha().digest(),

2127 delta_base=winner_base,

2128 decomp_len=winner_len,

2129 decomp_chunks=winner,

2130 )

2131 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

2132 while len(possible_bases) > window_size:

2133 possible_bases.pop()

2134

2135

2136def pack_objects_to_data(

2137 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2138 *,

2139 deltify: Optional[bool] = None,

2140 delta_window_size: Optional[int] = None,

2141 ofs_delta: bool = True,

2142 progress=None,

2143) -> tuple[int, Iterator[UnpackedObject]]:

2144 """Create pack data from objects.

2145

2146 Args:

2147 objects: Pack objects

2148 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2149 """

2150 # TODO(jelmer): support deltaifying

2151 count = len(objects)

2152 if deltify is None:

2153 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2154 # slow at the moment.

2155 deltify = False

2156 if deltify:

2157 return (

2158 count,

2159 deltify_pack_objects(

2160 iter(objects), # type: ignore

2161 window_size=delta_window_size,

2162 progress=progress,

2163 ),

2164 )

2165 else:

2166

2167 def iter_without_path():

2168 for o in objects:

2169 if isinstance(o, tuple):

2170 yield full_unpacked_object(o[0])

2171 else:

2172 yield full_unpacked_object(o)

2173

2174 return (count, iter_without_path())

2175

2176

2177def generate_unpacked_objects(

2178 container: PackedObjectContainer,

2179 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2180 delta_window_size: Optional[int] = None,

2181 deltify: Optional[bool] = None,

2182 reuse_deltas: bool = True,

2183 ofs_delta: bool = True,

2184 other_haves: Optional[set[bytes]] = None,

2185 progress=None,

2186) -> Iterator[UnpackedObject]:

2187 """Create pack data from objects.

2188

2189 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2190 """

2191 todo = dict(object_ids)

2192 if reuse_deltas:

2193 for unpack in find_reusable_deltas(

2194 container, set(todo), other_haves=other_haves, progress=progress

2195 ):

2196 del todo[sha_to_hex(unpack.sha())]

2197 yield unpack

2198 if deltify is None:

2199 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2200 # slow at the moment.

2201 deltify = False

2202 if deltify:

2203 objects_to_delta = container.iterobjects_subset(

2204 todo.keys(), allow_missing=False

2205 )

2206 yield from deltas_from_sorted_objects(

2207 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),

2208 window_size=delta_window_size,

2209 progress=progress,

2210 )

2211 else:

2212 for oid in todo:

2213 yield full_unpacked_object(container[oid])

2214

2215

2216def full_unpacked_object(o: ShaFile) -> UnpackedObject:

2217 return UnpackedObject(

2218 o.type_num,

2219 delta_base=None,

2220 crc32=None,

2221 decomp_chunks=o.as_raw_chunks(),

2222 sha=o.sha().digest(),

2223 )

2224

2225

2226def write_pack_from_container(

2227 write,

2228 container: PackedObjectContainer,

2229 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2230 delta_window_size: Optional[int] = None,

2231 deltify: Optional[bool] = None,

2232 reuse_deltas: bool = True,

2233 compression_level: int = -1,

2234 other_haves: Optional[set[bytes]] = None,

2235):

2236 """Write a new pack data file.

2237

2238 Args:

2239 write: write function to use

2240 container: PackedObjectContainer

2241 delta_window_size: Sliding window size for searching for deltas;

2242 Set to None for default window size.

2243 deltify: Whether to deltify objects

2244 compression_level: the zlib compression level to use

2245 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2246 """

2247 pack_contents_count = len(object_ids)

2248 pack_contents = generate_unpacked_objects(

2249 container,

2250 object_ids,

2251 delta_window_size=delta_window_size,

2252 deltify=deltify,

2253 reuse_deltas=reuse_deltas,

2254 other_haves=other_haves,

2255 )

2256

2257 return write_pack_data(

2258 write,

2259 pack_contents,

2260 num_records=pack_contents_count,

2261 compression_level=compression_level,

2262 )

2263

2264

2265def write_pack_objects(

2266 write,

2267 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2268 *,

2269 delta_window_size: Optional[int] = None,

2270 deltify: Optional[bool] = None,

2271 compression_level: int = -1,

2272):

2273 """Write a new pack data file.

2274

2275 Args:

2276 write: write function to use

2277 objects: Sequence of (object, path) tuples to write

2278 delta_window_size: Sliding window size for searching for deltas;

2279 Set to None for default window size.

2280 deltify: Whether to deltify objects

2281 compression_level: the zlib compression level to use

2282 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2283 """

2284 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2285

2286 return write_pack_data(

2287 write,

2288 pack_contents,

2289 num_records=pack_contents_count,

2290 compression_level=compression_level,

2291 )

2292

2293

2294class PackChunkGenerator:

2295 def __init__(

2296 self,

2297 num_records=None,

2298 records=None,

2299 progress=None,

2300 compression_level=-1,

2301 reuse_compressed=True,

2302 ) -> None:

2303 self.cs = sha1(b"")

2304 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}

2305 self._it = self._pack_data_chunks(

2306 num_records=num_records,

2307 records=records,

2308 progress=progress,

2309 compression_level=compression_level,

2310 reuse_compressed=reuse_compressed,

2311 )

2312

2313 def sha1digest(self):

2314 return self.cs.digest()

2315

2316 def __iter__(self):

2317 return self._it

2318

2319 def _pack_data_chunks(

2320 self,

2321 records: Iterator[UnpackedObject],

2322 *,

2323 num_records=None,

2324 progress=None,

2325 compression_level: int = -1,

2326 reuse_compressed: bool = True,

2327 ) -> Iterator[bytes]:

2328 """Iterate pack data file chunks.

2329

2330 Args:

2331 records: Iterator over UnpackedObject

2332 num_records: Number of records (defaults to len(records) if not specified)

2333 progress: Function to report progress to

2334 compression_level: the zlib compression level

2335 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2336 """

2337 # Write the pack

2338 if num_records is None:

2339 num_records = len(records) # type: ignore

2340 offset = 0

2341 for chunk in pack_header_chunks(num_records):

2342 yield chunk

2343 self.cs.update(chunk)

2344 offset += len(chunk)

2345 actual_num_records = 0

2346 for i, unpacked in enumerate(records):

2347 type_num = unpacked.pack_type_num

2348 if progress is not None and i % 1000 == 0:

2349 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))

2350 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]

2351 if unpacked.delta_base is not None:

2352 try:

2353 base_offset, base_crc32 = self.entries[unpacked.delta_base]

2354 except KeyError:

2355 type_num = REF_DELTA

2356 assert isinstance(unpacked.delta_base, bytes)

2357 raw = (unpacked.delta_base, unpacked.decomp_chunks)

2358 else:

2359 type_num = OFS_DELTA

2360 raw = (offset - base_offset, unpacked.decomp_chunks)

2361 else:

2362 raw = unpacked.decomp_chunks

2363 if unpacked.comp_chunks is not None and reuse_compressed:

2364 chunks = unpacked.comp_chunks

2365 else:

2366 chunks = pack_object_chunks(

2367 type_num, raw, compression_level=compression_level

2368 )

2369 crc32 = 0

2370 object_size = 0

2371 for chunk in chunks:

2372 yield chunk

2373 crc32 = binascii.crc32(chunk, crc32)

2374 self.cs.update(chunk)

2375 object_size += len(chunk)

2376 actual_num_records += 1

2377 self.entries[unpacked.sha()] = (offset, crc32)

2378 offset += object_size

2379 if actual_num_records != num_records:

2380 raise AssertionError(

2381 f"actual records written differs: {actual_num_records} != {num_records}"

2382 )

2383

2384 yield self.cs.digest()

2385

2386

2387def write_pack_data(

2388 write,

2389 records: Iterator[UnpackedObject],

2390 *,

2391 num_records=None,

2392 progress=None,

2393 compression_level=-1,

2394):

2395 """Write a new pack data file.

2396

2397 Args:

2398 write: Write function to use

2399 num_records: Number of records (defaults to len(records) if None)

2400 records: Iterator over type_num, object_id, delta_base, raw

2401 progress: Function to report progress to

2402 compression_level: the zlib compression level

2403 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2404 """

2405 chunk_generator = PackChunkGenerator(

2406 num_records=num_records,

2407 records=records,

2408 progress=progress,

2409 compression_level=compression_level,

2410 )

2411 for chunk in chunk_generator:

2412 write(chunk)

2413 return chunk_generator.entries, chunk_generator.sha1digest()

2414

2415

2416def write_pack_index_v1(f, entries, pack_checksum):

2417 """Write a new pack index file.

2418

2419 Args:

2420 f: A file-like object to write to

2421 entries: List of tuples with object name (sha), offset_in_pack,

2422 and crc32_checksum.

2423 pack_checksum: Checksum of the pack file.

2424 Returns: The SHA of the written index file

2425 """

2426 f = SHA1Writer(f)

2427 fan_out_table = defaultdict(lambda: 0)

2428 for name, _offset, _entry_checksum in entries:

2429 fan_out_table[ord(name[:1])] += 1

2430 # Fan-out table

2431 for i in range(0x100):

2432 f.write(struct.pack(">L", fan_out_table[i]))

2433 fan_out_table[i + 1] += fan_out_table[i]

2434 for name, offset, _entry_checksum in entries:

2435 if not (offset <= 0xFFFFFFFF):

2436 raise TypeError("pack format 1 only supports offsets < 2Gb")

2437 f.write(struct.pack(">L20s", offset, name))

2438 assert len(pack_checksum) == 20

2439 f.write(pack_checksum)

2440 return f.write_sha()

2441

2442

2443def _delta_encode_size(size) -> bytes:

2444 ret = bytearray()

2445 c = size & 0x7F

2446 size >>= 7

2447 while size:

2448 ret.append(c | 0x80)

2449 c = size & 0x7F

2450 size >>= 7

2451 ret.append(c)

2452 return bytes(ret)

2453

2454

2455# The length of delta compression copy operations in version 2 packs is limited

2456# to 64K. To copy more, we use several copy operations. Version 3 packs allow

2457# 24-bit lengths in copy operations, but we always make version 2 packs.

2458_MAX_COPY_LEN = 0xFFFF

2459

2460

2461def _encode_copy_operation(start, length):

2462 scratch = bytearray([0x80])

2463 for i in range(4):

2464 if start & 0xFF << i * 8:

2465 scratch.append((start >> i * 8) & 0xFF)

2466 scratch[0] |= 1 << i

2467 for i in range(2):

2468 if length & 0xFF << i * 8:

2469 scratch.append((length >> i * 8) & 0xFF)

2470 scratch[0] |= 1 << (4 + i)

2471 return bytes(scratch)

2472

2473

2474def create_delta(base_buf, target_buf):

2475 """Use python difflib to work out how to transform base_buf to target_buf.

2476

2477 Args:

2478 base_buf: Base buffer

2479 target_buf: Target buffer

2480 """

2481 if isinstance(base_buf, list):

2482 base_buf = b"".join(base_buf)

2483 if isinstance(target_buf, list):

2484 target_buf = b"".join(target_buf)

2485 assert isinstance(base_buf, bytes)

2486 assert isinstance(target_buf, bytes)

2487 # write delta header

2488 yield _delta_encode_size(len(base_buf))

2489 yield _delta_encode_size(len(target_buf))

2490 # write out delta opcodes

2491 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

2492 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

2493 # Git patch opcodes don't care about deletes!

2494 # if opcode == 'replace' or opcode == 'delete':

2495 # pass

2496 if opcode == "equal":

2497 # If they are equal, unpacker will use data from base_buf

2498 # Write out an opcode that says what range to use

2499 copy_start = i1

2500 copy_len = i2 - i1

2501 while copy_len > 0:

2502 to_copy = min(copy_len, _MAX_COPY_LEN)

2503 yield _encode_copy_operation(copy_start, to_copy)

2504 copy_start += to_copy

2505 copy_len -= to_copy

2506 if opcode == "replace" or opcode == "insert":

2507 # If we are replacing a range or adding one, then we just

2508 # output it to the stream (prefixed by its size)

2509 s = j2 - j1

2510 o = j1

2511 while s > 127:

2512 yield bytes([127])

2513 yield memoryview(target_buf)[o : o + 127]

2514 s -= 127

2515 o += 127

2516 yield bytes([s])

2517 yield memoryview(target_buf)[o : o + s]

2518

2519

2520def apply_delta(src_buf, delta):

2521 """Based on the similar function in git's patch-delta.c.

2522

2523 Args:

2524 src_buf: Source buffer

2525 delta: Delta instructions

2526 """

2527 if not isinstance(src_buf, bytes):

2528 src_buf = b"".join(src_buf)

2529 if not isinstance(delta, bytes):

2530 delta = b"".join(delta)

2531 out = []

2532 index = 0

2533 delta_length = len(delta)

2534

2535 def get_delta_header_size(delta, index):

2536 size = 0

2537 i = 0

2538 while delta:

2539 cmd = ord(delta[index : index + 1])

2540 index += 1

2541 size |= (cmd & ~0x80) << i

2542 i += 7

2543 if not cmd & 0x80:

2544 break

2545 return size, index

2546

2547 src_size, index = get_delta_header_size(delta, index)

2548 dest_size, index = get_delta_header_size(delta, index)

2549 if src_size != len(src_buf):

2550 raise ApplyDeltaError(

2551 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"

2552 )

2553 while index < delta_length:

2554 cmd = ord(delta[index : index + 1])

2555 index += 1

2556 if cmd & 0x80:

2557 cp_off = 0

2558 for i in range(4):

2559 if cmd & (1 << i):

2560 x = ord(delta[index : index + 1])

2561 index += 1

2562 cp_off |= x << (i * 8)

2563 cp_size = 0

2564 # Version 3 packs can contain copy sizes larger than 64K.

2565 for i in range(3):

2566 if cmd & (1 << (4 + i)):

2567 x = ord(delta[index : index + 1])

2568 index += 1

2569 cp_size |= x << (i * 8)

2570 if cp_size == 0:

2571 cp_size = 0x10000

2572 if (

2573 cp_off + cp_size < cp_size

2574 or cp_off + cp_size > src_size

2575 or cp_size > dest_size

2576 ):

2577 break

2578 out.append(src_buf[cp_off : cp_off + cp_size])

2579 elif cmd != 0:

2580 out.append(delta[index : index + cmd])

2581 index += cmd

2582 else:

2583 raise ApplyDeltaError("Invalid opcode 0")

2584

2585 if index != delta_length:

2586 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

2587

2588 if dest_size != chunks_length(out):

2589 raise ApplyDeltaError("dest size incorrect")

2590

2591 return out

2592

2593

2594def write_pack_index_v2(

2595 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes

2596) -> bytes:

2597 """Write a new pack index file.

2598

2599 Args:

2600 f: File-like object to write to

2601 entries: List of tuples with object name (sha), offset_in_pack, and

2602 crc32_checksum.

2603 pack_checksum: Checksum of the pack file.

2604 Returns: The SHA of the index file written

2605 """

2606 f = SHA1Writer(f)

2607 f.write(b"\377tOc") # Magic!

2608 f.write(struct.pack(">L", 2))

2609 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2610 for name, offset, entry_checksum in entries:

2611 fan_out_table[ord(name[:1])] += 1

2612 # Fan-out table

2613 largetable: list[int] = []

2614 for i in range(0x100):

2615 f.write(struct.pack(b">L", fan_out_table[i]))

2616 fan_out_table[i + 1] += fan_out_table[i]

2617 for name, offset, entry_checksum in entries:

2618 f.write(name)

2619 for name, offset, entry_checksum in entries:

2620 f.write(struct.pack(b">L", entry_checksum))

2621 for name, offset, entry_checksum in entries:

2622 if offset < 2**31:

2623 f.write(struct.pack(b">L", offset))

2624 else:

2625 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2626 largetable.append(offset)

2627 for offset in largetable:

2628 f.write(struct.pack(b">Q", offset))

2629 assert len(pack_checksum) == 20

2630 f.write(pack_checksum)

2631 return f.write_sha()

2632

2633

2634def write_pack_index_v3(

2635 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1

2636) -> bytes:

2637 """Write a new pack index file in v3 format.

2638

2639 Args:

2640 f: File-like object to write to

2641 entries: List of tuples with object name (sha), offset_in_pack, and

2642 crc32_checksum.

2643 pack_checksum: Checksum of the pack file.

2644 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

2645 Returns: The SHA of the index file written

2646 """

2647 if hash_algorithm == 1:

2648 hash_size = 20 # SHA-1

2649 writer_cls = SHA1Writer

2650 elif hash_algorithm == 2:

2651 hash_size = 32 # SHA-256

2652 # TODO: Add SHA256Writer when SHA-256 support is implemented

2653 raise NotImplementedError("SHA-256 support not yet implemented")

2654 else:

2655 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")

2656

2657 # Convert entries to list to allow multiple iterations

2658 entries_list = list(entries)

2659

2660 # Calculate shortest unambiguous prefix length for object names

2661 # For now, use full hash size (this could be optimized)

2662 shortened_oid_len = hash_size

2663

2664 f = writer_cls(f)

2665 f.write(b"\377tOc") # Magic!

2666 f.write(struct.pack(">L", 3)) # Version 3

2667 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm

2668 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length

2669

2670 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2671 for name, offset, entry_checksum in entries_list:

2672 if len(name) != hash_size:

2673 raise ValueError(

2674 f"Object name has wrong length: expected {hash_size}, got {len(name)}"

2675 )

2676 fan_out_table[ord(name[:1])] += 1

2677

2678 # Fan-out table

2679 largetable: list[int] = []

2680 for i in range(0x100):

2681 f.write(struct.pack(b">L", fan_out_table[i]))

2682 fan_out_table[i + 1] += fan_out_table[i]

2683

2684 # Object names table

2685 for name, offset, entry_checksum in entries_list:

2686 f.write(name)

2687

2688 # CRC32 checksums table

2689 for name, offset, entry_checksum in entries_list:

2690 f.write(struct.pack(b">L", entry_checksum))

2691

2692 # Offset table

2693 for name, offset, entry_checksum in entries_list:

2694 if offset < 2**31:

2695 f.write(struct.pack(b">L", offset))

2696 else:

2697 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2698 largetable.append(offset)

2699

2700 # Large offset table

2701 for offset in largetable:

2702 f.write(struct.pack(b">Q", offset))

2703

2704 assert len(pack_checksum) == hash_size, (

2705 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"

2706 )

2707 f.write(pack_checksum)

2708 return f.write_sha()

2709

2710

2711def write_pack_index(

2712 index_filename, entries, pack_checksum, progress=None, version=None

2713):

2714 """Write a pack index file.

2715

2716 Args:

2717 index_filename: Index filename.

2718 entries: List of (checksum, offset, crc32) tuples

2719 pack_checksum: Checksum of the pack file.

2720 progress: Progress function (not currently used)

2721 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.

2722

2723 Returns:

2724 SHA of the written index file

2725 """

2726 if version is None:

2727 version = DEFAULT_PACK_INDEX_VERSION

2728

2729 if version == 1:

2730 return write_pack_index_v1(index_filename, entries, pack_checksum)

2731 elif version == 2:

2732 return write_pack_index_v2(index_filename, entries, pack_checksum)

2733 elif version == 3:

2734 return write_pack_index_v3(index_filename, entries, pack_checksum)

2735 else:

2736 raise ValueError(f"Unsupported pack index version: {version}")

2737

2738

2739class Pack:

2740 """A Git pack object."""

2741

2742 _data_load: Optional[Callable[[], PackData]]

2743 _idx_load: Optional[Callable[[], PackIndex]]

2744

2745 _data: Optional[PackData]

2746 _idx: Optional[PackIndex]

2747

2748 def __init__(

2749 self,

2750 basename,

2751 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

2752 *,

2753 delta_window_size=None,

2754 window_memory=None,

2755 delta_cache_size=None,

2756 depth=None,

2757 threads=None,

2758 big_file_threshold=None,

2759 ) -> None:

2760 self._basename = basename

2761 self._data = None

2762 self._idx = None

2763 self._idx_path = self._basename + ".idx"

2764 self._data_path = self._basename + ".pack"

2765 self.delta_window_size = delta_window_size

2766 self.window_memory = window_memory

2767 self.delta_cache_size = delta_cache_size

2768 self.depth = depth

2769 self.threads = threads

2770 self.big_file_threshold = big_file_threshold

2771 self._data_load = lambda: PackData(

2772 self._data_path,

2773 delta_window_size=delta_window_size,

2774 window_memory=window_memory,

2775 delta_cache_size=delta_cache_size,

2776 depth=depth,

2777 threads=threads,

2778 big_file_threshold=big_file_threshold,

2779 )

2780 self._idx_load = lambda: load_pack_index(self._idx_path)

2781 self.resolve_ext_ref = resolve_ext_ref

2782

2783 @classmethod

2784 def from_lazy_objects(cls, data_fn, idx_fn):

2785 """Create a new pack object from callables to load pack data and

2786 index objects.

2787 """

2788 ret = cls("")

2789 ret._data_load = data_fn

2790 ret._idx_load = idx_fn

2791 return ret

2792

2793 @classmethod

2794 def from_objects(cls, data, idx):

2795 """Create a new pack object from pack data and index objects."""

2796 ret = cls("")

2797 ret._data = data

2798 ret._data_load = None

2799 ret._idx = idx

2800 ret._idx_load = None

2801 ret.check_length_and_checksum()

2802 return ret

2803

2804 def name(self):

2805 """The SHA over the SHAs of the objects in this pack."""

2806 return self.index.objects_sha1()

2807

2808 @property

2809 def data(self) -> PackData:

2810 """The pack data object being used."""

2811 if self._data is None:

2812 assert self._data_load

2813 self._data = self._data_load()

2814 self.check_length_and_checksum()

2815 return self._data

2816

2817 @property

2818 def index(self) -> PackIndex:

2819 """The index being used.

2820

2821 Note: This may be an in-memory index

2822 """

2823 if self._idx is None:

2824 assert self._idx_load

2825 self._idx = self._idx_load()

2826 return self._idx

2827

2828 def close(self) -> None:

2829 if self._data is not None:

2830 self._data.close()

2831 if self._idx is not None:

2832 self._idx.close()

2833

2834 def __enter__(self):

2835 return self

2836

2837 def __exit__(self, exc_type, exc_val, exc_tb):

2838 self.close()

2839

2840 def __eq__(self, other):

2841 return isinstance(self, type(other)) and self.index == other.index

2842

2843 def __len__(self) -> int:

2844 """Number of entries in this pack."""

2845 return len(self.index)

2846

2847 def __repr__(self) -> str:

2848 return f"{self.__class__.__name__}({self._basename!r})"

2849

2850 def __iter__(self):

2851 """Iterate over all the sha1s of the objects in this pack."""

2852 return iter(self.index)

2853

2854 def check_length_and_checksum(self) -> None:

2855 """Sanity check the length and checksum of the pack index and data."""

2856 assert len(self.index) == len(self.data), (

2857 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

2858 )

2859 idx_stored_checksum = self.index.get_pack_checksum()

2860 data_stored_checksum = self.data.get_stored_checksum()

2861 if idx_stored_checksum != data_stored_checksum:

2862 raise ChecksumMismatch(

2863 sha_to_hex(idx_stored_checksum),

2864 sha_to_hex(data_stored_checksum),

2865 )

2866

2867 def check(self) -> None:

2868 """Check the integrity of this pack.

2869

2870 Raises:

2871 ChecksumMismatch: if a checksum for the index or data is wrong

2872 """

2873 self.index.check()

2874 self.data.check()

2875 for obj in self.iterobjects():

2876 obj.check()

2877 # TODO: object connectivity checks

2878

2879 def get_stored_checksum(self) -> bytes:

2880 return self.data.get_stored_checksum()

2881

2882 def pack_tuples(self):

2883 return [(o, None) for o in self.iterobjects()]

2884

2885 def __contains__(self, sha1: bytes) -> bool:

2886 """Check whether this pack contains a particular SHA1."""

2887 try:

2888 self.index.object_offset(sha1)

2889 return True

2890 except KeyError:

2891 return False

2892

2893 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:

2894 offset = self.index.object_offset(sha1)

2895 obj_type, obj = self.data.get_object_at(offset)

2896 type_num, chunks = self.resolve_object(offset, obj_type, obj)

2897 return type_num, b"".join(chunks)

2898

2899 def __getitem__(self, sha1: bytes) -> ShaFile:

2900 """Retrieve the specified SHA1."""

2901 type, uncomp = self.get_raw(sha1)

2902 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

2903

2904 def iterobjects(self) -> Iterator[ShaFile]:

2905 """Iterate over the objects in this pack."""

2906 return iter(

2907 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

2908 )

2909

2910 def iterobjects_subset(

2911 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

2912 ) -> Iterator[ShaFile]:

2913 return (

2914 uo

2915 for uo in PackInflater.for_pack_subset(

2916 self,

2917 shas,

2918 allow_missing=allow_missing,

2919 resolve_ext_ref=self.resolve_ext_ref,

2920 )

2921 if uo.id in shas

2922 )

2923

2924 def iter_unpacked_subset(

2925 self,

2926 shas: Iterable[ObjectID],

2927 *,

2928 include_comp: bool = False,

2929 allow_missing: bool = False,

2930 convert_ofs_delta: bool = False,

2931 ) -> Iterator[UnpackedObject]:

2932 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)

2933 ofs: dict[bytes, int] = {}

2934 todo = set(shas)

2935 for unpacked in self.iter_unpacked(include_comp=include_comp):

2936 sha = unpacked.sha()

2937 ofs[unpacked.offset] = sha

2938 hexsha = sha_to_hex(sha)

2939 if hexsha in todo:

2940 if unpacked.pack_type_num == OFS_DELTA:

2941 assert isinstance(unpacked.delta_base, int)

2942 base_offset = unpacked.offset - unpacked.delta_base

2943 try:

2944 unpacked.delta_base = ofs[base_offset]

2945 except KeyError:

2946 ofs_pending[base_offset].append(unpacked)

2947 continue

2948 else:

2949 unpacked.pack_type_num = REF_DELTA

2950 yield unpacked

2951 todo.remove(hexsha)

2952 for child in ofs_pending.pop(unpacked.offset, []):

2953 child.pack_type_num = REF_DELTA

2954 child.delta_base = sha

2955 yield child

2956 assert not ofs_pending

2957 if not allow_missing and todo:

2958 raise UnresolvedDeltas(todo)

2959

2960 def iter_unpacked(self, include_comp=False):

2961 ofs_to_entries = {

2962 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

2963 }

2964 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

2965 (sha, crc32) = ofs_to_entries[unpacked.offset]

2966 unpacked._sha = sha

2967 unpacked.crc32 = crc32

2968 yield unpacked

2969

2970 def keep(self, msg: Optional[bytes] = None) -> str:

2971 """Add a .keep file for the pack, preventing git from garbage collecting it.

2972

2973 Args:

2974 msg: A message written inside the .keep file; can be used later

2975 to determine whether or not a .keep file is obsolete.

2976 Returns: The path of the .keep file, as a string.

2977 """

2978 keepfile_name = f"{self._basename}.keep"

2979 with GitFile(keepfile_name, "wb") as keepfile:

2980 if msg:

2981 keepfile.write(msg)

2982 keepfile.write(b"\n")

2983 return keepfile_name

2984

2985 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:

2986 """Get the object for a ref SHA, only looking in this pack."""

2987 # TODO: cache these results

2988 try:

2989 offset = self.index.object_offset(sha)

2990 except KeyError:

2991 offset = None

2992 if offset:

2993 type, obj = self.data.get_object_at(offset)

2994 elif self.resolve_ext_ref:

2995 type, obj = self.resolve_ext_ref(sha)

2996 else:

2997 raise KeyError(sha)

2998 return offset, type, obj

2999

3000 def resolve_object(

3001 self, offset: int, type: int, obj, get_ref=None

3002 ) -> tuple[int, Iterable[bytes]]:

3003 """Resolve an object, possibly resolving deltas when necessary.

3004

3005 Returns: Tuple with object type and contents.

3006 """

3007 # Walk down the delta chain, building a stack of deltas to reach

3008 # the requested object.

3009 base_offset = offset

3010 base_type = type

3011 base_obj = obj

3012 delta_stack = []

3013 while base_type in DELTA_TYPES:

3014 prev_offset = base_offset

3015 if get_ref is None:

3016 get_ref = self.get_ref

3017 if base_type == OFS_DELTA:

3018 (delta_offset, delta) = base_obj

3019 # TODO: clean up asserts and replace with nicer error messages

3020 base_offset = base_offset - delta_offset

3021 base_type, base_obj = self.data.get_object_at(base_offset)

3022 assert isinstance(base_type, int)

3023 elif base_type == REF_DELTA:

3024 (basename, delta) = base_obj

3025 assert isinstance(basename, bytes) and len(basename) == 20

3026 base_offset, base_type, base_obj = get_ref(basename)

3027 assert isinstance(base_type, int)

3028 if base_offset == prev_offset: # object is based on itself

3029 raise UnresolvedDeltas(sha_to_hex(basename))

3030 delta_stack.append((prev_offset, base_type, delta))

3031

3032 # Now grab the base object (mustn't be a delta) and apply the

3033 # deltas all the way up the stack.

3034 chunks = base_obj

3035 for prev_offset, _delta_type, delta in reversed(delta_stack):

3036 chunks = apply_delta(chunks, delta)

3037 if prev_offset is not None:

3038 self.data._offset_cache[prev_offset] = base_type, chunks

3039 return base_type, chunks

3040

3041 def entries(

3042 self, progress: Optional[ProgressFn] = None

3043 ) -> Iterator[PackIndexEntry]:

3044 """Yield entries summarizing the contents of this pack.

3045

3046 Args:

3047 progress: Progress function, called with current and total

3048 object count.

3049 Returns: iterator of tuples with (sha, offset, crc32)

3050 """

3051 return self.data.iterentries(

3052 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3053 )

3054

3055 def sorted_entries(

3056 self, progress: Optional[ProgressFn] = None

3057 ) -> Iterator[PackIndexEntry]:

3058 """Return entries in this pack, sorted by SHA.

3059

3060 Args:

3061 progress: Progress function, called with current and total

3062 object count

3063 Returns: Iterator of tuples with (sha, offset, crc32)

3064 """

3065 return self.data.sorted_entries(

3066 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3067 )

3068

3069 def get_unpacked_object(

3070 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True

3071 ) -> UnpackedObject:

3072 """Get the unpacked object for a sha.

3073

3074 Args:

3075 sha: SHA of object to fetch

3076 include_comp: Whether to include compression data in UnpackedObject

3077 """

3078 offset = self.index.object_offset(sha)

3079 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

3080 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

3081 assert isinstance(unpacked.delta_base, int)

3082 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

3083 unpacked.pack_type_num = REF_DELTA

3084 return unpacked

3085

3086

3087def extend_pack(

3088 f: BinaryIO,

3089 object_ids: set[ObjectID],

3090 get_raw,

3091 *,

3092 compression_level=-1,

3093 progress=None,

3094) -> tuple[bytes, list]:

3095 """Extend a pack file with more objects.

3096

3097 The caller should make sure that object_ids does not contain any objects

3098 that are already in the pack

3099 """

3100 # Update the header with the new number of objects.

3101 f.seek(0)

3102 _version, num_objects = read_pack_header(f.read)

3103

3104 if object_ids:

3105 f.seek(0)

3106 write_pack_header(f.write, num_objects + len(object_ids))

3107

3108 # Must flush before reading (http://bugs.python.org/issue3207)

3109 f.flush()

3110

3111 # Rescan the rest of the pack, computing the SHA with the new header.

3112 new_sha = compute_file_sha(f, end_ofs=-20)

3113

3114 # Must reposition before writing (http://bugs.python.org/issue3207)

3115 f.seek(0, os.SEEK_CUR)

3116

3117 extra_entries = []

3118

3119 # Complete the pack.

3120 for i, object_id in enumerate(object_ids):

3121 if progress is not None:

3122 progress(

3123 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")

3124 )

3125 assert len(object_id) == 20

3126 type_num, data = get_raw(object_id)

3127 offset = f.tell()

3128 crc32 = write_pack_object(

3129 f.write,

3130 type_num,

3131 data,

3132 sha=new_sha,

3133 compression_level=compression_level,

3134 )

3135 extra_entries.append((object_id, offset, crc32))

3136 pack_sha = new_sha.digest()

3137 f.write(pack_sha)

3138 return pack_sha, extra_entries

3139

3140

3141try:

3142 from dulwich._pack import ( # type: ignore

3143 apply_delta, # type: ignore

3144 bisect_find_sha, # type: ignore

3145 )

3146except ImportError:

3147 pass