Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 27%

1# pack.py -- For dealing with packed git objects.

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

7# General Public License as public by the Free Software Foundation; version 2.0

8# or (at your option) any later version. You can redistribute it and/or

9# modify it under the terms of either of these two licenses.

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS,

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14# See the License for the specific language governing permissions and

15# limitations under the License.

16#

17# You should have received a copy of the licenses; if not, see

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

20# License, Version 2.0.

21#

23"""Classes for dealing with packed git objects.

25A pack is a compact representation of a bunch of objects, stored

26using deltas where possible.

28They have two parts, the pack file, which stores the data, and an index

29that tells you where the data is.

31To find an object you look in all of the index files 'til you find a

32match for the object name. You then use the pointer got from this as

33a pointer in to the corresponding packfile.

34"""

36import binascii

37from collections import defaultdict, deque

38from contextlib import suppress

39from io import BytesIO, UnsupportedOperation

41try:

42 from cdifflib import CSequenceMatcher as SequenceMatcher

43except ModuleNotFoundError:

44 from difflib import SequenceMatcher

46import os

47import struct

48import sys

49import warnings

50import zlib

51from collections.abc import Iterable, Iterator, Sequence

52from hashlib import sha1

53from itertools import chain

54from os import SEEK_CUR, SEEK_END

55from struct import unpack_from

56from typing import (

57 BinaryIO,

58 Callable,

59 Generic,

60 Optional,

61 Protocol,

62 TypeVar,

63 Union,

64)

66try:

67 import mmap

68except ImportError:

69 has_mmap = False

70else:

71 has_mmap = True

73# For some reason the above try, except fails to set has_mmap = False for plan9

74if sys.platform == "Plan9":

75 has_mmap = False

77from . import replace_me

78from .errors import ApplyDeltaError, ChecksumMismatch

79from .file import GitFile

80from .lru_cache import LRUSizeCache

81from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex

83OFS_DELTA = 6

84REF_DELTA = 7

86DELTA_TYPES = (OFS_DELTA, REF_DELTA)

89DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

91# Keep pack files under 16Mb in memory, otherwise write them out to disk

92PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

94# Default pack index version to use when none is specified

95DEFAULT_PACK_INDEX_VERSION = 2

98OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]

99ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]

100ProgressFn = Callable[[int, str], None]

101PackHint = tuple[int, Optional[bytes]]

102

103

104class UnresolvedDeltas(Exception):

105 """Delta objects could not be resolved."""

106

107 def __init__(self, shas) -> None:

108 self.shas = shas

109

110

111class ObjectContainer(Protocol):

112 def add_object(self, obj: ShaFile) -> None:

113 """Add a single object to this object store."""

114

115 def add_objects(

116 self,

117 objects: Sequence[tuple[ShaFile, Optional[str]]],

118 progress: Optional[Callable[[str], None]] = None,

119 ) -> None:

120 """Add a set of objects to this object store.

121

122 Args:

123 objects: Iterable over a list of (object, path) tuples

124 """

125

126 def __contains__(self, sha1: bytes) -> bool:

127 """Check if a hex sha is present."""

128

129 def __getitem__(self, sha1: bytes) -> ShaFile:

130 """Retrieve an object."""

131

132 def get_commit_graph(self):

133 """Get the commit graph for this object store.

134

135 Returns:

136 CommitGraph object if available, None otherwise

137 """

138 return None

139

140

141class PackedObjectContainer(ObjectContainer):

142 def get_unpacked_object(

143 self, sha1: bytes, *, include_comp: bool = False

144 ) -> "UnpackedObject":

145 """Get a raw unresolved object."""

146 raise NotImplementedError(self.get_unpacked_object)

147

148 def iterobjects_subset(

149 self, shas: Iterable[bytes], *, allow_missing: bool = False

150 ) -> Iterator[ShaFile]:

151 raise NotImplementedError(self.iterobjects_subset)

152

153 def iter_unpacked_subset(

154 self,

155 shas: set[bytes],

156 include_comp: bool = False,

157 allow_missing: bool = False,

158 convert_ofs_delta: bool = True,

159 ) -> Iterator["UnpackedObject"]:

160 raise NotImplementedError(self.iter_unpacked_subset)

161

162

163class UnpackedObjectStream:

164 def __iter__(self) -> Iterator["UnpackedObject"]:

165 raise NotImplementedError(self.__iter__)

166

167 def __len__(self) -> int:

168 raise NotImplementedError(self.__len__)

169

170

171def take_msb_bytes(

172 read: Callable[[int], bytes], crc32: Optional[int] = None

173) -> tuple[list[int], Optional[int]]:

174 """Read bytes marked with most significant bit.

175

176 Args:

177 read: Read function

178 """

179 ret: list[int] = []

180 while len(ret) == 0 or ret[-1] & 0x80:

181 b = read(1)

182 if crc32 is not None:

183 crc32 = binascii.crc32(b, crc32)

184 ret.append(ord(b[:1]))

185 return ret, crc32

186

187

188class PackFileDisappeared(Exception):

189 def __init__(self, obj) -> None:

190 self.obj = obj

191

192

193class UnpackedObject:

194 """Class encapsulating an object unpacked from a pack file.

195

196 These objects should only be created from within unpack_object. Most

197 members start out as empty and are filled in at various points by

198 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

199

200 End users of this object should take care that the function they're getting

201 this object from is guaranteed to set the members they need.

202 """

203

204 __slots__ = [

205 "_sha", # Cached binary SHA.

206 "comp_chunks", # Compressed object chunks.

207 "crc32", # CRC32.

208 "decomp_chunks", # Decompressed object chunks.

209 "decomp_len", # Decompressed length of this object.

210 "delta_base", # Delta base offset or SHA.

211 "obj_chunks", # Decompressed and delta-resolved chunks.

212 "obj_type_num", # Type of this object.

213 "offset", # Offset in its pack.

214 "pack_type_num", # Type of this object in the pack (may be a delta).

215 ]

216

217 obj_type_num: Optional[int]

218 obj_chunks: Optional[list[bytes]]

219 delta_base: Union[None, bytes, int]

220 decomp_chunks: list[bytes]

221 comp_chunks: Optional[list[bytes]]

222

223 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

224 # methods of this object.

225 def __init__(

226 self,

227 pack_type_num,

228 *,

229 delta_base=None,

230 decomp_len=None,

231 crc32=None,

232 sha=None,

233 decomp_chunks=None,

234 offset=None,

235 ) -> None:

236 self.offset = offset

237 self._sha = sha

238 self.pack_type_num = pack_type_num

239 self.delta_base = delta_base

240 self.comp_chunks = None

241 self.decomp_chunks: list[bytes] = decomp_chunks or []

242 if decomp_chunks is not None and decomp_len is None:

243 self.decomp_len = sum(map(len, decomp_chunks))

244 else:

245 self.decomp_len = decomp_len

246 self.crc32 = crc32

247

248 if pack_type_num in DELTA_TYPES:

249 self.obj_type_num = None

250 self.obj_chunks = None

251 else:

252 self.obj_type_num = pack_type_num

253 self.obj_chunks = self.decomp_chunks

254 self.delta_base = delta_base

255

256 def sha(self):

257 """Return the binary SHA of this object."""

258 if self._sha is None:

259 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

260 return self._sha

261

262 def sha_file(self):

263 """Return a ShaFile from this object."""

264 assert self.obj_type_num is not None and self.obj_chunks is not None

265 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

266

267 # Only provided for backwards compatibility with code that expects either

268 # chunks or a delta tuple.

269 def _obj(self) -> OldUnpackedObject:

270 """Return the decompressed chunks, or (delta base, delta chunks)."""

271 if self.pack_type_num in DELTA_TYPES:

272 assert isinstance(self.delta_base, (bytes, int))

273 return (self.delta_base, self.decomp_chunks)

274 else:

275 return self.decomp_chunks

276

277 def __eq__(self, other):

278 if not isinstance(other, UnpackedObject):

279 return False

280 for slot in self.__slots__:

281 if getattr(self, slot) != getattr(other, slot):

282 return False

283 return True

284

285 def __ne__(self, other):

286 return not (self == other)

287

288 def __repr__(self) -> str:

289 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

290 return "{}({})".format(self.__class__.__name__, ", ".join(data))

291

292

293_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance

294

295

296def read_zlib_chunks(

297 read_some: Callable[[int], bytes],

298 unpacked: UnpackedObject,

299 include_comp: bool = False,

300 buffer_size: int = _ZLIB_BUFSIZE,

301) -> bytes:

302 """Read zlib data from a buffer.

303

304 This function requires that the buffer have additional data following the

305 compressed data, which is guaranteed to be the case for git pack files.

306

307 Args:

308 read_some: Read function that returns at least one byte, but may

309 return less than the requested size.

310 unpacked: An UnpackedObject to write result data to. If its crc32

311 attr is not None, the CRC32 of the compressed bytes will be computed

312 using this starting CRC32.

313 After this function, will have the following attrs set:

314 * comp_chunks (if include_comp is True)

315 * decomp_chunks

316 * decomp_len

317 * crc32

318 include_comp: If True, include compressed data in the result.

319 buffer_size: Size of the read buffer.

320 Returns: Leftover unused data from the decompression.

321

322 Raises:

323 zlib.error: if a decompression error occurred.

324 """

325 if unpacked.decomp_len <= -1:

326 raise ValueError("non-negative zlib data stream size expected")

327 decomp_obj = zlib.decompressobj()

328

329 comp_chunks = []

330 decomp_chunks = unpacked.decomp_chunks

331 decomp_len = 0

332 crc32 = unpacked.crc32

333

334 while True:

335 add = read_some(buffer_size)

336 if not add:

337 raise zlib.error("EOF before end of zlib stream")

338 comp_chunks.append(add)

339 decomp = decomp_obj.decompress(add)

340 decomp_len += len(decomp)

341 decomp_chunks.append(decomp)

342 unused = decomp_obj.unused_data

343 if unused:

344 left = len(unused)

345 if crc32 is not None:

346 crc32 = binascii.crc32(add[:-left], crc32)

347 if include_comp:

348 comp_chunks[-1] = add[:-left]

349 break

350 elif crc32 is not None:

351 crc32 = binascii.crc32(add, crc32)

352 if crc32 is not None:

353 crc32 &= 0xFFFFFFFF

354

355 if decomp_len != unpacked.decomp_len:

356 raise zlib.error("decompressed data does not match expected size")

357

358 unpacked.crc32 = crc32

359 if include_comp:

360 unpacked.comp_chunks = comp_chunks

361 return unused

362

363

364def iter_sha1(iter):

365 """Return the hexdigest of the SHA1 over a set of names.

366

367 Args:

368 iter: Iterator over string objects

369 Returns: 40-byte hex sha1 digest

370 """

371 sha = sha1()

372 for name in iter:

373 sha.update(name)

374 return sha.hexdigest().encode("ascii")

375

376

377def load_pack_index(path: Union[str, os.PathLike]):

378 """Load an index file by path.

379

380 Args:

381 path: Path to the index file

382 Returns: A PackIndex loaded from the given path

383 """

384 with GitFile(path, "rb") as f:

385 return load_pack_index_file(path, f)

386

387

388def _load_file_contents(f, size=None):

389 try:

390 fd = f.fileno()

391 except (UnsupportedOperation, AttributeError):

392 fd = None

393 # Attempt to use mmap if possible

394 if fd is not None:

395 if size is None:

396 size = os.fstat(fd).st_size

397 if has_mmap:

398 try:

399 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

400 except (OSError, ValueError):

401 # Can't mmap - perhaps a socket or invalid file descriptor

402 pass

403 else:

404 return contents, size

405 contents = f.read()

406 size = len(contents)

407 return contents, size

408

409

410def load_pack_index_file(path: Union[str, os.PathLike], f):

411 """Load an index file from a file-like object.

412

413 Args:

414 path: Path for the index file

415 f: File-like object

416 Returns: A PackIndex loaded from the given file

417 """

418 contents, size = _load_file_contents(f)

419 if contents[:4] == b"\377tOc":

420 version = struct.unpack(b">L", contents[4:8])[0]

421 if version == 2:

422 return PackIndex2(path, file=f, contents=contents, size=size)

423 elif version == 3:

424 return PackIndex3(path, file=f, contents=contents, size=size)

425 else:

426 raise KeyError(f"Unknown pack index format {version}")

427 else:

428 return PackIndex1(path, file=f, contents=contents, size=size)

429

430

431def bisect_find_sha(start, end, sha, unpack_name):

432 """Find a SHA in a data blob with sorted SHAs.

433

434 Args:

435 start: Start index of range to search

436 end: End index of range to search

437 sha: Sha to find

438 unpack_name: Callback to retrieve SHA by index

439 Returns: Index of the SHA, or None if it wasn't found

440 """

441 assert start <= end

442 while start <= end:

443 i = (start + end) // 2

444 file_sha = unpack_name(i)

445 if file_sha < sha:

446 start = i + 1

447 elif file_sha > sha:

448 end = i - 1

449 else:

450 return i

451 return None

452

453

454PackIndexEntry = tuple[bytes, int, Optional[int]]

455

456

457class PackIndex:

458 """An index in to a packfile.

459

460 Given a sha id of an object a pack index can tell you the location in the

461 packfile of that object if it has it.

462 """

463

464 # Default to SHA-1 for backward compatibility

465 hash_algorithm = 1

466 hash_size = 20

467

468 def __eq__(self, other):

469 if not isinstance(other, PackIndex):

470 return False

471

472 for (name1, _, _), (name2, _, _) in zip(

473 self.iterentries(), other.iterentries()

474 ):

475 if name1 != name2:

476 return False

477 return True

478

479 def __ne__(self, other):

480 return not self.__eq__(other)

481

482 def __len__(self) -> int:

483 """Return the number of entries in this pack index."""

484 raise NotImplementedError(self.__len__)

485

486 def __iter__(self) -> Iterator[bytes]:

487 """Iterate over the SHAs in this pack."""

488 return map(sha_to_hex, self._itersha())

489

490 def iterentries(self) -> Iterator[PackIndexEntry]:

491 """Iterate over the entries in this pack index.

492

493 Returns: iterator over tuples with object name, offset in packfile and

494 crc32 checksum.

495 """

496 raise NotImplementedError(self.iterentries)

497

498 def get_pack_checksum(self) -> bytes:

499 """Return the SHA1 checksum stored for the corresponding packfile.

500

501 Returns: 20-byte binary digest

502 """

503 raise NotImplementedError(self.get_pack_checksum)

504

505 @replace_me(since="0.21.0", remove_in="0.23.0")

506 def object_index(self, sha: bytes) -> int:

507 return self.object_offset(sha)

508

509 def object_offset(self, sha: bytes) -> int:

510 """Return the offset in to the corresponding packfile for the object.

511

512 Given the name of an object it will return the offset that object

513 lives at within the corresponding pack file. If the pack file doesn't

514 have the object then None will be returned.

515 """

516 raise NotImplementedError(self.object_offset)

517

518 def object_sha1(self, index: int) -> bytes:

519 """Return the SHA1 corresponding to the index in the pack file."""

520 for name, offset, crc32 in self.iterentries():

521 if offset == index:

522 return name

523 else:

524 raise KeyError(index)

525

526 def _object_offset(self, sha: bytes) -> int:

527 """See object_offset.

528

529 Args:

530 sha: A *binary* SHA string. (20 characters long)_

531 """

532 raise NotImplementedError(self._object_offset)

533

534 def objects_sha1(self) -> bytes:

535 """Return the hex SHA1 over all the shas of all objects in this pack.

536

537 Note: This is used for the filename of the pack.

538 """

539 return iter_sha1(self._itersha())

540

541 def _itersha(self) -> Iterator[bytes]:

542 """Yield all the SHA1's of the objects in the index, sorted."""

543 raise NotImplementedError(self._itersha)

544

545 def close(self) -> None:

546 pass

547

548 def check(self) -> None:

549 pass

550

551

552class MemoryPackIndex(PackIndex):

553 """Pack index that is stored entirely in memory."""

554

555 def __init__(self, entries, pack_checksum=None) -> None:

556 """Create a new MemoryPackIndex.

557

558 Args:

559 entries: Sequence of name, idx, crc32 (sorted)

560 pack_checksum: Optional pack checksum

561 """

562 self._by_sha = {}

563 self._by_offset = {}

564 for name, offset, crc32 in entries:

565 self._by_sha[name] = offset

566 self._by_offset[offset] = name

567 self._entries = entries

568 self._pack_checksum = pack_checksum

569

570 def get_pack_checksum(self):

571 return self._pack_checksum

572

573 def __len__(self) -> int:

574 return len(self._entries)

575

576 def object_offset(self, sha):

577 if len(sha) == 40:

578 sha = hex_to_sha(sha)

579 return self._by_sha[sha]

580

581 def object_sha1(self, offset):

582 return self._by_offset[offset]

583

584 def _itersha(self):

585 return iter(self._by_sha)

586

587 def iterentries(self):

588 return iter(self._entries)

589

590 @classmethod

591 def for_pack(cls, pack):

592 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())

593

594 @classmethod

595 def clone(cls, other_index):

596 return cls(other_index.iterentries(), other_index.get_pack_checksum())

597

598

599class FilePackIndex(PackIndex):

600 """Pack index that is based on a file.

601

602 To do the loop it opens the file, and indexes first 256 4 byte groups

603 with the first byte of the sha id. The value in the four byte group indexed

604 is the end of the group that shares the same starting byte. Subtract one

605 from the starting byte and index again to find the start of the group.

606 The values are sorted by sha id within the group, so do the math to find

607 the start and end offset and then bisect in to find if the value is

608 present.

609 """

610

611 _fan_out_table: list[int]

612

613 def __init__(self, filename, file=None, contents=None, size=None) -> None:

614 """Create a pack index object.

615

616 Provide it with the name of the index file to consider, and it will map

617 it whenever required.

618 """

619 self._filename = filename

620 # Take the size now, so it can be checked each time we map the file to

621 # ensure that it hasn't changed.

622 if file is None:

623 self._file = GitFile(filename, "rb")

624 else:

625 self._file = file

626 if contents is None:

627 self._contents, self._size = _load_file_contents(self._file, size)

628 else:

629 self._contents, self._size = (contents, size)

630

631 @property

632 def path(self) -> str:

633 return self._filename

634

635 def __eq__(self, other):

636 # Quick optimization:

637 if (

638 isinstance(other, FilePackIndex)

639 and self._fan_out_table != other._fan_out_table

640 ):

641 return False

642

643 return super().__eq__(other)

644

645 def close(self) -> None:

646 self._file.close()

647 if getattr(self._contents, "close", None) is not None:

648 self._contents.close()

649

650 def __len__(self) -> int:

651 """Return the number of entries in this pack index."""

652 return self._fan_out_table[-1]

653

654 def _unpack_entry(self, i: int) -> PackIndexEntry:

655 """Unpack the i-th entry in the index file.

656

657 Returns: Tuple with object name (SHA), offset in pack file and CRC32

658 checksum (if known).

659 """

660 raise NotImplementedError(self._unpack_entry)

661

662 def _unpack_name(self, i) -> bytes:

663 """Unpack the i-th name from the index file."""

664 raise NotImplementedError(self._unpack_name)

665

666 def _unpack_offset(self, i) -> int:

667 """Unpack the i-th object offset from the index file."""

668 raise NotImplementedError(self._unpack_offset)

669

670 def _unpack_crc32_checksum(self, i) -> Optional[int]:

671 """Unpack the crc32 checksum for the ith object from the index file."""

672 raise NotImplementedError(self._unpack_crc32_checksum)

673

674 def _itersha(self) -> Iterator[bytes]:

675 for i in range(len(self)):

676 yield self._unpack_name(i)

677

678 def iterentries(self) -> Iterator[PackIndexEntry]:

679 """Iterate over the entries in this pack index.

680

681 Returns: iterator over tuples with object name, offset in packfile and

682 crc32 checksum.

683 """

684 for i in range(len(self)):

685 yield self._unpack_entry(i)

686

687 def _read_fan_out_table(self, start_offset: int):

688 ret = []

689 for i in range(0x100):

690 fanout_entry = self._contents[

691 start_offset + i * 4 : start_offset + (i + 1) * 4

692 ]

693 ret.append(struct.unpack(">L", fanout_entry)[0])

694 return ret

695

696 def check(self) -> None:

697 """Check that the stored checksum matches the actual checksum."""

698 actual = self.calculate_checksum()

699 stored = self.get_stored_checksum()

700 if actual != stored:

701 raise ChecksumMismatch(stored, actual)

702

703 def calculate_checksum(self) -> bytes:

704 """Calculate the SHA1 checksum over this pack index.

705

706 Returns: This is a 20-byte binary digest

707 """

708 return sha1(self._contents[:-20]).digest()

709

710 def get_pack_checksum(self) -> bytes:

711 """Return the SHA1 checksum stored for the corresponding packfile.

712

713 Returns: 20-byte binary digest

714 """

715 return bytes(self._contents[-40:-20])

716

717 def get_stored_checksum(self) -> bytes:

718 """Return the SHA1 checksum stored for this index.

719

720 Returns: 20-byte binary digest

721 """

722 return bytes(self._contents[-20:])

723

724 def object_offset(self, sha: bytes) -> int:

725 """Return the offset in to the corresponding packfile for the object.

726

727 Given the name of an object it will return the offset that object

728 lives at within the corresponding pack file. If the pack file doesn't

729 have the object then None will be returned.

730 """

731 if len(sha) == 40:

732 sha = hex_to_sha(sha)

733 try:

734 return self._object_offset(sha)

735 except ValueError as exc:

736 closed = getattr(self._contents, "closed", None)

737 if closed in (None, True):

738 raise PackFileDisappeared(self) from exc

739 raise

740

741 def _object_offset(self, sha: bytes) -> int:

742 """See object_offset.

743

744 Args:

745 sha: A *binary* SHA string. (20 characters long)_

746 """

747 assert len(sha) == 20

748 idx = ord(sha[:1])

749 if idx == 0:

750 start = 0

751 else:

752 start = self._fan_out_table[idx - 1]

753 end = self._fan_out_table[idx]

754 i = bisect_find_sha(start, end, sha, self._unpack_name)

755 if i is None:

756 raise KeyError(sha)

757 return self._unpack_offset(i)

758

759 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:

760 """Iterate over all SHA1s with the given prefix."""

761 start = ord(prefix[:1])

762 if start == 0:

763 start = 0

764 else:

765 start = self._fan_out_table[start - 1]

766 end = ord(prefix[:1]) + 1

767 if end == 0x100:

768 end = len(self)

769 else:

770 end = self._fan_out_table[end]

771 assert start <= end

772 started = False

773 for i in range(start, end):

774 name: bytes = self._unpack_name(i)

775 if name.startswith(prefix):

776 yield name

777 started = True

778 elif started:

779 break

780

781

782class PackIndex1(FilePackIndex):

783 """Version 1 Pack Index file."""

784

785 def __init__(

786 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

787 ) -> None:

788 super().__init__(filename, file, contents, size)

789 self.version = 1

790 self._fan_out_table = self._read_fan_out_table(0)

791

792 def _unpack_entry(self, i):

793 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

794 return (name, offset, None)

795

796 def _unpack_name(self, i):

797 offset = (0x100 * 4) + (i * 24) + 4

798 return self._contents[offset : offset + 20]

799

800 def _unpack_offset(self, i):

801 offset = (0x100 * 4) + (i * 24)

802 return unpack_from(">L", self._contents, offset)[0]

803

804 def _unpack_crc32_checksum(self, i) -> None:

805 # Not stored in v1 index files

806 return None

807

808

809class PackIndex2(FilePackIndex):

810 """Version 2 Pack Index file."""

811

812 def __init__(

813 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

814 ) -> None:

815 super().__init__(filename, file, contents, size)

816 if self._contents[:4] != b"\377tOc":

817 raise AssertionError("Not a v2 pack index file")

818 (self.version,) = unpack_from(b">L", self._contents, 4)

819 if self.version != 2:

820 raise AssertionError(f"Version was {self.version}")

821 self._fan_out_table = self._read_fan_out_table(8)

822 self._name_table_offset = 8 + 0x100 * 4

823 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

824 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

825 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

826 self

827 )

828

829 def _unpack_entry(self, i):

830 return (

831 self._unpack_name(i),

832 self._unpack_offset(i),

833 self._unpack_crc32_checksum(i),

834 )

835

836 def _unpack_name(self, i):

837 offset = self._name_table_offset + i * 20

838 return self._contents[offset : offset + 20]

839

840 def _unpack_offset(self, i):

841 offset = self._pack_offset_table_offset + i * 4

842 offset = unpack_from(">L", self._contents, offset)[0]

843 if offset & (2**31):

844 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

845 offset = unpack_from(">Q", self._contents, offset)[0]

846 return offset

847

848 def _unpack_crc32_checksum(self, i):

849 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

850

851

852class PackIndex3(FilePackIndex):

853 """Version 3 Pack Index file.

854

855 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).

856 """

857

858 def __init__(

859 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

860 ) -> None:

861 super().__init__(filename, file, contents, size)

862 if self._contents[:4] != b"\377tOc":

863 raise AssertionError("Not a v3 pack index file")

864 (self.version,) = unpack_from(b">L", self._contents, 4)

865 if self.version != 3:

866 raise AssertionError(f"Version was {self.version}")

867

868 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

869 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)

870 if self.hash_algorithm == 1:

871 self.hash_size = 20 # SHA-1

872 elif self.hash_algorithm == 2:

873 self.hash_size = 32 # SHA-256

874 else:

875 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")

876

877 # Read length of shortened object names

878 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)

879

880 # Calculate offsets based on variable hash size

881 self._fan_out_table = self._read_fan_out_table(

882 16

883 ) # After header (4 + 4 + 4 + 4)

884 self._name_table_offset = 16 + 0x100 * 4

885 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)

886 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

887 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

888 self

889 )

890

891 def _unpack_entry(self, i):

892 return (

893 self._unpack_name(i),

894 self._unpack_offset(i),

895 self._unpack_crc32_checksum(i),

896 )

897

898 def _unpack_name(self, i):

899 offset = self._name_table_offset + i * self.hash_size

900 return self._contents[offset : offset + self.hash_size]

901

902 def _unpack_offset(self, i):

903 offset = self._pack_offset_table_offset + i * 4

904 offset = unpack_from(">L", self._contents, offset)[0]

905 if offset & (2**31):

906 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

907 offset = unpack_from(">Q", self._contents, offset)[0]

908 return offset

909

910 def _unpack_crc32_checksum(self, i):

911 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

912

913

914def read_pack_header(read) -> tuple[int, int]:

915 """Read the header of a pack file.

916

917 Args:

918 read: Read function

919 Returns: Tuple of (pack version, number of objects). If no data is

920 available to read, returns (None, None).

921 """

922 header = read(12)

923 if not header:

924 raise AssertionError("file too short to contain pack")

925 if header[:4] != b"PACK":

926 raise AssertionError(f"Invalid pack header {header!r}")

927 (version,) = unpack_from(b">L", header, 4)

928 if version not in (2, 3):

929 raise AssertionError(f"Version was {version}")

930 (num_objects,) = unpack_from(b">L", header, 8)

931 return (version, num_objects)

932

933

934def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:

935 if isinstance(chunks, bytes):

936 return len(chunks)

937 else:

938 return sum(map(len, chunks))

939

940

941def unpack_object(

942 read_all: Callable[[int], bytes],

943 read_some: Optional[Callable[[int], bytes]] = None,

944 compute_crc32=False,

945 include_comp=False,

946 zlib_bufsize=_ZLIB_BUFSIZE,

947) -> tuple[UnpackedObject, bytes]:

948 """Unpack a Git object.

949

950 Args:

951 read_all: Read function that blocks until the number of requested

952 bytes are read.

953 read_some: Read function that returns at least one byte, but may not

954 return the number of bytes requested.

955 compute_crc32: If True, compute the CRC32 of the compressed data. If

956 False, the returned CRC32 will be None.

957 include_comp: If True, include compressed data in the result.

958 zlib_bufsize: An optional buffer size for zlib operations.

959 Returns: A tuple of (unpacked, unused), where unused is the unused data

960 leftover from decompression, and unpacked in an UnpackedObject with

961 the following attrs set:

962

963 * obj_chunks (for non-delta types)

964 * pack_type_num

965 * delta_base (for delta types)

966 * comp_chunks (if include_comp is True)

967 * decomp_chunks

968 * decomp_len

969 * crc32 (if compute_crc32 is True)

970 """

971 if read_some is None:

972 read_some = read_all

973 if compute_crc32:

974 crc32 = 0

975 else:

976 crc32 = None

977

978 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

979 type_num = (raw[0] >> 4) & 0x07

980 size = raw[0] & 0x0F

981 for i, byte in enumerate(raw[1:]):

982 size += (byte & 0x7F) << ((i * 7) + 4)

983

984 delta_base: Union[int, bytes, None]

985 raw_base = len(raw)

986 if type_num == OFS_DELTA:

987 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

988 raw_base += len(raw)

989 if raw[-1] & 0x80:

990 raise AssertionError

991 delta_base_offset = raw[0] & 0x7F

992 for byte in raw[1:]:

993 delta_base_offset += 1

994 delta_base_offset <<= 7

995 delta_base_offset += byte & 0x7F

996 delta_base = delta_base_offset

997 elif type_num == REF_DELTA:

998 delta_base_obj = read_all(20)

999 if crc32 is not None:

1000 crc32 = binascii.crc32(delta_base_obj, crc32)

1001 delta_base = delta_base_obj

1002 raw_base += 20

1003 else:

1004 delta_base = None

1005

1006 unpacked = UnpackedObject(

1007 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

1008 )

1009 unused = read_zlib_chunks(

1010 read_some,

1011 unpacked,

1012 buffer_size=zlib_bufsize,

1013 include_comp=include_comp,

1014 )

1015 return unpacked, unused

1016

1017

1018def _compute_object_size(value):

1019 """Compute the size of a unresolved object for use with LRUSizeCache."""

1020 (num, obj) = value

1021 if num in DELTA_TYPES:

1022 return chunks_length(obj[1])

1023 return chunks_length(obj)

1024

1025

1026class PackStreamReader:

1027 """Class to read a pack stream.

1028

1029 The pack is read from a ReceivableProtocol using read() or recv() as

1030 appropriate.

1031 """

1032

1033 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:

1034 self.read_all = read_all

1035 if read_some is None:

1036 self.read_some = read_all

1037 else:

1038 self.read_some = read_some

1039 self.sha = sha1()

1040 self._offset = 0

1041 self._rbuf = BytesIO()

1042 # trailer is a deque to avoid memory allocation on small reads

1043 self._trailer: deque[bytes] = deque()

1044 self._zlib_bufsize = zlib_bufsize

1045

1046 def _read(self, read, size):

1047 """Read up to size bytes using the given callback.

1048

1049 As a side effect, update the verifier's hash (excluding the last 20

1050 bytes read).

1051

1052 Args:

1053 read: The read callback to read from.

1054 size: The maximum number of bytes to read; the particular

1055 behavior is callback-specific.

1056 """

1057 data = read(size)

1058

1059 # maintain a trailer of the last 20 bytes we've read

1060 n = len(data)

1061 self._offset += n

1062 tn = len(self._trailer)

1063 if n >= 20:

1064 to_pop = tn

1065 to_add = 20

1066 else:

1067 to_pop = max(n + tn - 20, 0)

1068 to_add = n

1069 self.sha.update(

1070 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

1071 )

1072 self._trailer.extend(data[-to_add:])

1073

1074 # hash everything but the trailer

1075 self.sha.update(data[:-to_add])

1076 return data

1077

1078 def _buf_len(self):

1079 buf = self._rbuf

1080 start = buf.tell()

1081 buf.seek(0, SEEK_END)

1082 end = buf.tell()

1083 buf.seek(start)

1084 return end - start

1085

1086 @property

1087 def offset(self):

1088 return self._offset - self._buf_len()

1089

1090 def read(self, size):

1091 """Read, blocking until size bytes are read."""

1092 buf_len = self._buf_len()

1093 if buf_len >= size:

1094 return self._rbuf.read(size)

1095 buf_data = self._rbuf.read()

1096 self._rbuf = BytesIO()

1097 return buf_data + self._read(self.read_all, size - buf_len)

1098

1099 def recv(self, size):

1100 """Read up to size bytes, blocking until one byte is read."""

1101 buf_len = self._buf_len()

1102 if buf_len:

1103 data = self._rbuf.read(size)

1104 if size >= buf_len:

1105 self._rbuf = BytesIO()

1106 return data

1107 return self._read(self.read_some, size)

1108

1109 def __len__(self) -> int:

1110 return self._num_objects

1111

1112 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:

1113 """Read the objects in this pack file.

1114

1115 Args:

1116 compute_crc32: If True, compute the CRC32 of the compressed

1117 data. If False, the returned CRC32 will be None.

1118 Returns: Iterator over UnpackedObjects with the following members set:

1119 offset

1120 obj_type_num

1121 obj_chunks (for non-delta types)

1122 delta_base (for delta types)

1123 decomp_chunks

1124 decomp_len

1125 crc32 (if compute_crc32 is True)

1126

1127 Raises:

1128 ChecksumMismatch: if the checksum of the pack contents does not

1129 match the checksum in the pack trailer.

1130 zlib.error: if an error occurred during zlib decompression.

1131 IOError: if an error occurred writing to the output file.

1132 """

1133 pack_version, self._num_objects = read_pack_header(self.read)

1134

1135 for i in range(self._num_objects):

1136 offset = self.offset

1137 unpacked, unused = unpack_object(

1138 self.read,

1139 read_some=self.recv,

1140 compute_crc32=compute_crc32,

1141 zlib_bufsize=self._zlib_bufsize,

1142 )

1143 unpacked.offset = offset

1144

1145 # prepend any unused data to current read buffer

1146 buf = BytesIO()

1147 buf.write(unused)

1148 buf.write(self._rbuf.read())

1149 buf.seek(0)

1150 self._rbuf = buf

1151

1152 yield unpacked

1153

1154 if self._buf_len() < 20:

1155 # If the read buffer is full, then the last read() got the whole

1156 # trailer off the wire. If not, it means there is still some of the

1157 # trailer to read. We need to read() all 20 bytes; N come from the

1158 # read buffer and (20 - N) come from the wire.

1159 self.read(20)

1160

1161 pack_sha = bytearray(self._trailer) # type: ignore

1162 if pack_sha != self.sha.digest():

1163 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())

1164

1165

1166class PackStreamCopier(PackStreamReader):

1167 """Class to verify a pack stream as it is being read.

1168

1169 The pack is read from a ReceivableProtocol using read() or recv() as

1170 appropriate and written out to the given file-like object.

1171 """

1172

1173 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:

1174 """Initialize the copier.

1175

1176 Args:

1177 read_all: Read function that blocks until the number of

1178 requested bytes are read.

1179 read_some: Read function that returns at least one byte, but may

1180 not return the number of bytes requested.

1181 outfile: File-like object to write output through.

1182 delta_iter: Optional DeltaChainIterator to record deltas as we

1183 read them.

1184 """

1185 super().__init__(read_all, read_some=read_some)

1186 self.outfile = outfile

1187 self._delta_iter = delta_iter

1188

1189 def _read(self, read, size):

1190 """Read data from the read callback and write it to the file."""

1191 data = super()._read(read, size)

1192 self.outfile.write(data)

1193 return data

1194

1195 def verify(self, progress=None) -> None:

1196 """Verify a pack stream and write it to the output file.

1197

1198 See PackStreamReader.iterobjects for a list of exceptions this may

1199 throw.

1200 """

1201 i = 0 # default count of entries if read_objects() is empty

1202 for i, unpacked in enumerate(self.read_objects()):

1203 if self._delta_iter:

1204 self._delta_iter.record(unpacked)

1205 if progress is not None:

1206 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))

1207 if progress is not None:

1208 progress(f"copied {i} pack entries\n".encode("ascii"))

1209

1210

1211def obj_sha(type, chunks):

1212 """Compute the SHA for a numeric type and object chunks."""

1213 sha = sha1()

1214 sha.update(object_header(type, chunks_length(chunks)))

1215 if isinstance(chunks, bytes):

1216 sha.update(chunks)

1217 else:

1218 for chunk in chunks:

1219 sha.update(chunk)

1220 return sha.digest()

1221

1222

1223def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):

1224 """Hash a portion of a file into a new SHA.

1225

1226 Args:

1227 f: A file-like object to read from that supports seek().

1228 start_ofs: The offset in the file to start reading at.

1229 end_ofs: The offset in the file to end reading at, relative to the

1230 end of the file.

1231 buffer_size: A buffer size for reading.

1232 Returns: A new SHA object updated with data read from the file.

1233 """

1234 sha = sha1()

1235 f.seek(0, SEEK_END)

1236 length = f.tell()

1237 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1238 raise AssertionError(

1239 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"

1240 )

1241 todo = length + end_ofs - start_ofs

1242 f.seek(start_ofs)

1243 while todo:

1244 data = f.read(min(todo, buffer_size))

1245 sha.update(data)

1246 todo -= len(data)

1247 return sha

1248

1249

1250class PackData:

1251 """The data contained in a packfile.

1252

1253 Pack files can be accessed both sequentially for exploding a pack, and

1254 directly with the help of an index to retrieve a specific object.

1255

1256 The objects within are either complete or a delta against another.

1257

1258 The header is variable length. If the MSB of each byte is set then it

1259 indicates that the subsequent byte is still part of the header.

1260 For the first byte the next MS bits are the type, which tells you the type

1261 of object, and whether it is a delta. The LS byte is the lowest bits of the

1262 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1263 size, i.e. the last byte of the header contains the MS bits of the size.

1264

1265 For the complete objects the data is stored as zlib deflated data.

1266 The size in the header is the uncompressed object size, so to uncompress

1267 you need to just keep feeding data to zlib until you get an object back,

1268 or it errors on bad data. This is done here by just giving the complete

1269 buffer from the start of the deflated object on. This is bad, but until I

1270 get mmap sorted out it will have to do.

1271

1272 Currently there are no integrity checks done. Also no attempt is made to

1273 try and detect the delta case, or a request for an object at the wrong

1274 position. It will all just throw a zlib or KeyError.

1275 """

1276

1277 def __init__(self, filename: Union[str, os.PathLike], file=None, size=None) -> None:

1278 """Create a PackData object representing the pack in the given filename.

1279

1280 The file must exist and stay readable until the object is disposed of.

1281 It must also stay the same size. It will be mapped whenever needed.

1282

1283 Currently there is a restriction on the size of the pack as the python

1284 mmap implementation is flawed.

1285 """

1286 self._filename = filename

1287 self._size = size

1288 self._header_size = 12

1289 if file is None:

1290 self._file = GitFile(self._filename, "rb")

1291 else:

1292 self._file = file

1293 (version, self._num_objects) = read_pack_header(self._file.read)

1294 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](

1295 1024 * 1024 * 20, compute_size=_compute_object_size

1296 )

1297

1298 @property

1299 def filename(self):

1300 return os.path.basename(self._filename)

1301

1302 @property

1303 def path(self):

1304 return self._filename

1305

1306 @classmethod

1307 def from_file(cls, file, size=None):

1308 return cls(str(file), file=file, size=size)

1309

1310 @classmethod

1311 def from_path(cls, path: Union[str, os.PathLike]):

1312 return cls(filename=path)

1313

1314 def close(self) -> None:

1315 self._file.close()

1316

1317 def __enter__(self):

1318 return self

1319

1320 def __exit__(self, exc_type, exc_val, exc_tb):

1321 self.close()

1322

1323 def __eq__(self, other):

1324 if isinstance(other, PackData):

1325 return self.get_stored_checksum() == other.get_stored_checksum()

1326 return False

1327

1328 def _get_size(self):

1329 if self._size is not None:

1330 return self._size

1331 self._size = os.path.getsize(self._filename)

1332 if self._size < self._header_size:

1333 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"

1334 raise AssertionError(errmsg)

1335 return self._size

1336

1337 def __len__(self) -> int:

1338 """Returns the number of objects in this pack."""

1339 return self._num_objects

1340

1341 def calculate_checksum(self):

1342 """Calculate the checksum for this pack.

1343

1344 Returns: 20-byte binary SHA1 digest

1345 """

1346 return compute_file_sha(self._file, end_ofs=-20).digest()

1347

1348 def iter_unpacked(self, *, include_comp: bool = False):

1349 self._file.seek(self._header_size)

1350

1351 if self._num_objects is None:

1352 return

1353

1354 for _ in range(self._num_objects):

1355 offset = self._file.tell()

1356 unpacked, unused = unpack_object(

1357 self._file.read, compute_crc32=False, include_comp=include_comp

1358 )

1359 unpacked.offset = offset

1360 yield unpacked

1361 # Back up over unused data.

1362 self._file.seek(-len(unused), SEEK_CUR)

1363

1364 def iterentries(

1365 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None

1366 ):

1367 """Yield entries summarizing the contents of this pack.

1368

1369 Args:

1370 progress: Progress function, called with current and total

1371 object count.

1372 Returns: iterator of tuples with (sha, offset, crc32)

1373 """

1374 num_objects = self._num_objects

1375 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1376 for i, result in enumerate(indexer):

1377 if progress is not None:

1378 progress(i, num_objects)

1379 yield result

1380

1381 def sorted_entries(

1382 self,

1383 progress: Optional[ProgressFn] = None,

1384 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

1385 ):

1386 """Return entries in this pack, sorted by SHA.

1387

1388 Args:

1389 progress: Progress function, called with current and total

1390 object count

1391 Returns: Iterator of tuples with (sha, offset, crc32)

1392 """

1393 return sorted(

1394 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)

1395 )

1396

1397 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):

1398 """Create a version 1 file for this data file.

1399

1400 Args:

1401 filename: Index filename.

1402 progress: Progress report function

1403 Returns: Checksum of index file

1404 """

1405 entries = self.sorted_entries(

1406 progress=progress, resolve_ext_ref=resolve_ext_ref

1407 )

1408 with GitFile(filename, "wb") as f:

1409 return write_pack_index_v1(f, entries, self.calculate_checksum())

1410

1411 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):

1412 """Create a version 2 index file for this data file.

1413

1414 Args:

1415 filename: Index filename.

1416 progress: Progress report function

1417 Returns: Checksum of index file

1418 """

1419 entries = self.sorted_entries(

1420 progress=progress, resolve_ext_ref=resolve_ext_ref

1421 )

1422 with GitFile(filename, "wb") as f:

1423 return write_pack_index_v2(f, entries, self.calculate_checksum())

1424

1425 def create_index_v3(

1426 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1

1427 ):

1428 """Create a version 3 index file for this data file.

1429

1430 Args:

1431 filename: Index filename.

1432 progress: Progress report function

1433 resolve_ext_ref: Function to resolve external references

1434 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1435 Returns: Checksum of index file

1436 """

1437 entries = self.sorted_entries(

1438 progress=progress, resolve_ext_ref=resolve_ext_ref

1439 )

1440 with GitFile(filename, "wb") as f:

1441 return write_pack_index_v3(

1442 f, entries, self.calculate_checksum(), hash_algorithm

1443 )

1444

1445 def create_index(

1446 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1

1447 ):

1448 """Create an index file for this data file.

1449

1450 Args:

1451 filename: Index filename.

1452 progress: Progress report function

1453 version: Index version (1, 2, or 3)

1454 resolve_ext_ref: Function to resolve external references

1455 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)

1456 Returns: Checksum of index file

1457 """

1458 if version == 1:

1459 return self.create_index_v1(

1460 filename, progress, resolve_ext_ref=resolve_ext_ref

1461 )

1462 elif version == 2:

1463 return self.create_index_v2(

1464 filename, progress, resolve_ext_ref=resolve_ext_ref

1465 )

1466 elif version == 3:

1467 return self.create_index_v3(

1468 filename,

1469 progress,

1470 resolve_ext_ref=resolve_ext_ref,

1471 hash_algorithm=hash_algorithm,

1472 )

1473 else:

1474 raise ValueError(f"unknown index format {version}")

1475

1476 def get_stored_checksum(self):

1477 """Return the expected checksum stored in this pack."""

1478 self._file.seek(-20, SEEK_END)

1479 return self._file.read(20)

1480

1481 def check(self) -> None:

1482 """Check the consistency of this pack."""

1483 actual = self.calculate_checksum()

1484 stored = self.get_stored_checksum()

1485 if actual != stored:

1486 raise ChecksumMismatch(stored, actual)

1487

1488 def get_unpacked_object_at(

1489 self, offset: int, *, include_comp: bool = False

1490 ) -> UnpackedObject:

1491 """Given offset in the packfile return a UnpackedObject."""

1492 assert offset >= self._header_size

1493 self._file.seek(offset)

1494 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1495 unpacked.offset = offset

1496 return unpacked

1497

1498 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:

1499 """Given an offset in to the packfile return the object that is there.

1500

1501 Using the associated index the location of an object can be looked up,

1502 and then the packfile can be asked directly for that object using this

1503 function.

1504 """

1505 try:

1506 return self._offset_cache[offset]

1507 except KeyError:

1508 pass

1509 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1510 return (unpacked.pack_type_num, unpacked._obj())

1513T = TypeVar("T")

1516class DeltaChainIterator(Generic[T]):

1517 """Abstract iterator over pack data based on delta chains.

1518

1519 Each object in the pack is guaranteed to be inflated exactly once,

1520 regardless of how many objects reference it as a delta base. As a result,

1521 memory usage is proportional to the length of the longest delta chain.

1522

1523 Subclasses can override _result to define the result type of the iterator.

1524 By default, results are UnpackedObjects with the following members set:

1525

1526 * offset

1527 * obj_type_num

1528 * obj_chunks

1529 * pack_type_num

1530 * delta_base (for delta types)

1531 * comp_chunks (if _include_comp is True)

1532 * decomp_chunks

1533 * decomp_len

1534 * crc32 (if _compute_crc32 is True)

1535 """

1536

1537 _compute_crc32 = False

1538 _include_comp = False

1539

1540 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:

1541 self._file = file_obj

1542 self._resolve_ext_ref = resolve_ext_ref

1543 self._pending_ofs: dict[int, list[int]] = defaultdict(list)

1544 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)

1545 self._full_ofs: list[tuple[int, int]] = []

1546 self._ext_refs: list[bytes] = []

1547

1548 @classmethod

1549 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):

1550 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1551 walker.set_pack_data(pack_data)

1552 for unpacked in pack_data.iter_unpacked(include_comp=False):

1553 walker.record(unpacked)

1554 return walker

1555

1556 @classmethod

1557 def for_pack_subset(

1558 cls,

1559 pack: "Pack",

1560 shas: Iterable[bytes],

1561 *,

1562 allow_missing: bool = False,

1563 resolve_ext_ref=None,

1564 ):

1565 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1566 walker.set_pack_data(pack.data)

1567 todo = set()

1568 for sha in shas:

1569 assert isinstance(sha, bytes)

1570 try:

1571 off = pack.index.object_offset(sha)

1572 except KeyError:

1573 if not allow_missing:

1574 raise

1575 else:

1576 todo.add(off)

1577 done = set()

1578 while todo:

1579 off = todo.pop()

1580 unpacked = pack.data.get_unpacked_object_at(off)

1581 walker.record(unpacked)

1582 done.add(off)

1583 base_ofs = None

1584 if unpacked.pack_type_num == OFS_DELTA:

1585 base_ofs = unpacked.offset - unpacked.delta_base

1586 elif unpacked.pack_type_num == REF_DELTA:

1587 with suppress(KeyError):

1588 assert isinstance(unpacked.delta_base, bytes)

1589 base_ofs = pack.index.object_index(unpacked.delta_base)

1590 if base_ofs is not None and base_ofs not in done:

1591 todo.add(base_ofs)

1592 return walker

1593

1594 def record(self, unpacked: UnpackedObject) -> None:

1595 type_num = unpacked.pack_type_num

1596 offset = unpacked.offset

1597 if type_num == OFS_DELTA:

1598 base_offset = offset - unpacked.delta_base

1599 self._pending_ofs[base_offset].append(offset)

1600 elif type_num == REF_DELTA:

1601 assert isinstance(unpacked.delta_base, bytes)

1602 self._pending_ref[unpacked.delta_base].append(offset)

1603 else:

1604 self._full_ofs.append((offset, type_num))

1605

1606 def set_pack_data(self, pack_data: PackData) -> None:

1607 self._file = pack_data._file

1608

1609 def _walk_all_chains(self):

1610 for offset, type_num in self._full_ofs:

1611 yield from self._follow_chain(offset, type_num, None)

1612 yield from self._walk_ref_chains()

1613 assert not self._pending_ofs, repr(self._pending_ofs)

1614

1615 def _ensure_no_pending(self) -> None:

1616 if self._pending_ref:

1617 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])

1618

1619 def _walk_ref_chains(self):

1620 if not self._resolve_ext_ref:

1621 self._ensure_no_pending()

1622 return

1623

1624 for base_sha, pending in sorted(self._pending_ref.items()):

1625 if base_sha not in self._pending_ref:

1626 continue

1627 try:

1628 type_num, chunks = self._resolve_ext_ref(base_sha)

1629 except KeyError:

1630 # Not an external ref, but may depend on one. Either it will

1631 # get popped via a _follow_chain call, or we will raise an

1632 # error below.

1633 continue

1634 self._ext_refs.append(base_sha)

1635 self._pending_ref.pop(base_sha)

1636 for new_offset in pending:

1637 yield from self._follow_chain(new_offset, type_num, chunks)

1638

1639 self._ensure_no_pending()

1640

1641 def _result(self, unpacked: UnpackedObject) -> T:

1642 raise NotImplementedError

1643

1644 def _resolve_object(

1645 self, offset: int, obj_type_num: int, base_chunks: list[bytes]

1646 ) -> UnpackedObject:

1647 self._file.seek(offset)

1648 unpacked, _ = unpack_object(

1649 self._file.read,

1650 include_comp=self._include_comp,

1651 compute_crc32=self._compute_crc32,

1652 )

1653 unpacked.offset = offset

1654 if base_chunks is None:

1655 assert unpacked.pack_type_num == obj_type_num

1656 else:

1657 assert unpacked.pack_type_num in DELTA_TYPES

1658 unpacked.obj_type_num = obj_type_num

1659 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

1660 return unpacked

1661

1662 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):

1663 # Unlike PackData.get_object_at, there is no need to cache offsets as

1664 # this approach by design inflates each object exactly once.

1665 todo = [(offset, obj_type_num, base_chunks)]

1666 while todo:

1667 (offset, obj_type_num, base_chunks) = todo.pop()

1668 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

1669 yield self._result(unpacked)

1670

1671 unblocked = chain(

1672 self._pending_ofs.pop(unpacked.offset, []),

1673 self._pending_ref.pop(unpacked.sha(), []),

1674 )

1675 todo.extend(

1676 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

1677 for new_offset in unblocked

1678 )

1679

1680 def __iter__(self) -> Iterator[T]:

1681 return self._walk_all_chains()

1682

1683 def ext_refs(self):

1684 return self._ext_refs

1685

1686

1687class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

1688 """Delta chain iterator that yield unpacked objects."""

1689

1690 def _result(self, unpacked):

1691 return unpacked

1692

1693

1694class PackIndexer(DeltaChainIterator[PackIndexEntry]):

1695 """Delta chain iterator that yields index entries."""

1696

1697 _compute_crc32 = True

1698

1699 def _result(self, unpacked):

1700 return unpacked.sha(), unpacked.offset, unpacked.crc32

1701

1702

1703class PackInflater(DeltaChainIterator[ShaFile]):

1704 """Delta chain iterator that yields ShaFile objects."""

1705

1706 def _result(self, unpacked):

1707 return unpacked.sha_file()

1708

1709

1710class SHA1Reader(BinaryIO):

1711 """Wrapper for file-like object that remembers the SHA1 of its data."""

1712

1713 def __init__(self, f) -> None:

1714 self.f = f

1715 self.sha1 = sha1(b"")

1716

1717 def read(self, size: int = -1) -> bytes:

1718 data = self.f.read(size)

1719 self.sha1.update(data)

1720 return data

1721

1722 def check_sha(self, allow_empty: bool = False) -> None:

1723 stored = self.f.read(20)

1724 # If git option index.skipHash is set the index will be empty

1725 if stored != self.sha1.digest() and (

1726 not allow_empty

1727 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"

1728 ):

1729 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))

1730

1731 def close(self):

1732 return self.f.close()

1733

1734 def tell(self) -> int:

1735 return self.f.tell()

1736

1737 # BinaryIO abstract methods

1738 def readable(self) -> bool:

1739 return True

1740

1741 def writable(self) -> bool:

1742 return False

1743

1744 def seekable(self) -> bool:

1745 return getattr(self.f, "seekable", lambda: False)()

1746

1747 def seek(self, offset: int, whence: int = 0) -> int:

1748 return self.f.seek(offset, whence)

1749

1750 def flush(self) -> None:

1751 if hasattr(self.f, "flush"):

1752 self.f.flush()

1753

1754 def readline(self, size: int = -1) -> bytes:

1755 return self.f.readline(size)

1756

1757 def readlines(self, hint: int = -1) -> list[bytes]:

1758 return self.f.readlines(hint)

1759

1760 def writelines(self, lines) -> None:

1761 raise UnsupportedOperation("writelines")

1762

1763 def write(self, data) -> int:

1764 raise UnsupportedOperation("write")

1765

1766 def __enter__(self):

1767 return self

1768

1769 def __exit__(self, type, value, traceback):

1770 self.close()

1771

1772 def __iter__(self):

1773 return self

1774

1775 def __next__(self) -> bytes:

1776 line = self.readline()

1777 if not line:

1778 raise StopIteration

1779 return line

1780

1781 def fileno(self) -> int:

1782 return self.f.fileno()

1783

1784 def isatty(self) -> bool:

1785 return getattr(self.f, "isatty", lambda: False)()

1786

1787 def truncate(self, size: Optional[int] = None) -> int:

1788 raise UnsupportedOperation("truncate")

1789

1790

1791class SHA1Writer(BinaryIO):

1792 """Wrapper for file-like object that remembers the SHA1 of its data."""

1793

1794 def __init__(self, f) -> None:

1795 self.f = f

1796 self.length = 0

1797 self.sha1 = sha1(b"")

1798

1799 def write(self, data) -> int:

1800 self.sha1.update(data)

1801 self.f.write(data)

1802 self.length += len(data)

1803 return len(data)

1804

1805 def write_sha(self):

1806 sha = self.sha1.digest()

1807 assert len(sha) == 20

1808 self.f.write(sha)

1809 self.length += len(sha)

1810 return sha

1811

1812 def close(self):

1813 sha = self.write_sha()

1814 self.f.close()

1815 return sha

1816

1817 def offset(self):

1818 return self.length

1819

1820 def tell(self) -> int:

1821 return self.f.tell()

1822

1823 # BinaryIO abstract methods

1824 def readable(self) -> bool:

1825 return False

1826

1827 def writable(self) -> bool:

1828 return True

1829

1830 def seekable(self) -> bool:

1831 return getattr(self.f, "seekable", lambda: False)()

1832

1833 def seek(self, offset: int, whence: int = 0) -> int:

1834 return self.f.seek(offset, whence)

1835

1836 def flush(self) -> None:

1837 if hasattr(self.f, "flush"):

1838 self.f.flush()

1839

1840 def readline(self, size: int = -1) -> bytes:

1841 raise UnsupportedOperation("readline")

1842

1843 def readlines(self, hint: int = -1) -> list[bytes]:

1844 raise UnsupportedOperation("readlines")

1845

1846 def writelines(self, lines) -> None:

1847 for line in lines:

1848 self.write(line)

1849

1850 def read(self, size: int = -1) -> bytes:

1851 raise UnsupportedOperation("read")

1852

1853 def __enter__(self):

1854 return self

1855

1856 def __exit__(self, type, value, traceback):

1857 self.close()

1858

1859 def __iter__(self):

1860 return self

1861

1862 def __next__(self) -> bytes:

1863 raise UnsupportedOperation("__next__")

1864

1865 def fileno(self) -> int:

1866 return self.f.fileno()

1867

1868 def isatty(self) -> bool:

1869 return getattr(self.f, "isatty", lambda: False)()

1870

1871 def truncate(self, size: Optional[int] = None) -> int:

1872 raise UnsupportedOperation("truncate")

1873

1874

1875def pack_object_header(type_num, delta_base, size):

1876 """Create a pack object header for the given object info.

1877

1878 Args:

1879 type_num: Numeric type of the object.

1880 delta_base: Delta base offset or ref, or None for whole objects.

1881 size: Uncompressed object size.

1882 Returns: A header for a packed object.

1883 """

1884 header = []

1885 c = (type_num << 4) | (size & 15)

1886 size >>= 4

1887 while size:

1888 header.append(c | 0x80)

1889 c = size & 0x7F

1890 size >>= 7

1891 header.append(c)

1892 if type_num == OFS_DELTA:

1893 ret = [delta_base & 0x7F]

1894 delta_base >>= 7

1895 while delta_base:

1896 delta_base -= 1

1897 ret.insert(0, 0x80 | (delta_base & 0x7F))

1898 delta_base >>= 7

1899 header.extend(ret)

1900 elif type_num == REF_DELTA:

1901 assert len(delta_base) == 20

1902 header += delta_base

1903 return bytearray(header)

1904

1905

1906def pack_object_chunks(type, object, compression_level=-1):

1907 """Generate chunks for a pack object.

1908

1909 Args:

1910 type: Numeric type of the object

1911 object: Object to write

1912 compression_level: the zlib compression level

1913 Returns: Chunks

1914 """

1915 if type in DELTA_TYPES:

1916 delta_base, object = object

1917 else:

1918 delta_base = None

1919 if isinstance(object, bytes):

1920 object = [object]

1921 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))

1922 compressor = zlib.compressobj(level=compression_level)

1923 for data in object:

1924 yield compressor.compress(data)

1925 yield compressor.flush()

1926

1927

1928def write_pack_object(write, type, object, sha=None, compression_level=-1):

1929 """Write pack object to a file.

1930

1931 Args:

1932 write: Write function to use

1933 type: Numeric type of the object

1934 object: Object to write

1935 compression_level: the zlib compression level

1936 Returns: Tuple with offset at which the object was written, and crc32

1937 """

1938 crc32 = 0

1939 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

1940 write(chunk)

1941 if sha is not None:

1942 sha.update(chunk)

1943 crc32 = binascii.crc32(chunk, crc32)

1944 return crc32 & 0xFFFFFFFF

1945

1946

1947def write_pack(

1948 filename,

1949 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

1950 *,

1951 deltify: Optional[bool] = None,

1952 delta_window_size: Optional[int] = None,

1953 compression_level: int = -1,

1954):

1955 """Write a new pack data file.

1956

1957 Args:

1958 filename: Path to the new pack file (without .pack extension)

1959 delta_window_size: Delta window size

1960 deltify: Whether to deltify pack objects

1961 compression_level: the zlib compression level

1962 Returns: Tuple with checksum of pack file and index file

1963 """

1964 with GitFile(filename + ".pack", "wb") as f:

1965 entries, data_sum = write_pack_objects(

1966 f.write,

1967 objects,

1968 delta_window_size=delta_window_size,

1969 deltify=deltify,

1970 compression_level=compression_level,

1971 )

1972 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

1973 with GitFile(filename + ".idx", "wb") as f:

1974 return data_sum, write_pack_index(f, entries, data_sum)

1975

1976

1977def pack_header_chunks(num_objects):

1978 """Yield chunks for a pack header."""

1979 yield b"PACK" # Pack header

1980 yield struct.pack(b">L", 2) # Pack version

1981 yield struct.pack(b">L", num_objects) # Number of objects in pack

1982

1983

1984def write_pack_header(write, num_objects) -> None:

1985 """Write a pack header for the given number of objects."""

1986 if hasattr(write, "write"):

1987 write = write.write

1988 warnings.warn(

1989 "write_pack_header() now takes a write rather than file argument",

1990 DeprecationWarning,

1991 stacklevel=2,

1992 )

1993 for chunk in pack_header_chunks(num_objects):

1994 write(chunk)

1995

1996

1997def find_reusable_deltas(

1998 container: PackedObjectContainer,

1999 object_ids: set[bytes],

2000 *,

2001 other_haves: Optional[set[bytes]] = None,

2002 progress=None,

2003) -> Iterator[UnpackedObject]:

2004 if other_haves is None:

2005 other_haves = set()

2006 reused = 0

2007 for i, unpacked in enumerate(

2008 container.iter_unpacked_subset(

2009 object_ids, allow_missing=True, convert_ofs_delta=True

2010 )

2011 ):

2012 if progress is not None and i % 1000 == 0:

2013 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())

2014 if unpacked.pack_type_num == REF_DELTA:

2015 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore

2016 if hexsha in object_ids or hexsha in other_haves:

2017 yield unpacked

2018 reused += 1

2019 if progress is not None:

2020 progress((f"found {reused} deltas to reuse\n").encode())

2021

2022

2023def deltify_pack_objects(

2024 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],

2025 *,

2026 window_size: Optional[int] = None,

2027 progress=None,

2028) -> Iterator[UnpackedObject]:

2029 """Generate deltas for pack objects.

2030

2031 Args:

2032 objects: An iterable of (object, path) tuples to deltify.

2033 window_size: Window size; None for default

2034 Returns: Iterator over type_num, object id, delta_base, content

2035 delta_base is None for full text entries

2036 """

2037

2038 def objects_with_hints():

2039 for e in objects:

2040 if isinstance(e, ShaFile):

2041 yield (e, (e.type_num, None))

2042 else:

2043 yield (e[0], (e[0].type_num, e[1]))

2044

2045 yield from deltas_from_sorted_objects(

2046 sort_objects_for_delta(objects_with_hints()),

2047 window_size=window_size,

2048 progress=progress,

2049 )

2050

2051

2052def sort_objects_for_delta(

2053 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],

2054) -> Iterator[ShaFile]:

2055 magic = []

2056 for entry in objects:

2057 if isinstance(entry, tuple):

2058 obj, hint = entry

2059 if hint is None:

2060 type_num = None

2061 path = None

2062 else:

2063 (type_num, path) = hint

2064 else:

2065 obj = entry

2066 magic.append((type_num, path, -obj.raw_length(), obj))

2067 # Build a list of objects ordered by the magic Linus heuristic

2068 # This helps us find good objects to diff against us

2069 magic.sort()

2070 return (x[3] for x in magic)

2071

2072

2073def deltas_from_sorted_objects(

2074 objects, window_size: Optional[int] = None, progress=None

2075):

2076 # TODO(jelmer): Use threads

2077 if window_size is None:

2078 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

2079

2080 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()

2081 for i, o in enumerate(objects):

2082 if progress is not None and i % 1000 == 0:

2083 progress((f"generating deltas: {i}\r").encode())

2084 raw = o.as_raw_chunks()

2085 winner = raw

2086 winner_len = sum(map(len, winner))

2087 winner_base = None

2088 for base_id, base_type_num, base in possible_bases:

2089 if base_type_num != o.type_num:

2090 continue

2091 delta_len = 0

2092 delta = []

2093 for chunk in create_delta(base, raw):

2094 delta_len += len(chunk)

2095 if delta_len >= winner_len:

2096 break

2097 delta.append(chunk)

2098 else:

2099 winner_base = base_id

2100 winner = delta

2101 winner_len = sum(map(len, winner))

2102 yield UnpackedObject(

2103 o.type_num,

2104 sha=o.sha().digest(),

2105 delta_base=winner_base,

2106 decomp_len=winner_len,

2107 decomp_chunks=winner,

2108 )

2109 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

2110 while len(possible_bases) > window_size:

2111 possible_bases.pop()

2112

2113

2114def pack_objects_to_data(

2115 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2116 *,

2117 deltify: Optional[bool] = None,

2118 delta_window_size: Optional[int] = None,

2119 ofs_delta: bool = True,

2120 progress=None,

2121) -> tuple[int, Iterator[UnpackedObject]]:

2122 """Create pack data from objects.

2123

2124 Args:

2125 objects: Pack objects

2126 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2127 """

2128 # TODO(jelmer): support deltaifying

2129 count = len(objects)

2130 if deltify is None:

2131 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2132 # slow at the moment.

2133 deltify = False

2134 if deltify:

2135 return (

2136 count,

2137 deltify_pack_objects(

2138 iter(objects), # type: ignore

2139 window_size=delta_window_size,

2140 progress=progress,

2141 ),

2142 )

2143 else:

2144

2145 def iter_without_path():

2146 for o in objects:

2147 if isinstance(o, tuple):

2148 yield full_unpacked_object(o[0])

2149 else:

2150 yield full_unpacked_object(o)

2151

2152 return (count, iter_without_path())

2153

2154

2155def generate_unpacked_objects(

2156 container: PackedObjectContainer,

2157 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2158 delta_window_size: Optional[int] = None,

2159 deltify: Optional[bool] = None,

2160 reuse_deltas: bool = True,

2161 ofs_delta: bool = True,

2162 other_haves: Optional[set[bytes]] = None,

2163 progress=None,

2164) -> Iterator[UnpackedObject]:

2165 """Create pack data from objects.

2166

2167 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2168 """

2169 todo = dict(object_ids)

2170 if reuse_deltas:

2171 for unpack in find_reusable_deltas(

2172 container, set(todo), other_haves=other_haves, progress=progress

2173 ):

2174 del todo[sha_to_hex(unpack.sha())]

2175 yield unpack

2176 if deltify is None:

2177 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2178 # slow at the moment.

2179 deltify = False

2180 if deltify:

2181 objects_to_delta = container.iterobjects_subset(

2182 todo.keys(), allow_missing=False

2183 )

2184 yield from deltas_from_sorted_objects(

2185 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),

2186 window_size=delta_window_size,

2187 progress=progress,

2188 )

2189 else:

2190 for oid in todo:

2191 yield full_unpacked_object(container[oid])

2192

2193

2194def full_unpacked_object(o: ShaFile) -> UnpackedObject:

2195 return UnpackedObject(

2196 o.type_num,

2197 delta_base=None,

2198 crc32=None,

2199 decomp_chunks=o.as_raw_chunks(),

2200 sha=o.sha().digest(),

2201 )

2202

2203

2204def write_pack_from_container(

2205 write,

2206 container: PackedObjectContainer,

2207 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2208 delta_window_size: Optional[int] = None,

2209 deltify: Optional[bool] = None,

2210 reuse_deltas: bool = True,

2211 compression_level: int = -1,

2212 other_haves: Optional[set[bytes]] = None,

2213):

2214 """Write a new pack data file.

2215

2216 Args:

2217 write: write function to use

2218 container: PackedObjectContainer

2219 delta_window_size: Sliding window size for searching for deltas;

2220 Set to None for default window size.

2221 deltify: Whether to deltify objects

2222 compression_level: the zlib compression level to use

2223 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2224 """

2225 pack_contents_count = len(object_ids)

2226 pack_contents = generate_unpacked_objects(

2227 container,

2228 object_ids,

2229 delta_window_size=delta_window_size,

2230 deltify=deltify,

2231 reuse_deltas=reuse_deltas,

2232 other_haves=other_haves,

2233 )

2234

2235 return write_pack_data(

2236 write,

2237 pack_contents,

2238 num_records=pack_contents_count,

2239 compression_level=compression_level,

2240 )

2241

2242

2243def write_pack_objects(

2244 write,

2245 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2246 *,

2247 delta_window_size: Optional[int] = None,

2248 deltify: Optional[bool] = None,

2249 compression_level: int = -1,

2250):

2251 """Write a new pack data file.

2252

2253 Args:

2254 write: write function to use

2255 objects: Sequence of (object, path) tuples to write

2256 delta_window_size: Sliding window size for searching for deltas;

2257 Set to None for default window size.

2258 deltify: Whether to deltify objects

2259 compression_level: the zlib compression level to use

2260 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2261 """

2262 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2263

2264 return write_pack_data(

2265 write,

2266 pack_contents,

2267 num_records=pack_contents_count,

2268 compression_level=compression_level,

2269 )

2270

2271

2272class PackChunkGenerator:

2273 def __init__(

2274 self,

2275 num_records=None,

2276 records=None,

2277 progress=None,

2278 compression_level=-1,

2279 reuse_compressed=True,

2280 ) -> None:

2281 self.cs = sha1(b"")

2282 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}

2283 self._it = self._pack_data_chunks(

2284 num_records=num_records,

2285 records=records,

2286 progress=progress,

2287 compression_level=compression_level,

2288 reuse_compressed=reuse_compressed,

2289 )

2290

2291 def sha1digest(self):

2292 return self.cs.digest()

2293

2294 def __iter__(self):

2295 return self._it

2296

2297 def _pack_data_chunks(

2298 self,

2299 records: Iterator[UnpackedObject],

2300 *,

2301 num_records=None,

2302 progress=None,

2303 compression_level: int = -1,

2304 reuse_compressed: bool = True,

2305 ) -> Iterator[bytes]:

2306 """Iterate pack data file chunks.

2307

2308 Args:

2309 records: Iterator over UnpackedObject

2310 num_records: Number of records (defaults to len(records) if not specified)

2311 progress: Function to report progress to

2312 compression_level: the zlib compression level

2313 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2314 """

2315 # Write the pack

2316 if num_records is None:

2317 num_records = len(records) # type: ignore

2318 offset = 0

2319 for chunk in pack_header_chunks(num_records):

2320 yield chunk

2321 self.cs.update(chunk)

2322 offset += len(chunk)

2323 actual_num_records = 0

2324 for i, unpacked in enumerate(records):

2325 type_num = unpacked.pack_type_num

2326 if progress is not None and i % 1000 == 0:

2327 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))

2328 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]

2329 if unpacked.delta_base is not None:

2330 try:

2331 base_offset, base_crc32 = self.entries[unpacked.delta_base]

2332 except KeyError:

2333 type_num = REF_DELTA

2334 assert isinstance(unpacked.delta_base, bytes)

2335 raw = (unpacked.delta_base, unpacked.decomp_chunks)

2336 else:

2337 type_num = OFS_DELTA

2338 raw = (offset - base_offset, unpacked.decomp_chunks)

2339 else:

2340 raw = unpacked.decomp_chunks

2341 if unpacked.comp_chunks is not None and reuse_compressed:

2342 chunks = unpacked.comp_chunks

2343 else:

2344 chunks = pack_object_chunks(

2345 type_num, raw, compression_level=compression_level

2346 )

2347 crc32 = 0

2348 object_size = 0

2349 for chunk in chunks:

2350 yield chunk

2351 crc32 = binascii.crc32(chunk, crc32)

2352 self.cs.update(chunk)

2353 object_size += len(chunk)

2354 actual_num_records += 1

2355 self.entries[unpacked.sha()] = (offset, crc32)

2356 offset += object_size

2357 if actual_num_records != num_records:

2358 raise AssertionError(

2359 f"actual records written differs: {actual_num_records} != {num_records}"

2360 )

2361

2362 yield self.cs.digest()

2363

2364

2365def write_pack_data(

2366 write,

2367 records: Iterator[UnpackedObject],

2368 *,

2369 num_records=None,

2370 progress=None,

2371 compression_level=-1,

2372):

2373 """Write a new pack data file.

2374

2375 Args:

2376 write: Write function to use

2377 num_records: Number of records (defaults to len(records) if None)

2378 records: Iterator over type_num, object_id, delta_base, raw

2379 progress: Function to report progress to

2380 compression_level: the zlib compression level

2381 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2382 """

2383 chunk_generator = PackChunkGenerator(

2384 num_records=num_records,

2385 records=records,

2386 progress=progress,

2387 compression_level=compression_level,

2388 )

2389 for chunk in chunk_generator:

2390 write(chunk)

2391 return chunk_generator.entries, chunk_generator.sha1digest()

2392

2393

2394def write_pack_index_v1(f, entries, pack_checksum):

2395 """Write a new pack index file.

2396

2397 Args:

2398 f: A file-like object to write to

2399 entries: List of tuples with object name (sha), offset_in_pack,

2400 and crc32_checksum.

2401 pack_checksum: Checksum of the pack file.

2402 Returns: The SHA of the written index file

2403 """

2404 f = SHA1Writer(f)

2405 fan_out_table = defaultdict(lambda: 0)

2406 for name, offset, entry_checksum in entries:

2407 fan_out_table[ord(name[:1])] += 1

2408 # Fan-out table

2409 for i in range(0x100):

2410 f.write(struct.pack(">L", fan_out_table[i]))

2411 fan_out_table[i + 1] += fan_out_table[i]

2412 for name, offset, entry_checksum in entries:

2413 if not (offset <= 0xFFFFFFFF):

2414 raise TypeError("pack format 1 only supports offsets < 2Gb")

2415 f.write(struct.pack(">L20s", offset, name))

2416 assert len(pack_checksum) == 20

2417 f.write(pack_checksum)

2418 return f.write_sha()

2419

2420

2421def _delta_encode_size(size) -> bytes:

2422 ret = bytearray()

2423 c = size & 0x7F

2424 size >>= 7

2425 while size:

2426 ret.append(c | 0x80)

2427 c = size & 0x7F

2428 size >>= 7

2429 ret.append(c)

2430 return bytes(ret)

2431

2432

2433# The length of delta compression copy operations in version 2 packs is limited

2434# to 64K. To copy more, we use several copy operations. Version 3 packs allow

2435# 24-bit lengths in copy operations, but we always make version 2 packs.

2436_MAX_COPY_LEN = 0xFFFF

2437

2438

2439def _encode_copy_operation(start, length):

2440 scratch = bytearray([0x80])

2441 for i in range(4):

2442 if start & 0xFF << i * 8:

2443 scratch.append((start >> i * 8) & 0xFF)

2444 scratch[0] |= 1 << i

2445 for i in range(2):

2446 if length & 0xFF << i * 8:

2447 scratch.append((length >> i * 8) & 0xFF)

2448 scratch[0] |= 1 << (4 + i)

2449 return bytes(scratch)

2450

2451

2452def create_delta(base_buf, target_buf):

2453 """Use python difflib to work out how to transform base_buf to target_buf.

2454

2455 Args:

2456 base_buf: Base buffer

2457 target_buf: Target buffer

2458 """

2459 if isinstance(base_buf, list):

2460 base_buf = b"".join(base_buf)

2461 if isinstance(target_buf, list):

2462 target_buf = b"".join(target_buf)

2463 assert isinstance(base_buf, bytes)

2464 assert isinstance(target_buf, bytes)

2465 # write delta header

2466 yield _delta_encode_size(len(base_buf))

2467 yield _delta_encode_size(len(target_buf))

2468 # write out delta opcodes

2469 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

2470 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

2471 # Git patch opcodes don't care about deletes!

2472 # if opcode == 'replace' or opcode == 'delete':

2473 # pass

2474 if opcode == "equal":

2475 # If they are equal, unpacker will use data from base_buf

2476 # Write out an opcode that says what range to use

2477 copy_start = i1

2478 copy_len = i2 - i1

2479 while copy_len > 0:

2480 to_copy = min(copy_len, _MAX_COPY_LEN)

2481 yield _encode_copy_operation(copy_start, to_copy)

2482 copy_start += to_copy

2483 copy_len -= to_copy

2484 if opcode == "replace" or opcode == "insert":

2485 # If we are replacing a range or adding one, then we just

2486 # output it to the stream (prefixed by its size)

2487 s = j2 - j1

2488 o = j1

2489 while s > 127:

2490 yield bytes([127])

2491 yield memoryview(target_buf)[o : o + 127]

2492 s -= 127

2493 o += 127

2494 yield bytes([s])

2495 yield memoryview(target_buf)[o : o + s]

2496

2497

2498def apply_delta(src_buf, delta):

2499 """Based on the similar function in git's patch-delta.c.

2500

2501 Args:

2502 src_buf: Source buffer

2503 delta: Delta instructions

2504 """

2505 if not isinstance(src_buf, bytes):

2506 src_buf = b"".join(src_buf)

2507 if not isinstance(delta, bytes):

2508 delta = b"".join(delta)

2509 out = []

2510 index = 0

2511 delta_length = len(delta)

2512

2513 def get_delta_header_size(delta, index):

2514 size = 0

2515 i = 0

2516 while delta:

2517 cmd = ord(delta[index : index + 1])

2518 index += 1

2519 size |= (cmd & ~0x80) << i

2520 i += 7

2521 if not cmd & 0x80:

2522 break

2523 return size, index

2524

2525 src_size, index = get_delta_header_size(delta, index)

2526 dest_size, index = get_delta_header_size(delta, index)

2527 if src_size != len(src_buf):

2528 raise ApplyDeltaError(

2529 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"

2530 )

2531 while index < delta_length:

2532 cmd = ord(delta[index : index + 1])

2533 index += 1

2534 if cmd & 0x80:

2535 cp_off = 0

2536 for i in range(4):

2537 if cmd & (1 << i):

2538 x = ord(delta[index : index + 1])

2539 index += 1

2540 cp_off |= x << (i * 8)

2541 cp_size = 0

2542 # Version 3 packs can contain copy sizes larger than 64K.

2543 for i in range(3):

2544 if cmd & (1 << (4 + i)):

2545 x = ord(delta[index : index + 1])

2546 index += 1

2547 cp_size |= x << (i * 8)

2548 if cp_size == 0:

2549 cp_size = 0x10000

2550 if (

2551 cp_off + cp_size < cp_size

2552 or cp_off + cp_size > src_size

2553 or cp_size > dest_size

2554 ):

2555 break

2556 out.append(src_buf[cp_off : cp_off + cp_size])

2557 elif cmd != 0:

2558 out.append(delta[index : index + cmd])

2559 index += cmd

2560 else:

2561 raise ApplyDeltaError("Invalid opcode 0")

2562

2563 if index != delta_length:

2564 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

2565

2566 if dest_size != chunks_length(out):

2567 raise ApplyDeltaError("dest size incorrect")

2568

2569 return out

2570

2571

2572def write_pack_index_v2(

2573 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes

2574) -> bytes:

2575 """Write a new pack index file.

2576

2577 Args:

2578 f: File-like object to write to

2579 entries: List of tuples with object name (sha), offset_in_pack, and

2580 crc32_checksum.

2581 pack_checksum: Checksum of the pack file.

2582 Returns: The SHA of the index file written

2583 """

2584 f = SHA1Writer(f)

2585 f.write(b"\377tOc") # Magic!

2586 f.write(struct.pack(">L", 2))

2587 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2588 for name, offset, entry_checksum in entries:

2589 fan_out_table[ord(name[:1])] += 1

2590 # Fan-out table

2591 largetable: list[int] = []

2592 for i in range(0x100):

2593 f.write(struct.pack(b">L", fan_out_table[i]))

2594 fan_out_table[i + 1] += fan_out_table[i]

2595 for name, offset, entry_checksum in entries:

2596 f.write(name)

2597 for name, offset, entry_checksum in entries:

2598 f.write(struct.pack(b">L", entry_checksum))

2599 for name, offset, entry_checksum in entries:

2600 if offset < 2**31:

2601 f.write(struct.pack(b">L", offset))

2602 else:

2603 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2604 largetable.append(offset)

2605 for offset in largetable:

2606 f.write(struct.pack(b">Q", offset))

2607 assert len(pack_checksum) == 20

2608 f.write(pack_checksum)

2609 return f.write_sha()

2610

2611

2612def write_pack_index_v3(

2613 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1

2614) -> bytes:

2615 """Write a new pack index file in v3 format.

2616

2617 Args:

2618 f: File-like object to write to

2619 entries: List of tuples with object name (sha), offset_in_pack, and

2620 crc32_checksum.

2621 pack_checksum: Checksum of the pack file.

2622 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

2623 Returns: The SHA of the index file written

2624 """

2625 if hash_algorithm == 1:

2626 hash_size = 20 # SHA-1

2627 writer_cls = SHA1Writer

2628 elif hash_algorithm == 2:

2629 hash_size = 32 # SHA-256

2630 # TODO: Add SHA256Writer when SHA-256 support is implemented

2631 raise NotImplementedError("SHA-256 support not yet implemented")

2632 else:

2633 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")

2634

2635 # Convert entries to list to allow multiple iterations

2636 entries_list = list(entries)

2637

2638 # Calculate shortest unambiguous prefix length for object names

2639 # For now, use full hash size (this could be optimized)

2640 shortened_oid_len = hash_size

2641

2642 f = writer_cls(f)

2643 f.write(b"\377tOc") # Magic!

2644 f.write(struct.pack(">L", 3)) # Version 3

2645 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm

2646 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length

2647

2648 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2649 for name, offset, entry_checksum in entries_list:

2650 if len(name) != hash_size:

2651 raise ValueError(

2652 f"Object name has wrong length: expected {hash_size}, got {len(name)}"

2653 )

2654 fan_out_table[ord(name[:1])] += 1

2655

2656 # Fan-out table

2657 largetable: list[int] = []

2658 for i in range(0x100):

2659 f.write(struct.pack(b">L", fan_out_table[i]))

2660 fan_out_table[i + 1] += fan_out_table[i]

2661

2662 # Object names table

2663 for name, offset, entry_checksum in entries_list:

2664 f.write(name)

2665

2666 # CRC32 checksums table

2667 for name, offset, entry_checksum in entries_list:

2668 f.write(struct.pack(b">L", entry_checksum))

2669

2670 # Offset table

2671 for name, offset, entry_checksum in entries_list:

2672 if offset < 2**31:

2673 f.write(struct.pack(b">L", offset))

2674 else:

2675 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2676 largetable.append(offset)

2677

2678 # Large offset table

2679 for offset in largetable:

2680 f.write(struct.pack(b">Q", offset))

2681

2682 assert len(pack_checksum) == hash_size, (

2683 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"

2684 )

2685 f.write(pack_checksum)

2686 return f.write_sha()

2687

2688

2689def write_pack_index(

2690 index_filename, entries, pack_checksum, progress=None, version=None

2691):

2692 """Write a pack index file.

2693

2694 Args:

2695 index_filename: Index filename.

2696 entries: List of (checksum, offset, crc32) tuples

2697 pack_checksum: Checksum of the pack file.

2698 progress: Progress function (not currently used)

2699 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.

2700

2701 Returns:

2702 SHA of the written index file

2703 """

2704 if version is None:

2705 version = DEFAULT_PACK_INDEX_VERSION

2706

2707 if version == 1:

2708 return write_pack_index_v1(index_filename, entries, pack_checksum)

2709 elif version == 2:

2710 return write_pack_index_v2(index_filename, entries, pack_checksum)

2711 elif version == 3:

2712 return write_pack_index_v3(index_filename, entries, pack_checksum)

2713 else:

2714 raise ValueError(f"Unsupported pack index version: {version}")

2715

2716

2717class Pack:

2718 """A Git pack object."""

2719

2720 _data_load: Optional[Callable[[], PackData]]

2721 _idx_load: Optional[Callable[[], PackIndex]]

2722

2723 _data: Optional[PackData]

2724 _idx: Optional[PackIndex]

2725

2726 def __init__(

2727 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None

2728 ) -> None:

2729 self._basename = basename

2730 self._data = None

2731 self._idx = None

2732 self._idx_path = self._basename + ".idx"

2733 self._data_path = self._basename + ".pack"

2734 self._data_load = lambda: PackData(self._data_path)

2735 self._idx_load = lambda: load_pack_index(self._idx_path)

2736 self.resolve_ext_ref = resolve_ext_ref

2737

2738 @classmethod

2739 def from_lazy_objects(cls, data_fn, idx_fn):

2740 """Create a new pack object from callables to load pack data and

2741 index objects.

2742 """

2743 ret = cls("")

2744 ret._data_load = data_fn

2745 ret._idx_load = idx_fn

2746 return ret

2747

2748 @classmethod

2749 def from_objects(cls, data, idx):

2750 """Create a new pack object from pack data and index objects."""

2751 ret = cls("")

2752 ret._data = data

2753 ret._data_load = None

2754 ret._idx = idx

2755 ret._idx_load = None

2756 ret.check_length_and_checksum()

2757 return ret

2758

2759 def name(self):

2760 """The SHA over the SHAs of the objects in this pack."""

2761 return self.index.objects_sha1()

2762

2763 @property

2764 def data(self) -> PackData:

2765 """The pack data object being used."""

2766 if self._data is None:

2767 assert self._data_load

2768 self._data = self._data_load()

2769 self.check_length_and_checksum()

2770 return self._data

2771

2772 @property

2773 def index(self) -> PackIndex:

2774 """The index being used.

2775

2776 Note: This may be an in-memory index

2777 """

2778 if self._idx is None:

2779 assert self._idx_load

2780 self._idx = self._idx_load()

2781 return self._idx

2782

2783 def close(self) -> None:

2784 if self._data is not None:

2785 self._data.close()

2786 if self._idx is not None:

2787 self._idx.close()

2788

2789 def __enter__(self):

2790 return self

2791

2792 def __exit__(self, exc_type, exc_val, exc_tb):

2793 self.close()

2794

2795 def __eq__(self, other):

2796 return isinstance(self, type(other)) and self.index == other.index

2797

2798 def __len__(self) -> int:

2799 """Number of entries in this pack."""

2800 return len(self.index)

2801

2802 def __repr__(self) -> str:

2803 return f"{self.__class__.__name__}({self._basename!r})"

2804

2805 def __iter__(self):

2806 """Iterate over all the sha1s of the objects in this pack."""

2807 return iter(self.index)

2808

2809 def check_length_and_checksum(self) -> None:

2810 """Sanity check the length and checksum of the pack index and data."""

2811 assert len(self.index) == len(self.data), (

2812 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

2813 )

2814 idx_stored_checksum = self.index.get_pack_checksum()

2815 data_stored_checksum = self.data.get_stored_checksum()

2816 if idx_stored_checksum != data_stored_checksum:

2817 raise ChecksumMismatch(

2818 sha_to_hex(idx_stored_checksum),

2819 sha_to_hex(data_stored_checksum),

2820 )

2821

2822 def check(self) -> None:

2823 """Check the integrity of this pack.

2824

2825 Raises:

2826 ChecksumMismatch: if a checksum for the index or data is wrong

2827 """

2828 self.index.check()

2829 self.data.check()

2830 for obj in self.iterobjects():

2831 obj.check()

2832 # TODO: object connectivity checks

2833

2834 def get_stored_checksum(self) -> bytes:

2835 return self.data.get_stored_checksum()

2836

2837 def pack_tuples(self):

2838 return [(o, None) for o in self.iterobjects()]

2839

2840 def __contains__(self, sha1: bytes) -> bool:

2841 """Check whether this pack contains a particular SHA1."""

2842 try:

2843 self.index.object_offset(sha1)

2844 return True

2845 except KeyError:

2846 return False

2847

2848 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:

2849 offset = self.index.object_offset(sha1)

2850 obj_type, obj = self.data.get_object_at(offset)

2851 type_num, chunks = self.resolve_object(offset, obj_type, obj)

2852 return type_num, b"".join(chunks)

2853

2854 def __getitem__(self, sha1: bytes) -> ShaFile:

2855 """Retrieve the specified SHA1."""

2856 type, uncomp = self.get_raw(sha1)

2857 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

2858

2859 def iterobjects(self) -> Iterator[ShaFile]:

2860 """Iterate over the objects in this pack."""

2861 return iter(

2862 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

2863 )

2864

2865 def iterobjects_subset(

2866 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

2867 ) -> Iterator[ShaFile]:

2868 return (

2869 uo

2870 for uo in PackInflater.for_pack_subset(

2871 self,

2872 shas,

2873 allow_missing=allow_missing,

2874 resolve_ext_ref=self.resolve_ext_ref,

2875 )

2876 if uo.id in shas

2877 )

2878

2879 def iter_unpacked_subset(

2880 self,

2881 shas: Iterable[ObjectID],

2882 *,

2883 include_comp: bool = False,

2884 allow_missing: bool = False,

2885 convert_ofs_delta: bool = False,

2886 ) -> Iterator[UnpackedObject]:

2887 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)

2888 ofs: dict[bytes, int] = {}

2889 todo = set(shas)

2890 for unpacked in self.iter_unpacked(include_comp=include_comp):

2891 sha = unpacked.sha()

2892 ofs[unpacked.offset] = sha

2893 hexsha = sha_to_hex(sha)

2894 if hexsha in todo:

2895 if unpacked.pack_type_num == OFS_DELTA:

2896 assert isinstance(unpacked.delta_base, int)

2897 base_offset = unpacked.offset - unpacked.delta_base

2898 try:

2899 unpacked.delta_base = ofs[base_offset]

2900 except KeyError:

2901 ofs_pending[base_offset].append(unpacked)

2902 continue

2903 else:

2904 unpacked.pack_type_num = REF_DELTA

2905 yield unpacked

2906 todo.remove(hexsha)

2907 for child in ofs_pending.pop(unpacked.offset, []):

2908 child.pack_type_num = REF_DELTA

2909 child.delta_base = sha

2910 yield child

2911 assert not ofs_pending

2912 if not allow_missing and todo:

2913 raise UnresolvedDeltas(todo)

2914

2915 def iter_unpacked(self, include_comp=False):

2916 ofs_to_entries = {

2917 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

2918 }

2919 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

2920 (sha, crc32) = ofs_to_entries[unpacked.offset]

2921 unpacked._sha = sha

2922 unpacked.crc32 = crc32

2923 yield unpacked

2924

2925 def keep(self, msg: Optional[bytes] = None) -> str:

2926 """Add a .keep file for the pack, preventing git from garbage collecting it.

2927

2928 Args:

2929 msg: A message written inside the .keep file; can be used later

2930 to determine whether or not a .keep file is obsolete.

2931 Returns: The path of the .keep file, as a string.

2932 """

2933 keepfile_name = f"{self._basename}.keep"

2934 with GitFile(keepfile_name, "wb") as keepfile:

2935 if msg:

2936 keepfile.write(msg)

2937 keepfile.write(b"\n")

2938 return keepfile_name

2939

2940 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:

2941 """Get the object for a ref SHA, only looking in this pack."""

2942 # TODO: cache these results

2943 try:

2944 offset = self.index.object_offset(sha)

2945 except KeyError:

2946 offset = None

2947 if offset:

2948 type, obj = self.data.get_object_at(offset)

2949 elif self.resolve_ext_ref:

2950 type, obj = self.resolve_ext_ref(sha)

2951 else:

2952 raise KeyError(sha)

2953 return offset, type, obj

2954

2955 def resolve_object(

2956 self, offset: int, type: int, obj, get_ref=None

2957 ) -> tuple[int, Iterable[bytes]]:

2958 """Resolve an object, possibly resolving deltas when necessary.

2959

2960 Returns: Tuple with object type and contents.

2961 """

2962 # Walk down the delta chain, building a stack of deltas to reach

2963 # the requested object.

2964 base_offset = offset

2965 base_type = type

2966 base_obj = obj

2967 delta_stack = []

2968 while base_type in DELTA_TYPES:

2969 prev_offset = base_offset

2970 if get_ref is None:

2971 get_ref = self.get_ref

2972 if base_type == OFS_DELTA:

2973 (delta_offset, delta) = base_obj

2974 # TODO: clean up asserts and replace with nicer error messages

2975 base_offset = base_offset - delta_offset

2976 base_type, base_obj = self.data.get_object_at(base_offset)

2977 assert isinstance(base_type, int)

2978 elif base_type == REF_DELTA:

2979 (basename, delta) = base_obj

2980 assert isinstance(basename, bytes) and len(basename) == 20

2981 base_offset, base_type, base_obj = get_ref(basename)

2982 assert isinstance(base_type, int)

2983 if base_offset == prev_offset: # object is based on itself

2984 raise UnresolvedDeltas(sha_to_hex(basename))

2985 delta_stack.append((prev_offset, base_type, delta))

2986

2987 # Now grab the base object (mustn't be a delta) and apply the

2988 # deltas all the way up the stack.

2989 chunks = base_obj

2990 for prev_offset, delta_type, delta in reversed(delta_stack):

2991 chunks = apply_delta(chunks, delta)

2992 # TODO(dborowitz): This can result in poor performance if

2993 # large base objects are separated from deltas in the pack.

2994 # We should reorganize so that we apply deltas to all

2995 # objects in a chain one after the other to optimize cache

2996 # performance.

2997 if prev_offset is not None:

2998 self.data._offset_cache[prev_offset] = base_type, chunks

2999 return base_type, chunks

3000

3001 def entries(

3002 self, progress: Optional[ProgressFn] = None

3003 ) -> Iterator[PackIndexEntry]:

3004 """Yield entries summarizing the contents of this pack.

3005

3006 Args:

3007 progress: Progress function, called with current and total

3008 object count.

3009 Returns: iterator of tuples with (sha, offset, crc32)

3010 """

3011 return self.data.iterentries(

3012 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3013 )

3014

3015 def sorted_entries(

3016 self, progress: Optional[ProgressFn] = None

3017 ) -> Iterator[PackIndexEntry]:

3018 """Return entries in this pack, sorted by SHA.

3019

3020 Args:

3021 progress: Progress function, called with current and total

3022 object count

3023 Returns: Iterator of tuples with (sha, offset, crc32)

3024 """

3025 return self.data.sorted_entries(

3026 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3027 )

3028

3029 def get_unpacked_object(

3030 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True

3031 ) -> UnpackedObject:

3032 """Get the unpacked object for a sha.

3033

3034 Args:

3035 sha: SHA of object to fetch

3036 include_comp: Whether to include compression data in UnpackedObject

3037 """

3038 offset = self.index.object_offset(sha)

3039 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

3040 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

3041 assert isinstance(unpacked.delta_base, int)

3042 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

3043 unpacked.pack_type_num = REF_DELTA

3044 return unpacked

3045

3046

3047def extend_pack(

3048 f: BinaryIO,

3049 object_ids: set[ObjectID],

3050 get_raw,

3051 *,

3052 compression_level=-1,

3053 progress=None,

3054) -> tuple[bytes, list]:

3055 """Extend a pack file with more objects.

3056

3057 The caller should make sure that object_ids does not contain any objects

3058 that are already in the pack

3059 """

3060 # Update the header with the new number of objects.

3061 f.seek(0)

3062 _version, num_objects = read_pack_header(f.read)

3063

3064 if object_ids:

3065 f.seek(0)

3066 write_pack_header(f.write, num_objects + len(object_ids))

3067

3068 # Must flush before reading (http://bugs.python.org/issue3207)

3069 f.flush()

3070

3071 # Rescan the rest of the pack, computing the SHA with the new header.

3072 new_sha = compute_file_sha(f, end_ofs=-20)

3073

3074 # Must reposition before writing (http://bugs.python.org/issue3207)

3075 f.seek(0, os.SEEK_CUR)

3076

3077 extra_entries = []

3078

3079 # Complete the pack.

3080 for i, object_id in enumerate(object_ids):

3081 if progress is not None:

3082 progress(

3083 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")

3084 )

3085 assert len(object_id) == 20

3086 type_num, data = get_raw(object_id)

3087 offset = f.tell()

3088 crc32 = write_pack_object(

3089 f.write,

3090 type_num,

3091 data,

3092 sha=new_sha,

3093 compression_level=compression_level,

3094 )

3095 extra_entries.append((object_id, offset, crc32))

3096 pack_sha = new_sha.digest()

3097 f.write(pack_sha)

3098 return pack_sha, extra_entries

3099

3100

3101try:

3102 from dulwich._pack import ( # type: ignore

3103 apply_delta, # type: ignore

3104 bisect_find_sha, # type: ignore

3105 )

3106except ImportError:

3107 pass