Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 27%

1# pack.py -- For dealing with packed git objects.

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

7# General Public License as public by the Free Software Foundation; version 2.0

8# or (at your option) any later version. You can redistribute it and/or

9# modify it under the terms of either of these two licenses.

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS,

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14# See the License for the specific language governing permissions and

15# limitations under the License.

16#

17# You should have received a copy of the licenses; if not, see

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

20# License, Version 2.0.

21#

23"""Classes for dealing with packed git objects.

25A pack is a compact representation of a bunch of objects, stored

26using deltas where possible.

28They have two parts, the pack file, which stores the data, and an index

29that tells you where the data is.

31To find an object you look in all of the index files 'til you find a

32match for the object name. You then use the pointer got from this as

33a pointer in to the corresponding packfile.

34"""

36import binascii

37from collections import defaultdict, deque

38from contextlib import suppress

39from io import BytesIO, UnsupportedOperation

41try:

42 from cdifflib import CSequenceMatcher as SequenceMatcher

43except ModuleNotFoundError:

44 from difflib import SequenceMatcher

46import os

47import struct

48import sys

49import warnings

50import zlib

51from collections.abc import Iterable, Iterator, Sequence

52from hashlib import sha1

53from itertools import chain

54from os import SEEK_CUR, SEEK_END

55from struct import unpack_from

56from typing import (

57 BinaryIO,

58 Callable,

59 Generic,

60 Optional,

61 Protocol,

62 TypeVar,

63 Union,

64)

66try:

67 import mmap

68except ImportError:

69 has_mmap = False

70else:

71 has_mmap = True

73# For some reason the above try, except fails to set has_mmap = False for plan9

74if sys.platform == "Plan9":

75 has_mmap = False

77from . import replace_me

78from .errors import ApplyDeltaError, ChecksumMismatch

79from .file import GitFile

80from .lru_cache import LRUSizeCache

81from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex

83OFS_DELTA = 6

84REF_DELTA = 7

86DELTA_TYPES = (OFS_DELTA, REF_DELTA)

89DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

91# Keep pack files under 16Mb in memory, otherwise write them out to disk

92PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

94# Default pack index version to use when none is specified

95DEFAULT_PACK_INDEX_VERSION = 2

98OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]

99ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]

100ProgressFn = Callable[[int, str], None]

101PackHint = tuple[int, Optional[bytes]]

102

103

104class UnresolvedDeltas(Exception):

105 """Delta objects could not be resolved."""

106

107 def __init__(self, shas) -> None:

108 self.shas = shas

109

110

111class ObjectContainer(Protocol):

112 def add_object(self, obj: ShaFile) -> None:

113 """Add a single object to this object store."""

114

115 def add_objects(

116 self,

117 objects: Sequence[tuple[ShaFile, Optional[str]]],

118 progress: Optional[Callable[[str], None]] = None,

119 ) -> None:

120 """Add a set of objects to this object store.

121

122 Args:

123 objects: Iterable over a list of (object, path) tuples

124 """

125

126 def __contains__(self, sha1: bytes) -> bool:

127 """Check if a hex sha is present."""

128

129 def __getitem__(self, sha1: bytes) -> ShaFile:

130 """Retrieve an object."""

131

132

133class PackedObjectContainer(ObjectContainer):

134 def get_unpacked_object(

135 self, sha1: bytes, *, include_comp: bool = False

136 ) -> "UnpackedObject":

137 """Get a raw unresolved object."""

138 raise NotImplementedError(self.get_unpacked_object)

139

140 def iterobjects_subset(

141 self, shas: Iterable[bytes], *, allow_missing: bool = False

142 ) -> Iterator[ShaFile]:

143 raise NotImplementedError(self.iterobjects_subset)

144

145 def iter_unpacked_subset(

146 self,

147 shas: set[bytes],

148 include_comp: bool = False,

149 allow_missing: bool = False,

150 convert_ofs_delta: bool = True,

151 ) -> Iterator["UnpackedObject"]:

152 raise NotImplementedError(self.iter_unpacked_subset)

153

154

155class UnpackedObjectStream:

156 def __iter__(self) -> Iterator["UnpackedObject"]:

157 raise NotImplementedError(self.__iter__)

158

159 def __len__(self) -> int:

160 raise NotImplementedError(self.__len__)

161

162

163def take_msb_bytes(

164 read: Callable[[int], bytes], crc32: Optional[int] = None

165) -> tuple[list[int], Optional[int]]:

166 """Read bytes marked with most significant bit.

167

168 Args:

169 read: Read function

170 """

171 ret: list[int] = []

172 while len(ret) == 0 or ret[-1] & 0x80:

173 b = read(1)

174 if crc32 is not None:

175 crc32 = binascii.crc32(b, crc32)

176 ret.append(ord(b[:1]))

177 return ret, crc32

178

179

180class PackFileDisappeared(Exception):

181 def __init__(self, obj) -> None:

182 self.obj = obj

183

184

185class UnpackedObject:

186 """Class encapsulating an object unpacked from a pack file.

187

188 These objects should only be created from within unpack_object. Most

189 members start out as empty and are filled in at various points by

190 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

191

192 End users of this object should take care that the function they're getting

193 this object from is guaranteed to set the members they need.

194 """

195

196 __slots__ = [

197 "_sha", # Cached binary SHA.

198 "comp_chunks", # Compressed object chunks.

199 "crc32", # CRC32.

200 "decomp_chunks", # Decompressed object chunks.

201 "decomp_len", # Decompressed length of this object.

202 "delta_base", # Delta base offset or SHA.

203 "obj_chunks", # Decompressed and delta-resolved chunks.

204 "obj_type_num", # Type of this object.

205 "offset", # Offset in its pack.

206 "pack_type_num", # Type of this object in the pack (may be a delta).

207 ]

208

209 obj_type_num: Optional[int]

210 obj_chunks: Optional[list[bytes]]

211 delta_base: Union[None, bytes, int]

212 decomp_chunks: list[bytes]

213 comp_chunks: Optional[list[bytes]]

214

215 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

216 # methods of this object.

217 def __init__(

218 self,

219 pack_type_num,

220 *,

221 delta_base=None,

222 decomp_len=None,

223 crc32=None,

224 sha=None,

225 decomp_chunks=None,

226 offset=None,

227 ) -> None:

228 self.offset = offset

229 self._sha = sha

230 self.pack_type_num = pack_type_num

231 self.delta_base = delta_base

232 self.comp_chunks = None

233 self.decomp_chunks: list[bytes] = decomp_chunks or []

234 if decomp_chunks is not None and decomp_len is None:

235 self.decomp_len = sum(map(len, decomp_chunks))

236 else:

237 self.decomp_len = decomp_len

238 self.crc32 = crc32

239

240 if pack_type_num in DELTA_TYPES:

241 self.obj_type_num = None

242 self.obj_chunks = None

243 else:

244 self.obj_type_num = pack_type_num

245 self.obj_chunks = self.decomp_chunks

246 self.delta_base = delta_base

247

248 def sha(self):

249 """Return the binary SHA of this object."""

250 if self._sha is None:

251 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

252 return self._sha

253

254 def sha_file(self):

255 """Return a ShaFile from this object."""

256 assert self.obj_type_num is not None and self.obj_chunks is not None

257 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

258

259 # Only provided for backwards compatibility with code that expects either

260 # chunks or a delta tuple.

261 def _obj(self) -> OldUnpackedObject:

262 """Return the decompressed chunks, or (delta base, delta chunks)."""

263 if self.pack_type_num in DELTA_TYPES:

264 assert isinstance(self.delta_base, (bytes, int))

265 return (self.delta_base, self.decomp_chunks)

266 else:

267 return self.decomp_chunks

268

269 def __eq__(self, other):

270 if not isinstance(other, UnpackedObject):

271 return False

272 for slot in self.__slots__:

273 if getattr(self, slot) != getattr(other, slot):

274 return False

275 return True

276

277 def __ne__(self, other):

278 return not (self == other)

279

280 def __repr__(self) -> str:

281 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

282 return "{}({})".format(self.__class__.__name__, ", ".join(data))

283

284

285_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance

286

287

288def read_zlib_chunks(

289 read_some: Callable[[int], bytes],

290 unpacked: UnpackedObject,

291 include_comp: bool = False,

292 buffer_size: int = _ZLIB_BUFSIZE,

293) -> bytes:

294 """Read zlib data from a buffer.

295

296 This function requires that the buffer have additional data following the

297 compressed data, which is guaranteed to be the case for git pack files.

298

299 Args:

300 read_some: Read function that returns at least one byte, but may

301 return less than the requested size.

302 unpacked: An UnpackedObject to write result data to. If its crc32

303 attr is not None, the CRC32 of the compressed bytes will be computed

304 using this starting CRC32.

305 After this function, will have the following attrs set:

306 * comp_chunks (if include_comp is True)

307 * decomp_chunks

308 * decomp_len

309 * crc32

310 include_comp: If True, include compressed data in the result.

311 buffer_size: Size of the read buffer.

312 Returns: Leftover unused data from the decompression.

313

314 Raises:

315 zlib.error: if a decompression error occurred.

316 """

317 if unpacked.decomp_len <= -1:

318 raise ValueError("non-negative zlib data stream size expected")

319 decomp_obj = zlib.decompressobj()

320

321 comp_chunks = []

322 decomp_chunks = unpacked.decomp_chunks

323 decomp_len = 0

324 crc32 = unpacked.crc32

325

326 while True:

327 add = read_some(buffer_size)

328 if not add:

329 raise zlib.error("EOF before end of zlib stream")

330 comp_chunks.append(add)

331 decomp = decomp_obj.decompress(add)

332 decomp_len += len(decomp)

333 decomp_chunks.append(decomp)

334 unused = decomp_obj.unused_data

335 if unused:

336 left = len(unused)

337 if crc32 is not None:

338 crc32 = binascii.crc32(add[:-left], crc32)

339 if include_comp:

340 comp_chunks[-1] = add[:-left]

341 break

342 elif crc32 is not None:

343 crc32 = binascii.crc32(add, crc32)

344 if crc32 is not None:

345 crc32 &= 0xFFFFFFFF

346

347 if decomp_len != unpacked.decomp_len:

348 raise zlib.error("decompressed data does not match expected size")

349

350 unpacked.crc32 = crc32

351 if include_comp:

352 unpacked.comp_chunks = comp_chunks

353 return unused

354

355

356def iter_sha1(iter):

357 """Return the hexdigest of the SHA1 over a set of names.

358

359 Args:

360 iter: Iterator over string objects

361 Returns: 40-byte hex sha1 digest

362 """

363 sha = sha1()

364 for name in iter:

365 sha.update(name)

366 return sha.hexdigest().encode("ascii")

367

368

369def load_pack_index(path: Union[str, os.PathLike]):

370 """Load an index file by path.

371

372 Args:

373 path: Path to the index file

374 Returns: A PackIndex loaded from the given path

375 """

376 with GitFile(path, "rb") as f:

377 return load_pack_index_file(path, f)

378

379

380def _load_file_contents(f, size=None):

381 try:

382 fd = f.fileno()

383 except (UnsupportedOperation, AttributeError):

384 fd = None

385 # Attempt to use mmap if possible

386 if fd is not None:

387 if size is None:

388 size = os.fstat(fd).st_size

389 if has_mmap:

390 try:

391 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

392 except OSError:

393 # Perhaps a socket?

394 pass

395 else:

396 return contents, size

397 contents = f.read()

398 size = len(contents)

399 return contents, size

400

401

402def load_pack_index_file(path: Union[str, os.PathLike], f):

403 """Load an index file from a file-like object.

404

405 Args:

406 path: Path for the index file

407 f: File-like object

408 Returns: A PackIndex loaded from the given file

409 """

410 contents, size = _load_file_contents(f)

411 if contents[:4] == b"\377tOc":

412 version = struct.unpack(b">L", contents[4:8])[0]

413 if version == 2:

414 return PackIndex2(path, file=f, contents=contents, size=size)

415 elif version == 3:

416 return PackIndex3(path, file=f, contents=contents, size=size)

417 else:

418 raise KeyError(f"Unknown pack index format {version}")

419 else:

420 return PackIndex1(path, file=f, contents=contents, size=size)

421

422

423def bisect_find_sha(start, end, sha, unpack_name):

424 """Find a SHA in a data blob with sorted SHAs.

425

426 Args:

427 start: Start index of range to search

428 end: End index of range to search

429 sha: Sha to find

430 unpack_name: Callback to retrieve SHA by index

431 Returns: Index of the SHA, or None if it wasn't found

432 """

433 assert start <= end

434 while start <= end:

435 i = (start + end) // 2

436 file_sha = unpack_name(i)

437 if file_sha < sha:

438 start = i + 1

439 elif file_sha > sha:

440 end = i - 1

441 else:

442 return i

443 return None

444

445

446PackIndexEntry = tuple[bytes, int, Optional[int]]

447

448

449class PackIndex:

450 """An index in to a packfile.

451

452 Given a sha id of an object a pack index can tell you the location in the

453 packfile of that object if it has it.

454 """

455

456 # Default to SHA-1 for backward compatibility

457 hash_algorithm = 1

458 hash_size = 20

459

460 def __eq__(self, other):

461 if not isinstance(other, PackIndex):

462 return False

463

464 for (name1, _, _), (name2, _, _) in zip(

465 self.iterentries(), other.iterentries()

466 ):

467 if name1 != name2:

468 return False

469 return True

470

471 def __ne__(self, other):

472 return not self.__eq__(other)

473

474 def __len__(self) -> int:

475 """Return the number of entries in this pack index."""

476 raise NotImplementedError(self.__len__)

477

478 def __iter__(self) -> Iterator[bytes]:

479 """Iterate over the SHAs in this pack."""

480 return map(sha_to_hex, self._itersha())

481

482 def iterentries(self) -> Iterator[PackIndexEntry]:

483 """Iterate over the entries in this pack index.

484

485 Returns: iterator over tuples with object name, offset in packfile and

486 crc32 checksum.

487 """

488 raise NotImplementedError(self.iterentries)

489

490 def get_pack_checksum(self) -> bytes:

491 """Return the SHA1 checksum stored for the corresponding packfile.

492

493 Returns: 20-byte binary digest

494 """

495 raise NotImplementedError(self.get_pack_checksum)

496

497 @replace_me(since="0.21.0", remove_in="0.23.0")

498 def object_index(self, sha: bytes) -> int:

499 return self.object_offset(sha)

500

501 def object_offset(self, sha: bytes) -> int:

502 """Return the offset in to the corresponding packfile for the object.

503

504 Given the name of an object it will return the offset that object

505 lives at within the corresponding pack file. If the pack file doesn't

506 have the object then None will be returned.

507 """

508 raise NotImplementedError(self.object_offset)

509

510 def object_sha1(self, index: int) -> bytes:

511 """Return the SHA1 corresponding to the index in the pack file."""

512 for name, offset, crc32 in self.iterentries():

513 if offset == index:

514 return name

515 else:

516 raise KeyError(index)

517

518 def _object_offset(self, sha: bytes) -> int:

519 """See object_offset.

520

521 Args:

522 sha: A *binary* SHA string. (20 characters long)_

523 """

524 raise NotImplementedError(self._object_offset)

525

526 def objects_sha1(self) -> bytes:

527 """Return the hex SHA1 over all the shas of all objects in this pack.

528

529 Note: This is used for the filename of the pack.

530 """

531 return iter_sha1(self._itersha())

532

533 def _itersha(self) -> Iterator[bytes]:

534 """Yield all the SHA1's of the objects in the index, sorted."""

535 raise NotImplementedError(self._itersha)

536

537 def close(self) -> None:

538 pass

539

540 def check(self) -> None:

541 pass

542

543

544class MemoryPackIndex(PackIndex):

545 """Pack index that is stored entirely in memory."""

546

547 def __init__(self, entries, pack_checksum=None) -> None:

548 """Create a new MemoryPackIndex.

549

550 Args:

551 entries: Sequence of name, idx, crc32 (sorted)

552 pack_checksum: Optional pack checksum

553 """

554 self._by_sha = {}

555 self._by_offset = {}

556 for name, offset, crc32 in entries:

557 self._by_sha[name] = offset

558 self._by_offset[offset] = name

559 self._entries = entries

560 self._pack_checksum = pack_checksum

561

562 def get_pack_checksum(self):

563 return self._pack_checksum

564

565 def __len__(self) -> int:

566 return len(self._entries)

567

568 def object_offset(self, sha):

569 if len(sha) == 40:

570 sha = hex_to_sha(sha)

571 return self._by_sha[sha]

572

573 def object_sha1(self, offset):

574 return self._by_offset[offset]

575

576 def _itersha(self):

577 return iter(self._by_sha)

578

579 def iterentries(self):

580 return iter(self._entries)

581

582 @classmethod

583 def for_pack(cls, pack):

584 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())

585

586 @classmethod

587 def clone(cls, other_index):

588 return cls(other_index.iterentries(), other_index.get_pack_checksum())

589

590

591class FilePackIndex(PackIndex):

592 """Pack index that is based on a file.

593

594 To do the loop it opens the file, and indexes first 256 4 byte groups

595 with the first byte of the sha id. The value in the four byte group indexed

596 is the end of the group that shares the same starting byte. Subtract one

597 from the starting byte and index again to find the start of the group.

598 The values are sorted by sha id within the group, so do the math to find

599 the start and end offset and then bisect in to find if the value is

600 present.

601 """

602

603 _fan_out_table: list[int]

604

605 def __init__(self, filename, file=None, contents=None, size=None) -> None:

606 """Create a pack index object.

607

608 Provide it with the name of the index file to consider, and it will map

609 it whenever required.

610 """

611 self._filename = filename

612 # Take the size now, so it can be checked each time we map the file to

613 # ensure that it hasn't changed.

614 if file is None:

615 self._file = GitFile(filename, "rb")

616 else:

617 self._file = file

618 if contents is None:

619 self._contents, self._size = _load_file_contents(self._file, size)

620 else:

621 self._contents, self._size = (contents, size)

622

623 @property

624 def path(self) -> str:

625 return self._filename

626

627 def __eq__(self, other):

628 # Quick optimization:

629 if (

630 isinstance(other, FilePackIndex)

631 and self._fan_out_table != other._fan_out_table

632 ):

633 return False

634

635 return super().__eq__(other)

636

637 def close(self) -> None:

638 self._file.close()

639 if getattr(self._contents, "close", None) is not None:

640 self._contents.close()

641

642 def __len__(self) -> int:

643 """Return the number of entries in this pack index."""

644 return self._fan_out_table[-1]

645

646 def _unpack_entry(self, i: int) -> PackIndexEntry:

647 """Unpack the i-th entry in the index file.

648

649 Returns: Tuple with object name (SHA), offset in pack file and CRC32

650 checksum (if known).

651 """

652 raise NotImplementedError(self._unpack_entry)

653

654 def _unpack_name(self, i) -> bytes:

655 """Unpack the i-th name from the index file."""

656 raise NotImplementedError(self._unpack_name)

657

658 def _unpack_offset(self, i) -> int:

659 """Unpack the i-th object offset from the index file."""

660 raise NotImplementedError(self._unpack_offset)

661

662 def _unpack_crc32_checksum(self, i) -> Optional[int]:

663 """Unpack the crc32 checksum for the ith object from the index file."""

664 raise NotImplementedError(self._unpack_crc32_checksum)

665

666 def _itersha(self) -> Iterator[bytes]:

667 for i in range(len(self)):

668 yield self._unpack_name(i)

669

670 def iterentries(self) -> Iterator[PackIndexEntry]:

671 """Iterate over the entries in this pack index.

672

673 Returns: iterator over tuples with object name, offset in packfile and

674 crc32 checksum.

675 """

676 for i in range(len(self)):

677 yield self._unpack_entry(i)

678

679 def _read_fan_out_table(self, start_offset: int):

680 ret = []

681 for i in range(0x100):

682 fanout_entry = self._contents[

683 start_offset + i * 4 : start_offset + (i + 1) * 4

684 ]

685 ret.append(struct.unpack(">L", fanout_entry)[0])

686 return ret

687

688 def check(self) -> None:

689 """Check that the stored checksum matches the actual checksum."""

690 actual = self.calculate_checksum()

691 stored = self.get_stored_checksum()

692 if actual != stored:

693 raise ChecksumMismatch(stored, actual)

694

695 def calculate_checksum(self) -> bytes:

696 """Calculate the SHA1 checksum over this pack index.

697

698 Returns: This is a 20-byte binary digest

699 """

700 return sha1(self._contents[:-20]).digest()

701

702 def get_pack_checksum(self) -> bytes:

703 """Return the SHA1 checksum stored for the corresponding packfile.

704

705 Returns: 20-byte binary digest

706 """

707 return bytes(self._contents[-40:-20])

708

709 def get_stored_checksum(self) -> bytes:

710 """Return the SHA1 checksum stored for this index.

711

712 Returns: 20-byte binary digest

713 """

714 return bytes(self._contents[-20:])

715

716 def object_offset(self, sha: bytes) -> int:

717 """Return the offset in to the corresponding packfile for the object.

718

719 Given the name of an object it will return the offset that object

720 lives at within the corresponding pack file. If the pack file doesn't

721 have the object then None will be returned.

722 """

723 if len(sha) == 40:

724 sha = hex_to_sha(sha)

725 try:

726 return self._object_offset(sha)

727 except ValueError as exc:

728 closed = getattr(self._contents, "closed", None)

729 if closed in (None, True):

730 raise PackFileDisappeared(self) from exc

731 raise

732

733 def _object_offset(self, sha: bytes) -> int:

734 """See object_offset.

735

736 Args:

737 sha: A *binary* SHA string. (20 characters long)_

738 """

739 assert len(sha) == 20

740 idx = ord(sha[:1])

741 if idx == 0:

742 start = 0

743 else:

744 start = self._fan_out_table[idx - 1]

745 end = self._fan_out_table[idx]

746 i = bisect_find_sha(start, end, sha, self._unpack_name)

747 if i is None:

748 raise KeyError(sha)

749 return self._unpack_offset(i)

750

751 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:

752 """Iterate over all SHA1s with the given prefix."""

753 start = ord(prefix[:1])

754 if start == 0:

755 start = 0

756 else:

757 start = self._fan_out_table[start - 1]

758 end = ord(prefix[:1]) + 1

759 if end == 0x100:

760 end = len(self)

761 else:

762 end = self._fan_out_table[end]

763 assert start <= end

764 started = False

765 for i in range(start, end):

766 name: bytes = self._unpack_name(i)

767 if name.startswith(prefix):

768 yield name

769 started = True

770 elif started:

771 break

772

773

774class PackIndex1(FilePackIndex):

775 """Version 1 Pack Index file."""

776

777 def __init__(

778 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

779 ) -> None:

780 super().__init__(filename, file, contents, size)

781 self.version = 1

782 self._fan_out_table = self._read_fan_out_table(0)

783

784 def _unpack_entry(self, i):

785 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

786 return (name, offset, None)

787

788 def _unpack_name(self, i):

789 offset = (0x100 * 4) + (i * 24) + 4

790 return self._contents[offset : offset + 20]

791

792 def _unpack_offset(self, i):

793 offset = (0x100 * 4) + (i * 24)

794 return unpack_from(">L", self._contents, offset)[0]

795

796 def _unpack_crc32_checksum(self, i) -> None:

797 # Not stored in v1 index files

798 return None

799

800

801class PackIndex2(FilePackIndex):

802 """Version 2 Pack Index file."""

803

804 def __init__(

805 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

806 ) -> None:

807 super().__init__(filename, file, contents, size)

808 if self._contents[:4] != b"\377tOc":

809 raise AssertionError("Not a v2 pack index file")

810 (self.version,) = unpack_from(b">L", self._contents, 4)

811 if self.version != 2:

812 raise AssertionError(f"Version was {self.version}")

813 self._fan_out_table = self._read_fan_out_table(8)

814 self._name_table_offset = 8 + 0x100 * 4

815 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

816 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

817 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

818 self

819 )

820

821 def _unpack_entry(self, i):

822 return (

823 self._unpack_name(i),

824 self._unpack_offset(i),

825 self._unpack_crc32_checksum(i),

826 )

827

828 def _unpack_name(self, i):

829 offset = self._name_table_offset + i * 20

830 return self._contents[offset : offset + 20]

831

832 def _unpack_offset(self, i):

833 offset = self._pack_offset_table_offset + i * 4

834 offset = unpack_from(">L", self._contents, offset)[0]

835 if offset & (2**31):

836 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

837 offset = unpack_from(">Q", self._contents, offset)[0]

838 return offset

839

840 def _unpack_crc32_checksum(self, i):

841 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

842

843

844class PackIndex3(FilePackIndex):

845 """Version 3 Pack Index file.

846

847 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).

848 """

849

850 def __init__(

851 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

852 ) -> None:

853 super().__init__(filename, file, contents, size)

854 if self._contents[:4] != b"\377tOc":

855 raise AssertionError("Not a v3 pack index file")

856 (self.version,) = unpack_from(b">L", self._contents, 4)

857 if self.version != 3:

858 raise AssertionError(f"Version was {self.version}")

859

860 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

861 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)

862 if self.hash_algorithm == 1:

863 self.hash_size = 20 # SHA-1

864 elif self.hash_algorithm == 2:

865 self.hash_size = 32 # SHA-256

866 else:

867 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")

868

869 # Read length of shortened object names

870 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)

871

872 # Calculate offsets based on variable hash size

873 self._fan_out_table = self._read_fan_out_table(

874 16

875 ) # After header (4 + 4 + 4 + 4)

876 self._name_table_offset = 16 + 0x100 * 4

877 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)

878 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

879 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

880 self

881 )

882

883 def _unpack_entry(self, i):

884 return (

885 self._unpack_name(i),

886 self._unpack_offset(i),

887 self._unpack_crc32_checksum(i),

888 )

889

890 def _unpack_name(self, i):

891 offset = self._name_table_offset + i * self.hash_size

892 return self._contents[offset : offset + self.hash_size]

893

894 def _unpack_offset(self, i):

895 offset = self._pack_offset_table_offset + i * 4

896 offset = unpack_from(">L", self._contents, offset)[0]

897 if offset & (2**31):

898 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

899 offset = unpack_from(">Q", self._contents, offset)[0]

900 return offset

901

902 def _unpack_crc32_checksum(self, i):

903 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

904

905

906def read_pack_header(read) -> tuple[int, int]:

907 """Read the header of a pack file.

908

909 Args:

910 read: Read function

911 Returns: Tuple of (pack version, number of objects). If no data is

912 available to read, returns (None, None).

913 """

914 header = read(12)

915 if not header:

916 raise AssertionError("file too short to contain pack")

917 if header[:4] != b"PACK":

918 raise AssertionError(f"Invalid pack header {header!r}")

919 (version,) = unpack_from(b">L", header, 4)

920 if version not in (2, 3):

921 raise AssertionError(f"Version was {version}")

922 (num_objects,) = unpack_from(b">L", header, 8)

923 return (version, num_objects)

924

925

926def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:

927 if isinstance(chunks, bytes):

928 return len(chunks)

929 else:

930 return sum(map(len, chunks))

931

932

933def unpack_object(

934 read_all: Callable[[int], bytes],

935 read_some: Optional[Callable[[int], bytes]] = None,

936 compute_crc32=False,

937 include_comp=False,

938 zlib_bufsize=_ZLIB_BUFSIZE,

939) -> tuple[UnpackedObject, bytes]:

940 """Unpack a Git object.

941

942 Args:

943 read_all: Read function that blocks until the number of requested

944 bytes are read.

945 read_some: Read function that returns at least one byte, but may not

946 return the number of bytes requested.

947 compute_crc32: If True, compute the CRC32 of the compressed data. If

948 False, the returned CRC32 will be None.

949 include_comp: If True, include compressed data in the result.

950 zlib_bufsize: An optional buffer size for zlib operations.

951 Returns: A tuple of (unpacked, unused), where unused is the unused data

952 leftover from decompression, and unpacked in an UnpackedObject with

953 the following attrs set:

954

955 * obj_chunks (for non-delta types)

956 * pack_type_num

957 * delta_base (for delta types)

958 * comp_chunks (if include_comp is True)

959 * decomp_chunks

960 * decomp_len

961 * crc32 (if compute_crc32 is True)

962 """

963 if read_some is None:

964 read_some = read_all

965 if compute_crc32:

966 crc32 = 0

967 else:

968 crc32 = None

969

970 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

971 type_num = (raw[0] >> 4) & 0x07

972 size = raw[0] & 0x0F

973 for i, byte in enumerate(raw[1:]):

974 size += (byte & 0x7F) << ((i * 7) + 4)

975

976 delta_base: Union[int, bytes, None]

977 raw_base = len(raw)

978 if type_num == OFS_DELTA:

979 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

980 raw_base += len(raw)

981 if raw[-1] & 0x80:

982 raise AssertionError

983 delta_base_offset = raw[0] & 0x7F

984 for byte in raw[1:]:

985 delta_base_offset += 1

986 delta_base_offset <<= 7

987 delta_base_offset += byte & 0x7F

988 delta_base = delta_base_offset

989 elif type_num == REF_DELTA:

990 delta_base_obj = read_all(20)

991 if crc32 is not None:

992 crc32 = binascii.crc32(delta_base_obj, crc32)

993 delta_base = delta_base_obj

994 raw_base += 20

995 else:

996 delta_base = None

997

998 unpacked = UnpackedObject(

999 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

1000 )

1001 unused = read_zlib_chunks(

1002 read_some,

1003 unpacked,

1004 buffer_size=zlib_bufsize,

1005 include_comp=include_comp,

1006 )

1007 return unpacked, unused

1008

1009

1010def _compute_object_size(value):

1011 """Compute the size of a unresolved object for use with LRUSizeCache."""

1012 (num, obj) = value

1013 if num in DELTA_TYPES:

1014 return chunks_length(obj[1])

1015 return chunks_length(obj)

1016

1017

1018class PackStreamReader:

1019 """Class to read a pack stream.

1020

1021 The pack is read from a ReceivableProtocol using read() or recv() as

1022 appropriate.

1023 """

1024

1025 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:

1026 self.read_all = read_all

1027 if read_some is None:

1028 self.read_some = read_all

1029 else:

1030 self.read_some = read_some

1031 self.sha = sha1()

1032 self._offset = 0

1033 self._rbuf = BytesIO()

1034 # trailer is a deque to avoid memory allocation on small reads

1035 self._trailer: deque[bytes] = deque()

1036 self._zlib_bufsize = zlib_bufsize

1037

1038 def _read(self, read, size):

1039 """Read up to size bytes using the given callback.

1040

1041 As a side effect, update the verifier's hash (excluding the last 20

1042 bytes read).

1043

1044 Args:

1045 read: The read callback to read from.

1046 size: The maximum number of bytes to read; the particular

1047 behavior is callback-specific.

1048 """

1049 data = read(size)

1050

1051 # maintain a trailer of the last 20 bytes we've read

1052 n = len(data)

1053 self._offset += n

1054 tn = len(self._trailer)

1055 if n >= 20:

1056 to_pop = tn

1057 to_add = 20

1058 else:

1059 to_pop = max(n + tn - 20, 0)

1060 to_add = n

1061 self.sha.update(

1062 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

1063 )

1064 self._trailer.extend(data[-to_add:])

1065

1066 # hash everything but the trailer

1067 self.sha.update(data[:-to_add])

1068 return data

1069

1070 def _buf_len(self):

1071 buf = self._rbuf

1072 start = buf.tell()

1073 buf.seek(0, SEEK_END)

1074 end = buf.tell()

1075 buf.seek(start)

1076 return end - start

1077

1078 @property

1079 def offset(self):

1080 return self._offset - self._buf_len()

1081

1082 def read(self, size):

1083 """Read, blocking until size bytes are read."""

1084 buf_len = self._buf_len()

1085 if buf_len >= size:

1086 return self._rbuf.read(size)

1087 buf_data = self._rbuf.read()

1088 self._rbuf = BytesIO()

1089 return buf_data + self._read(self.read_all, size - buf_len)

1090

1091 def recv(self, size):

1092 """Read up to size bytes, blocking until one byte is read."""

1093 buf_len = self._buf_len()

1094 if buf_len:

1095 data = self._rbuf.read(size)

1096 if size >= buf_len:

1097 self._rbuf = BytesIO()

1098 return data

1099 return self._read(self.read_some, size)

1100

1101 def __len__(self) -> int:

1102 return self._num_objects

1103

1104 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:

1105 """Read the objects in this pack file.

1106

1107 Args:

1108 compute_crc32: If True, compute the CRC32 of the compressed

1109 data. If False, the returned CRC32 will be None.

1110 Returns: Iterator over UnpackedObjects with the following members set:

1111 offset

1112 obj_type_num

1113 obj_chunks (for non-delta types)

1114 delta_base (for delta types)

1115 decomp_chunks

1116 decomp_len

1117 crc32 (if compute_crc32 is True)

1118

1119 Raises:

1120 ChecksumMismatch: if the checksum of the pack contents does not

1121 match the checksum in the pack trailer.

1122 zlib.error: if an error occurred during zlib decompression.

1123 IOError: if an error occurred writing to the output file.

1124 """

1125 pack_version, self._num_objects = read_pack_header(self.read)

1126

1127 for i in range(self._num_objects):

1128 offset = self.offset

1129 unpacked, unused = unpack_object(

1130 self.read,

1131 read_some=self.recv,

1132 compute_crc32=compute_crc32,

1133 zlib_bufsize=self._zlib_bufsize,

1134 )

1135 unpacked.offset = offset

1136

1137 # prepend any unused data to current read buffer

1138 buf = BytesIO()

1139 buf.write(unused)

1140 buf.write(self._rbuf.read())

1141 buf.seek(0)

1142 self._rbuf = buf

1143

1144 yield unpacked

1145

1146 if self._buf_len() < 20:

1147 # If the read buffer is full, then the last read() got the whole

1148 # trailer off the wire. If not, it means there is still some of the

1149 # trailer to read. We need to read() all 20 bytes; N come from the

1150 # read buffer and (20 - N) come from the wire.

1151 self.read(20)

1152

1153 pack_sha = bytearray(self._trailer) # type: ignore

1154 if pack_sha != self.sha.digest():

1155 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())

1156

1157

1158class PackStreamCopier(PackStreamReader):

1159 """Class to verify a pack stream as it is being read.

1160

1161 The pack is read from a ReceivableProtocol using read() or recv() as

1162 appropriate and written out to the given file-like object.

1163 """

1164

1165 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:

1166 """Initialize the copier.

1167

1168 Args:

1169 read_all: Read function that blocks until the number of

1170 requested bytes are read.

1171 read_some: Read function that returns at least one byte, but may

1172 not return the number of bytes requested.

1173 outfile: File-like object to write output through.

1174 delta_iter: Optional DeltaChainIterator to record deltas as we

1175 read them.

1176 """

1177 super().__init__(read_all, read_some=read_some)

1178 self.outfile = outfile

1179 self._delta_iter = delta_iter

1180

1181 def _read(self, read, size):

1182 """Read data from the read callback and write it to the file."""

1183 data = super()._read(read, size)

1184 self.outfile.write(data)

1185 return data

1186

1187 def verify(self, progress=None) -> None:

1188 """Verify a pack stream and write it to the output file.

1189

1190 See PackStreamReader.iterobjects for a list of exceptions this may

1191 throw.

1192 """

1193 i = 0 # default count of entries if read_objects() is empty

1194 for i, unpacked in enumerate(self.read_objects()):

1195 if self._delta_iter:

1196 self._delta_iter.record(unpacked)

1197 if progress is not None:

1198 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))

1199 if progress is not None:

1200 progress(f"copied {i} pack entries\n".encode("ascii"))

1201

1202

1203def obj_sha(type, chunks):

1204 """Compute the SHA for a numeric type and object chunks."""

1205 sha = sha1()

1206 sha.update(object_header(type, chunks_length(chunks)))

1207 if isinstance(chunks, bytes):

1208 sha.update(chunks)

1209 else:

1210 for chunk in chunks:

1211 sha.update(chunk)

1212 return sha.digest()

1213

1214

1215def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):

1216 """Hash a portion of a file into a new SHA.

1217

1218 Args:

1219 f: A file-like object to read from that supports seek().

1220 start_ofs: The offset in the file to start reading at.

1221 end_ofs: The offset in the file to end reading at, relative to the

1222 end of the file.

1223 buffer_size: A buffer size for reading.

1224 Returns: A new SHA object updated with data read from the file.

1225 """

1226 sha = sha1()

1227 f.seek(0, SEEK_END)

1228 length = f.tell()

1229 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1230 raise AssertionError(

1231 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"

1232 )

1233 todo = length + end_ofs - start_ofs

1234 f.seek(start_ofs)

1235 while todo:

1236 data = f.read(min(todo, buffer_size))

1237 sha.update(data)

1238 todo -= len(data)

1239 return sha

1240

1241

1242class PackData:

1243 """The data contained in a packfile.

1244

1245 Pack files can be accessed both sequentially for exploding a pack, and

1246 directly with the help of an index to retrieve a specific object.

1247

1248 The objects within are either complete or a delta against another.

1249

1250 The header is variable length. If the MSB of each byte is set then it

1251 indicates that the subsequent byte is still part of the header.

1252 For the first byte the next MS bits are the type, which tells you the type

1253 of object, and whether it is a delta. The LS byte is the lowest bits of the

1254 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1255 size, i.e. the last byte of the header contains the MS bits of the size.

1256

1257 For the complete objects the data is stored as zlib deflated data.

1258 The size in the header is the uncompressed object size, so to uncompress

1259 you need to just keep feeding data to zlib until you get an object back,

1260 or it errors on bad data. This is done here by just giving the complete

1261 buffer from the start of the deflated object on. This is bad, but until I

1262 get mmap sorted out it will have to do.

1263

1264 Currently there are no integrity checks done. Also no attempt is made to

1265 try and detect the delta case, or a request for an object at the wrong

1266 position. It will all just throw a zlib or KeyError.

1267 """

1268

1269 def __init__(self, filename: Union[str, os.PathLike], file=None, size=None) -> None:

1270 """Create a PackData object representing the pack in the given filename.

1271

1272 The file must exist and stay readable until the object is disposed of.

1273 It must also stay the same size. It will be mapped whenever needed.

1274

1275 Currently there is a restriction on the size of the pack as the python

1276 mmap implementation is flawed.

1277 """

1278 self._filename = filename

1279 self._size = size

1280 self._header_size = 12

1281 if file is None:

1282 self._file = GitFile(self._filename, "rb")

1283 else:

1284 self._file = file

1285 (version, self._num_objects) = read_pack_header(self._file.read)

1286 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](

1287 1024 * 1024 * 20, compute_size=_compute_object_size

1288 )

1289

1290 @property

1291 def filename(self):

1292 return os.path.basename(self._filename)

1293

1294 @property

1295 def path(self):

1296 return self._filename

1297

1298 @classmethod

1299 def from_file(cls, file, size=None):

1300 return cls(str(file), file=file, size=size)

1301

1302 @classmethod

1303 def from_path(cls, path: Union[str, os.PathLike]):

1304 return cls(filename=path)

1305

1306 def close(self) -> None:

1307 self._file.close()

1308

1309 def __enter__(self):

1310 return self

1311

1312 def __exit__(self, exc_type, exc_val, exc_tb):

1313 self.close()

1314

1315 def __eq__(self, other):

1316 if isinstance(other, PackData):

1317 return self.get_stored_checksum() == other.get_stored_checksum()

1318 return False

1319

1320 def _get_size(self):

1321 if self._size is not None:

1322 return self._size

1323 self._size = os.path.getsize(self._filename)

1324 if self._size < self._header_size:

1325 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"

1326 raise AssertionError(errmsg)

1327 return self._size

1328

1329 def __len__(self) -> int:

1330 """Returns the number of objects in this pack."""

1331 return self._num_objects

1332

1333 def calculate_checksum(self):

1334 """Calculate the checksum for this pack.

1335

1336 Returns: 20-byte binary SHA1 digest

1337 """

1338 return compute_file_sha(self._file, end_ofs=-20).digest()

1339

1340 def iter_unpacked(self, *, include_comp: bool = False):

1341 self._file.seek(self._header_size)

1342

1343 if self._num_objects is None:

1344 return

1345

1346 for _ in range(self._num_objects):

1347 offset = self._file.tell()

1348 unpacked, unused = unpack_object(

1349 self._file.read, compute_crc32=False, include_comp=include_comp

1350 )

1351 unpacked.offset = offset

1352 yield unpacked

1353 # Back up over unused data.

1354 self._file.seek(-len(unused), SEEK_CUR)

1355

1356 def iterentries(

1357 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None

1358 ):

1359 """Yield entries summarizing the contents of this pack.

1360

1361 Args:

1362 progress: Progress function, called with current and total

1363 object count.

1364 Returns: iterator of tuples with (sha, offset, crc32)

1365 """

1366 num_objects = self._num_objects

1367 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1368 for i, result in enumerate(indexer):

1369 if progress is not None:

1370 progress(i, num_objects)

1371 yield result

1372

1373 def sorted_entries(

1374 self,

1375 progress: Optional[ProgressFn] = None,

1376 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

1377 ):

1378 """Return entries in this pack, sorted by SHA.

1379

1380 Args:

1381 progress: Progress function, called with current and total

1382 object count

1383 Returns: Iterator of tuples with (sha, offset, crc32)

1384 """

1385 return sorted(

1386 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)

1387 )

1388

1389 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):

1390 """Create a version 1 file for this data file.

1391

1392 Args:

1393 filename: Index filename.

1394 progress: Progress report function

1395 Returns: Checksum of index file

1396 """

1397 entries = self.sorted_entries(

1398 progress=progress, resolve_ext_ref=resolve_ext_ref

1399 )

1400 with GitFile(filename, "wb") as f:

1401 return write_pack_index_v1(f, entries, self.calculate_checksum())

1402

1403 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):

1404 """Create a version 2 index file for this data file.

1405

1406 Args:

1407 filename: Index filename.

1408 progress: Progress report function

1409 Returns: Checksum of index file

1410 """

1411 entries = self.sorted_entries(

1412 progress=progress, resolve_ext_ref=resolve_ext_ref

1413 )

1414 with GitFile(filename, "wb") as f:

1415 return write_pack_index_v2(f, entries, self.calculate_checksum())

1416

1417 def create_index_v3(

1418 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1

1419 ):

1420 """Create a version 3 index file for this data file.

1421

1422 Args:

1423 filename: Index filename.

1424 progress: Progress report function

1425 resolve_ext_ref: Function to resolve external references

1426 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1427 Returns: Checksum of index file

1428 """

1429 entries = self.sorted_entries(

1430 progress=progress, resolve_ext_ref=resolve_ext_ref

1431 )

1432 with GitFile(filename, "wb") as f:

1433 return write_pack_index_v3(

1434 f, entries, self.calculate_checksum(), hash_algorithm

1435 )

1436

1437 def create_index(

1438 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1

1439 ):

1440 """Create an index file for this data file.

1441

1442 Args:

1443 filename: Index filename.

1444 progress: Progress report function

1445 version: Index version (1, 2, or 3)

1446 resolve_ext_ref: Function to resolve external references

1447 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)

1448 Returns: Checksum of index file

1449 """

1450 if version == 1:

1451 return self.create_index_v1(

1452 filename, progress, resolve_ext_ref=resolve_ext_ref

1453 )

1454 elif version == 2:

1455 return self.create_index_v2(

1456 filename, progress, resolve_ext_ref=resolve_ext_ref

1457 )

1458 elif version == 3:

1459 return self.create_index_v3(

1460 filename,

1461 progress,

1462 resolve_ext_ref=resolve_ext_ref,

1463 hash_algorithm=hash_algorithm,

1464 )

1465 else:

1466 raise ValueError(f"unknown index format {version}")

1467

1468 def get_stored_checksum(self):

1469 """Return the expected checksum stored in this pack."""

1470 self._file.seek(-20, SEEK_END)

1471 return self._file.read(20)

1472

1473 def check(self) -> None:

1474 """Check the consistency of this pack."""

1475 actual = self.calculate_checksum()

1476 stored = self.get_stored_checksum()

1477 if actual != stored:

1478 raise ChecksumMismatch(stored, actual)

1479

1480 def get_unpacked_object_at(

1481 self, offset: int, *, include_comp: bool = False

1482 ) -> UnpackedObject:

1483 """Given offset in the packfile return a UnpackedObject."""

1484 assert offset >= self._header_size

1485 self._file.seek(offset)

1486 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1487 unpacked.offset = offset

1488 return unpacked

1489

1490 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:

1491 """Given an offset in to the packfile return the object that is there.

1492

1493 Using the associated index the location of an object can be looked up,

1494 and then the packfile can be asked directly for that object using this

1495 function.

1496 """

1497 try:

1498 return self._offset_cache[offset]

1499 except KeyError:

1500 pass

1501 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1502 return (unpacked.pack_type_num, unpacked._obj())

1505T = TypeVar("T")

1508class DeltaChainIterator(Generic[T]):

1509 """Abstract iterator over pack data based on delta chains.

1510

1511 Each object in the pack is guaranteed to be inflated exactly once,

1512 regardless of how many objects reference it as a delta base. As a result,

1513 memory usage is proportional to the length of the longest delta chain.

1514

1515 Subclasses can override _result to define the result type of the iterator.

1516 By default, results are UnpackedObjects with the following members set:

1517

1518 * offset

1519 * obj_type_num

1520 * obj_chunks

1521 * pack_type_num

1522 * delta_base (for delta types)

1523 * comp_chunks (if _include_comp is True)

1524 * decomp_chunks

1525 * decomp_len

1526 * crc32 (if _compute_crc32 is True)

1527 """

1528

1529 _compute_crc32 = False

1530 _include_comp = False

1531

1532 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:

1533 self._file = file_obj

1534 self._resolve_ext_ref = resolve_ext_ref

1535 self._pending_ofs: dict[int, list[int]] = defaultdict(list)

1536 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)

1537 self._full_ofs: list[tuple[int, int]] = []

1538 self._ext_refs: list[bytes] = []

1539

1540 @classmethod

1541 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):

1542 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1543 walker.set_pack_data(pack_data)

1544 for unpacked in pack_data.iter_unpacked(include_comp=False):

1545 walker.record(unpacked)

1546 return walker

1547

1548 @classmethod

1549 def for_pack_subset(

1550 cls,

1551 pack: "Pack",

1552 shas: Iterable[bytes],

1553 *,

1554 allow_missing: bool = False,

1555 resolve_ext_ref=None,

1556 ):

1557 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1558 walker.set_pack_data(pack.data)

1559 todo = set()

1560 for sha in shas:

1561 assert isinstance(sha, bytes)

1562 try:

1563 off = pack.index.object_offset(sha)

1564 except KeyError:

1565 if not allow_missing:

1566 raise

1567 else:

1568 todo.add(off)

1569 done = set()

1570 while todo:

1571 off = todo.pop()

1572 unpacked = pack.data.get_unpacked_object_at(off)

1573 walker.record(unpacked)

1574 done.add(off)

1575 base_ofs = None

1576 if unpacked.pack_type_num == OFS_DELTA:

1577 base_ofs = unpacked.offset - unpacked.delta_base

1578 elif unpacked.pack_type_num == REF_DELTA:

1579 with suppress(KeyError):

1580 assert isinstance(unpacked.delta_base, bytes)

1581 base_ofs = pack.index.object_index(unpacked.delta_base)

1582 if base_ofs is not None and base_ofs not in done:

1583 todo.add(base_ofs)

1584 return walker

1585

1586 def record(self, unpacked: UnpackedObject) -> None:

1587 type_num = unpacked.pack_type_num

1588 offset = unpacked.offset

1589 if type_num == OFS_DELTA:

1590 base_offset = offset - unpacked.delta_base

1591 self._pending_ofs[base_offset].append(offset)

1592 elif type_num == REF_DELTA:

1593 assert isinstance(unpacked.delta_base, bytes)

1594 self._pending_ref[unpacked.delta_base].append(offset)

1595 else:

1596 self._full_ofs.append((offset, type_num))

1597

1598 def set_pack_data(self, pack_data: PackData) -> None:

1599 self._file = pack_data._file

1600

1601 def _walk_all_chains(self):

1602 for offset, type_num in self._full_ofs:

1603 yield from self._follow_chain(offset, type_num, None)

1604 yield from self._walk_ref_chains()

1605 assert not self._pending_ofs, repr(self._pending_ofs)

1606

1607 def _ensure_no_pending(self) -> None:

1608 if self._pending_ref:

1609 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])

1610

1611 def _walk_ref_chains(self):

1612 if not self._resolve_ext_ref:

1613 self._ensure_no_pending()

1614 return

1615

1616 for base_sha, pending in sorted(self._pending_ref.items()):

1617 if base_sha not in self._pending_ref:

1618 continue

1619 try:

1620 type_num, chunks = self._resolve_ext_ref(base_sha)

1621 except KeyError:

1622 # Not an external ref, but may depend on one. Either it will

1623 # get popped via a _follow_chain call, or we will raise an

1624 # error below.

1625 continue

1626 self._ext_refs.append(base_sha)

1627 self._pending_ref.pop(base_sha)

1628 for new_offset in pending:

1629 yield from self._follow_chain(new_offset, type_num, chunks)

1630

1631 self._ensure_no_pending()

1632

1633 def _result(self, unpacked: UnpackedObject) -> T:

1634 raise NotImplementedError

1635

1636 def _resolve_object(

1637 self, offset: int, obj_type_num: int, base_chunks: list[bytes]

1638 ) -> UnpackedObject:

1639 self._file.seek(offset)

1640 unpacked, _ = unpack_object(

1641 self._file.read,

1642 include_comp=self._include_comp,

1643 compute_crc32=self._compute_crc32,

1644 )

1645 unpacked.offset = offset

1646 if base_chunks is None:

1647 assert unpacked.pack_type_num == obj_type_num

1648 else:

1649 assert unpacked.pack_type_num in DELTA_TYPES

1650 unpacked.obj_type_num = obj_type_num

1651 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

1652 return unpacked

1653

1654 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):

1655 # Unlike PackData.get_object_at, there is no need to cache offsets as

1656 # this approach by design inflates each object exactly once.

1657 todo = [(offset, obj_type_num, base_chunks)]

1658 while todo:

1659 (offset, obj_type_num, base_chunks) = todo.pop()

1660 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

1661 yield self._result(unpacked)

1662

1663 unblocked = chain(

1664 self._pending_ofs.pop(unpacked.offset, []),

1665 self._pending_ref.pop(unpacked.sha(), []),

1666 )

1667 todo.extend(

1668 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

1669 for new_offset in unblocked

1670 )

1671

1672 def __iter__(self) -> Iterator[T]:

1673 return self._walk_all_chains()

1674

1675 def ext_refs(self):

1676 return self._ext_refs

1677

1678

1679class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

1680 """Delta chain iterator that yield unpacked objects."""

1681

1682 def _result(self, unpacked):

1683 return unpacked

1684

1685

1686class PackIndexer(DeltaChainIterator[PackIndexEntry]):

1687 """Delta chain iterator that yields index entries."""

1688

1689 _compute_crc32 = True

1690

1691 def _result(self, unpacked):

1692 return unpacked.sha(), unpacked.offset, unpacked.crc32

1693

1694

1695class PackInflater(DeltaChainIterator[ShaFile]):

1696 """Delta chain iterator that yields ShaFile objects."""

1697

1698 def _result(self, unpacked):

1699 return unpacked.sha_file()

1700

1701

1702class SHA1Reader(BinaryIO):

1703 """Wrapper for file-like object that remembers the SHA1 of its data."""

1704

1705 def __init__(self, f) -> None:

1706 self.f = f

1707 self.sha1 = sha1(b"")

1708

1709 def read(self, size: int = -1) -> bytes:

1710 data = self.f.read(size)

1711 self.sha1.update(data)

1712 return data

1713

1714 def check_sha(self, allow_empty: bool = False) -> None:

1715 stored = self.f.read(20)

1716 # If git option index.skipHash is set the index will be empty

1717 if stored != self.sha1.digest() and (

1718 not allow_empty

1719 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"

1720 ):

1721 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))

1722

1723 def close(self):

1724 return self.f.close()

1725

1726 def tell(self) -> int:

1727 return self.f.tell()

1728

1729 # BinaryIO abstract methods

1730 def readable(self) -> bool:

1731 return True

1732

1733 def writable(self) -> bool:

1734 return False

1735

1736 def seekable(self) -> bool:

1737 return getattr(self.f, "seekable", lambda: False)()

1738

1739 def seek(self, offset: int, whence: int = 0) -> int:

1740 return self.f.seek(offset, whence)

1741

1742 def flush(self) -> None:

1743 if hasattr(self.f, "flush"):

1744 self.f.flush()

1745

1746 def readline(self, size: int = -1) -> bytes:

1747 return self.f.readline(size)

1748

1749 def readlines(self, hint: int = -1) -> list[bytes]:

1750 return self.f.readlines(hint)

1751

1752 def writelines(self, lines) -> None:

1753 raise UnsupportedOperation("writelines")

1754

1755 def write(self, data) -> int:

1756 raise UnsupportedOperation("write")

1757

1758 def __enter__(self):

1759 return self

1760

1761 def __exit__(self, type, value, traceback):

1762 self.close()

1763

1764 def __iter__(self):

1765 return self

1766

1767 def __next__(self) -> bytes:

1768 line = self.readline()

1769 if not line:

1770 raise StopIteration

1771 return line

1772

1773 def fileno(self) -> int:

1774 return self.f.fileno()

1775

1776 def isatty(self) -> bool:

1777 return getattr(self.f, "isatty", lambda: False)()

1778

1779 def truncate(self, size: Optional[int] = None) -> int:

1780 raise UnsupportedOperation("truncate")

1781

1782

1783class SHA1Writer(BinaryIO):

1784 """Wrapper for file-like object that remembers the SHA1 of its data."""

1785

1786 def __init__(self, f) -> None:

1787 self.f = f

1788 self.length = 0

1789 self.sha1 = sha1(b"")

1790

1791 def write(self, data) -> int:

1792 self.sha1.update(data)

1793 self.f.write(data)

1794 self.length += len(data)

1795 return len(data)

1796

1797 def write_sha(self):

1798 sha = self.sha1.digest()

1799 assert len(sha) == 20

1800 self.f.write(sha)

1801 self.length += len(sha)

1802 return sha

1803

1804 def close(self):

1805 sha = self.write_sha()

1806 self.f.close()

1807 return sha

1808

1809 def offset(self):

1810 return self.length

1811

1812 def tell(self) -> int:

1813 return self.f.tell()

1814

1815 # BinaryIO abstract methods

1816 def readable(self) -> bool:

1817 return False

1818

1819 def writable(self) -> bool:

1820 return True

1821

1822 def seekable(self) -> bool:

1823 return getattr(self.f, "seekable", lambda: False)()

1824

1825 def seek(self, offset: int, whence: int = 0) -> int:

1826 return self.f.seek(offset, whence)

1827

1828 def flush(self) -> None:

1829 if hasattr(self.f, "flush"):

1830 self.f.flush()

1831

1832 def readline(self, size: int = -1) -> bytes:

1833 raise UnsupportedOperation("readline")

1834

1835 def readlines(self, hint: int = -1) -> list[bytes]:

1836 raise UnsupportedOperation("readlines")

1837

1838 def writelines(self, lines) -> None:

1839 for line in lines:

1840 self.write(line)

1841

1842 def read(self, size: int = -1) -> bytes:

1843 raise UnsupportedOperation("read")

1844

1845 def __enter__(self):

1846 return self

1847

1848 def __exit__(self, type, value, traceback):

1849 self.close()

1850

1851 def __iter__(self):

1852 return self

1853

1854 def __next__(self) -> bytes:

1855 raise UnsupportedOperation("__next__")

1856

1857 def fileno(self) -> int:

1858 return self.f.fileno()

1859

1860 def isatty(self) -> bool:

1861 return getattr(self.f, "isatty", lambda: False)()

1862

1863 def truncate(self, size: Optional[int] = None) -> int:

1864 raise UnsupportedOperation("truncate")

1865

1866

1867def pack_object_header(type_num, delta_base, size):

1868 """Create a pack object header for the given object info.

1869

1870 Args:

1871 type_num: Numeric type of the object.

1872 delta_base: Delta base offset or ref, or None for whole objects.

1873 size: Uncompressed object size.

1874 Returns: A header for a packed object.

1875 """

1876 header = []

1877 c = (type_num << 4) | (size & 15)

1878 size >>= 4

1879 while size:

1880 header.append(c | 0x80)

1881 c = size & 0x7F

1882 size >>= 7

1883 header.append(c)

1884 if type_num == OFS_DELTA:

1885 ret = [delta_base & 0x7F]

1886 delta_base >>= 7

1887 while delta_base:

1888 delta_base -= 1

1889 ret.insert(0, 0x80 | (delta_base & 0x7F))

1890 delta_base >>= 7

1891 header.extend(ret)

1892 elif type_num == REF_DELTA:

1893 assert len(delta_base) == 20

1894 header += delta_base

1895 return bytearray(header)

1896

1897

1898def pack_object_chunks(type, object, compression_level=-1):

1899 """Generate chunks for a pack object.

1900

1901 Args:

1902 type: Numeric type of the object

1903 object: Object to write

1904 compression_level: the zlib compression level

1905 Returns: Chunks

1906 """

1907 if type in DELTA_TYPES:

1908 delta_base, object = object

1909 else:

1910 delta_base = None

1911 if isinstance(object, bytes):

1912 object = [object]

1913 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))

1914 compressor = zlib.compressobj(level=compression_level)

1915 for data in object:

1916 yield compressor.compress(data)

1917 yield compressor.flush()

1918

1919

1920def write_pack_object(write, type, object, sha=None, compression_level=-1):

1921 """Write pack object to a file.

1922

1923 Args:

1924 write: Write function to use

1925 type: Numeric type of the object

1926 object: Object to write

1927 compression_level: the zlib compression level

1928 Returns: Tuple with offset at which the object was written, and crc32

1929 """

1930 crc32 = 0

1931 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

1932 write(chunk)

1933 if sha is not None:

1934 sha.update(chunk)

1935 crc32 = binascii.crc32(chunk, crc32)

1936 return crc32 & 0xFFFFFFFF

1937

1938

1939def write_pack(

1940 filename,

1941 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

1942 *,

1943 deltify: Optional[bool] = None,

1944 delta_window_size: Optional[int] = None,

1945 compression_level: int = -1,

1946):

1947 """Write a new pack data file.

1948

1949 Args:

1950 filename: Path to the new pack file (without .pack extension)

1951 delta_window_size: Delta window size

1952 deltify: Whether to deltify pack objects

1953 compression_level: the zlib compression level

1954 Returns: Tuple with checksum of pack file and index file

1955 """

1956 with GitFile(filename + ".pack", "wb") as f:

1957 entries, data_sum = write_pack_objects(

1958 f.write,

1959 objects,

1960 delta_window_size=delta_window_size,

1961 deltify=deltify,

1962 compression_level=compression_level,

1963 )

1964 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

1965 with GitFile(filename + ".idx", "wb") as f:

1966 return data_sum, write_pack_index(f, entries, data_sum)

1967

1968

1969def pack_header_chunks(num_objects):

1970 """Yield chunks for a pack header."""

1971 yield b"PACK" # Pack header

1972 yield struct.pack(b">L", 2) # Pack version

1973 yield struct.pack(b">L", num_objects) # Number of objects in pack

1974

1975

1976def write_pack_header(write, num_objects) -> None:

1977 """Write a pack header for the given number of objects."""

1978 if hasattr(write, "write"):

1979 write = write.write

1980 warnings.warn(

1981 "write_pack_header() now takes a write rather than file argument",

1982 DeprecationWarning,

1983 stacklevel=2,

1984 )

1985 for chunk in pack_header_chunks(num_objects):

1986 write(chunk)

1987

1988

1989def find_reusable_deltas(

1990 container: PackedObjectContainer,

1991 object_ids: set[bytes],

1992 *,

1993 other_haves: Optional[set[bytes]] = None,

1994 progress=None,

1995) -> Iterator[UnpackedObject]:

1996 if other_haves is None:

1997 other_haves = set()

1998 reused = 0

1999 for i, unpacked in enumerate(

2000 container.iter_unpacked_subset(

2001 object_ids, allow_missing=True, convert_ofs_delta=True

2002 )

2003 ):

2004 if progress is not None and i % 1000 == 0:

2005 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())

2006 if unpacked.pack_type_num == REF_DELTA:

2007 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore

2008 if hexsha in object_ids or hexsha in other_haves:

2009 yield unpacked

2010 reused += 1

2011 if progress is not None:

2012 progress((f"found {reused} deltas to reuse\n").encode())

2013

2014

2015def deltify_pack_objects(

2016 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],

2017 *,

2018 window_size: Optional[int] = None,

2019 progress=None,

2020) -> Iterator[UnpackedObject]:

2021 """Generate deltas for pack objects.

2022

2023 Args:

2024 objects: An iterable of (object, path) tuples to deltify.

2025 window_size: Window size; None for default

2026 Returns: Iterator over type_num, object id, delta_base, content

2027 delta_base is None for full text entries

2028 """

2029

2030 def objects_with_hints():

2031 for e in objects:

2032 if isinstance(e, ShaFile):

2033 yield (e, (e.type_num, None))

2034 else:

2035 yield (e[0], (e[0].type_num, e[1]))

2036

2037 yield from deltas_from_sorted_objects(

2038 sort_objects_for_delta(objects_with_hints()),

2039 window_size=window_size,

2040 progress=progress,

2041 )

2042

2043

2044def sort_objects_for_delta(

2045 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],

2046) -> Iterator[ShaFile]:

2047 magic = []

2048 for entry in objects:

2049 if isinstance(entry, tuple):

2050 obj, hint = entry

2051 if hint is None:

2052 type_num = None

2053 path = None

2054 else:

2055 (type_num, path) = hint

2056 else:

2057 obj = entry

2058 magic.append((type_num, path, -obj.raw_length(), obj))

2059 # Build a list of objects ordered by the magic Linus heuristic

2060 # This helps us find good objects to diff against us

2061 magic.sort()

2062 return (x[3] for x in magic)

2063

2064

2065def deltas_from_sorted_objects(

2066 objects, window_size: Optional[int] = None, progress=None

2067):

2068 # TODO(jelmer): Use threads

2069 if window_size is None:

2070 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

2071

2072 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()

2073 for i, o in enumerate(objects):

2074 if progress is not None and i % 1000 == 0:

2075 progress((f"generating deltas: {i}\r").encode())

2076 raw = o.as_raw_chunks()

2077 winner = raw

2078 winner_len = sum(map(len, winner))

2079 winner_base = None

2080 for base_id, base_type_num, base in possible_bases:

2081 if base_type_num != o.type_num:

2082 continue

2083 delta_len = 0

2084 delta = []

2085 for chunk in create_delta(base, raw):

2086 delta_len += len(chunk)

2087 if delta_len >= winner_len:

2088 break

2089 delta.append(chunk)

2090 else:

2091 winner_base = base_id

2092 winner = delta

2093 winner_len = sum(map(len, winner))

2094 yield UnpackedObject(

2095 o.type_num,

2096 sha=o.sha().digest(),

2097 delta_base=winner_base,

2098 decomp_len=winner_len,

2099 decomp_chunks=winner,

2100 )

2101 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

2102 while len(possible_bases) > window_size:

2103 possible_bases.pop()

2104

2105

2106def pack_objects_to_data(

2107 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2108 *,

2109 deltify: Optional[bool] = None,

2110 delta_window_size: Optional[int] = None,

2111 ofs_delta: bool = True,

2112 progress=None,

2113) -> tuple[int, Iterator[UnpackedObject]]:

2114 """Create pack data from objects.

2115

2116 Args:

2117 objects: Pack objects

2118 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2119 """

2120 # TODO(jelmer): support deltaifying

2121 count = len(objects)

2122 if deltify is None:

2123 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2124 # slow at the moment.

2125 deltify = False

2126 if deltify:

2127 return (

2128 count,

2129 deltify_pack_objects(

2130 iter(objects), # type: ignore

2131 window_size=delta_window_size,

2132 progress=progress,

2133 ),

2134 )

2135 else:

2136

2137 def iter_without_path():

2138 for o in objects:

2139 if isinstance(o, tuple):

2140 yield full_unpacked_object(o[0])

2141 else:

2142 yield full_unpacked_object(o)

2143

2144 return (count, iter_without_path())

2145

2146

2147def generate_unpacked_objects(

2148 container: PackedObjectContainer,

2149 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2150 delta_window_size: Optional[int] = None,

2151 deltify: Optional[bool] = None,

2152 reuse_deltas: bool = True,

2153 ofs_delta: bool = True,

2154 other_haves: Optional[set[bytes]] = None,

2155 progress=None,

2156) -> Iterator[UnpackedObject]:

2157 """Create pack data from objects.

2158

2159 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2160 """

2161 todo = dict(object_ids)

2162 if reuse_deltas:

2163 for unpack in find_reusable_deltas(

2164 container, set(todo), other_haves=other_haves, progress=progress

2165 ):

2166 del todo[sha_to_hex(unpack.sha())]

2167 yield unpack

2168 if deltify is None:

2169 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2170 # slow at the moment.

2171 deltify = False

2172 if deltify:

2173 objects_to_delta = container.iterobjects_subset(

2174 todo.keys(), allow_missing=False

2175 )

2176 yield from deltas_from_sorted_objects(

2177 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),

2178 window_size=delta_window_size,

2179 progress=progress,

2180 )

2181 else:

2182 for oid in todo:

2183 yield full_unpacked_object(container[oid])

2184

2185

2186def full_unpacked_object(o: ShaFile) -> UnpackedObject:

2187 return UnpackedObject(

2188 o.type_num,

2189 delta_base=None,

2190 crc32=None,

2191 decomp_chunks=o.as_raw_chunks(),

2192 sha=o.sha().digest(),

2193 )

2194

2195

2196def write_pack_from_container(

2197 write,

2198 container: PackedObjectContainer,

2199 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2200 delta_window_size: Optional[int] = None,

2201 deltify: Optional[bool] = None,

2202 reuse_deltas: bool = True,

2203 compression_level: int = -1,

2204 other_haves: Optional[set[bytes]] = None,

2205):

2206 """Write a new pack data file.

2207

2208 Args:

2209 write: write function to use

2210 container: PackedObjectContainer

2211 delta_window_size: Sliding window size for searching for deltas;

2212 Set to None for default window size.

2213 deltify: Whether to deltify objects

2214 compression_level: the zlib compression level to use

2215 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2216 """

2217 pack_contents_count = len(object_ids)

2218 pack_contents = generate_unpacked_objects(

2219 container,

2220 object_ids,

2221 delta_window_size=delta_window_size,

2222 deltify=deltify,

2223 reuse_deltas=reuse_deltas,

2224 other_haves=other_haves,

2225 )

2226

2227 return write_pack_data(

2228 write,

2229 pack_contents,

2230 num_records=pack_contents_count,

2231 compression_level=compression_level,

2232 )

2233

2234

2235def write_pack_objects(

2236 write,

2237 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2238 *,

2239 delta_window_size: Optional[int] = None,

2240 deltify: Optional[bool] = None,

2241 compression_level: int = -1,

2242):

2243 """Write a new pack data file.

2244

2245 Args:

2246 write: write function to use

2247 objects: Sequence of (object, path) tuples to write

2248 delta_window_size: Sliding window size for searching for deltas;

2249 Set to None for default window size.

2250 deltify: Whether to deltify objects

2251 compression_level: the zlib compression level to use

2252 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2253 """

2254 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2255

2256 return write_pack_data(

2257 write,

2258 pack_contents,

2259 num_records=pack_contents_count,

2260 compression_level=compression_level,

2261 )

2262

2263

2264class PackChunkGenerator:

2265 def __init__(

2266 self,

2267 num_records=None,

2268 records=None,

2269 progress=None,

2270 compression_level=-1,

2271 reuse_compressed=True,

2272 ) -> None:

2273 self.cs = sha1(b"")

2274 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}

2275 self._it = self._pack_data_chunks(

2276 num_records=num_records,

2277 records=records,

2278 progress=progress,

2279 compression_level=compression_level,

2280 reuse_compressed=reuse_compressed,

2281 )

2282

2283 def sha1digest(self):

2284 return self.cs.digest()

2285

2286 def __iter__(self):

2287 return self._it

2288

2289 def _pack_data_chunks(

2290 self,

2291 records: Iterator[UnpackedObject],

2292 *,

2293 num_records=None,

2294 progress=None,

2295 compression_level: int = -1,

2296 reuse_compressed: bool = True,

2297 ) -> Iterator[bytes]:

2298 """Iterate pack data file chunks.

2299

2300 Args:

2301 records: Iterator over UnpackedObject

2302 num_records: Number of records (defaults to len(records) if not specified)

2303 progress: Function to report progress to

2304 compression_level: the zlib compression level

2305 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2306 """

2307 # Write the pack

2308 if num_records is None:

2309 num_records = len(records) # type: ignore

2310 offset = 0

2311 for chunk in pack_header_chunks(num_records):

2312 yield chunk

2313 self.cs.update(chunk)

2314 offset += len(chunk)

2315 actual_num_records = 0

2316 for i, unpacked in enumerate(records):

2317 type_num = unpacked.pack_type_num

2318 if progress is not None and i % 1000 == 0:

2319 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))

2320 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]

2321 if unpacked.delta_base is not None:

2322 try:

2323 base_offset, base_crc32 = self.entries[unpacked.delta_base]

2324 except KeyError:

2325 type_num = REF_DELTA

2326 assert isinstance(unpacked.delta_base, bytes)

2327 raw = (unpacked.delta_base, unpacked.decomp_chunks)

2328 else:

2329 type_num = OFS_DELTA

2330 raw = (offset - base_offset, unpacked.decomp_chunks)

2331 else:

2332 raw = unpacked.decomp_chunks

2333 if unpacked.comp_chunks is not None and reuse_compressed:

2334 chunks = unpacked.comp_chunks

2335 else:

2336 chunks = pack_object_chunks(

2337 type_num, raw, compression_level=compression_level

2338 )

2339 crc32 = 0

2340 object_size = 0

2341 for chunk in chunks:

2342 yield chunk

2343 crc32 = binascii.crc32(chunk, crc32)

2344 self.cs.update(chunk)

2345 object_size += len(chunk)

2346 actual_num_records += 1

2347 self.entries[unpacked.sha()] = (offset, crc32)

2348 offset += object_size

2349 if actual_num_records != num_records:

2350 raise AssertionError(

2351 f"actual records written differs: {actual_num_records} != {num_records}"

2352 )

2353

2354 yield self.cs.digest()

2355

2356

2357def write_pack_data(

2358 write,

2359 records: Iterator[UnpackedObject],

2360 *,

2361 num_records=None,

2362 progress=None,

2363 compression_level=-1,

2364):

2365 """Write a new pack data file.

2366

2367 Args:

2368 write: Write function to use

2369 num_records: Number of records (defaults to len(records) if None)

2370 records: Iterator over type_num, object_id, delta_base, raw

2371 progress: Function to report progress to

2372 compression_level: the zlib compression level

2373 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2374 """

2375 chunk_generator = PackChunkGenerator(

2376 num_records=num_records,

2377 records=records,

2378 progress=progress,

2379 compression_level=compression_level,

2380 )

2381 for chunk in chunk_generator:

2382 write(chunk)

2383 return chunk_generator.entries, chunk_generator.sha1digest()

2384

2385

2386def write_pack_index_v1(f, entries, pack_checksum):

2387 """Write a new pack index file.

2388

2389 Args:

2390 f: A file-like object to write to

2391 entries: List of tuples with object name (sha), offset_in_pack,

2392 and crc32_checksum.

2393 pack_checksum: Checksum of the pack file.

2394 Returns: The SHA of the written index file

2395 """

2396 f = SHA1Writer(f)

2397 fan_out_table = defaultdict(lambda: 0)

2398 for name, offset, entry_checksum in entries:

2399 fan_out_table[ord(name[:1])] += 1

2400 # Fan-out table

2401 for i in range(0x100):

2402 f.write(struct.pack(">L", fan_out_table[i]))

2403 fan_out_table[i + 1] += fan_out_table[i]

2404 for name, offset, entry_checksum in entries:

2405 if not (offset <= 0xFFFFFFFF):

2406 raise TypeError("pack format 1 only supports offsets < 2Gb")

2407 f.write(struct.pack(">L20s", offset, name))

2408 assert len(pack_checksum) == 20

2409 f.write(pack_checksum)

2410 return f.write_sha()

2411

2412

2413def _delta_encode_size(size) -> bytes:

2414 ret = bytearray()

2415 c = size & 0x7F

2416 size >>= 7

2417 while size:

2418 ret.append(c | 0x80)

2419 c = size & 0x7F

2420 size >>= 7

2421 ret.append(c)

2422 return bytes(ret)

2423

2424

2425# The length of delta compression copy operations in version 2 packs is limited

2426# to 64K. To copy more, we use several copy operations. Version 3 packs allow

2427# 24-bit lengths in copy operations, but we always make version 2 packs.

2428_MAX_COPY_LEN = 0xFFFF

2429

2430

2431def _encode_copy_operation(start, length):

2432 scratch = bytearray([0x80])

2433 for i in range(4):

2434 if start & 0xFF << i * 8:

2435 scratch.append((start >> i * 8) & 0xFF)

2436 scratch[0] |= 1 << i

2437 for i in range(2):

2438 if length & 0xFF << i * 8:

2439 scratch.append((length >> i * 8) & 0xFF)

2440 scratch[0] |= 1 << (4 + i)

2441 return bytes(scratch)

2442

2443

2444def create_delta(base_buf, target_buf):

2445 """Use python difflib to work out how to transform base_buf to target_buf.

2446

2447 Args:

2448 base_buf: Base buffer

2449 target_buf: Target buffer

2450 """

2451 if isinstance(base_buf, list):

2452 base_buf = b"".join(base_buf)

2453 if isinstance(target_buf, list):

2454 target_buf = b"".join(target_buf)

2455 assert isinstance(base_buf, bytes)

2456 assert isinstance(target_buf, bytes)

2457 # write delta header

2458 yield _delta_encode_size(len(base_buf))

2459 yield _delta_encode_size(len(target_buf))

2460 # write out delta opcodes

2461 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

2462 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

2463 # Git patch opcodes don't care about deletes!

2464 # if opcode == 'replace' or opcode == 'delete':

2465 # pass

2466 if opcode == "equal":

2467 # If they are equal, unpacker will use data from base_buf

2468 # Write out an opcode that says what range to use

2469 copy_start = i1

2470 copy_len = i2 - i1

2471 while copy_len > 0:

2472 to_copy = min(copy_len, _MAX_COPY_LEN)

2473 yield _encode_copy_operation(copy_start, to_copy)

2474 copy_start += to_copy

2475 copy_len -= to_copy

2476 if opcode == "replace" or opcode == "insert":

2477 # If we are replacing a range or adding one, then we just

2478 # output it to the stream (prefixed by its size)

2479 s = j2 - j1

2480 o = j1

2481 while s > 127:

2482 yield bytes([127])

2483 yield memoryview(target_buf)[o : o + 127]

2484 s -= 127

2485 o += 127

2486 yield bytes([s])

2487 yield memoryview(target_buf)[o : o + s]

2488

2489

2490def apply_delta(src_buf, delta):

2491 """Based on the similar function in git's patch-delta.c.

2492

2493 Args:

2494 src_buf: Source buffer

2495 delta: Delta instructions

2496 """

2497 if not isinstance(src_buf, bytes):

2498 src_buf = b"".join(src_buf)

2499 if not isinstance(delta, bytes):

2500 delta = b"".join(delta)

2501 out = []

2502 index = 0

2503 delta_length = len(delta)

2504

2505 def get_delta_header_size(delta, index):

2506 size = 0

2507 i = 0

2508 while delta:

2509 cmd = ord(delta[index : index + 1])

2510 index += 1

2511 size |= (cmd & ~0x80) << i

2512 i += 7

2513 if not cmd & 0x80:

2514 break

2515 return size, index

2516

2517 src_size, index = get_delta_header_size(delta, index)

2518 dest_size, index = get_delta_header_size(delta, index)

2519 if src_size != len(src_buf):

2520 raise ApplyDeltaError(

2521 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"

2522 )

2523 while index < delta_length:

2524 cmd = ord(delta[index : index + 1])

2525 index += 1

2526 if cmd & 0x80:

2527 cp_off = 0

2528 for i in range(4):

2529 if cmd & (1 << i):

2530 x = ord(delta[index : index + 1])

2531 index += 1

2532 cp_off |= x << (i * 8)

2533 cp_size = 0

2534 # Version 3 packs can contain copy sizes larger than 64K.

2535 for i in range(3):

2536 if cmd & (1 << (4 + i)):

2537 x = ord(delta[index : index + 1])

2538 index += 1

2539 cp_size |= x << (i * 8)

2540 if cp_size == 0:

2541 cp_size = 0x10000

2542 if (

2543 cp_off + cp_size < cp_size

2544 or cp_off + cp_size > src_size

2545 or cp_size > dest_size

2546 ):

2547 break

2548 out.append(src_buf[cp_off : cp_off + cp_size])

2549 elif cmd != 0:

2550 out.append(delta[index : index + cmd])

2551 index += cmd

2552 else:

2553 raise ApplyDeltaError("Invalid opcode 0")

2554

2555 if index != delta_length:

2556 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

2557

2558 if dest_size != chunks_length(out):

2559 raise ApplyDeltaError("dest size incorrect")

2560

2561 return out

2562

2563

2564def write_pack_index_v2(

2565 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes

2566) -> bytes:

2567 """Write a new pack index file.

2568

2569 Args:

2570 f: File-like object to write to

2571 entries: List of tuples with object name (sha), offset_in_pack, and

2572 crc32_checksum.

2573 pack_checksum: Checksum of the pack file.

2574 Returns: The SHA of the index file written

2575 """

2576 f = SHA1Writer(f)

2577 f.write(b"\377tOc") # Magic!

2578 f.write(struct.pack(">L", 2))

2579 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2580 for name, offset, entry_checksum in entries:

2581 fan_out_table[ord(name[:1])] += 1

2582 # Fan-out table

2583 largetable: list[int] = []

2584 for i in range(0x100):

2585 f.write(struct.pack(b">L", fan_out_table[i]))

2586 fan_out_table[i + 1] += fan_out_table[i]

2587 for name, offset, entry_checksum in entries:

2588 f.write(name)

2589 for name, offset, entry_checksum in entries:

2590 f.write(struct.pack(b">L", entry_checksum))

2591 for name, offset, entry_checksum in entries:

2592 if offset < 2**31:

2593 f.write(struct.pack(b">L", offset))

2594 else:

2595 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2596 largetable.append(offset)

2597 for offset in largetable:

2598 f.write(struct.pack(b">Q", offset))

2599 assert len(pack_checksum) == 20

2600 f.write(pack_checksum)

2601 return f.write_sha()

2602

2603

2604def write_pack_index_v3(

2605 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1

2606) -> bytes:

2607 """Write a new pack index file in v3 format.

2608

2609 Args:

2610 f: File-like object to write to

2611 entries: List of tuples with object name (sha), offset_in_pack, and

2612 crc32_checksum.

2613 pack_checksum: Checksum of the pack file.

2614 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

2615 Returns: The SHA of the index file written

2616 """

2617 if hash_algorithm == 1:

2618 hash_size = 20 # SHA-1

2619 writer_cls = SHA1Writer

2620 elif hash_algorithm == 2:

2621 hash_size = 32 # SHA-256

2622 # TODO: Add SHA256Writer when SHA-256 support is implemented

2623 raise NotImplementedError("SHA-256 support not yet implemented")

2624 else:

2625 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")

2626

2627 # Convert entries to list to allow multiple iterations

2628 entries_list = list(entries)

2629

2630 # Calculate shortest unambiguous prefix length for object names

2631 # For now, use full hash size (this could be optimized)

2632 shortened_oid_len = hash_size

2633

2634 f = writer_cls(f)

2635 f.write(b"\377tOc") # Magic!

2636 f.write(struct.pack(">L", 3)) # Version 3

2637 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm

2638 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length

2639

2640 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2641 for name, offset, entry_checksum in entries_list:

2642 if len(name) != hash_size:

2643 raise ValueError(

2644 f"Object name has wrong length: expected {hash_size}, got {len(name)}"

2645 )

2646 fan_out_table[ord(name[:1])] += 1

2647

2648 # Fan-out table

2649 largetable: list[int] = []

2650 for i in range(0x100):

2651 f.write(struct.pack(b">L", fan_out_table[i]))

2652 fan_out_table[i + 1] += fan_out_table[i]

2653

2654 # Object names table

2655 for name, offset, entry_checksum in entries_list:

2656 f.write(name)

2657

2658 # CRC32 checksums table

2659 for name, offset, entry_checksum in entries_list:

2660 f.write(struct.pack(b">L", entry_checksum))

2661

2662 # Offset table

2663 for name, offset, entry_checksum in entries_list:

2664 if offset < 2**31:

2665 f.write(struct.pack(b">L", offset))

2666 else:

2667 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2668 largetable.append(offset)

2669

2670 # Large offset table

2671 for offset in largetable:

2672 f.write(struct.pack(b">Q", offset))

2673

2674 assert len(pack_checksum) == hash_size, (

2675 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"

2676 )

2677 f.write(pack_checksum)

2678 return f.write_sha()

2679

2680

2681def write_pack_index(

2682 index_filename, entries, pack_checksum, progress=None, version=None

2683):

2684 """Write a pack index file.

2685

2686 Args:

2687 index_filename: Index filename.

2688 entries: List of (checksum, offset, crc32) tuples

2689 pack_checksum: Checksum of the pack file.

2690 progress: Progress function (not currently used)

2691 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.

2692

2693 Returns:

2694 SHA of the written index file

2695 """

2696 if version is None:

2697 version = DEFAULT_PACK_INDEX_VERSION

2698

2699 if version == 1:

2700 return write_pack_index_v1(index_filename, entries, pack_checksum)

2701 elif version == 2:

2702 return write_pack_index_v2(index_filename, entries, pack_checksum)

2703 elif version == 3:

2704 return write_pack_index_v3(index_filename, entries, pack_checksum)

2705 else:

2706 raise ValueError(f"Unsupported pack index version: {version}")

2707

2708

2709class Pack:

2710 """A Git pack object."""

2711

2712 _data_load: Optional[Callable[[], PackData]]

2713 _idx_load: Optional[Callable[[], PackIndex]]

2714

2715 _data: Optional[PackData]

2716 _idx: Optional[PackIndex]

2717

2718 def __init__(

2719 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None

2720 ) -> None:

2721 self._basename = basename

2722 self._data = None

2723 self._idx = None

2724 self._idx_path = self._basename + ".idx"

2725 self._data_path = self._basename + ".pack"

2726 self._data_load = lambda: PackData(self._data_path)

2727 self._idx_load = lambda: load_pack_index(self._idx_path)

2728 self.resolve_ext_ref = resolve_ext_ref

2729

2730 @classmethod

2731 def from_lazy_objects(cls, data_fn, idx_fn):

2732 """Create a new pack object from callables to load pack data and

2733 index objects.

2734 """

2735 ret = cls("")

2736 ret._data_load = data_fn

2737 ret._idx_load = idx_fn

2738 return ret

2739

2740 @classmethod

2741 def from_objects(cls, data, idx):

2742 """Create a new pack object from pack data and index objects."""

2743 ret = cls("")

2744 ret._data = data

2745 ret._data_load = None

2746 ret._idx = idx

2747 ret._idx_load = None

2748 ret.check_length_and_checksum()

2749 return ret

2750

2751 def name(self):

2752 """The SHA over the SHAs of the objects in this pack."""

2753 return self.index.objects_sha1()

2754

2755 @property

2756 def data(self) -> PackData:

2757 """The pack data object being used."""

2758 if self._data is None:

2759 assert self._data_load

2760 self._data = self._data_load()

2761 self.check_length_and_checksum()

2762 return self._data

2763

2764 @property

2765 def index(self) -> PackIndex:

2766 """The index being used.

2767

2768 Note: This may be an in-memory index

2769 """

2770 if self._idx is None:

2771 assert self._idx_load

2772 self._idx = self._idx_load()

2773 return self._idx

2774

2775 def close(self) -> None:

2776 if self._data is not None:

2777 self._data.close()

2778 if self._idx is not None:

2779 self._idx.close()

2780

2781 def __enter__(self):

2782 return self

2783

2784 def __exit__(self, exc_type, exc_val, exc_tb):

2785 self.close()

2786

2787 def __eq__(self, other):

2788 return isinstance(self, type(other)) and self.index == other.index

2789

2790 def __len__(self) -> int:

2791 """Number of entries in this pack."""

2792 return len(self.index)

2793

2794 def __repr__(self) -> str:

2795 return f"{self.__class__.__name__}({self._basename!r})"

2796

2797 def __iter__(self):

2798 """Iterate over all the sha1s of the objects in this pack."""

2799 return iter(self.index)

2800

2801 def check_length_and_checksum(self) -> None:

2802 """Sanity check the length and checksum of the pack index and data."""

2803 assert len(self.index) == len(self.data), (

2804 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

2805 )

2806 idx_stored_checksum = self.index.get_pack_checksum()

2807 data_stored_checksum = self.data.get_stored_checksum()

2808 if idx_stored_checksum != data_stored_checksum:

2809 raise ChecksumMismatch(

2810 sha_to_hex(idx_stored_checksum),

2811 sha_to_hex(data_stored_checksum),

2812 )

2813

2814 def check(self) -> None:

2815 """Check the integrity of this pack.

2816

2817 Raises:

2818 ChecksumMismatch: if a checksum for the index or data is wrong

2819 """

2820 self.index.check()

2821 self.data.check()

2822 for obj in self.iterobjects():

2823 obj.check()

2824 # TODO: object connectivity checks

2825

2826 def get_stored_checksum(self) -> bytes:

2827 return self.data.get_stored_checksum()

2828

2829 def pack_tuples(self):

2830 return [(o, None) for o in self.iterobjects()]

2831

2832 def __contains__(self, sha1: bytes) -> bool:

2833 """Check whether this pack contains a particular SHA1."""

2834 try:

2835 self.index.object_offset(sha1)

2836 return True

2837 except KeyError:

2838 return False

2839

2840 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:

2841 offset = self.index.object_offset(sha1)

2842 obj_type, obj = self.data.get_object_at(offset)

2843 type_num, chunks = self.resolve_object(offset, obj_type, obj)

2844 return type_num, b"".join(chunks)

2845

2846 def __getitem__(self, sha1: bytes) -> ShaFile:

2847 """Retrieve the specified SHA1."""

2848 type, uncomp = self.get_raw(sha1)

2849 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

2850

2851 def iterobjects(self) -> Iterator[ShaFile]:

2852 """Iterate over the objects in this pack."""

2853 return iter(

2854 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

2855 )

2856

2857 def iterobjects_subset(

2858 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

2859 ) -> Iterator[ShaFile]:

2860 return (

2861 uo

2862 for uo in PackInflater.for_pack_subset(

2863 self,

2864 shas,

2865 allow_missing=allow_missing,

2866 resolve_ext_ref=self.resolve_ext_ref,

2867 )

2868 if uo.id in shas

2869 )

2870

2871 def iter_unpacked_subset(

2872 self,

2873 shas: Iterable[ObjectID],

2874 *,

2875 include_comp: bool = False,

2876 allow_missing: bool = False,

2877 convert_ofs_delta: bool = False,

2878 ) -> Iterator[UnpackedObject]:

2879 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)

2880 ofs: dict[bytes, int] = {}

2881 todo = set(shas)

2882 for unpacked in self.iter_unpacked(include_comp=include_comp):

2883 sha = unpacked.sha()

2884 ofs[unpacked.offset] = sha

2885 hexsha = sha_to_hex(sha)

2886 if hexsha in todo:

2887 if unpacked.pack_type_num == OFS_DELTA:

2888 assert isinstance(unpacked.delta_base, int)

2889 base_offset = unpacked.offset - unpacked.delta_base

2890 try:

2891 unpacked.delta_base = ofs[base_offset]

2892 except KeyError:

2893 ofs_pending[base_offset].append(unpacked)

2894 continue

2895 else:

2896 unpacked.pack_type_num = REF_DELTA

2897 yield unpacked

2898 todo.remove(hexsha)

2899 for child in ofs_pending.pop(unpacked.offset, []):

2900 child.pack_type_num = REF_DELTA

2901 child.delta_base = sha

2902 yield child

2903 assert not ofs_pending

2904 if not allow_missing and todo:

2905 raise UnresolvedDeltas(todo)

2906

2907 def iter_unpacked(self, include_comp=False):

2908 ofs_to_entries = {

2909 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

2910 }

2911 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

2912 (sha, crc32) = ofs_to_entries[unpacked.offset]

2913 unpacked._sha = sha

2914 unpacked.crc32 = crc32

2915 yield unpacked

2916

2917 def keep(self, msg: Optional[bytes] = None) -> str:

2918 """Add a .keep file for the pack, preventing git from garbage collecting it.

2919

2920 Args:

2921 msg: A message written inside the .keep file; can be used later

2922 to determine whether or not a .keep file is obsolete.

2923 Returns: The path of the .keep file, as a string.

2924 """

2925 keepfile_name = f"{self._basename}.keep"

2926 with GitFile(keepfile_name, "wb") as keepfile:

2927 if msg:

2928 keepfile.write(msg)

2929 keepfile.write(b"\n")

2930 return keepfile_name

2931

2932 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:

2933 """Get the object for a ref SHA, only looking in this pack."""

2934 # TODO: cache these results

2935 try:

2936 offset = self.index.object_offset(sha)

2937 except KeyError:

2938 offset = None

2939 if offset:

2940 type, obj = self.data.get_object_at(offset)

2941 elif self.resolve_ext_ref:

2942 type, obj = self.resolve_ext_ref(sha)

2943 else:

2944 raise KeyError(sha)

2945 return offset, type, obj

2946

2947 def resolve_object(

2948 self, offset: int, type: int, obj, get_ref=None

2949 ) -> tuple[int, Iterable[bytes]]:

2950 """Resolve an object, possibly resolving deltas when necessary.

2951

2952 Returns: Tuple with object type and contents.

2953 """

2954 # Walk down the delta chain, building a stack of deltas to reach

2955 # the requested object.

2956 base_offset = offset

2957 base_type = type

2958 base_obj = obj

2959 delta_stack = []

2960 while base_type in DELTA_TYPES:

2961 prev_offset = base_offset

2962 if get_ref is None:

2963 get_ref = self.get_ref

2964 if base_type == OFS_DELTA:

2965 (delta_offset, delta) = base_obj

2966 # TODO: clean up asserts and replace with nicer error messages

2967 base_offset = base_offset - delta_offset

2968 base_type, base_obj = self.data.get_object_at(base_offset)

2969 assert isinstance(base_type, int)

2970 elif base_type == REF_DELTA:

2971 (basename, delta) = base_obj

2972 assert isinstance(basename, bytes) and len(basename) == 20

2973 base_offset, base_type, base_obj = get_ref(basename)

2974 assert isinstance(base_type, int)

2975 if base_offset == prev_offset: # object is based on itself

2976 raise UnresolvedDeltas(sha_to_hex(basename))

2977 delta_stack.append((prev_offset, base_type, delta))

2978

2979 # Now grab the base object (mustn't be a delta) and apply the

2980 # deltas all the way up the stack.

2981 chunks = base_obj

2982 for prev_offset, delta_type, delta in reversed(delta_stack):

2983 chunks = apply_delta(chunks, delta)

2984 # TODO(dborowitz): This can result in poor performance if

2985 # large base objects are separated from deltas in the pack.

2986 # We should reorganize so that we apply deltas to all

2987 # objects in a chain one after the other to optimize cache

2988 # performance.

2989 if prev_offset is not None:

2990 self.data._offset_cache[prev_offset] = base_type, chunks

2991 return base_type, chunks

2992

2993 def entries(

2994 self, progress: Optional[ProgressFn] = None

2995 ) -> Iterator[PackIndexEntry]:

2996 """Yield entries summarizing the contents of this pack.

2997

2998 Args:

2999 progress: Progress function, called with current and total

3000 object count.

3001 Returns: iterator of tuples with (sha, offset, crc32)

3002 """

3003 return self.data.iterentries(

3004 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3005 )

3006

3007 def sorted_entries(

3008 self, progress: Optional[ProgressFn] = None

3009 ) -> Iterator[PackIndexEntry]:

3010 """Return entries in this pack, sorted by SHA.

3011

3012 Args:

3013 progress: Progress function, called with current and total

3014 object count

3015 Returns: Iterator of tuples with (sha, offset, crc32)

3016 """

3017 return self.data.sorted_entries(

3018 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3019 )

3020

3021 def get_unpacked_object(

3022 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True

3023 ) -> UnpackedObject:

3024 """Get the unpacked object for a sha.

3025

3026 Args:

3027 sha: SHA of object to fetch

3028 include_comp: Whether to include compression data in UnpackedObject

3029 """

3030 offset = self.index.object_offset(sha)

3031 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

3032 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

3033 assert isinstance(unpacked.delta_base, int)

3034 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

3035 unpacked.pack_type_num = REF_DELTA

3036 return unpacked

3037

3038

3039def extend_pack(

3040 f: BinaryIO,

3041 object_ids: set[ObjectID],

3042 get_raw,

3043 *,

3044 compression_level=-1,

3045 progress=None,

3046) -> tuple[bytes, list]:

3047 """Extend a pack file with more objects.

3048

3049 The caller should make sure that object_ids does not contain any objects

3050 that are already in the pack

3051 """

3052 # Update the header with the new number of objects.

3053 f.seek(0)

3054 _version, num_objects = read_pack_header(f.read)

3055

3056 if object_ids:

3057 f.seek(0)

3058 write_pack_header(f.write, num_objects + len(object_ids))

3059

3060 # Must flush before reading (http://bugs.python.org/issue3207)

3061 f.flush()

3062

3063 # Rescan the rest of the pack, computing the SHA with the new header.

3064 new_sha = compute_file_sha(f, end_ofs=-20)

3065

3066 # Must reposition before writing (http://bugs.python.org/issue3207)

3067 f.seek(0, os.SEEK_CUR)

3068

3069 extra_entries = []

3070

3071 # Complete the pack.

3072 for i, object_id in enumerate(object_ids):

3073 if progress is not None:

3074 progress(

3075 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")

3076 )

3077 assert len(object_id) == 20

3078 type_num, data = get_raw(object_id)

3079 offset = f.tell()

3080 crc32 = write_pack_object(

3081 f.write,

3082 type_num,

3083 data,

3084 sha=new_sha,

3085 compression_level=compression_level,

3086 )

3087 extra_entries.append((object_id, offset, crc32))

3088 pack_sha = new_sha.digest()

3089 f.write(pack_sha)

3090 return pack_sha, extra_entries

3091

3092

3093try:

3094 from dulwich._pack import ( # type: ignore

3095 apply_delta, # type: ignore

3096 bisect_find_sha, # type: ignore

3097 )

3098except ImportError:

3099 pass