Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/midx.py: 16%

1# midx.py -- Multi-Pack-Index (MIDX) support

4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

6# General Public License as published by the Free Software Foundation; version 2.0

7# or (at your option) any later version. You can redistribute it and/or

8# modify it under the terms of either of these two licenses.

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS,

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13# See the License for the specific language governing permissions and

14# limitations under the License.

15#

16# You should have received a copy of the licenses; if not, see

17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

19# License, Version 2.0.

20#

22"""Multi-Pack-Index (MIDX) support.

24A multi-pack-index (MIDX) provides a single index that covers multiple pack files,

25enabling fast object lookup across all packs without opening each pack index.

27The MIDX file format consists of:

28- A header with signature, version, and hash algorithm

29- A chunk lookup table

30- Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets

31- A trailer with checksum

33This module provides:

34- Reading MIDX files

35- Writing MIDX files

36- Integration with pack-based object stores

38Limitations:

39- Incremental MIDX chains are not yet supported (base_midx_files must be 0)

40- BTMP (bitmapped packfiles) chunk is not yet implemented

41- RIDX (reverse index) chunk is not yet implemented

43Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental

44feature, where multiple MIDX files can be chained together. The format includes

45a base_midx_files field in the header and uses a multi-pack-index.d/ directory

46with a multi-pack-index-chain file. This feature is not yet supported by Dulwich

47as the specification is still evolving.

48"""

50__all__ = [

51 "CHUNK_BTMP",

52 "CHUNK_LOFF",

53 "CHUNK_OIDF",

54 "CHUNK_OIDL",

55 "CHUNK_OOFF",

56 "CHUNK_PNAM",

57 "CHUNK_RIDX",

58 "HASH_ALGORITHM_SHA1",

59 "HASH_ALGORITHM_SHA256",

60 "MIDX_SIGNATURE",

61 "MIDX_VERSION",

62 "MultiPackIndex",

63 "load_midx",

64 "load_midx_file",

65 "write_midx",

66]

68import os

69import struct

70from collections.abc import Iterator

71from io import UnsupportedOperation

72from typing import IO, Any

74try:

75 import mmap

76except ImportError:

77 has_mmap = False

78else:

79 has_mmap = True

81from .file import GitFile, _GitFile

82from .objects import ObjectID, RawObjectID

83from .pack import SHA1Writer

85# MIDX signature

86MIDX_SIGNATURE = b"MIDX"

88# MIDX version

89MIDX_VERSION = 1

91# Chunk identifiers (4 bytes each)

92CHUNK_PNAM = b"PNAM" # Packfile names

93CHUNK_OIDF = b"OIDF" # OID fanout table

94CHUNK_OIDL = b"OIDL" # OID lookup table

95CHUNK_OOFF = b"OOFF" # Object offsets

96CHUNK_LOFF = b"LOFF" # Large offsets (optional)

97CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)

98CHUNK_RIDX = b"RIDX" # Reverse index (optional)

100# Hash algorithm identifiers

101HASH_ALGORITHM_SHA1 = 1

102HASH_ALGORITHM_SHA256 = 2

103

104

105class MultiPackIndex:

106 """Multi-pack-index for efficient object lookup across multiple pack files."""

107

108 def __init__(

109 self,

110 filename: str | os.PathLike[str],

111 file: IO[bytes] | _GitFile | None = None,

112 contents: bytes | None = None,

113 size: int | None = None,

114 ) -> None:

115 """Initialize a MultiPackIndex.

116

117 Args:

118 filename: Path to the MIDX file

119 file: Optional file object

120 contents: Optional mmap'd contents

121 size: Optional size of the MIDX file

122 """

123 self._filename = os.fspath(filename)

124 self._file = file

125 self._size = size

126

127 # Instance variables that will be set during parsing

128 self.version: int

129 self.hash_algorithm: int

130 self.hash_size: int

131 self.chunk_count: int

132 self.base_midx_files: int

133 self.pack_count: int

134 self.pack_names: list[str]

135 self.object_count: int

136 self._chunks: dict[bytes, int]

137 self._fanout_table: list[int]

138 self._oidl_offset: int

139 self._ooff_offset: int

140 self._loff_offset: int

141

142 # Load file contents

143 if contents is None:

144 if file is None:

145 with GitFile(filename, "rb") as f:

146 self._contents, self._size = self._load_file_contents(f, size)

147 else:

148 self._contents, self._size = self._load_file_contents(file, size)

149 else:

150 self._contents = contents

151

152 # Parse header

153 self._parse_header()

154

155 # Parse chunk lookup table

156 self._parse_chunk_table()

157

158 def _load_file_contents(

159 self, f: IO[bytes] | _GitFile, size: int | None = None

160 ) -> tuple[bytes | Any, int]:

161 """Load contents from a file, preferring mmap when possible.

162

163 Args:

164 f: File-like object to load

165 size: Expected size, or None to determine from file

166

167 Returns:

168 Tuple of (contents, size)

169 """

170 try:

171 fd = f.fileno()

172 except (UnsupportedOperation, AttributeError):

173 fd = None

174

175 # Attempt to use mmap if possible

176 if fd is not None:

177 if size is None:

178 size = os.fstat(fd).st_size

179 if has_mmap:

180 try:

181 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

182 except (OSError, ValueError):

183 # Can't mmap - perhaps a socket or invalid file descriptor

184 pass

185 else:

186 return contents, size

187

188 # Fall back to reading entire file into memory

189 contents_bytes = f.read()

190 size = len(contents_bytes)

191 return contents_bytes, size

192

193 def _parse_header(self) -> None:

194 """Parse the MIDX header."""

195 if len(self._contents) < 12:

196 raise ValueError("MIDX file too small")

197

198 # Check signature

199 signature = self._contents[0:4]

200 if signature != MIDX_SIGNATURE:

201 raise ValueError(f"Invalid MIDX signature: {signature!r}")

202

203 # Read version

204 self.version = self._contents[4]

205 if self.version != MIDX_VERSION:

206 raise ValueError(f"Unsupported MIDX version: {self.version}")

207

208 # Read object ID version (hash algorithm)

209 self.hash_algorithm = self._contents[5]

210 if self.hash_algorithm == HASH_ALGORITHM_SHA1:

211 self.hash_size = 20

212 elif self.hash_algorithm == HASH_ALGORITHM_SHA256:

213 self.hash_size = 32

214 else:

215 raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")

216

217 # Read chunk count

218 self.chunk_count = self._contents[6]

219

220 # Read base MIDX files count (currently always 0)

221 self.base_midx_files = self._contents[7]

222 if self.base_midx_files != 0:

223 raise ValueError("Incremental MIDX not yet supported")

224

225 # Read pack file count

226 (self.pack_count,) = struct.unpack(">L", self._contents[8:12])

227

228 def _parse_chunk_table(self) -> None:

229 """Parse the chunk lookup table."""

230 self._chunks = {}

231

232 # Chunk table starts at offset 12

233 offset = 12

234

235 # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)

236 for i in range(self.chunk_count + 1): # +1 for terminator

237 chunk_id = self._contents[offset : offset + 4]

238 (chunk_offset,) = struct.unpack(

239 ">Q", self._contents[offset + 4 : offset + 12]

240 )

241

242 if chunk_id == b"\x00\x00\x00\x00":

243 # Terminator entry

244 break

245

246 self._chunks[chunk_id] = chunk_offset

247 offset += 12

248

249 # Parse required chunks

250 self._parse_pnam_chunk()

251 self._parse_oidf_chunk()

252 self._parse_oidl_chunk()

253 self._parse_ooff_chunk()

254

255 # Parse optional chunks

256 if CHUNK_LOFF in self._chunks:

257 self._parse_loff_chunk()

258

259 def _parse_pnam_chunk(self) -> None:

260 """Parse the Packfile Names (PNAM) chunk."""

261 if CHUNK_PNAM not in self._chunks:

262 raise ValueError("Required PNAM chunk not found")

263

264 offset = self._chunks[CHUNK_PNAM]

265 self.pack_names = []

266

267 # Find the end of the PNAM chunk (next chunk or end of chunks section)

268 next_offset = min(

269 (o for o in self._chunks.values() if o > offset),

270 default=len(self._contents),

271 )

272

273 # Parse null-terminated pack names

274 current = offset

275 while current < next_offset:

276 # Find the next null terminator

277 null_pos = self._contents.find(b"\x00", current, next_offset)

278 if null_pos == -1:

279 break

280

281 pack_name = self._contents[current:null_pos].decode("utf-8")

282 if pack_name: # Skip empty strings (padding)

283 self.pack_names.append(pack_name)

284 current = null_pos + 1

285

286 def _parse_oidf_chunk(self) -> None:

287 """Parse the OID Fanout (OIDF) chunk."""

288 if CHUNK_OIDF not in self._chunks:

289 raise ValueError("Required OIDF chunk not found")

290

291 offset = self._chunks[CHUNK_OIDF]

292 self._fanout_table = []

293

294 # Read 256 4-byte entries

295 for i in range(256):

296 (count,) = struct.unpack(

297 ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]

298 )

299 self._fanout_table.append(count)

300

301 # Total object count is the last entry

302 self.object_count = self._fanout_table[255]

303

304 def _parse_oidl_chunk(self) -> None:

305 """Parse the OID Lookup (OIDL) chunk."""

306 if CHUNK_OIDL not in self._chunks:

307 raise ValueError("Required OIDL chunk not found")

308

309 self._oidl_offset = self._chunks[CHUNK_OIDL]

310

311 def _parse_ooff_chunk(self) -> None:

312 """Parse the Object Offsets (OOFF) chunk."""

313 if CHUNK_OOFF not in self._chunks:

314 raise ValueError("Required OOFF chunk not found")

315

316 self._ooff_offset = self._chunks[CHUNK_OOFF]

317

318 def _parse_loff_chunk(self) -> None:

319 """Parse the Large Offsets (LOFF) chunk."""

320 self._loff_offset = self._chunks[CHUNK_LOFF]

321

322 def __len__(self) -> int:

323 """Return the number of objects in this MIDX."""

324 return self.object_count

325

326 def _get_oid(self, index: int) -> RawObjectID:

327 """Get the object ID at the given index.

328

329 Args:

330 index: Index of the object

331

332 Returns:

333 Binary object ID

334 """

335 if index < 0 or index >= self.object_count:

336 raise IndexError(f"Index {index} out of range")

337

338 offset = self._oidl_offset + index * self.hash_size

339 return RawObjectID(self._contents[offset : offset + self.hash_size])

340

341 def _get_pack_info(self, index: int) -> tuple[int, int]:

342 """Get pack ID and offset for object at the given index.

343

344 Args:

345 index: Index of the object

346

347 Returns:

348 Tuple of (pack_id, offset)

349 """

350 if index < 0 or index >= self.object_count:

351 raise IndexError(f"Index {index} out of range")

352

353 # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)

354 offset = self._ooff_offset + index * 8

355

356 (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])

357 (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])

358

359 # Check if this is a large offset (MSB set)

360 if pack_offset & 0x80000000:

361 # Look up in LOFF chunk

362 if CHUNK_LOFF not in self._chunks:

363 raise ValueError("Large offset found but no LOFF chunk")

364

365 large_index = pack_offset & 0x7FFFFFFF

366 large_offset_pos = self._loff_offset + large_index * 8

367 (pack_offset,) = struct.unpack(

368 ">Q", self._contents[large_offset_pos : large_offset_pos + 8]

369 )

370

371 return pack_id, pack_offset

372

373 def object_offset(self, sha: ObjectID | RawObjectID) -> tuple[str, int] | None:

374 """Return the pack name and offset for the given object.

375

376 Args:

377 sha: Binary SHA-1 or SHA-256 hash

378

379 Returns:

380 Tuple of (pack_name, offset) or None if not found

381 """

382 if len(sha) != self.hash_size:

383 raise ValueError(

384 f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"

385 )

386

387 # Use fanout table to narrow search range

388 first_byte = sha[0]

389 start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]

390 end_idx = self._fanout_table[first_byte]

391

392 # Binary search within the range

393 while start_idx < end_idx:

394 mid = (start_idx + end_idx) // 2

395 mid_sha = self._get_oid(mid)

396

397 if mid_sha == sha:

398 # Found it!

399 pack_id, offset = self._get_pack_info(mid)

400 return self.pack_names[pack_id], offset

401 elif mid_sha < sha:

402 start_idx = mid + 1

403 else:

404 end_idx = mid

405

406 return None

407

408 def __contains__(self, sha: ObjectID | RawObjectID) -> bool:

409 """Check if the given object SHA is in this MIDX.

410

411 Args:

412 sha: Binary SHA hash

413

414 Returns:

415 True if the object is in this MIDX

416 """

417 return self.object_offset(sha) is not None

418

419 def iterentries(self) -> Iterator[tuple[RawObjectID, str, int]]:

420 """Iterate over all entries in this MIDX.

421

422 Yields:

423 Tuples of (sha, pack_name, offset)

424 """

425 for i in range(self.object_count):

426 sha = self._get_oid(i)

427 pack_id, offset = self._get_pack_info(i)

428 pack_name = self.pack_names[pack_id]

429 yield sha, pack_name, offset

430

431 def close(self) -> None:

432 """Close the MIDX file and release mmap resources."""

433 # Close mmap'd contents first if it's an mmap object

434 if self._contents is not None and has_mmap:

435 if isinstance(self._contents, mmap.mmap):

436 self._contents.close()

437 self._contents = None

438

439 # Close file handle

440 if self._file is not None:

441 self._file.close()

442 self._file = None

443

444

445def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:

446 """Load a multi-pack-index file by path.

447

448 Args:

449 path: Path to the MIDX file

450

451 Returns:

452 A MultiPackIndex loaded from the given path

453 """

454 with GitFile(path, "rb") as f:

455 return load_midx_file(path, f)

456

457

458def load_midx_file(

459 path: str | os.PathLike[str], f: IO[bytes] | _GitFile

460) -> MultiPackIndex:

461 """Load a multi-pack-index from a file-like object.

462

463 Args:

464 path: Path for the MIDX file

465 f: File-like object

466

467 Returns:

468 A MultiPackIndex loaded from the given file

469 """

470 return MultiPackIndex(path, file=f)

471

472

473def write_midx(

474 f: IO[bytes],

475 pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],

476 hash_algorithm: int = HASH_ALGORITHM_SHA1,

477) -> bytes:

478 """Write a multi-pack-index file.

479

480 Args:

481 f: File-like object to write to

482 pack_index_entries: List of (pack_name, entries) tuples where entries are

483 (sha, offset, crc32) tuples, sorted by SHA

484 hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)

485

486 Returns:

487 SHA-1 checksum of the written MIDX file

488 """

489 if hash_algorithm == HASH_ALGORITHM_SHA1:

490 hash_size = 20

491 elif hash_algorithm == HASH_ALGORITHM_SHA256:

492 hash_size = 32

493 else:

494 raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")

495

496 # Wrap file in SHA1Writer to compute checksum

497 writer = SHA1Writer(f)

498

499 # Sort pack entries by pack name (required by Git)

500 pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])

501

502 # Collect all objects from all packs

503 all_objects: list[tuple[RawObjectID, int, int]] = [] # (sha, pack_id, offset)

504 pack_names: list[str] = []

505

506 for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):

507 pack_names.append(pack_name)

508 for sha, offset, _crc32 in entries:

509 all_objects.append((sha, pack_id, offset))

510

511 # Sort all objects by SHA

512 all_objects.sort(key=lambda x: x[0])

513

514 # Calculate offsets for chunks

515 num_packs = len(pack_names)

516 num_objects = len(all_objects)

517

518 # Header: 12 bytes

519 header_size = 12

520

521 # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF

522 # We'll determine if LOFF is needed later

523 chunk_count = 4 # PNAM, OIDF, OIDL, OOFF

524

525 # Check if we need LOFF chunk (for offsets >= 2^31)

526 need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)

527 if need_loff:

528 chunk_count += 1

529

530 # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)

531 chunk_table_size = (chunk_count + 1) * 12

532

533 # Calculate chunk offsets

534 current_offset = header_size + chunk_table_size

535

536 # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary

537 pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)

538 # Pad to 4-byte boundary

539 pnam_padding = (4 - len(pnam_data) % 4) % 4

540 pnam_data += b"\x00" * pnam_padding

541 pnam_offset = current_offset

542 current_offset += len(pnam_data)

543

544 # OIDF chunk: 256 * 4 bytes

545 oidf_offset = current_offset

546 oidf_size = 256 * 4

547 current_offset += oidf_size

548

549 # OIDL chunk: num_objects * hash_size bytes

550 oidl_offset = current_offset

551 oidl_size = num_objects * hash_size

552 current_offset += oidl_size

553

554 # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)

555 ooff_offset = current_offset

556 ooff_size = num_objects * 8

557 current_offset += ooff_size

558

559 # LOFF chunk (if needed): variable size

560 # We'll calculate the exact size when we know how many large offsets we have

561 loff_offset = current_offset if need_loff else 0

562 large_offsets: list[int] = []

563

564 # Calculate trailer offset (where checksum starts)

565 # We need to pre-calculate large offset count for accurate trailer offset

566 if need_loff:

567 # Count large offsets

568 large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)

569 loff_size = large_offset_count * 8

570 trailer_offset = current_offset + loff_size

571 else:

572 trailer_offset = current_offset

573

574 # Write header

575 writer.write(MIDX_SIGNATURE) # 4 bytes: signature

576 writer.write(bytes([MIDX_VERSION])) # 1 byte: version

577 writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm

578 writer.write(bytes([chunk_count])) # 1 byte: chunk count

579 writer.write(bytes([0])) # 1 byte: base MIDX files (always 0)

580 writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count

581

582 # Write chunk table

583 chunk_table = [

584 (CHUNK_PNAM, pnam_offset),

585 (CHUNK_OIDF, oidf_offset),

586 (CHUNK_OIDL, oidl_offset),

587 (CHUNK_OOFF, ooff_offset),

588 ]

589 if need_loff:

590 chunk_table.append((CHUNK_LOFF, loff_offset))

591

592 for chunk_id, chunk_offset in chunk_table:

593 writer.write(chunk_id) # 4 bytes

594 writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes

595

596 # Write terminator (points to where trailer/checksum starts)

597 writer.write(b"\x00\x00\x00\x00") # 4 bytes

598 writer.write(struct.pack(">Q", trailer_offset)) # 8 bytes

599

600 # Write PNAM chunk

601 writer.write(pnam_data)

602

603 # Write OIDF chunk (fanout table)

604 fanout: list[int] = [0] * 256

605 for sha, _pack_id, _offset in all_objects:

606 first_byte = sha[0]

607 fanout[first_byte] += 1

608

609 # Convert counts to cumulative

610 cumulative = 0

611 for i in range(256):

612 cumulative += fanout[i]

613 writer.write(struct.pack(">L", cumulative))

614

615 # Write OIDL chunk (object IDs)

616 for sha, _pack_id, _offset in all_objects:

617 writer.write(sha)

618

619 # Write OOFF chunk (pack ID and offset for each object)

620 for _sha, pack_id, offset in all_objects:

621 writer.write(struct.pack(">L", pack_id))

622

623 if offset >= 2**31:

624 # Use large offset table

625 large_offset_index = len(large_offsets)

626 large_offsets.append(offset)

627 # Set MSB to indicate large offset

628 writer.write(struct.pack(">L", 0x80000000 | large_offset_index))

629 else:

630 writer.write(struct.pack(">L", offset))

631

632 # Write LOFF chunk if needed

633 if need_loff:

634 for large_offset in large_offsets:

635 writer.write(struct.pack(">Q", large_offset))

636

637 # Write checksum

638 return writer.write_sha()

639

640

641def write_midx_file(

642 path: str | os.PathLike[str],

643 pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],

644 hash_algorithm: int = HASH_ALGORITHM_SHA1,

645) -> bytes:

646 """Write a multi-pack-index file to disk.

647

648 Args:

649 path: Path where to write the MIDX file

650 pack_index_entries: List of (pack_name, entries) tuples where entries are

651 (sha, offset, crc32) tuples, sorted by SHA

652 hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)

653

654 Returns:

655 SHA-1 checksum of the written MIDX file

656 """

657 with GitFile(path, "wb") as f:

658 return write_midx(f, pack_index_entries, hash_algorithm)

659

660

661# TODO: Add support for incremental MIDX chains

662# TODO: Add support for BTMP and RIDX chunks for bitmap integration