Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/midx.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# midx.py -- Multi-Pack-Index (MIDX) support
2# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
3#
4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as published by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
22"""Multi-Pack-Index (MIDX) support.
24A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
25enabling fast object lookup across all packs without opening each pack index.
27The MIDX file format consists of:
28- A header with signature, version, and hash algorithm
29- A chunk lookup table
30- Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
31- A trailer with checksum
33This module provides:
34- Reading MIDX files
35- Writing MIDX files
36- Integration with pack-based object stores
38Limitations:
39- Incremental MIDX chains are not yet supported (base_midx_files must be 0)
40- BTMP (bitmapped packfiles) chunk is not yet implemented
41- RIDX (reverse index) chunk is not yet implemented
43Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental
44feature, where multiple MIDX files can be chained together. The format includes
45a base_midx_files field in the header and uses a multi-pack-index.d/ directory
46with a multi-pack-index-chain file. This feature is not yet supported by Dulwich
47as the specification is still evolving.
48"""
50__all__ = [
51 "CHUNK_BTMP",
52 "CHUNK_LOFF",
53 "CHUNK_OIDF",
54 "CHUNK_OIDL",
55 "CHUNK_OOFF",
56 "CHUNK_PNAM",
57 "CHUNK_RIDX",
58 "HASH_ALGORITHM_SHA1",
59 "HASH_ALGORITHM_SHA256",
60 "MIDX_SIGNATURE",
61 "MIDX_VERSION",
62 "MultiPackIndex",
63 "load_midx",
64 "load_midx_file",
65 "write_midx",
66]
68import os
69import struct
70from collections.abc import Iterator
71from io import UnsupportedOperation
72from typing import IO, Any
74try:
75 import mmap
76except ImportError:
77 has_mmap = False
78else:
79 has_mmap = True
81from .file import GitFile, _GitFile
82from .objects import ObjectID, RawObjectID
83from .pack import SHA1Writer
85# MIDX signature
86MIDX_SIGNATURE = b"MIDX"
88# MIDX version
89MIDX_VERSION = 1
91# Chunk identifiers (4 bytes each)
92CHUNK_PNAM = b"PNAM" # Packfile names
93CHUNK_OIDF = b"OIDF" # OID fanout table
94CHUNK_OIDL = b"OIDL" # OID lookup table
95CHUNK_OOFF = b"OOFF" # Object offsets
96CHUNK_LOFF = b"LOFF" # Large offsets (optional)
97CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
98CHUNK_RIDX = b"RIDX" # Reverse index (optional)
100# Hash algorithm identifiers
101HASH_ALGORITHM_SHA1 = 1
102HASH_ALGORITHM_SHA256 = 2
105class MultiPackIndex:
106 """Multi-pack-index for efficient object lookup across multiple pack files."""
108 def __init__(
109 self,
110 filename: str | os.PathLike[str],
111 file: IO[bytes] | _GitFile | None = None,
112 contents: bytes | None = None,
113 size: int | None = None,
114 ) -> None:
115 """Initialize a MultiPackIndex.
117 Args:
118 filename: Path to the MIDX file
119 file: Optional file object
120 contents: Optional mmap'd contents
121 size: Optional size of the MIDX file
122 """
123 self._filename = os.fspath(filename)
124 self._file = file
125 self._size = size
127 # Instance variables that will be set during parsing
128 self.version: int
129 self.hash_algorithm: int
130 self.hash_size: int
131 self.chunk_count: int
132 self.base_midx_files: int
133 self.pack_count: int
134 self.pack_names: list[str]
135 self.object_count: int
136 self._chunks: dict[bytes, int]
137 self._fanout_table: list[int]
138 self._oidl_offset: int
139 self._ooff_offset: int
140 self._loff_offset: int
142 # Load file contents
143 if contents is None:
144 if file is None:
145 with GitFile(filename, "rb") as f:
146 self._contents, self._size = self._load_file_contents(f, size)
147 else:
148 self._contents, self._size = self._load_file_contents(file, size)
149 else:
150 self._contents = contents
152 # Parse header
153 self._parse_header()
155 # Parse chunk lookup table
156 self._parse_chunk_table()
158 def _load_file_contents(
159 self, f: IO[bytes] | _GitFile, size: int | None = None
160 ) -> tuple[bytes | Any, int]:
161 """Load contents from a file, preferring mmap when possible.
163 Args:
164 f: File-like object to load
165 size: Expected size, or None to determine from file
167 Returns:
168 Tuple of (contents, size)
169 """
170 try:
171 fd = f.fileno()
172 except (UnsupportedOperation, AttributeError):
173 fd = None
175 # Attempt to use mmap if possible
176 if fd is not None:
177 if size is None:
178 size = os.fstat(fd).st_size
179 if has_mmap:
180 try:
181 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
182 except (OSError, ValueError):
183 # Can't mmap - perhaps a socket or invalid file descriptor
184 pass
185 else:
186 return contents, size
188 # Fall back to reading entire file into memory
189 contents_bytes = f.read()
190 size = len(contents_bytes)
191 return contents_bytes, size
193 def _parse_header(self) -> None:
194 """Parse the MIDX header."""
195 if len(self._contents) < 12:
196 raise ValueError("MIDX file too small")
198 # Check signature
199 signature = self._contents[0:4]
200 if signature != MIDX_SIGNATURE:
201 raise ValueError(f"Invalid MIDX signature: {signature!r}")
203 # Read version
204 self.version = self._contents[4]
205 if self.version != MIDX_VERSION:
206 raise ValueError(f"Unsupported MIDX version: {self.version}")
208 # Read object ID version (hash algorithm)
209 self.hash_algorithm = self._contents[5]
210 if self.hash_algorithm == HASH_ALGORITHM_SHA1:
211 self.hash_size = 20
212 elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
213 self.hash_size = 32
214 else:
215 raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
217 # Read chunk count
218 self.chunk_count = self._contents[6]
220 # Read base MIDX files count (currently always 0)
221 self.base_midx_files = self._contents[7]
222 if self.base_midx_files != 0:
223 raise ValueError("Incremental MIDX not yet supported")
225 # Read pack file count
226 (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
228 def _parse_chunk_table(self) -> None:
229 """Parse the chunk lookup table."""
230 self._chunks = {}
232 # Chunk table starts at offset 12
233 offset = 12
235 # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
236 for i in range(self.chunk_count + 1): # +1 for terminator
237 chunk_id = self._contents[offset : offset + 4]
238 (chunk_offset,) = struct.unpack(
239 ">Q", self._contents[offset + 4 : offset + 12]
240 )
242 if chunk_id == b"\x00\x00\x00\x00":
243 # Terminator entry
244 break
246 self._chunks[chunk_id] = chunk_offset
247 offset += 12
249 # Parse required chunks
250 self._parse_pnam_chunk()
251 self._parse_oidf_chunk()
252 self._parse_oidl_chunk()
253 self._parse_ooff_chunk()
255 # Parse optional chunks
256 if CHUNK_LOFF in self._chunks:
257 self._parse_loff_chunk()
259 def _parse_pnam_chunk(self) -> None:
260 """Parse the Packfile Names (PNAM) chunk."""
261 if CHUNK_PNAM not in self._chunks:
262 raise ValueError("Required PNAM chunk not found")
264 offset = self._chunks[CHUNK_PNAM]
265 self.pack_names = []
267 # Find the end of the PNAM chunk (next chunk or end of chunks section)
268 next_offset = min(
269 (o for o in self._chunks.values() if o > offset),
270 default=len(self._contents),
271 )
273 # Parse null-terminated pack names
274 current = offset
275 while current < next_offset:
276 # Find the next null terminator
277 null_pos = self._contents.find(b"\x00", current, next_offset)
278 if null_pos == -1:
279 break
281 pack_name = self._contents[current:null_pos].decode("utf-8")
282 if pack_name: # Skip empty strings (padding)
283 self.pack_names.append(pack_name)
284 current = null_pos + 1
286 def _parse_oidf_chunk(self) -> None:
287 """Parse the OID Fanout (OIDF) chunk."""
288 if CHUNK_OIDF not in self._chunks:
289 raise ValueError("Required OIDF chunk not found")
291 offset = self._chunks[CHUNK_OIDF]
292 self._fanout_table = []
294 # Read 256 4-byte entries
295 for i in range(256):
296 (count,) = struct.unpack(
297 ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
298 )
299 self._fanout_table.append(count)
301 # Total object count is the last entry
302 self.object_count = self._fanout_table[255]
304 def _parse_oidl_chunk(self) -> None:
305 """Parse the OID Lookup (OIDL) chunk."""
306 if CHUNK_OIDL not in self._chunks:
307 raise ValueError("Required OIDL chunk not found")
309 self._oidl_offset = self._chunks[CHUNK_OIDL]
311 def _parse_ooff_chunk(self) -> None:
312 """Parse the Object Offsets (OOFF) chunk."""
313 if CHUNK_OOFF not in self._chunks:
314 raise ValueError("Required OOFF chunk not found")
316 self._ooff_offset = self._chunks[CHUNK_OOFF]
318 def _parse_loff_chunk(self) -> None:
319 """Parse the Large Offsets (LOFF) chunk."""
320 self._loff_offset = self._chunks[CHUNK_LOFF]
322 def __len__(self) -> int:
323 """Return the number of objects in this MIDX."""
324 return self.object_count
326 def _get_oid(self, index: int) -> RawObjectID:
327 """Get the object ID at the given index.
329 Args:
330 index: Index of the object
332 Returns:
333 Binary object ID
334 """
335 if index < 0 or index >= self.object_count:
336 raise IndexError(f"Index {index} out of range")
338 offset = self._oidl_offset + index * self.hash_size
339 return RawObjectID(self._contents[offset : offset + self.hash_size])
341 def _get_pack_info(self, index: int) -> tuple[int, int]:
342 """Get pack ID and offset for object at the given index.
344 Args:
345 index: Index of the object
347 Returns:
348 Tuple of (pack_id, offset)
349 """
350 if index < 0 or index >= self.object_count:
351 raise IndexError(f"Index {index} out of range")
353 # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
354 offset = self._ooff_offset + index * 8
356 (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
357 (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
359 # Check if this is a large offset (MSB set)
360 if pack_offset & 0x80000000:
361 # Look up in LOFF chunk
362 if CHUNK_LOFF not in self._chunks:
363 raise ValueError("Large offset found but no LOFF chunk")
365 large_index = pack_offset & 0x7FFFFFFF
366 large_offset_pos = self._loff_offset + large_index * 8
367 (pack_offset,) = struct.unpack(
368 ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
369 )
371 return pack_id, pack_offset
373 def object_offset(self, sha: ObjectID | RawObjectID) -> tuple[str, int] | None:
374 """Return the pack name and offset for the given object.
376 Args:
377 sha: Binary SHA-1 or SHA-256 hash
379 Returns:
380 Tuple of (pack_name, offset) or None if not found
381 """
382 if len(sha) != self.hash_size:
383 raise ValueError(
384 f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
385 )
387 # Use fanout table to narrow search range
388 first_byte = sha[0]
389 start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
390 end_idx = self._fanout_table[first_byte]
392 # Binary search within the range
393 while start_idx < end_idx:
394 mid = (start_idx + end_idx) // 2
395 mid_sha = self._get_oid(mid)
397 if mid_sha == sha:
398 # Found it!
399 pack_id, offset = self._get_pack_info(mid)
400 return self.pack_names[pack_id], offset
401 elif mid_sha < sha:
402 start_idx = mid + 1
403 else:
404 end_idx = mid
406 return None
408 def __contains__(self, sha: ObjectID | RawObjectID) -> bool:
409 """Check if the given object SHA is in this MIDX.
411 Args:
412 sha: Binary SHA hash
414 Returns:
415 True if the object is in this MIDX
416 """
417 return self.object_offset(sha) is not None
419 def iterentries(self) -> Iterator[tuple[RawObjectID, str, int]]:
420 """Iterate over all entries in this MIDX.
422 Yields:
423 Tuples of (sha, pack_name, offset)
424 """
425 for i in range(self.object_count):
426 sha = self._get_oid(i)
427 pack_id, offset = self._get_pack_info(i)
428 pack_name = self.pack_names[pack_id]
429 yield sha, pack_name, offset
431 def close(self) -> None:
432 """Close the MIDX file and release mmap resources."""
433 # Close mmap'd contents first if it's an mmap object
434 if self._contents is not None and has_mmap:
435 if isinstance(self._contents, mmap.mmap):
436 self._contents.close()
437 self._contents = None
439 # Close file handle
440 if self._file is not None:
441 self._file.close()
442 self._file = None
445def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
446 """Load a multi-pack-index file by path.
448 Args:
449 path: Path to the MIDX file
451 Returns:
452 A MultiPackIndex loaded from the given path
453 """
454 with GitFile(path, "rb") as f:
455 return load_midx_file(path, f)
458def load_midx_file(
459 path: str | os.PathLike[str], f: IO[bytes] | _GitFile
460) -> MultiPackIndex:
461 """Load a multi-pack-index from a file-like object.
463 Args:
464 path: Path for the MIDX file
465 f: File-like object
467 Returns:
468 A MultiPackIndex loaded from the given file
469 """
470 return MultiPackIndex(path, file=f)
473def write_midx(
474 f: IO[bytes],
475 pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
476 hash_algorithm: int = HASH_ALGORITHM_SHA1,
477) -> bytes:
478 """Write a multi-pack-index file.
480 Args:
481 f: File-like object to write to
482 pack_index_entries: List of (pack_name, entries) tuples where entries are
483 (sha, offset, crc32) tuples, sorted by SHA
484 hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
486 Returns:
487 SHA-1 checksum of the written MIDX file
488 """
489 if hash_algorithm == HASH_ALGORITHM_SHA1:
490 hash_size = 20
491 elif hash_algorithm == HASH_ALGORITHM_SHA256:
492 hash_size = 32
493 else:
494 raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
496 # Wrap file in SHA1Writer to compute checksum
497 writer = SHA1Writer(f)
499 # Sort pack entries by pack name (required by Git)
500 pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])
502 # Collect all objects from all packs
503 all_objects: list[tuple[RawObjectID, int, int]] = [] # (sha, pack_id, offset)
504 pack_names: list[str] = []
506 for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):
507 pack_names.append(pack_name)
508 for sha, offset, _crc32 in entries:
509 all_objects.append((sha, pack_id, offset))
511 # Sort all objects by SHA
512 all_objects.sort(key=lambda x: x[0])
514 # Calculate offsets for chunks
515 num_packs = len(pack_names)
516 num_objects = len(all_objects)
518 # Header: 12 bytes
519 header_size = 12
521 # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
522 # We'll determine if LOFF is needed later
523 chunk_count = 4 # PNAM, OIDF, OIDL, OOFF
525 # Check if we need LOFF chunk (for offsets >= 2^31)
526 need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
527 if need_loff:
528 chunk_count += 1
530 # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
531 chunk_table_size = (chunk_count + 1) * 12
533 # Calculate chunk offsets
534 current_offset = header_size + chunk_table_size
536 # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
537 pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
538 # Pad to 4-byte boundary
539 pnam_padding = (4 - len(pnam_data) % 4) % 4
540 pnam_data += b"\x00" * pnam_padding
541 pnam_offset = current_offset
542 current_offset += len(pnam_data)
544 # OIDF chunk: 256 * 4 bytes
545 oidf_offset = current_offset
546 oidf_size = 256 * 4
547 current_offset += oidf_size
549 # OIDL chunk: num_objects * hash_size bytes
550 oidl_offset = current_offset
551 oidl_size = num_objects * hash_size
552 current_offset += oidl_size
554 # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
555 ooff_offset = current_offset
556 ooff_size = num_objects * 8
557 current_offset += ooff_size
559 # LOFF chunk (if needed): variable size
560 # We'll calculate the exact size when we know how many large offsets we have
561 loff_offset = current_offset if need_loff else 0
562 large_offsets: list[int] = []
564 # Calculate trailer offset (where checksum starts)
565 # We need to pre-calculate large offset count for accurate trailer offset
566 if need_loff:
567 # Count large offsets
568 large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)
569 loff_size = large_offset_count * 8
570 trailer_offset = current_offset + loff_size
571 else:
572 trailer_offset = current_offset
574 # Write header
575 writer.write(MIDX_SIGNATURE) # 4 bytes: signature
576 writer.write(bytes([MIDX_VERSION])) # 1 byte: version
577 writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm
578 writer.write(bytes([chunk_count])) # 1 byte: chunk count
579 writer.write(bytes([0])) # 1 byte: base MIDX files (always 0)
580 writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count
582 # Write chunk table
583 chunk_table = [
584 (CHUNK_PNAM, pnam_offset),
585 (CHUNK_OIDF, oidf_offset),
586 (CHUNK_OIDL, oidl_offset),
587 (CHUNK_OOFF, ooff_offset),
588 ]
589 if need_loff:
590 chunk_table.append((CHUNK_LOFF, loff_offset))
592 for chunk_id, chunk_offset in chunk_table:
593 writer.write(chunk_id) # 4 bytes
594 writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes
596 # Write terminator (points to where trailer/checksum starts)
597 writer.write(b"\x00\x00\x00\x00") # 4 bytes
598 writer.write(struct.pack(">Q", trailer_offset)) # 8 bytes
600 # Write PNAM chunk
601 writer.write(pnam_data)
603 # Write OIDF chunk (fanout table)
604 fanout: list[int] = [0] * 256
605 for sha, _pack_id, _offset in all_objects:
606 first_byte = sha[0]
607 fanout[first_byte] += 1
609 # Convert counts to cumulative
610 cumulative = 0
611 for i in range(256):
612 cumulative += fanout[i]
613 writer.write(struct.pack(">L", cumulative))
615 # Write OIDL chunk (object IDs)
616 for sha, _pack_id, _offset in all_objects:
617 writer.write(sha)
619 # Write OOFF chunk (pack ID and offset for each object)
620 for _sha, pack_id, offset in all_objects:
621 writer.write(struct.pack(">L", pack_id))
623 if offset >= 2**31:
624 # Use large offset table
625 large_offset_index = len(large_offsets)
626 large_offsets.append(offset)
627 # Set MSB to indicate large offset
628 writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
629 else:
630 writer.write(struct.pack(">L", offset))
632 # Write LOFF chunk if needed
633 if need_loff:
634 for large_offset in large_offsets:
635 writer.write(struct.pack(">Q", large_offset))
637 # Write checksum
638 return writer.write_sha()
641def write_midx_file(
642 path: str | os.PathLike[str],
643 pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
644 hash_algorithm: int = HASH_ALGORITHM_SHA1,
645) -> bytes:
646 """Write a multi-pack-index file to disk.
648 Args:
649 path: Path where to write the MIDX file
650 pack_index_entries: List of (pack_name, entries) tuples where entries are
651 (sha, offset, crc32) tuples, sorted by SHA
652 hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
654 Returns:
655 SHA-1 checksum of the written MIDX file
656 """
657 with GitFile(path, "wb") as f:
658 return write_midx(f, pack_index_entries, hash_algorithm)
661# TODO: Add support for incremental MIDX chains
662# TODO: Add support for BTMP and RIDX chunks for bitmap integration