1# Copyright 2024 The Sigstore Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Machinery for computing digests for a single file.
16
17Example usage for `SimpleFileHasher`:
18```python
19>>> with open("/tmp/file", "w") as f:
20... f.write("abcd")
21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())
22>>> digest = hasher.compute()
23>>> digest.digest_hex
24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
25```
26
27Example usage for `ShardedFileHasher`, reading only the second part of a file:
28```python
29>>> with open("/tmp/file", "w") as f:
30... f.write("0123abcd")
31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
32>>> digest = hasher.compute()
33>>> digest.digest_hex
34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
35```
36"""
37
38import pathlib
39
40import blake3
41from typing_extensions import override
42
43from model_signing._hashing import hashing
44
45
46class FileHasher(hashing.HashEngine):
47 """Generic file hash engine.
48
49 This class is intentionally empty (and abstract, via inheritance) to be used
50 only as a type annotation (to signal that API expects a hasher capable of
51 hashing files, instead of any `HashEngine` instance).
52 """
53
54
55class SimpleFileHasher(FileHasher):
56 """Simple file hash engine that computes the digest iteratively.
57
58 To compute the hash of a file, we read the file exactly once, including for
59 very large files that don't fit in memory. Files are read in chunks and each
60 chunk is passed to the `update` method of an inner
61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest
62 will not change even if the chunk size changes. As such, we can dynamically
63 determine an optimal value for the chunk argument.
64 """
65
66 def __init__(
67 self,
68 file: pathlib.Path,
69 content_hasher: hashing.StreamingHashEngine,
70 *,
71 chunk_size: int = 1_048_576,
72 digest_name_override: str | None = None,
73 ):
74 """Initializes an instance to hash a file with a specific `HashEngine`.
75
76 Args:
77 file: The file to hash. Use `set_file` to reset it.
78 content_hasher: A `hashing.StreamingHashEngine` instance used to
79 compute the digest of the file.
80 chunk_size: The amount of file to read at once. Default is 1MB. A
81 special value of 0 signals to attempt to read everything in a
82 single call.
83 digest_name_override: Optional string to allow overriding the
84 `digest_name` property to support shorter, standardized names.
85 """
86 if chunk_size < 0:
87 raise ValueError(
88 f"Chunk size must be non-negative, got {chunk_size}."
89 )
90
91 self._file = file
92 self._content_hasher = content_hasher
93 self._chunk_size = chunk_size
94 self._digest_name_override = digest_name_override
95
96 def set_file(self, file: pathlib.Path) -> None:
97 """Redefines the file to be hashed in `compute`.
98
99 Args:
100 file: The new file to be hashed.
101 """
102 self._file = file
103
104 @property
105 @override
106 def digest_name(self) -> str:
107 if self._digest_name_override is not None:
108 return self._digest_name_override
109 # Since there is no difference between hashing the file with this engine
110 # or reading the file in memory and then using the content hasher
111 # directly, we must have the same digest_name.
112 return self._content_hasher.digest_name
113
114 @override
115 def compute(self) -> hashing.Digest:
116 self._content_hasher.reset()
117
118 if self._chunk_size == 0:
119 with open(self._file, "rb") as f:
120 self._content_hasher.update(f.read())
121 else:
122 with open(self._file, "rb") as f:
123 while True:
124 data = f.read(self._chunk_size)
125 if not data:
126 break
127 self._content_hasher.update(data)
128
129 digest = self._content_hasher.compute()
130 return hashing.Digest(self.digest_name, digest.digest_value)
131
132 @property
133 @override
134 def digest_size(self) -> int:
135 return self._content_hasher.digest_size
136
137
138class Blake3FileHasher(FileHasher):
139 """Simple file hash engine that uses BLAKE3 in parallel.
140
141 This hash engine uses the fastest BLAKE3 settings, by using memory mapping
142 and multiple workers. This will greatly increase speed on SSDs, but may
143 not perform well on HDDs. For HDDs, you can set max_threads to 1.
144 """
145
146 def __init__(
147 self,
148 file: pathlib.Path,
149 *,
150 max_threads: int = blake3.blake3.AUTO,
151 digest_name_override: str | None = None,
152 ):
153 """Initializes an instance to hash a file.
154
155 Args:
156 file: The file to hash. Use `set_file` to reset it.
157 max_threads: how many BLAKE3 workers to use. Defaults to number of
158 logical cores.
159 digest_name_override: Optional string to allow overriding the
160 `digest_name` property to support shorter, standardized names.
161 """
162 self._file = file
163 self._digest_name_override = digest_name_override
164 self._blake3 = blake3.blake3(max_threads=max_threads)
165
166 def set_file(self, file: pathlib.Path) -> None:
167 """Redefines the file to be hashed in `compute`.
168
169 Args:
170 file: The new file to be hashed.
171 """
172 self._file = file
173
174 @property
175 @override
176 def digest_name(self) -> str:
177 if self._digest_name_override is not None:
178 return self._digest_name_override
179 return "blake3"
180
181 @override
182 def compute(self) -> hashing.Digest:
183 self._blake3.reset()
184 self._blake3.update_mmap(self._file)
185 return hashing.Digest(self.digest_name, self._blake3.digest())
186
187 @property
188 @override
189 def digest_size(self) -> int:
190 return 32
191
192
193class ShardedFileHasher(SimpleFileHasher):
194 """File hash engine that hashes a portion (shard) of the file.
195
196 By invoking this engine in parallel across disjoint shards, we can speed up
197 hashing a single file. However, the hash output depends on the shard size.
198
199 It is the responsibility of the user to compose the digests of each shard
200 into a single digest for the entire file.
201 """
202
203 def __init__(
204 self,
205 file: pathlib.Path,
206 content_hasher: hashing.StreamingHashEngine,
207 *,
208 start: int,
209 end: int,
210 chunk_size: int = 1_048_576,
211 shard_size: int = 1_000_000_000,
212 digest_name_override: str | None = None,
213 ):
214 """Initializes an instance to hash a file with a specific `HashEngine`.
215
216 Args:
217 file: The file to hash. Use `set_file` to reset it.
218 content_hasher: A `hashing.HashEngine` instance used to compute the
219 digest of the file shard.
220 start: The file offset to start reading from. Must be valid. Reset
221 with `set_shard`.
222 end: The file offset to stop reading at. Must be stricly greater
223 than start. The entire shard length must be less than the
224 configured `shard_size`. Reset with `set_shard`.
225 chunk_size: The amount of file to read at once. Default is 1MB. A
226 special value of 0 signals to attempt to read everything in a
227 single call.
228 shard_size: The size of a file shard. Default is 1 GB.
229 digest_name_override: Optional string to allow overriding the
230 `digest_name` property to support shorter, standardized names.
231 """
232 super().__init__(
233 file=file,
234 content_hasher=content_hasher,
235 chunk_size=chunk_size,
236 digest_name_override=digest_name_override,
237 )
238
239 if shard_size <= 0:
240 raise ValueError(
241 f"Shard size must be strictly positive, got {shard_size}."
242 )
243 self.shard_size = shard_size
244
245 self.set_shard(start=start, end=end)
246
247 def set_shard(self, *, start: int, end: int) -> None:
248 """Redefines the file shard to be hashed in `compute`.
249
250 Args:
251 start: The file offset to start reading from. Must be valid.
252 end: The file offset to stop reading at. Must be stricly greater
253 than start. The entire shard length must be less than the
254 configured `shard_size`.
255 """
256 if start < 0:
257 raise ValueError(
258 f"File start offset must be non-negative, got {start}."
259 )
260 if end <= start:
261 raise ValueError(
262 "File end offset must be stricly higher that file start offset,"
263 f" got {start=}, {end=}."
264 )
265 read_length = end - start
266 if read_length > self.shard_size:
267 raise ValueError(
268 f"Must not read more than shard_size={self.shard_size}, got"
269 f" {read_length}."
270 )
271
272 self._start = start
273 self._end = end
274
275 @override
276 def compute(self) -> hashing.Digest:
277 self._content_hasher.reset()
278
279 with open(self._file, "rb") as f:
280 f.seek(self._start)
281 to_read = self._end - self._start
282 if self._chunk_size == 0 or self._chunk_size >= to_read:
283 data = f.read(to_read)
284 self._content_hasher.update(data)
285 else:
286 while to_read >= 0:
287 data = f.read(min(self._chunk_size, to_read))
288 if not data:
289 break
290 to_read -= len(data)
291 self._content_hasher.update(data)
292
293 digest = self._content_hasher.compute()
294 return hashing.Digest(self.digest_name, digest.digest_value)
295
296 @property
297 @override
298 def digest_name(self) -> str:
299 if self._digest_name_override is not None:
300 return self._digest_name_override
301 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"