1# Copyright 2024 The Sigstore Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Machinery for computing digests for a single file.
16
17Example usage for `SimpleFileHasher`:
18```python
19>>> with open("/tmp/file", "w") as f:
20... f.write("abcd")
21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())
22>>> digest = hasher.compute()
23>>> digest.digest_hex
24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
25```
26
27Example usage for `ShardedFileHasher`, reading only the second part of a file:
28```python
29>>> with open("/tmp/file", "w") as f:
30... f.write("0123abcd")
31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
32>>> digest = hasher.compute()
33>>> digest.digest_hex
34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
35```
36"""
37
38import pathlib
39from typing import Optional
40
41import blake3
42from typing_extensions import override
43
44from model_signing._hashing import hashing
45
46
47class FileHasher(hashing.HashEngine):
48 """Generic file hash engine.
49
50 This class is intentionally empty (and abstract, via inheritance) to be used
51 only as a type annotation (to signal that API expects a hasher capable of
52 hashing files, instead of any `HashEngine` instance).
53 """
54
55
56class SimpleFileHasher(FileHasher):
57 """Simple file hash engine that computes the digest iteratively.
58
59 To compute the hash of a file, we read the file exactly once, including for
60 very large files that don't fit in memory. Files are read in chunks and each
61 chunk is passed to the `update` method of an inner
62 `hashing.StreamingHashEngine`, instance. This ensures that the file digest
63 will not change even if the chunk size changes. As such, we can dynamically
64 determine an optimal value for the chunk argument.
65 """
66
67 def __init__(
68 self,
69 file: pathlib.Path,
70 content_hasher: hashing.StreamingHashEngine,
71 *,
72 chunk_size: int = 1_048_576,
73 digest_name_override: Optional[str] = None,
74 ):
75 """Initializes an instance to hash a file with a specific `HashEngine`.
76
77 Args:
78 file: The file to hash. Use `set_file` to reset it.
79 content_hasher: A `hashing.StreamingHashEngine` instance used to
80 compute the digest of the file.
81 chunk_size: The amount of file to read at once. Default is 1MB. A
82 special value of 0 signals to attempt to read everything in a
83 single call.
84 digest_name_override: Optional string to allow overriding the
85 `digest_name` property to support shorter, standardized names.
86 """
87 if chunk_size < 0:
88 raise ValueError(
89 f"Chunk size must be non-negative, got {chunk_size}."
90 )
91
92 self._file = file
93 self._content_hasher = content_hasher
94 self._chunk_size = chunk_size
95 self._digest_name_override = digest_name_override
96
97 def set_file(self, file: pathlib.Path) -> None:
98 """Redefines the file to be hashed in `compute`.
99
100 Args:
101 file: The new file to be hashed.
102 """
103 self._file = file
104
105 @property
106 @override
107 def digest_name(self) -> str:
108 if self._digest_name_override is not None:
109 return self._digest_name_override
110 # Since there is no difference between hashing the file with this engine
111 # or reading the file in memory and then using the content hasher
112 # directly, we must have the same digest_name.
113 return self._content_hasher.digest_name
114
115 @override
116 def compute(self) -> hashing.Digest:
117 self._content_hasher.reset()
118
119 if self._chunk_size == 0:
120 with open(self._file, "rb") as f:
121 self._content_hasher.update(f.read())
122 else:
123 with open(self._file, "rb") as f:
124 while True:
125 data = f.read(self._chunk_size)
126 if not data:
127 break
128 self._content_hasher.update(data)
129
130 digest = self._content_hasher.compute()
131 return hashing.Digest(self.digest_name, digest.digest_value)
132
133 @property
134 @override
135 def digest_size(self) -> int:
136 return self._content_hasher.digest_size
137
138
139class Blake3FileHasher(FileHasher):
140 """Simple file hash engine that uses BLAKE3 in parallel.
141
142 This hash engine uses the fastest BLAKE3 settings, by using memory mapping
143 and multiple workers. This will greatly increase speed on SSDs, but may
144 not perform well on HDDs. For HDDs, you can set max_threads to 1.
145 """
146
147 def __init__(
148 self,
149 file: pathlib.Path,
150 *,
151 max_threads: int = blake3.blake3.AUTO,
152 digest_name_override: Optional[str] = None,
153 ):
154 """Initializes an instance to hash a file.
155
156 Args:
157 file: The file to hash. Use `set_file` to reset it.
158 max_threads: how many BLAKE3 workers to use. Defaults to number of
159 logical cores.
160 digest_name_override: Optional string to allow overriding the
161 `digest_name` property to support shorter, standardized names.
162 """
163 self._file = file
164 self._digest_name_override = digest_name_override
165 self._blake3 = blake3.blake3(max_threads=max_threads)
166
167 def set_file(self, file: pathlib.Path) -> None:
168 """Redefines the file to be hashed in `compute`.
169
170 Args:
171 file: The new file to be hashed.
172 """
173 self._file = file
174
175 @property
176 @override
177 def digest_name(self) -> str:
178 if self._digest_name_override is not None:
179 return self._digest_name_override
180 return "blake3"
181
182 @override
183 def compute(self) -> hashing.Digest:
184 self._blake3.reset()
185 self._blake3.update_mmap(self._file)
186 return hashing.Digest(self.digest_name, self._blake3.digest())
187
188 @property
189 @override
190 def digest_size(self) -> int:
191 return 32
192
193
194class ShardedFileHasher(SimpleFileHasher):
195 """File hash engine that hashes a portion (shard) of the file.
196
197 By invoking this engine in parallel across disjoint shards, we can speed up
198 hashing a single file. However, the hash output depends on the shard size.
199
200 It is the responsibility of the user to compose the digests of each shard
201 into a single digest for the entire file.
202 """
203
204 def __init__(
205 self,
206 file: pathlib.Path,
207 content_hasher: hashing.StreamingHashEngine,
208 *,
209 start: int,
210 end: int,
211 chunk_size: int = 1_048_576,
212 shard_size: int = 1_000_000_000,
213 digest_name_override: Optional[str] = None,
214 ):
215 """Initializes an instance to hash a file with a specific `HashEngine`.
216
217 Args:
218 file: The file to hash. Use `set_file` to reset it.
219 content_hasher: A `hashing.HashEngine` instance used to compute the
220 digest of the file shard.
221 start: The file offset to start reading from. Must be valid. Reset
222 with `set_shard`.
223 end: The file offset to stop reading at. Must be stricly greater
224 than start. The entire shard length must be less than the
225 configured `shard_size`. Reset with `set_shard`.
226 chunk_size: The amount of file to read at once. Default is 1MB. A
227 special value of 0 signals to attempt to read everything in a
228 single call.
229 shard_size: The size of a file shard. Default is 1 GB.
230 digest_name_override: Optional string to allow overriding the
231 `digest_name` property to support shorter, standardized names.
232 """
233 super().__init__(
234 file=file,
235 content_hasher=content_hasher,
236 chunk_size=chunk_size,
237 digest_name_override=digest_name_override,
238 )
239
240 if shard_size <= 0:
241 raise ValueError(
242 f"Shard size must be strictly positive, got {shard_size}."
243 )
244 self.shard_size = shard_size
245
246 self.set_shard(start=start, end=end)
247
248 def set_shard(self, *, start: int, end: int) -> None:
249 """Redefines the file shard to be hashed in `compute`.
250
251 Args:
252 start: The file offset to start reading from. Must be valid.
253 end: The file offset to stop reading at. Must be stricly greater
254 than start. The entire shard length must be less than the
255 configured `shard_size`.
256 """
257 if start < 0:
258 raise ValueError(
259 f"File start offset must be non-negative, got {start}."
260 )
261 if end <= start:
262 raise ValueError(
263 "File end offset must be stricly higher that file start offset,"
264 f" got {start=}, {end=}."
265 )
266 read_length = end - start
267 if read_length > self.shard_size:
268 raise ValueError(
269 f"Must not read more than shard_size={self.shard_size}, got"
270 f" {read_length}."
271 )
272
273 self._start = start
274 self._end = end
275
276 @override
277 def compute(self) -> hashing.Digest:
278 self._content_hasher.reset()
279
280 with open(self._file, "rb") as f:
281 f.seek(self._start)
282 to_read = self._end - self._start
283 if self._chunk_size == 0 or self._chunk_size >= to_read:
284 data = f.read(to_read)
285 self._content_hasher.update(data)
286 else:
287 while to_read >= 0:
288 data = f.read(min(self._chunk_size, to_read))
289 if not data:
290 break
291 to_read -= len(data)
292 self._content_hasher.update(data)
293
294 digest = self._content_hasher.compute()
295 return hashing.Digest(self.digest_name, digest.digest_value)
296
297 @property
298 @override
299 def digest_name(self) -> str:
300 if self._digest_name_override is not None:
301 return self._digest_name_override
302 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"