1# Copyright 2024 The Sigstore Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Machinery for computing digests for a single file.
16
17Example usage for `SimpleFileHasher`:
18```python
19>>> with open("/tmp/file", "w") as f:
20... f.write("abcd")
21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())
22>>> digest = hasher.compute()
23>>> digest.digest_hex
24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
25```
26
27Example usage for `ShardedFileHasher`, reading only the second part of a file:
28```python
29>>> with open("/tmp/file", "w") as f:
30... f.write("0123abcd")
31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
32>>> digest = hasher.compute()
33>>> digest.digest_hex
34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
35```
36"""
37
38import pathlib
39from typing import Optional
40
41from typing_extensions import override
42
43from model_signing._hashing import hashing
44
45
46class FileHasher(hashing.HashEngine):
47 """Generic file hash engine.
48
49 This class is intentionally empty (and abstract, via inheritance) to be used
50 only as a type annotation (to signal that API expects a hasher capable of
51 hashing files, instead of any `HashEngine` instance).
52 """
53
54
55class SimpleFileHasher(FileHasher):
56 """Simple file hash engine that computes the digest iteratively.
57
58 To compute the hash of a file, we read the file exactly once, including for
59 very large files that don't fit in memory. Files are read in chunks and each
60 chunk is passed to the `update` method of an inner
61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest
62 will not change even if the chunk size changes. As such, we can dynamically
63 determine an optimal value for the chunk argument.
64 """
65
66 def __init__(
67 self,
68 file: pathlib.Path,
69 content_hasher: hashing.StreamingHashEngine,
70 *,
71 chunk_size: int = 1_048_576,
72 digest_name_override: Optional[str] = None,
73 ):
74 """Initializes an instance to hash a file with a specific `HashEngine`.
75
76 Args:
77 file: The file to hash. Use `set_file` to reset it.
78 content_hasher: A `hashing.StreamingHashEngine` instance used to
79 compute the digest of the file.
80 chunk_size: The amount of file to read at once. Default is 1MB. A
81 special value of 0 signals to attempt to read everything in a
82 single call.
83 digest_name_override: Optional string to allow overriding the
84 `digest_name` property to support shorter, standardized names.
85 """
86 if chunk_size < 0:
87 raise ValueError(
88 f"Chunk size must be non-negative, got {chunk_size}."
89 )
90
91 self._file = file
92 self._content_hasher = content_hasher
93 self._chunk_size = chunk_size
94 self._digest_name_override = digest_name_override
95
96 def set_file(self, file: pathlib.Path) -> None:
97 """Redefines the file to be hashed in `compute`.
98
99 Args:
100 file: The new file to be hashed.
101 """
102 self._file = file
103
104 @property
105 @override
106 def digest_name(self) -> str:
107 if self._digest_name_override is not None:
108 return self._digest_name_override
109 # Since there is no difference between hashing the file with this engine
110 # or reading the file in memory and then using the content hasher
111 # directly, we must have the same digest_name.
112 return self._content_hasher.digest_name
113
114 @override
115 def compute(self) -> hashing.Digest:
116 self._content_hasher.reset()
117
118 if self._chunk_size == 0:
119 with open(self._file, "rb") as f:
120 self._content_hasher.update(f.read())
121 else:
122 with open(self._file, "rb") as f:
123 while True:
124 data = f.read(self._chunk_size)
125 if not data:
126 break
127 self._content_hasher.update(data)
128
129 digest = self._content_hasher.compute()
130 return hashing.Digest(self.digest_name, digest.digest_value)
131
132 @property
133 @override
134 def digest_size(self) -> int:
135 return self._content_hasher.digest_size
136
137
138class ShardedFileHasher(SimpleFileHasher):
139 """File hash engine that hashes a portion (shard) of the file.
140
141 By invoking this engine in parallel across disjoint shards, we can speed up
142 hashing a single file. However, the hash output depends on the shard size.
143
144 It is the responsibility of the user to compose the digests of each shard
145 into a single digest for the entire file.
146 """
147
148 def __init__(
149 self,
150 file: pathlib.Path,
151 content_hasher: hashing.StreamingHashEngine,
152 *,
153 start: int,
154 end: int,
155 chunk_size: int = 1_048_576,
156 shard_size: int = 1_000_000_000,
157 digest_name_override: Optional[str] = None,
158 ):
159 """Initializes an instance to hash a file with a specific `HashEngine`.
160
161 Args:
162 file: The file to hash. Use `set_file` to reset it.
163 content_hasher: A `hashing.HashEngine` instance used to compute the
164 digest of the file shard.
165 start: The file offset to start reading from. Must be valid. Reset
166 with `set_shard`.
167 end: The file offset to stop reading at. Must be stricly greater
168 than start. The entire shard length must be less than the
169 configured `shard_size`. Reset with `set_shard`.
170 chunk_size: The amount of file to read at once. Default is 1MB. A
171 special value of 0 signals to attempt to read everything in a
172 single call.
173 shard_size: The size of a file shard. Default is 1 GB.
174 digest_name_override: Optional string to allow overriding the
175 `digest_name` property to support shorter, standardized names.
176 """
177 super().__init__(
178 file=file,
179 content_hasher=content_hasher,
180 chunk_size=chunk_size,
181 digest_name_override=digest_name_override,
182 )
183
184 if shard_size <= 0:
185 raise ValueError(
186 f"Shard size must be strictly positive, got {shard_size}."
187 )
188 self.shard_size = shard_size
189
190 self.set_shard(start=start, end=end)
191
192 def set_shard(self, *, start: int, end: int) -> None:
193 """Redefines the file shard to be hashed in `compute`.
194
195 Args:
196 start: The file offset to start reading from. Must be valid.
197 end: The file offset to stop reading at. Must be stricly greater
198 than start. The entire shard length must be less than the
199 configured `shard_size`.
200 """
201 if start < 0:
202 raise ValueError(
203 f"File start offset must be non-negative, got {start}."
204 )
205 if end <= start:
206 raise ValueError(
207 "File end offset must be stricly higher that file start offset,"
208 f" got {start=}, {end=}."
209 )
210 read_length = end - start
211 if read_length > self.shard_size:
212 raise ValueError(
213 f"Must not read more than shard_size={self.shard_size}, got"
214 f" {read_length}."
215 )
216
217 self._start = start
218 self._end = end
219
220 @override
221 def compute(self) -> hashing.Digest:
222 self._content_hasher.reset()
223
224 with open(self._file, "rb") as f:
225 f.seek(self._start)
226 to_read = self._end - self._start
227 if self._chunk_size == 0 or self._chunk_size >= to_read:
228 data = f.read(to_read)
229 self._content_hasher.update(data)
230 else:
231 while to_read >= 0:
232 data = f.read(min(self._chunk_size, to_read))
233 if not data:
234 break
235 to_read -= len(data)
236 self._content_hasher.update(data)
237
238 digest = self._content_hasher.compute()
239 return hashing.Digest(self.digest_name, digest.digest_value)
240
241 @property
242 @override
243 def digest_name(self) -> str:
244 if self._digest_name_override is not None:
245 return self._digest_name_override
246 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"