Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 80%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

103 statements  

1# Copyright 2024 The Sigstore Authors 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Machinery for computing digests for a single file. 

16 

17Example usage for `SimpleFileHasher`: 

18```python 

19>>> with open("/tmp/file", "w") as f: 

20... f.write("abcd") 

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256()) 

22>>> digest = hasher.compute() 

23>>> digest.digest_hex 

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

25``` 

26 

27Example usage for `ShardedFileHasher`, reading only the second part of a file: 

28```python 

29>>> with open("/tmp/file", "w") as f: 

30... f.write("0123abcd") 

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) 

32>>> digest = hasher.compute() 

33>>> digest.digest_hex 

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

35``` 

36""" 

37 

38import pathlib 

39 

40import blake3 

41from typing_extensions import override 

42 

43from model_signing._hashing import hashing 

44 

45 

46class FileHasher(hashing.HashEngine): 

47 """Generic file hash engine. 

48 

49 This class is intentionally empty (and abstract, via inheritance) to be used 

50 only as a type annotation (to signal that API expects a hasher capable of 

51 hashing files, instead of any `HashEngine` instance). 

52 """ 

53 

54 

55class SimpleFileHasher(FileHasher): 

56 """Simple file hash engine that computes the digest iteratively. 

57 

58 To compute the hash of a file, we read the file exactly once, including for 

59 very large files that don't fit in memory. Files are read in chunks and each 

60 chunk is passed to the `update` method of an inner 

61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest 

62 will not change even if the chunk size changes. As such, we can dynamically 

63 determine an optimal value for the chunk argument. 

64 """ 

65 

66 def __init__( 

67 self, 

68 file: pathlib.Path, 

69 content_hasher: hashing.StreamingHashEngine, 

70 *, 

71 chunk_size: int = 1_048_576, 

72 digest_name_override: str | None = None, 

73 ): 

74 """Initializes an instance to hash a file with a specific `HashEngine`. 

75 

76 Args: 

77 file: The file to hash. Use `set_file` to reset it. 

78 content_hasher: A `hashing.StreamingHashEngine` instance used to 

79 compute the digest of the file. 

80 chunk_size: The amount of file to read at once. Default is 1MB. A 

81 special value of 0 signals to attempt to read everything in a 

82 single call. 

83 digest_name_override: Optional string to allow overriding the 

84 `digest_name` property to support shorter, standardized names. 

85 """ 

86 if chunk_size < 0: 

87 raise ValueError( 

88 f"Chunk size must be non-negative, got {chunk_size}." 

89 ) 

90 

91 self._file = file 

92 self._content_hasher = content_hasher 

93 self._chunk_size = chunk_size 

94 self._digest_name_override = digest_name_override 

95 

96 def set_file(self, file: pathlib.Path) -> None: 

97 """Redefines the file to be hashed in `compute`. 

98 

99 Args: 

100 file: The new file to be hashed. 

101 """ 

102 self._file = file 

103 

104 @property 

105 @override 

106 def digest_name(self) -> str: 

107 if self._digest_name_override is not None: 

108 return self._digest_name_override 

109 # Since there is no difference between hashing the file with this engine 

110 # or reading the file in memory and then using the content hasher 

111 # directly, we must have the same digest_name. 

112 return self._content_hasher.digest_name 

113 

114 @override 

115 def compute(self) -> hashing.Digest: 

116 self._content_hasher.reset() 

117 

118 if self._chunk_size == 0: 

119 with open(self._file, "rb") as f: 

120 self._content_hasher.update(f.read()) 

121 else: 

122 with open(self._file, "rb") as f: 

123 while True: 

124 data = f.read(self._chunk_size) 

125 if not data: 

126 break 

127 self._content_hasher.update(data) 

128 

129 digest = self._content_hasher.compute() 

130 return hashing.Digest(self.digest_name, digest.digest_value) 

131 

132 @property 

133 @override 

134 def digest_size(self) -> int: 

135 return self._content_hasher.digest_size 

136 

137 

138class Blake3FileHasher(FileHasher): 

139 """Simple file hash engine that uses BLAKE3 in parallel. 

140 

141 This hash engine uses the fastest BLAKE3 settings, by using memory mapping 

142 and multiple workers. This will greatly increase speed on SSDs, but may 

143 not perform well on HDDs. For HDDs, you can set max_threads to 1. 

144 """ 

145 

146 def __init__( 

147 self, 

148 file: pathlib.Path, 

149 *, 

150 max_threads: int = blake3.blake3.AUTO, 

151 digest_name_override: str | None = None, 

152 ): 

153 """Initializes an instance to hash a file. 

154 

155 Args: 

156 file: The file to hash. Use `set_file` to reset it. 

157 max_threads: how many BLAKE3 workers to use. Defaults to number of 

158 logical cores. 

159 digest_name_override: Optional string to allow overriding the 

160 `digest_name` property to support shorter, standardized names. 

161 """ 

162 self._file = file 

163 self._digest_name_override = digest_name_override 

164 self._blake3 = blake3.blake3(max_threads=max_threads) 

165 

166 def set_file(self, file: pathlib.Path) -> None: 

167 """Redefines the file to be hashed in `compute`. 

168 

169 Args: 

170 file: The new file to be hashed. 

171 """ 

172 self._file = file 

173 

174 @property 

175 @override 

176 def digest_name(self) -> str: 

177 if self._digest_name_override is not None: 

178 return self._digest_name_override 

179 return "blake3" 

180 

181 @override 

182 def compute(self) -> hashing.Digest: 

183 self._blake3.reset() 

184 self._blake3.update_mmap(self._file) 

185 return hashing.Digest(self.digest_name, self._blake3.digest()) 

186 

187 @property 

188 @override 

189 def digest_size(self) -> int: 

190 return 32 

191 

192 

193class ShardedFileHasher(SimpleFileHasher): 

194 """File hash engine that hashes a portion (shard) of the file. 

195 

196 By invoking this engine in parallel across disjoint shards, we can speed up 

197 hashing a single file. However, the hash output depends on the shard size. 

198 

199 It is the responsibility of the user to compose the digests of each shard 

200 into a single digest for the entire file. 

201 """ 

202 

203 def __init__( 

204 self, 

205 file: pathlib.Path, 

206 content_hasher: hashing.StreamingHashEngine, 

207 *, 

208 start: int, 

209 end: int, 

210 chunk_size: int = 1_048_576, 

211 shard_size: int = 1_000_000_000, 

212 digest_name_override: str | None = None, 

213 ): 

214 """Initializes an instance to hash a file with a specific `HashEngine`. 

215 

216 Args: 

217 file: The file to hash. Use `set_file` to reset it. 

218 content_hasher: A `hashing.HashEngine` instance used to compute the 

219 digest of the file shard. 

220 start: The file offset to start reading from. Must be valid. Reset 

221 with `set_shard`. 

222 end: The file offset to stop reading at. Must be stricly greater 

223 than start. The entire shard length must be less than the 

224 configured `shard_size`. Reset with `set_shard`. 

225 chunk_size: The amount of file to read at once. Default is 1MB. A 

226 special value of 0 signals to attempt to read everything in a 

227 single call. 

228 shard_size: The size of a file shard. Default is 1 GB. 

229 digest_name_override: Optional string to allow overriding the 

230 `digest_name` property to support shorter, standardized names. 

231 """ 

232 super().__init__( 

233 file=file, 

234 content_hasher=content_hasher, 

235 chunk_size=chunk_size, 

236 digest_name_override=digest_name_override, 

237 ) 

238 

239 if shard_size <= 0: 

240 raise ValueError( 

241 f"Shard size must be strictly positive, got {shard_size}." 

242 ) 

243 self.shard_size = shard_size 

244 

245 self.set_shard(start=start, end=end) 

246 

247 def set_shard(self, *, start: int, end: int) -> None: 

248 """Redefines the file shard to be hashed in `compute`. 

249 

250 Args: 

251 start: The file offset to start reading from. Must be valid. 

252 end: The file offset to stop reading at. Must be stricly greater 

253 than start. The entire shard length must be less than the 

254 configured `shard_size`. 

255 """ 

256 if start < 0: 

257 raise ValueError( 

258 f"File start offset must be non-negative, got {start}." 

259 ) 

260 if end <= start: 

261 raise ValueError( 

262 "File end offset must be stricly higher that file start offset," 

263 f" got {start=}, {end=}." 

264 ) 

265 read_length = end - start 

266 if read_length > self.shard_size: 

267 raise ValueError( 

268 f"Must not read more than shard_size={self.shard_size}, got" 

269 f" {read_length}." 

270 ) 

271 

272 self._start = start 

273 self._end = end 

274 

275 @override 

276 def compute(self) -> hashing.Digest: 

277 self._content_hasher.reset() 

278 

279 with open(self._file, "rb") as f: 

280 f.seek(self._start) 

281 to_read = self._end - self._start 

282 if self._chunk_size == 0 or self._chunk_size >= to_read: 

283 data = f.read(to_read) 

284 self._content_hasher.update(data) 

285 else: 

286 while to_read >= 0: 

287 data = f.read(min(self._chunk_size, to_read)) 

288 if not data: 

289 break 

290 to_read -= len(data) 

291 self._content_hasher.update(data) 

292 

293 digest = self._content_hasher.compute() 

294 return hashing.Digest(self.digest_name, digest.digest_value) 

295 

296 @property 

297 @override 

298 def digest_name(self) -> str: 

299 if self._digest_name_override is not None: 

300 return self._digest_name_override 

301 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"