Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 52%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

104 statements  

1# Copyright 2024 The Sigstore Authors 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Machinery for computing digests for a single file. 

16 

17Example usage for `SimpleFileHasher`: 

18```python 

19>>> with open("/tmp/file", "w") as f: 

20... f.write("abcd") 

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256()) 

22>>> digest = hasher.compute() 

23>>> digest.digest_hex 

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

25``` 

26 

27Example usage for `ShardedFileHasher`, reading only the second part of a file: 

28```python 

29>>> with open("/tmp/file", "w") as f: 

30... f.write("0123abcd") 

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) 

32>>> digest = hasher.compute() 

33>>> digest.digest_hex 

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

35``` 

36""" 

37 

38import pathlib 

39from typing import Optional 

40 

41import blake3 

42from typing_extensions import override 

43 

44from model_signing._hashing import hashing 

45 

46 

47class FileHasher(hashing.HashEngine): 

48 """Generic file hash engine. 

49 

50 This class is intentionally empty (and abstract, via inheritance) to be used 

51 only as a type annotation (to signal that API expects a hasher capable of 

52 hashing files, instead of any `HashEngine` instance). 

53 """ 

54 

55 

56class SimpleFileHasher(FileHasher): 

57 """Simple file hash engine that computes the digest iteratively. 

58 

59 To compute the hash of a file, we read the file exactly once, including for 

60 very large files that don't fit in memory. Files are read in chunks and each 

61 chunk is passed to the `update` method of an inner 

62 `hashing.StreamingHashEngine`, instance. This ensures that the file digest 

63 will not change even if the chunk size changes. As such, we can dynamically 

64 determine an optimal value for the chunk argument. 

65 """ 

66 

67 def __init__( 

68 self, 

69 file: pathlib.Path, 

70 content_hasher: hashing.StreamingHashEngine, 

71 *, 

72 chunk_size: int = 1_048_576, 

73 digest_name_override: Optional[str] = None, 

74 ): 

75 """Initializes an instance to hash a file with a specific `HashEngine`. 

76 

77 Args: 

78 file: The file to hash. Use `set_file` to reset it. 

79 content_hasher: A `hashing.StreamingHashEngine` instance used to 

80 compute the digest of the file. 

81 chunk_size: The amount of file to read at once. Default is 1MB. A 

82 special value of 0 signals to attempt to read everything in a 

83 single call. 

84 digest_name_override: Optional string to allow overriding the 

85 `digest_name` property to support shorter, standardized names. 

86 """ 

87 if chunk_size < 0: 

88 raise ValueError( 

89 f"Chunk size must be non-negative, got {chunk_size}." 

90 ) 

91 

92 self._file = file 

93 self._content_hasher = content_hasher 

94 self._chunk_size = chunk_size 

95 self._digest_name_override = digest_name_override 

96 

97 def set_file(self, file: pathlib.Path) -> None: 

98 """Redefines the file to be hashed in `compute`. 

99 

100 Args: 

101 file: The new file to be hashed. 

102 """ 

103 self._file = file 

104 

105 @property 

106 @override 

107 def digest_name(self) -> str: 

108 if self._digest_name_override is not None: 

109 return self._digest_name_override 

110 # Since there is no difference between hashing the file with this engine 

111 # or reading the file in memory and then using the content hasher 

112 # directly, we must have the same digest_name. 

113 return self._content_hasher.digest_name 

114 

115 @override 

116 def compute(self) -> hashing.Digest: 

117 self._content_hasher.reset() 

118 

119 if self._chunk_size == 0: 

120 with open(self._file, "rb") as f: 

121 self._content_hasher.update(f.read()) 

122 else: 

123 with open(self._file, "rb") as f: 

124 while True: 

125 data = f.read(self._chunk_size) 

126 if not data: 

127 break 

128 self._content_hasher.update(data) 

129 

130 digest = self._content_hasher.compute() 

131 return hashing.Digest(self.digest_name, digest.digest_value) 

132 

133 @property 

134 @override 

135 def digest_size(self) -> int: 

136 return self._content_hasher.digest_size 

137 

138 

139class Blake3FileHasher(FileHasher): 

140 """Simple file hash engine that uses BLAKE3 in parallel. 

141 

142 This hash engine uses the fastest BLAKE3 settings, by using memory mapping 

143 and multiple workers. This will greatly increase speed on SSDs, but may 

144 not perform well on HDDs. For HDDs, you can set max_threads to 1. 

145 """ 

146 

147 def __init__( 

148 self, 

149 file: pathlib.Path, 

150 *, 

151 max_threads: int = blake3.blake3.AUTO, 

152 digest_name_override: Optional[str] = None, 

153 ): 

154 """Initializes an instance to hash a file. 

155 

156 Args: 

157 file: The file to hash. Use `set_file` to reset it. 

158 max_threads: how many BLAKE3 workers to use. Defaults to number of 

159 logical cores. 

160 digest_name_override: Optional string to allow overriding the 

161 `digest_name` property to support shorter, standardized names. 

162 """ 

163 self._file = file 

164 self._digest_name_override = digest_name_override 

165 self._blake3 = blake3.blake3(max_threads=max_threads) 

166 

167 def set_file(self, file: pathlib.Path) -> None: 

168 """Redefines the file to be hashed in `compute`. 

169 

170 Args: 

171 file: The new file to be hashed. 

172 """ 

173 self._file = file 

174 

175 @property 

176 @override 

177 def digest_name(self) -> str: 

178 if self._digest_name_override is not None: 

179 return self._digest_name_override 

180 return "blake3" 

181 

182 @override 

183 def compute(self) -> hashing.Digest: 

184 self._blake3.reset() 

185 self._blake3.update_mmap(self._file) 

186 return hashing.Digest(self.digest_name, self._blake3.digest()) 

187 

188 @property 

189 @override 

190 def digest_size(self) -> int: 

191 return 32 

192 

193 

194class ShardedFileHasher(SimpleFileHasher): 

195 """File hash engine that hashes a portion (shard) of the file. 

196 

197 By invoking this engine in parallel across disjoint shards, we can speed up 

198 hashing a single file. However, the hash output depends on the shard size. 

199 

200 It is the responsibility of the user to compose the digests of each shard 

201 into a single digest for the entire file. 

202 """ 

203 

204 def __init__( 

205 self, 

206 file: pathlib.Path, 

207 content_hasher: hashing.StreamingHashEngine, 

208 *, 

209 start: int, 

210 end: int, 

211 chunk_size: int = 1_048_576, 

212 shard_size: int = 1_000_000_000, 

213 digest_name_override: Optional[str] = None, 

214 ): 

215 """Initializes an instance to hash a file with a specific `HashEngine`. 

216 

217 Args: 

218 file: The file to hash. Use `set_file` to reset it. 

219 content_hasher: A `hashing.HashEngine` instance used to compute the 

220 digest of the file shard. 

221 start: The file offset to start reading from. Must be valid. Reset 

222 with `set_shard`. 

223 end: The file offset to stop reading at. Must be stricly greater 

224 than start. The entire shard length must be less than the 

225 configured `shard_size`. Reset with `set_shard`. 

226 chunk_size: The amount of file to read at once. Default is 1MB. A 

227 special value of 0 signals to attempt to read everything in a 

228 single call. 

229 shard_size: The size of a file shard. Default is 1 GB. 

230 digest_name_override: Optional string to allow overriding the 

231 `digest_name` property to support shorter, standardized names. 

232 """ 

233 super().__init__( 

234 file=file, 

235 content_hasher=content_hasher, 

236 chunk_size=chunk_size, 

237 digest_name_override=digest_name_override, 

238 ) 

239 

240 if shard_size <= 0: 

241 raise ValueError( 

242 f"Shard size must be strictly positive, got {shard_size}." 

243 ) 

244 self.shard_size = shard_size 

245 

246 self.set_shard(start=start, end=end) 

247 

248 def set_shard(self, *, start: int, end: int) -> None: 

249 """Redefines the file shard to be hashed in `compute`. 

250 

251 Args: 

252 start: The file offset to start reading from. Must be valid. 

253 end: The file offset to stop reading at. Must be stricly greater 

254 than start. The entire shard length must be less than the 

255 configured `shard_size`. 

256 """ 

257 if start < 0: 

258 raise ValueError( 

259 f"File start offset must be non-negative, got {start}." 

260 ) 

261 if end <= start: 

262 raise ValueError( 

263 "File end offset must be stricly higher that file start offset," 

264 f" got {start=}, {end=}." 

265 ) 

266 read_length = end - start 

267 if read_length > self.shard_size: 

268 raise ValueError( 

269 f"Must not read more than shard_size={self.shard_size}, got" 

270 f" {read_length}." 

271 ) 

272 

273 self._start = start 

274 self._end = end 

275 

276 @override 

277 def compute(self) -> hashing.Digest: 

278 self._content_hasher.reset() 

279 

280 with open(self._file, "rb") as f: 

281 f.seek(self._start) 

282 to_read = self._end - self._start 

283 if self._chunk_size == 0 or self._chunk_size >= to_read: 

284 data = f.read(to_read) 

285 self._content_hasher.update(data) 

286 else: 

287 while to_read >= 0: 

288 data = f.read(min(self._chunk_size, to_read)) 

289 if not data: 

290 break 

291 to_read -= len(data) 

292 self._content_hasher.update(data) 

293 

294 digest = self._content_hasher.compute() 

295 return hashing.Digest(self.digest_name, digest.digest_value) 

296 

297 @property 

298 @override 

299 def digest_name(self) -> str: 

300 if self._digest_name_override is not None: 

301 return self._digest_name_override 

302 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"