Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15"""Machinery for computing digests for a single file.

17Example usage for `SimpleFileHasher`:

18```python

19>>> with open("/tmp/file", "w") as f:

20... f.write("abcd")

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())

22>>> digest = hasher.compute()

23>>> digest.digest_hex

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

25```

27Example usage for `ShardedFileHasher`, reading only the second part of a file:

28```python

29>>> with open("/tmp/file", "w") as f:

30... f.write("0123abcd")

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)

32>>> digest = hasher.compute()

33>>> digest.digest_hex

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

35```

36"""

38import pathlib

39from typing import Optional

41import blake3

42from typing_extensions import override

44from model_signing._hashing import hashing

47class FileHasher(hashing.HashEngine):

48 """Generic file hash engine.

50 This class is intentionally empty (and abstract, via inheritance) to be used

51 only as a type annotation (to signal that API expects a hasher capable of

52 hashing files, instead of any `HashEngine` instance).

53 """

56class SimpleFileHasher(FileHasher):

57 """Simple file hash engine that computes the digest iteratively.

59 To compute the hash of a file, we read the file exactly once, including for

60 very large files that don't fit in memory. Files are read in chunks and each

61 chunk is passed to the `update` method of an inner

62 `hashing.StreamingHashEngine`, instance. This ensures that the file digest

63 will not change even if the chunk size changes. As such, we can dynamically

64 determine an optimal value for the chunk argument.

65 """

67 def __init__(

68 self,

69 file: pathlib.Path,

70 content_hasher: hashing.StreamingHashEngine,

71 *,

72 chunk_size: int = 1_048_576,

73 digest_name_override: Optional[str] = None,

74 ):

75 """Initializes an instance to hash a file with a specific `HashEngine`.

77 Args:

78 file: The file to hash. Use `set_file` to reset it.

79 content_hasher: A `hashing.StreamingHashEngine` instance used to

80 compute the digest of the file.

81 chunk_size: The amount of file to read at once. Default is 1MB. A

82 special value of 0 signals to attempt to read everything in a

83 single call.

84 digest_name_override: Optional string to allow overriding the

85 `digest_name` property to support shorter, standardized names.

86 """

87 if chunk_size < 0:

88 raise ValueError(

89 f"Chunk size must be non-negative, got {chunk_size}."

90 )

92 self._file = file

93 self._content_hasher = content_hasher

94 self._chunk_size = chunk_size

95 self._digest_name_override = digest_name_override

97 def set_file(self, file: pathlib.Path) -> None:

98 """Redefines the file to be hashed in `compute`.

100 Args:

101 file: The new file to be hashed.

102 """

103 self._file = file

104

105 @property

106 @override

107 def digest_name(self) -> str:

108 if self._digest_name_override is not None:

109 return self._digest_name_override

110 # Since there is no difference between hashing the file with this engine

111 # or reading the file in memory and then using the content hasher

112 # directly, we must have the same digest_name.

113 return self._content_hasher.digest_name

114

115 @override

116 def compute(self) -> hashing.Digest:

117 self._content_hasher.reset()

118

119 if self._chunk_size == 0:

120 with open(self._file, "rb") as f:

121 self._content_hasher.update(f.read())

122 else:

123 with open(self._file, "rb") as f:

124 while True:

125 data = f.read(self._chunk_size)

126 if not data:

127 break

128 self._content_hasher.update(data)

129

130 digest = self._content_hasher.compute()

131 return hashing.Digest(self.digest_name, digest.digest_value)

132

133 @property

134 @override

135 def digest_size(self) -> int:

136 return self._content_hasher.digest_size

137

138

139class Blake3FileHasher(FileHasher):

140 """Simple file hash engine that uses BLAKE3 in parallel.

141

142 This hash engine uses the fastest BLAKE3 settings, by using memory mapping

143 and multiple workers. This will greatly increase speed on SSDs, but may

144 not perform well on HDDs. For HDDs, you can set max_threads to 1.

145 """

146

147 def __init__(

148 self,

149 file: pathlib.Path,

150 *,

151 max_threads: int = blake3.blake3.AUTO,

152 digest_name_override: Optional[str] = None,

153 ):

154 """Initializes an instance to hash a file.

155

156 Args:

157 file: The file to hash. Use `set_file` to reset it.

158 max_threads: how many BLAKE3 workers to use. Defaults to number of

159 logical cores.

160 digest_name_override: Optional string to allow overriding the

161 `digest_name` property to support shorter, standardized names.

162 """

163 self._file = file

164 self._digest_name_override = digest_name_override

165 self._blake3 = blake3.blake3(max_threads=max_threads)

166

167 def set_file(self, file: pathlib.Path) -> None:

168 """Redefines the file to be hashed in `compute`.

169

170 Args:

171 file: The new file to be hashed.

172 """

173 self._file = file

174

175 @property

176 @override

177 def digest_name(self) -> str:

178 if self._digest_name_override is not None:

179 return self._digest_name_override

180 return "blake3"

181

182 @override

183 def compute(self) -> hashing.Digest:

184 self._blake3.reset()

185 self._blake3.update_mmap(self._file)

186 return hashing.Digest(self.digest_name, self._blake3.digest())

187

188 @property

189 @override

190 def digest_size(self) -> int:

191 return 32

192

193

194class ShardedFileHasher(SimpleFileHasher):

195 """File hash engine that hashes a portion (shard) of the file.

196

197 By invoking this engine in parallel across disjoint shards, we can speed up

198 hashing a single file. However, the hash output depends on the shard size.

199

200 It is the responsibility of the user to compose the digests of each shard

201 into a single digest for the entire file.

202 """

203

204 def __init__(

205 self,

206 file: pathlib.Path,

207 content_hasher: hashing.StreamingHashEngine,

208 *,

209 start: int,

210 end: int,

211 chunk_size: int = 1_048_576,

212 shard_size: int = 1_000_000_000,

213 digest_name_override: Optional[str] = None,

214 ):

215 """Initializes an instance to hash a file with a specific `HashEngine`.

216

217 Args:

218 file: The file to hash. Use `set_file` to reset it.

219 content_hasher: A `hashing.HashEngine` instance used to compute the

220 digest of the file shard.

221 start: The file offset to start reading from. Must be valid. Reset

222 with `set_shard`.

223 end: The file offset to stop reading at. Must be stricly greater

224 than start. The entire shard length must be less than the

225 configured `shard_size`. Reset with `set_shard`.

226 chunk_size: The amount of file to read at once. Default is 1MB. A

227 special value of 0 signals to attempt to read everything in a

228 single call.

229 shard_size: The size of a file shard. Default is 1 GB.

230 digest_name_override: Optional string to allow overriding the

231 `digest_name` property to support shorter, standardized names.

232 """

233 super().__init__(

234 file=file,

235 content_hasher=content_hasher,

236 chunk_size=chunk_size,

237 digest_name_override=digest_name_override,

238 )

239

240 if shard_size <= 0:

241 raise ValueError(

242 f"Shard size must be strictly positive, got {shard_size}."

243 )

244 self.shard_size = shard_size

245

246 self.set_shard(start=start, end=end)

247

248 def set_shard(self, *, start: int, end: int) -> None:

249 """Redefines the file shard to be hashed in `compute`.

250

251 Args:

252 start: The file offset to start reading from. Must be valid.

253 end: The file offset to stop reading at. Must be stricly greater

254 than start. The entire shard length must be less than the

255 configured `shard_size`.

256 """

257 if start < 0:

258 raise ValueError(

259 f"File start offset must be non-negative, got {start}."

260 )

261 if end <= start:

262 raise ValueError(

263 "File end offset must be stricly higher that file start offset,"

264 f" got {start=}, {end=}."

265 )

266 read_length = end - start

267 if read_length > self.shard_size:

268 raise ValueError(

269 f"Must not read more than shard_size={self.shard_size}, got"

270 f" {read_length}."

271 )

272

273 self._start = start

274 self._end = end

275

276 @override

277 def compute(self) -> hashing.Digest:

278 self._content_hasher.reset()

279

280 with open(self._file, "rb") as f:

281 f.seek(self._start)

282 to_read = self._end - self._start

283 if self._chunk_size == 0 or self._chunk_size >= to_read:

284 data = f.read(to_read)

285 self._content_hasher.update(data)

286 else:

287 while to_read >= 0:

288 data = f.read(min(self._chunk_size, to_read))

289 if not data:

290 break

291 to_read -= len(data)

292 self._content_hasher.update(data)

293

294 digest = self._content_hasher.compute()

295 return hashing.Digest(self.digest_name, digest.digest_value)

296

297 @property

298 @override

299 def digest_name(self) -> str:

300 if self._digest_name_override is not None:

301 return self._digest_name_override

302 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 52%

104 statements