Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15"""Machinery for computing digests for a single file.

17Example usage for `SimpleFileHasher`:

18```python

19>>> with open("/tmp/file", "w") as f:

20... f.write("abcd")

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())

22>>> digest = hasher.compute()

23>>> digest.digest_hex

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

25```

27Example usage for `ShardedFileHasher`, reading only the second part of a file:

28```python

29>>> with open("/tmp/file", "w") as f:

30... f.write("0123abcd")

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)

32>>> digest = hasher.compute()

33>>> digest.digest_hex

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

35```

36"""

38import pathlib

40import blake3

41from typing_extensions import override

43from model_signing._hashing import hashing

46class FileHasher(hashing.HashEngine):

47 """Generic file hash engine.

49 This class is intentionally empty (and abstract, via inheritance) to be used

50 only as a type annotation (to signal that API expects a hasher capable of

51 hashing files, instead of any `HashEngine` instance).

52 """

55class SimpleFileHasher(FileHasher):

56 """Simple file hash engine that computes the digest iteratively.

58 To compute the hash of a file, we read the file exactly once, including for

59 very large files that don't fit in memory. Files are read in chunks and each

60 chunk is passed to the `update` method of an inner

61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest

62 will not change even if the chunk size changes. As such, we can dynamically

63 determine an optimal value for the chunk argument.

64 """

66 def __init__(

67 self,

68 file: pathlib.Path,

69 content_hasher: hashing.StreamingHashEngine,

70 *,

71 chunk_size: int = 1_048_576,

72 digest_name_override: str | None = None,

73 ):

74 """Initializes an instance to hash a file with a specific `HashEngine`.

76 Args:

77 file: The file to hash. Use `set_file` to reset it.

78 content_hasher: A `hashing.StreamingHashEngine` instance used to

79 compute the digest of the file.

80 chunk_size: The amount of file to read at once. Default is 1MB. A

81 special value of 0 signals to attempt to read everything in a

82 single call.

83 digest_name_override: Optional string to allow overriding the

84 `digest_name` property to support shorter, standardized names.

85 """

86 if chunk_size < 0:

87 raise ValueError(

88 f"Chunk size must be non-negative, got {chunk_size}."

89 )

91 self._file = file

92 self._content_hasher = content_hasher

93 self._chunk_size = chunk_size

94 self._digest_name_override = digest_name_override

96 def set_file(self, file: pathlib.Path) -> None:

97 """Redefines the file to be hashed in `compute`.

99 Args:

100 file: The new file to be hashed.

101 """

102 self._file = file

103

104 @property

105 @override

106 def digest_name(self) -> str:

107 if self._digest_name_override is not None:

108 return self._digest_name_override

109 # Since there is no difference between hashing the file with this engine

110 # or reading the file in memory and then using the content hasher

111 # directly, we must have the same digest_name.

112 return self._content_hasher.digest_name

113

114 @override

115 def compute(self) -> hashing.Digest:

116 self._content_hasher.reset()

117

118 if self._chunk_size == 0:

119 with open(self._file, "rb") as f:

120 self._content_hasher.update(f.read())

121 else:

122 with open(self._file, "rb") as f:

123 while True:

124 data = f.read(self._chunk_size)

125 if not data:

126 break

127 self._content_hasher.update(data)

128

129 digest = self._content_hasher.compute()

130 return hashing.Digest(self.digest_name, digest.digest_value)

131

132 @property

133 @override

134 def digest_size(self) -> int:

135 return self._content_hasher.digest_size

136

137

138class Blake3FileHasher(FileHasher):

139 """Simple file hash engine that uses BLAKE3 in parallel.

140

141 This hash engine uses the fastest BLAKE3 settings, by using memory mapping

142 and multiple workers. This will greatly increase speed on SSDs, but may

143 not perform well on HDDs. For HDDs, you can set max_threads to 1.

144 """

145

146 def __init__(

147 self,

148 file: pathlib.Path,

149 *,

150 max_threads: int = blake3.blake3.AUTO,

151 digest_name_override: str | None = None,

152 ):

153 """Initializes an instance to hash a file.

154

155 Args:

156 file: The file to hash. Use `set_file` to reset it.

157 max_threads: how many BLAKE3 workers to use. Defaults to number of

158 logical cores.

159 digest_name_override: Optional string to allow overriding the

160 `digest_name` property to support shorter, standardized names.

161 """

162 self._file = file

163 self._digest_name_override = digest_name_override

164 self._blake3 = blake3.blake3(max_threads=max_threads)

165

166 def set_file(self, file: pathlib.Path) -> None:

167 """Redefines the file to be hashed in `compute`.

168

169 Args:

170 file: The new file to be hashed.

171 """

172 self._file = file

173

174 @property

175 @override

176 def digest_name(self) -> str:

177 if self._digest_name_override is not None:

178 return self._digest_name_override

179 return "blake3"

180

181 @override

182 def compute(self) -> hashing.Digest:

183 self._blake3.reset()

184 self._blake3.update_mmap(self._file)

185 return hashing.Digest(self.digest_name, self._blake3.digest())

186

187 @property

188 @override

189 def digest_size(self) -> int:

190 return 32

191

192

193class ShardedFileHasher(SimpleFileHasher):

194 """File hash engine that hashes a portion (shard) of the file.

195

196 By invoking this engine in parallel across disjoint shards, we can speed up

197 hashing a single file. However, the hash output depends on the shard size.

198

199 It is the responsibility of the user to compose the digests of each shard

200 into a single digest for the entire file.

201 """

202

203 def __init__(

204 self,

205 file: pathlib.Path,

206 content_hasher: hashing.StreamingHashEngine,

207 *,

208 start: int,

209 end: int,

210 chunk_size: int = 1_048_576,

211 shard_size: int = 1_000_000_000,

212 digest_name_override: str | None = None,

213 ):

214 """Initializes an instance to hash a file with a specific `HashEngine`.

215

216 Args:

217 file: The file to hash. Use `set_file` to reset it.

218 content_hasher: A `hashing.HashEngine` instance used to compute the

219 digest of the file shard.

220 start: The file offset to start reading from. Must be valid. Reset

221 with `set_shard`.

222 end: The file offset to stop reading at. Must be stricly greater

223 than start. The entire shard length must be less than the

224 configured `shard_size`. Reset with `set_shard`.

225 chunk_size: The amount of file to read at once. Default is 1MB. A

226 special value of 0 signals to attempt to read everything in a

227 single call.

228 shard_size: The size of a file shard. Default is 1 GB.

229 digest_name_override: Optional string to allow overriding the

230 `digest_name` property to support shorter, standardized names.

231 """

232 super().__init__(

233 file=file,

234 content_hasher=content_hasher,

235 chunk_size=chunk_size,

236 digest_name_override=digest_name_override,

237 )

238

239 if shard_size <= 0:

240 raise ValueError(

241 f"Shard size must be strictly positive, got {shard_size}."

242 )

243 self.shard_size = shard_size

244

245 self.set_shard(start=start, end=end)

246

247 def set_shard(self, *, start: int, end: int) -> None:

248 """Redefines the file shard to be hashed in `compute`.

249

250 Args:

251 start: The file offset to start reading from. Must be valid.

252 end: The file offset to stop reading at. Must be stricly greater

253 than start. The entire shard length must be less than the

254 configured `shard_size`.

255 """

256 if start < 0:

257 raise ValueError(

258 f"File start offset must be non-negative, got {start}."

259 )

260 if end <= start:

261 raise ValueError(

262 "File end offset must be stricly higher that file start offset,"

263 f" got {start=}, {end=}."

264 )

265 read_length = end - start

266 if read_length > self.shard_size:

267 raise ValueError(

268 f"Must not read more than shard_size={self.shard_size}, got"

269 f" {read_length}."

270 )

271

272 self._start = start

273 self._end = end

274

275 @override

276 def compute(self) -> hashing.Digest:

277 self._content_hasher.reset()

278

279 with open(self._file, "rb") as f:

280 f.seek(self._start)

281 to_read = self._end - self._start

282 if self._chunk_size == 0 or self._chunk_size >= to_read:

283 data = f.read(to_read)

284 self._content_hasher.update(data)

285 else:

286 while to_read >= 0:

287 data = f.read(min(self._chunk_size, to_read))

288 if not data:

289 break

290 to_read -= len(data)

291 self._content_hasher.update(data)

292

293 digest = self._content_hasher.compute()

294 return hashing.Digest(self.digest_name, digest.digest_value)

295

296 @property

297 @override

298 def digest_name(self) -> str:

299 if self._digest_name_override is not None:

300 return self._digest_name_override

301 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 80%

103 statements