Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15"""Machinery for computing digests for a single file.

17Example usage for `SimpleFileHasher`:

18```python

19>>> with open("/tmp/file", "w") as f:

20... f.write("abcd")

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256())

22>>> digest = hasher.compute()

23>>> digest.digest_hex

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

25```

27Example usage for `ShardedFileHasher`, reading only the second part of a file:

28```python

29>>> with open("/tmp/file", "w") as f:

30... f.write("0123abcd")

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)

32>>> digest = hasher.compute()

33>>> digest.digest_hex

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'

35```

36"""

38import pathlib

39from typing import Optional

41from typing_extensions import override

43from model_signing._hashing import hashing

46class FileHasher(hashing.HashEngine):

47 """Generic file hash engine.

49 This class is intentionally empty (and abstract, via inheritance) to be used

50 only as a type annotation (to signal that API expects a hasher capable of

51 hashing files, instead of any `HashEngine` instance).

52 """

55class SimpleFileHasher(FileHasher):

56 """Simple file hash engine that computes the digest iteratively.

58 To compute the hash of a file, we read the file exactly once, including for

59 very large files that don't fit in memory. Files are read in chunks and each

60 chunk is passed to the `update` method of an inner

61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest

62 will not change even if the chunk size changes. As such, we can dynamically

63 determine an optimal value for the chunk argument.

64 """

66 def __init__(

67 self,

68 file: pathlib.Path,

69 content_hasher: hashing.StreamingHashEngine,

70 *,

71 chunk_size: int = 1_048_576,

72 digest_name_override: Optional[str] = None,

73 ):

74 """Initializes an instance to hash a file with a specific `HashEngine`.

76 Args:

77 file: The file to hash. Use `set_file` to reset it.

78 content_hasher: A `hashing.StreamingHashEngine` instance used to

79 compute the digest of the file.

80 chunk_size: The amount of file to read at once. Default is 1MB. A

81 special value of 0 signals to attempt to read everything in a

82 single call.

83 digest_name_override: Optional string to allow overriding the

84 `digest_name` property to support shorter, standardized names.

85 """

86 if chunk_size < 0:

87 raise ValueError(

88 f"Chunk size must be non-negative, got {chunk_size}."

89 )

91 self._file = file

92 self._content_hasher = content_hasher

93 self._chunk_size = chunk_size

94 self._digest_name_override = digest_name_override

96 def set_file(self, file: pathlib.Path) -> None:

97 """Redefines the file to be hashed in `compute`.

99 Args:

100 file: The new file to be hashed.

101 """

102 self._file = file

103

104 @property

105 @override

106 def digest_name(self) -> str:

107 if self._digest_name_override is not None:

108 return self._digest_name_override

109 # Since there is no difference between hashing the file with this engine

110 # or reading the file in memory and then using the content hasher

111 # directly, we must have the same digest_name.

112 return self._content_hasher.digest_name

113

114 @override

115 def compute(self) -> hashing.Digest:

116 self._content_hasher.reset()

117

118 if self._chunk_size == 0:

119 with open(self._file, "rb") as f:

120 self._content_hasher.update(f.read())

121 else:

122 with open(self._file, "rb") as f:

123 while True:

124 data = f.read(self._chunk_size)

125 if not data:

126 break

127 self._content_hasher.update(data)

128

129 digest = self._content_hasher.compute()

130 return hashing.Digest(self.digest_name, digest.digest_value)

131

132 @property

133 @override

134 def digest_size(self) -> int:

135 return self._content_hasher.digest_size

136

137

138class ShardedFileHasher(SimpleFileHasher):

139 """File hash engine that hashes a portion (shard) of the file.

140

141 By invoking this engine in parallel across disjoint shards, we can speed up

142 hashing a single file. However, the hash output depends on the shard size.

143

144 It is the responsibility of the user to compose the digests of each shard

145 into a single digest for the entire file.

146 """

147

148 def __init__(

149 self,

150 file: pathlib.Path,

151 content_hasher: hashing.StreamingHashEngine,

152 *,

153 start: int,

154 end: int,

155 chunk_size: int = 1_048_576,

156 shard_size: int = 1_000_000_000,

157 digest_name_override: Optional[str] = None,

158 ):

159 """Initializes an instance to hash a file with a specific `HashEngine`.

160

161 Args:

162 file: The file to hash. Use `set_file` to reset it.

163 content_hasher: A `hashing.HashEngine` instance used to compute the

164 digest of the file shard.

165 start: The file offset to start reading from. Must be valid. Reset

166 with `set_shard`.

167 end: The file offset to stop reading at. Must be stricly greater

168 than start. The entire shard length must be less than the

169 configured `shard_size`. Reset with `set_shard`.

170 chunk_size: The amount of file to read at once. Default is 1MB. A

171 special value of 0 signals to attempt to read everything in a

172 single call.

173 shard_size: The size of a file shard. Default is 1 GB.

174 digest_name_override: Optional string to allow overriding the

175 `digest_name` property to support shorter, standardized names.

176 """

177 super().__init__(

178 file=file,

179 content_hasher=content_hasher,

180 chunk_size=chunk_size,

181 digest_name_override=digest_name_override,

182 )

183

184 if shard_size <= 0:

185 raise ValueError(

186 f"Shard size must be strictly positive, got {shard_size}."

187 )

188 self.shard_size = shard_size

189

190 self.set_shard(start=start, end=end)

191

192 def set_shard(self, *, start: int, end: int) -> None:

193 """Redefines the file shard to be hashed in `compute`.

194

195 Args:

196 start: The file offset to start reading from. Must be valid.

197 end: The file offset to stop reading at. Must be stricly greater

198 than start. The entire shard length must be less than the

199 configured `shard_size`.

200 """

201 if start < 0:

202 raise ValueError(

203 f"File start offset must be non-negative, got {start}."

204 )

205 if end <= start:

206 raise ValueError(

207 "File end offset must be stricly higher that file start offset,"

208 f" got {start=}, {end=}."

209 )

210 read_length = end - start

211 if read_length > self.shard_size:

212 raise ValueError(

213 f"Must not read more than shard_size={self.shard_size}, got"

214 f" {read_length}."

215 )

216

217 self._start = start

218 self._end = end

219

220 @override

221 def compute(self) -> hashing.Digest:

222 self._content_hasher.reset()

223

224 with open(self._file, "rb") as f:

225 f.seek(self._start)

226 to_read = self._end - self._start

227 if self._chunk_size == 0 or self._chunk_size >= to_read:

228 data = f.read(to_read)

229 self._content_hasher.update(data)

230 else:

231 while to_read >= 0:

232 data = f.read(min(self._chunk_size, to_read))

233 if not data:

234 break

235 to_read -= len(data)

236 self._content_hasher.update(data)

237

238 digest = self._content_hasher.compute()

239 return hashing.Digest(self.digest_name, digest.digest_value)

240

241 @property

242 @override

243 def digest_name(self) -> str:

244 if self._digest_name_override is not None:

245 return self._digest_name_override

246 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 52%

81 statements