Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/_hashing/io.py: 52%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

81 statements  

1# Copyright 2024 The Sigstore Authors 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Machinery for computing digests for a single file. 

16 

17Example usage for `SimpleFileHasher`: 

18```python 

19>>> with open("/tmp/file", "w") as f: 

20... f.write("abcd") 

21>>> hasher = SimpleFileHasher("/tmp/file", SHA256()) 

22>>> digest = hasher.compute() 

23>>> digest.digest_hex 

24'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

25``` 

26 

27Example usage for `ShardedFileHasher`, reading only the second part of a file: 

28```python 

29>>> with open("/tmp/file", "w") as f: 

30... f.write("0123abcd") 

31>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) 

32>>> digest = hasher.compute() 

33>>> digest.digest_hex 

34'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' 

35``` 

36""" 

37 

38import pathlib 

39from typing import Optional 

40 

41from typing_extensions import override 

42 

43from model_signing._hashing import hashing 

44 

45 

46class FileHasher(hashing.HashEngine): 

47 """Generic file hash engine. 

48 

49 This class is intentionally empty (and abstract, via inheritance) to be used 

50 only as a type annotation (to signal that API expects a hasher capable of 

51 hashing files, instead of any `HashEngine` instance). 

52 """ 

53 

54 

55class SimpleFileHasher(FileHasher): 

56 """Simple file hash engine that computes the digest iteratively. 

57 

58 To compute the hash of a file, we read the file exactly once, including for 

59 very large files that don't fit in memory. Files are read in chunks and each 

60 chunk is passed to the `update` method of an inner 

61 `hashing.StreamingHashEngine`, instance. This ensures that the file digest 

62 will not change even if the chunk size changes. As such, we can dynamically 

63 determine an optimal value for the chunk argument. 

64 """ 

65 

66 def __init__( 

67 self, 

68 file: pathlib.Path, 

69 content_hasher: hashing.StreamingHashEngine, 

70 *, 

71 chunk_size: int = 1_048_576, 

72 digest_name_override: Optional[str] = None, 

73 ): 

74 """Initializes an instance to hash a file with a specific `HashEngine`. 

75 

76 Args: 

77 file: The file to hash. Use `set_file` to reset it. 

78 content_hasher: A `hashing.StreamingHashEngine` instance used to 

79 compute the digest of the file. 

80 chunk_size: The amount of file to read at once. Default is 1MB. A 

81 special value of 0 signals to attempt to read everything in a 

82 single call. 

83 digest_name_override: Optional string to allow overriding the 

84 `digest_name` property to support shorter, standardized names. 

85 """ 

86 if chunk_size < 0: 

87 raise ValueError( 

88 f"Chunk size must be non-negative, got {chunk_size}." 

89 ) 

90 

91 self._file = file 

92 self._content_hasher = content_hasher 

93 self._chunk_size = chunk_size 

94 self._digest_name_override = digest_name_override 

95 

96 def set_file(self, file: pathlib.Path) -> None: 

97 """Redefines the file to be hashed in `compute`. 

98 

99 Args: 

100 file: The new file to be hashed. 

101 """ 

102 self._file = file 

103 

104 @property 

105 @override 

106 def digest_name(self) -> str: 

107 if self._digest_name_override is not None: 

108 return self._digest_name_override 

109 # Since there is no difference between hashing the file with this engine 

110 # or reading the file in memory and then using the content hasher 

111 # directly, we must have the same digest_name. 

112 return self._content_hasher.digest_name 

113 

114 @override 

115 def compute(self) -> hashing.Digest: 

116 self._content_hasher.reset() 

117 

118 if self._chunk_size == 0: 

119 with open(self._file, "rb") as f: 

120 self._content_hasher.update(f.read()) 

121 else: 

122 with open(self._file, "rb") as f: 

123 while True: 

124 data = f.read(self._chunk_size) 

125 if not data: 

126 break 

127 self._content_hasher.update(data) 

128 

129 digest = self._content_hasher.compute() 

130 return hashing.Digest(self.digest_name, digest.digest_value) 

131 

132 @property 

133 @override 

134 def digest_size(self) -> int: 

135 return self._content_hasher.digest_size 

136 

137 

138class ShardedFileHasher(SimpleFileHasher): 

139 """File hash engine that hashes a portion (shard) of the file. 

140 

141 By invoking this engine in parallel across disjoint shards, we can speed up 

142 hashing a single file. However, the hash output depends on the shard size. 

143 

144 It is the responsibility of the user to compose the digests of each shard 

145 into a single digest for the entire file. 

146 """ 

147 

148 def __init__( 

149 self, 

150 file: pathlib.Path, 

151 content_hasher: hashing.StreamingHashEngine, 

152 *, 

153 start: int, 

154 end: int, 

155 chunk_size: int = 1_048_576, 

156 shard_size: int = 1_000_000_000, 

157 digest_name_override: Optional[str] = None, 

158 ): 

159 """Initializes an instance to hash a file with a specific `HashEngine`. 

160 

161 Args: 

162 file: The file to hash. Use `set_file` to reset it. 

163 content_hasher: A `hashing.HashEngine` instance used to compute the 

164 digest of the file shard. 

165 start: The file offset to start reading from. Must be valid. Reset 

166 with `set_shard`. 

167 end: The file offset to stop reading at. Must be stricly greater 

168 than start. The entire shard length must be less than the 

169 configured `shard_size`. Reset with `set_shard`. 

170 chunk_size: The amount of file to read at once. Default is 1MB. A 

171 special value of 0 signals to attempt to read everything in a 

172 single call. 

173 shard_size: The size of a file shard. Default is 1 GB. 

174 digest_name_override: Optional string to allow overriding the 

175 `digest_name` property to support shorter, standardized names. 

176 """ 

177 super().__init__( 

178 file=file, 

179 content_hasher=content_hasher, 

180 chunk_size=chunk_size, 

181 digest_name_override=digest_name_override, 

182 ) 

183 

184 if shard_size <= 0: 

185 raise ValueError( 

186 f"Shard size must be strictly positive, got {shard_size}." 

187 ) 

188 self.shard_size = shard_size 

189 

190 self.set_shard(start=start, end=end) 

191 

192 def set_shard(self, *, start: int, end: int) -> None: 

193 """Redefines the file shard to be hashed in `compute`. 

194 

195 Args: 

196 start: The file offset to start reading from. Must be valid. 

197 end: The file offset to stop reading at. Must be stricly greater 

198 than start. The entire shard length must be less than the 

199 configured `shard_size`. 

200 """ 

201 if start < 0: 

202 raise ValueError( 

203 f"File start offset must be non-negative, got {start}." 

204 ) 

205 if end <= start: 

206 raise ValueError( 

207 "File end offset must be stricly higher that file start offset," 

208 f" got {start=}, {end=}." 

209 ) 

210 read_length = end - start 

211 if read_length > self.shard_size: 

212 raise ValueError( 

213 f"Must not read more than shard_size={self.shard_size}, got" 

214 f" {read_length}." 

215 ) 

216 

217 self._start = start 

218 self._end = end 

219 

220 @override 

221 def compute(self) -> hashing.Digest: 

222 self._content_hasher.reset() 

223 

224 with open(self._file, "rb") as f: 

225 f.seek(self._start) 

226 to_read = self._end - self._start 

227 if self._chunk_size == 0 or self._chunk_size >= to_read: 

228 data = f.read(to_read) 

229 self._content_hasher.update(data) 

230 else: 

231 while to_read >= 0: 

232 data = f.read(min(self._chunk_size, to_read)) 

233 if not data: 

234 break 

235 to_read -= len(data) 

236 self._content_hasher.update(data) 

237 

238 digest = self._content_hasher.compute() 

239 return hashing.Digest(self.digest_name, digest.digest_value) 

240 

241 @property 

242 @override 

243 def digest_name(self) -> str: 

244 if self._digest_name_override is not None: 

245 return self._digest_name_override 

246 return f"{self._content_hasher.digest_name}-sharded-{self.shard_size}"