Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/model_signing/hashing.py: 86%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

92 statements  

1# Copyright 2024 The Sigstore Authors 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""High level API for the hashing interface of `model_signing` library. 

16 

17Hashing is used both for signing and verification and users should ensure that 

18the same configuration is used in both cases. 

19 

20The module could also be used to just hash a single model, without signing it: 

21 

22```python 

23model_signing.hashing.hash(model_path) 

24``` 

25 

26This module allows setting up the hashing configuration to a single variable and 

27then sharing it between signing and verification. 

28 

29```python 

30hashing_config = model_signing.hashing.Config().set_ignored_paths( 

31 paths=["README.md"], ignore_git_paths=True 

32) 

33 

34signing_config = ( 

35 model_signing.signing.Config() 

36 .use_elliptic_key_signer(private_key="key") 

37 .set_hashing_config(hashing_config) 

38) 

39 

40verifying_config = ( 

41 model_signing.verifying.Config() 

42 .use_elliptic_key_verifier(public_key="key.pub") 

43 .set_hashing_config(hashing_config) 

44) 

45``` 

46 

47The API defined here is stable and backwards compatible. 

48""" 

49 

50from collections.abc import Callable, Iterable 

51import os 

52import pathlib 

53import sys 

54from typing import Literal 

55 

56import blake3 

57 

58from model_signing import manifest 

59from model_signing._hashing import hashing 

60from model_signing._hashing import io 

61from model_signing._hashing import memory 

62from model_signing._serialization import file 

63from model_signing._serialization import file_shard 

64 

65 

66if sys.version_info >= (3, 11): 

67 from typing import Self 

68else: 

69 from typing_extensions import Self 

70 

71 

72# `TypeAlias` only exists from Python 3.10 

73# `TypeAlias` is deprecated in Python 3.12 in favor of `type` 

74from typing import TypeAlias 

75 

76 

77# Type alias to support `os.PathLike`, `str` and `bytes` objects in the API 

78# When Python 3.12 is the minimum supported version we can use `type` 

79# When Python 3.11 is the minimum supported version we can use `|` 

80PathLike: TypeAlias = str | bytes | os.PathLike 

81 

82 

83def hash(model_path: PathLike) -> manifest.Manifest: 

84 """Hashes a model using the default configuration. 

85 

86 Hashing is the shared part between signing and verification and is also 

87 expected to be the slowest component. When serializing a model, we need to 

88 spend time proportional to the model size on disk. 

89 

90 This method returns a "manifest" of the model. A manifest is a collection of 

91 every object in the model, paired with the corresponding hash. Currently, we 

92 consider an object in the model to be either a file or a shard of the file. 

93 Large models with large files will be hashed much faster when every shard is 

94 hashed in parallel, at the cost of generating a larger payload for the 

95 signature. In future releases we could support hashing individual tensors or 

96 tensor slices for further speed optimizations for very large models. 

97 

98 Args: 

99 model_path: The path to the model to hash. 

100 

101 Returns: 

102 A manifest of the hashed model. 

103 """ 

104 return Config().hash(model_path) 

105 

106 

107class Config: 

108 """Configuration to use when hashing models. 

109 

110 Hashing is the shared part between signing and verification and is also 

111 expected to be the slowest component. When serializing a model, we need to 

112 spend time proportional to the model size on disk. 

113 

114 Hashing builds a "manifest" of the model. A manifest is a collection of 

115 every object in the model, paired with the corresponding hash. Currently, we 

116 consider an object in the model to be either a file or a shard of the file. 

117 Large models with large files will be hashed much faster when every shard is 

118 hashed in parallel, at the cost of generating a larger payload for the 

119 signature. In future releases we could support hashing individual tensors or 

120 tensor slices for further speed optimizations for very large models. 

121 

122 This configuration class supports configuring the hashing granularity. By 

123 default, we hash at file level granularity. 

124 

125 This configuration class also supports configuring the hash method used to 

126 generate the hash for every object in the model. We currently support 

127 SHA256, BLAKE2, and BLAKE3, with SHA256 being the default. 

128 

129 This configuration class also supports configuring which paths from the 

130 model directory should be ignored. These are files that doesn't impact the 

131 behavior of the model, or files that won't be distributed with the model. By 

132 default, only files that are associated with a git repository (`.git`, 

133 `.gitattributes`, `.gitignore`, etc.) are ignored. 

134 """ 

135 

136 def __init__(self): 

137 """Initializes the default configuration for hashing.""" 

138 self._ignored_paths = frozenset() 

139 self._ignore_git_paths = True 

140 self.use_file_serialization() 

141 self._allow_symlinks = False 

142 

143 def hash( 

144 self, 

145 model_path: PathLike, 

146 *, 

147 files_to_hash: Iterable[PathLike] | None = None, 

148 ) -> manifest.Manifest: 

149 """Hashes a model using the current configuration.""" 

150 # All paths in ``_ignored_paths`` are expected to be relative to the 

151 # model directory. Join them to ``model_path`` and ensure they do not 

152 # escape it. 

153 model_path = pathlib.Path(model_path) 

154 ignored_paths = [] 

155 for p in self._ignored_paths: 

156 full = model_path / p 

157 try: 

158 full.relative_to(model_path) 

159 except ValueError: 

160 continue 

161 ignored_paths.append(full) 

162 

163 if self._ignore_git_paths: 

164 ignored_paths.extend( 

165 [ 

166 model_path / p 

167 for p in [ 

168 ".git/", 

169 ".gitattributes", 

170 ".github/", 

171 ".gitignore", 

172 ] 

173 ] 

174 ) 

175 

176 self._serializer.set_allow_symlinks(self._allow_symlinks) 

177 

178 return self._serializer.serialize( 

179 pathlib.Path(model_path), 

180 ignore_paths=ignored_paths, 

181 files_to_hash=files_to_hash, 

182 ) 

183 

184 def _build_stream_hasher( 

185 self, 

186 hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", 

187 ) -> hashing.StreamingHashEngine: 

188 """Builds a streaming hasher from a constant string. 

189 

190 Args: 

191 hashing_algorithm: The hashing algorithm to use. 

192 

193 Returns: 

194 An instance of the requested hasher. 

195 """ 

196 match hashing_algorithm: 

197 case "sha256": 

198 return memory.SHA256() 

199 case "blake2": 

200 return memory.BLAKE2() 

201 case "blake3": 

202 return memory.BLAKE3() 

203 case _: 

204 raise ValueError( 

205 f"Unsupported hashing method {hashing_algorithm}" 

206 ) 

207 

208 def _build_file_hasher_factory( 

209 self, 

210 hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", 

211 chunk_size: int = 1048576, 

212 max_workers: int | None = None, 

213 ) -> Callable[[pathlib.Path], io.FileHasher]: 

214 """Builds the hasher factory for a serialization by file. 

215 

216 Args: 

217 hashing_algorithm: The hashing algorithm to use to hash a file. 

218 chunk_size: The amount of file to read at once. Default is 1MB. A 

219 special value of 0 signals to attempt to read everything in a 

220 single call. This is ignored for BLAKE3. 

221 max_workers: Maximum number of workers to use in parallel. Defaults 

222 to the number of logical cores. Only relevant for BLAKE3. 

223 

224 Returns: 

225 The hasher factory that should be used by the active serialization 

226 method. 

227 """ 

228 if max_workers is None: 

229 max_workers = blake3.blake3.AUTO 

230 

231 def _factory(path: pathlib.Path) -> io.FileHasher: 

232 if hashing_algorithm == "blake3": 

233 return io.Blake3FileHasher(path, max_threads=max_workers) 

234 hasher = self._build_stream_hasher(hashing_algorithm) 

235 return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size) 

236 

237 return _factory 

238 

239 def _build_sharded_file_hasher_factory( 

240 self, 

241 hashing_algorithm: Literal["sha256", "blake2"] = "sha256", 

242 chunk_size: int = 1048576, 

243 shard_size: int = 1_000_000_000, 

244 ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]: 

245 """Builds the hasher factory for a serialization by file shards. 

246 

247 This is not recommended for BLAKE3 because it is not necessary. BLAKE3 

248 already operates in parallel. 

249 

250 Args: 

251 hashing_algorithm: The hashing algorithm to use to hash a shard. 

252 chunk_size: The amount of file to read at once. Default is 1MB. A 

253 special value of 0 signals to attempt to read everything in a 

254 single call. 

255 shard_size: The size of a file shard. Default is 1 GB. 

256 

257 Returns: 

258 The hasher factory that should be used by the active serialization 

259 method. 

260 """ 

261 

262 def _factory( 

263 path: pathlib.Path, start: int, end: int 

264 ) -> io.ShardedFileHasher: 

265 hasher = self._build_stream_hasher(hashing_algorithm) 

266 return io.ShardedFileHasher( 

267 path, 

268 hasher, 

269 start=start, 

270 end=end, 

271 chunk_size=chunk_size, 

272 shard_size=shard_size, 

273 ) 

274 

275 return _factory 

276 

277 def use_file_serialization( 

278 self, 

279 *, 

280 hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", 

281 chunk_size: int = 1048576, 

282 max_workers: int | None = None, 

283 allow_symlinks: bool = False, 

284 ignore_paths: Iterable[pathlib.Path] = frozenset(), 

285 ) -> Self: 

286 """Configures serialization to build a manifest of (file, hash) pairs. 

287 

288 The serialization method in this configuration is changed to one where 

289 every file in the model is paired with its digest and a manifest 

290 containing all these pairings is being built. 

291 

292 Args: 

293 hashing_algorithm: The hashing algorithm to use to hash a file. 

294 chunk_size: The amount of file to read at once. Default is 1MB. A 

295 special value of 0 signals to attempt to read everything in a 

296 single call. Ignored for BLAKE3. 

297 max_workers: Maximum number of workers to use in parallel. Default 

298 is to defer to the `concurrent.futures` library to select the best 

299 value for the current machine, or the number of logical cores 

300 when doing BLAKE3 hashing. When reading files off of slower 

301 hardware like an HDD rather than an SSD, and using BLAKE3, 

302 setting max_workers to 1 may improve performance. 

303 allow_symlinks: Controls whether symbolic links are included. If a 

304 symlink is present but the flag is `False` (default) the 

305 serialization would raise an error. 

306 

307 Returns: 

308 The new hashing configuration with the new serialization method. 

309 """ 

310 self._serializer = file.Serializer( 

311 self._build_file_hasher_factory( 

312 hashing_algorithm, chunk_size, max_workers 

313 ), 

314 max_workers=max_workers, 

315 allow_symlinks=allow_symlinks, 

316 ignore_paths=ignore_paths, 

317 ) 

318 return self 

319 

320 def use_shard_serialization( 

321 self, 

322 *, 

323 hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", 

324 chunk_size: int = 1048576, 

325 shard_size: int = 1_000_000_000, 

326 max_workers: int | None = None, 

327 allow_symlinks: bool = False, 

328 ignore_paths: Iterable[pathlib.Path] = frozenset(), 

329 ) -> Self: 

330 """Configures serialization to build a manifest of (shard, hash) pairs. 

331 

332 For BLAKE3 this is equivalent to not sharding. Sharding is bypassed 

333 because BLAKE3 already operates in parallel. This means the chunk_size 

334 and shard_size args are ignored. 

335 

336 The serialization method in this configuration is changed to one where 

337 every file in the model is sharded in equal sized shards, every shard is 

338 paired with its digest and a manifest containing all these pairings is 

339 being built. 

340 

341 Args: 

342 hashing_algorithm: The hashing algorithm to use to hash a shard. 

343 chunk_size: The amount of file to read at once. Default is 1MB. A 

344 special value of 0 signals to attempt to read everything in a 

345 single call. 

346 shard_size: The size of a file shard. Default is 1 GB. 

347 max_workers: Maximum number of workers to use in parallel. Default 

348 is to defer to the `concurrent.futures` library to select the best 

349 value for the current machine. 

350 allow_symlinks: Controls whether symbolic links are included. If a 

351 symlink is present but the flag is `False` (default) the 

352 serialization would raise an error. 

353 ignore_paths: Paths of files to ignore. 

354 

355 Returns: 

356 The new hashing configuration with the new serialization method. 

357 """ 

358 if hashing_algorithm == "blake3": 

359 return self.use_file_serialization( 

360 hashing_algorithm=hashing_algorithm, 

361 chunk_size=chunk_size, 

362 max_workers=max_workers, 

363 allow_symlinks=allow_symlinks, 

364 ignore_paths=ignore_paths, 

365 ) 

366 

367 self._serializer = file_shard.Serializer( 

368 self._build_sharded_file_hasher_factory( 

369 hashing_algorithm, chunk_size, shard_size 

370 ), 

371 max_workers=max_workers, 

372 allow_symlinks=allow_symlinks, 

373 ignore_paths=ignore_paths, 

374 ) 

375 return self 

376 

377 def set_ignored_paths( 

378 self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True 

379 ) -> Self: 

380 """Configures the paths to be ignored during serialization of a model. 

381 

382 If the model is a single file, there are no paths that are ignored. If 

383 the model is a directory, all paths are considered as relative to the 

384 model directory, since we never look at files outside of it. 

385 

386 If an ignored path is a directory, serialization will ignore both the 

387 path and any of its children. 

388 

389 Args: 

390 paths: The paths to ignore. 

391 ignore_git_paths: Whether to ignore git related paths (default) or 

392 include them in the signature. 

393 

394 Returns: 

395 The new hashing configuration with a new set of ignored paths. 

396 """ 

397 # Preserve the user-provided relative paths; they are resolved against 

398 # the model directory later when hashing. 

399 self._ignored_paths = frozenset(pathlib.Path(p) for p in paths) 

400 self._ignore_git_paths = ignore_git_paths 

401 return self 

402 

403 def add_ignored_paths( 

404 self, *, model_path: PathLike, paths: Iterable[PathLike] 

405 ) -> None: 

406 """Add more paths to ignore to existing set of paths. 

407 

408 Args: 

409 model_path: The path to the model 

410 paths: Additional paths to ignore. All path must be relative to 

411 the model directory. 

412 """ 

413 newset = set(self._ignored_paths) 

414 model_path = pathlib.Path(model_path) 

415 for p in paths: 

416 candidate = pathlib.Path(p) 

417 full = model_path / candidate 

418 try: 

419 full.relative_to(model_path) 

420 except ValueError: 

421 continue 

422 newset.add(candidate) 

423 self._ignored_paths = newset 

424 

425 def set_allow_symlinks(self, allow_symlinks: bool) -> Self: 

426 """Set whether following symlinks is allowed.""" 

427 self._allow_symlinks = allow_symlinks 

428 return self