1# Copyright 2024 The Sigstore Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Machinery for signing and verification of ML models.
16
17The serialization API produces a manifest representation of the models, and we
18use that to implement integrity checking of models in different computational
19patterns. This means that all manifests need to be kept only in memory.
20
21Since we need to support multiple signing methods (e.g., Sigstore, key,
22certificate, etc.) , we provide a `Signer` abstract class with a single `sign`
23method that takes a signing payload and converts it to a signature in the
24supported format.
25
26Finally, every signature needs to be verified. We pair every `Signer` subclass
27with a `Verifier` which takes a signature, verify the authenticity of the
28payload and then expand that to a manifest.
29
30Regarding the data formats, for signing, we need to convert the manifest to the
31signing payload. We only support manifests serialized to in-toto formats
32described by https://github.com/in-toto/attestation/tree/main/spec/v1. The
33envelope format is DSSE, as described in
34https://github.com/secure-systems-lab/dsse. The signature is in a Sigstore
35bundle format over the DSSE payload. The format is described at
36https://docs.sigstore.dev/about/bundle/.
37"""
38
39import abc
40import json
41import pathlib
42import sys
43from typing import Any
44
45from in_toto_attestation.v1 import statement
46
47from model_signing import manifest
48from model_signing._hashing import hashing
49from model_signing._hashing import memory
50
51
52if sys.version_info >= (3, 11):
53 from typing import Self
54else:
55 from typing_extensions import Self
56
57
58# The expected in-toto payload type for the signature.
59_IN_TOTO_JSON_PAYLOAD_TYPE: str = "application/vnd.in-toto+json"
60
61
62# The expected in-toto statement type for the signature.
63_IN_TOTO_STATEMENT_TYPE: str = "https://in-toto.io/Statement/v1"
64
65
66# The expected model signature predicate type.
67_PREDICATE_TYPE: str = "https://model_signing/signature/v1.0"
68
69
70# The expected model signature predicate type for v0.2 (compat)
71_PREDICATE_TYPE_COMPAT: str = "https://model_signing/Digests/v0.1"
72
73
74def dsse_payload_to_manifest(dsse_payload: dict[str, Any]) -> manifest.Manifest:
75 """Builds a manifest from the DSSE payload read from a signature.
76
77 The payload here is a dictionary that represents the payload part of the
78 DSSE envelope contained in the Sigstore bundle.
79
80 Args:
81 payload: The in-toto DSSE envelope to convert to manifest.
82
83 Returns:
84 A manifest representing the signed model.
85
86 Raises:
87 ValueError: The payload cannot be deserialized to a manifest.
88 """
89 obtained_predicate_type = dsse_payload["predicateType"]
90 if obtained_predicate_type != _PREDICATE_TYPE:
91 if obtained_predicate_type == _PREDICATE_TYPE_COMPAT:
92 return dsse_payload_to_manifest_compat(dsse_payload)
93 raise ValueError(
94 f"Predicate type mismatch, expected {_PREDICATE_TYPE}, "
95 f"got {obtained_predicate_type}"
96 )
97
98 subjects = dsse_payload["subject"]
99 if len(subjects) != 1:
100 raise ValueError(f"Expected only one subject, got {subjects}")
101
102 model_name = subjects[0]["name"]
103 expected_digest = subjects[0]["digest"]["sha256"]
104
105 predicate = dsse_payload["predicate"]
106 serialization_args = predicate["serialization"]
107 serialization = manifest.SerializationType.from_args(serialization_args)
108
109 hasher = memory.SHA256()
110 items = []
111 for resource in predicate["resources"]:
112 name = resource["name"]
113 algorithm = resource["algorithm"]
114 digest_value = resource["digest"]
115 digest = hashing.Digest(algorithm, bytes.fromhex(digest_value))
116 hasher.update(digest.digest_value)
117 items.append(serialization.new_item(name, digest))
118
119 obtained_digest = hasher.compute().digest_hex
120 if obtained_digest != expected_digest:
121 raise ValueError(
122 f"Manifest is inconsistent. Root digest is {expected_digest}, "
123 f"but the included resources hash to {obtained_digest}"
124 )
125
126 return manifest.Manifest(model_name, items, serialization)
127
128
129def dsse_payload_to_manifest_compat(
130 dsse_payload: dict[str, Any],
131) -> manifest.Manifest:
132 """Builds a manifest from the DSSE payload read from a signature.
133
134 This is the same as `dsse_payload_to_manifest` but using a DSSE payload as
135 defined at v0.2 release. This was experimental but got used in production
136 before v1.0 so we need to patch support to it while verifiers migrate to the
137 forward compatible format defined by v1.0.
138
139 Args:
140 payload: The in-toto DSSE envelope to convert to manifest.
141
142 Returns:
143 A manifest representing the signed model.
144
145 Raises:
146 ValueError: The payload cannot be deserialized to a manifest.
147 """
148 # Model name is not defined, use a constant.
149 model_name = "compat-undefined-not-present"
150
151 # Serialization format is not present, build a fake one.
152 serialization = manifest.SerializationType.from_args(
153 {"method": "files", "hash_type": "sha256", "allow_symlinks": "false"}
154 )
155
156 # The only field with actual content is the subject.
157 items = []
158 for subject in dsse_payload["subject"]:
159 name = subject["name"]
160 algorithm = "sha256" # hardcoded, the only supported one
161 digest_value = subject["digest"][algorithm]
162 digest = hashing.Digest(algorithm, bytes.fromhex(digest_value))
163 items.append(serialization.new_item(name, digest))
164
165 # There is no verification that the manifest is missing items at this point.
166 return manifest.Manifest(model_name, items, serialization)
167
168
169class Payload:
170 """In-toto payload used to represent a model for signing.
171
172 This payload represents all the object (files, shards, etc.) of the model
173 paired with their hashes. It can be seen as a serialization of a manifest.
174 The hashes are all recorded under the predicate, given that for the subject
175 we are limited on what hashes we can use
176 (https://github.com/sigstore/sigstore-python/issues/1018). Each hash follows
177 the format of a ResourceDescriptor: is an object containing a name for the
178 object, the hashing algorithm, and the digest value. These are recorded in
179 the predicate, as part of the `"resources"` list.
180
181 The subject is a name for the model (taken from the model's directory) and a
182 global digest over all the computed digests. This is SHA256 computed over
183 all the digests, in the order they show up in the predicate (we canonicalize
184 this to be in alphabetical order). This digest can be used to refer to the
185 model from other metadata documents without having to carry the entire set
186 of resource descriptors around.
187
188 To ensure backwards compatibility, the predicate contains a
189 `"serialization"` section which describes the method used to serialize a
190 model to the manifest used to generate this payload. The section includes a
191 method name and a list of all relevant values needed to recompute the
192 serialization.
193
194 Future extensions to the model signature (e.g., incorporating model cards,
195 etc.) can be added as part of the predicate. For v1.0 of the predicate the
196 only supported fields in the predicate are `"serialization"` and
197 `"resources"`. Any other field should be ignored by verifiers adhering to
198 v1.0 version.
199
200 Example:
201 ```json
202 {
203 "_type": "https://in-toto.io/Statement/v1",
204 "subject": [
205 {
206 "name": "sample_model",
207 "digest": {
208 "sha256": "143cc6..."
209 }
210 }
211 ],
212 "predicateType": "https://model_signing/signature/v1.0",
213 "predicate": {
214 "serialization": {
215 "method": "files",
216 "hash_type": "sha256",
217 "allow_symlinks": true
218 "ignore_paths": [
219 "model.sig",
220 ".git",
221 ".gitattributes",
222 ".github",
223 ".gitignore"
224 ],
225 },
226 "resources": [
227 {
228 "algorithm": "sha256",
229 "digest": "fdd892...",
230 "name": "d0/f00"
231 },
232 {
233 "algorithm": "sha256",
234 "digest": "e16940...",
235 "name": "d0/f01"
236 },
237 {
238 "algorithm": "sha256",
239 "digest": "407822...",
240 "name": "d0/f02"
241 },
242 ...
243 {
244 "algorithm": "sha256",
245 "digest": "912bcf...",
246 "name": "f3"
247 }
248 ]
249 }
250 }
251 ```
252 """
253
254 def __init__(self, manifest: manifest.Manifest):
255 """Builds an instance of this in-toto payload.
256
257 Args:
258 manifest: the manifest to convert to signing payload.
259 """
260 hasher = memory.SHA256()
261 resources = []
262 for descriptor in manifest.resource_descriptors():
263 hasher.update(descriptor.digest.digest_value)
264 resources.append(
265 {
266 "name": descriptor.identifier,
267 "algorithm": descriptor.digest.algorithm,
268 "digest": descriptor.digest.digest_hex,
269 }
270 )
271
272 root_digest = {"sha256": hasher.compute().digest_hex}
273 subject = statement.ResourceDescriptor(
274 name=manifest.model_name, digest=root_digest
275 ).pb
276
277 predicate = {
278 "serialization": manifest.serialization_type,
279 "resources": resources,
280 # other properties can go here
281 }
282
283 self.statement = statement.Statement(
284 subjects=[subject],
285 predicate_type=_PREDICATE_TYPE,
286 predicate=predicate,
287 )
288
289
290class Signature(metaclass=abc.ABCMeta):
291 """Signature class, wrapping a sigstore bundle.
292
293 We only support sigstore bundle signature formats, but we need to have two
294 separate classes for this given the need to support traditional signing as
295 well as Sigstore one. One class wraps around `sigstore_models.Bundle` and
296 the other around the bundle as defined by `sigstore_protobuf_specs`.
297 """
298
299 @abc.abstractmethod
300 def write(self, path: pathlib.Path) -> None:
301 """Writes the signature to disk, to the given path.
302
303 Args:
304 path: The path to write the signature to.
305 """
306
307 @classmethod
308 @abc.abstractmethod
309 def read(cls, path: pathlib.Path) -> Self:
310 """Reads the signature from disk.
311
312 Does not perform any signature verification, except what is needed to
313 parse the signature file.
314
315 Args:
316 path: The path to read the signature from.
317
318 Returns:
319 An instance of the class which can be passed to a `Verifier` for
320 signature and integrity verification.
321 """
322
323
324class Signer(metaclass=abc.ABCMeta):
325 """Generic signer.
326
327 Each signer may implement its own mechanism for managing the key material.
328 """
329
330 @abc.abstractmethod
331 def sign(self, payload: Payload) -> Signature:
332 """Signs the provided signing payload.
333
334 Args:
335 payload: The `Payload` instance that should be signed.
336
337 Returns:
338 A valid signature.
339 """
340
341
342class Verifier(metaclass=abc.ABCMeta):
343 """Generic signature verifier.
344
345 Every subclass of `Verifier` is paired with a subclass of `Signer`. This is
346 to ensure that they support the same signature formats as well as have
347 similar key materials.
348
349 If the signature is valid, the payload is expanded to a `Manifest` instance
350 which can then be used to check the model integrity.
351 """
352
353 def verify(self, signature: Signature) -> manifest.Manifest:
354 """Verifies the signature.
355
356 Args:
357 signature: The signature to verify.
358
359 Returns:
360 A `manifest.Manifest` instance that represents the model.
361
362 Raises:
363 ValueError: Signature verification fails.
364 """
365 payload_type, payload = self._verify_signed_content(signature)
366
367 if payload_type != _IN_TOTO_JSON_PAYLOAD_TYPE:
368 raise ValueError(
369 f"Expected DSSE payload {_IN_TOTO_JSON_PAYLOAD_TYPE}, "
370 f"but got {payload_type}"
371 )
372
373 payload = json.loads(payload)
374
375 if payload["_type"] != _IN_TOTO_STATEMENT_TYPE:
376 raise ValueError(
377 f"Expected in-toto {_IN_TOTO_STATEMENT_TYPE} payload, "
378 f"but got {payload['_type']}"
379 )
380
381 return dsse_payload_to_manifest(payload)
382
383 @abc.abstractmethod
384 def _verify_signed_content(self, signature: Signature) -> tuple[str, bytes]:
385 """Verifies the signed content and extract payload type and payload.
386
387 Subclasses only need to implement this method.
388
389 Args:
390 signature: The signature to verify.
391
392 Returns:
393 A tuple containing the payload type and the payload (as a JSON
394 object loaded to a dictionary).
395 """