/src/model_load_fuzzer.cc
Line | Count | Source |
1 | | // Copyright 2026 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // Fuzzer for SentencePiece model loading and post-load operations. |
16 | | // Feeds arbitrary binary data as a serialized ModelProto, then exercises |
17 | | // encoding/decoding if the model loads successfully. |
18 | | |
19 | | #include <cstddef> |
20 | | #include <cstdint> |
21 | | #include <string> |
22 | | #include <vector> |
23 | | |
24 | | #include <fuzzer/FuzzedDataProvider.h> |
25 | | #include "sentencepiece_processor.h" |
26 | | |
27 | 4.72k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
28 | 4.72k | if (size < 4) |
29 | 2 | return 0; |
30 | | |
31 | 4.72k | FuzzedDataProvider fdp(data, size); |
32 | | |
33 | | // Split data: most goes to model, some to test text |
34 | 4.72k | std::string model_data = fdp.ConsumeRandomLengthString(size); |
35 | 4.72k | std::string test_text = fdp.ConsumeRemainingBytesAsString(); |
36 | | |
37 | 4.72k | sentencepiece::SentencePieceProcessor processor; |
38 | | |
39 | | // Try loading fuzz data as a serialized model proto |
40 | 4.72k | auto status = processor.LoadFromSerializedProto(model_data); |
41 | 4.72k | if (!status.ok()) |
42 | 3.16k | return 0; |
43 | | |
44 | | // Model loaded successfully - exercise all major operations |
45 | | |
46 | | // Basic encoding |
47 | 1.55k | std::vector<std::string> pieces; |
48 | 1.55k | processor.Encode(test_text, &pieces); |
49 | | |
50 | | // Encode to IDs |
51 | 1.55k | std::vector<int> ids; |
52 | 1.55k | processor.Encode(test_text, &ids); |
53 | | |
54 | | // Decode from pieces |
55 | 1.55k | if (!pieces.empty()) { |
56 | 1.49k | std::string decoded; |
57 | 1.49k | processor.Decode(pieces, &decoded); |
58 | 1.49k | } |
59 | | |
60 | | // Decode from IDs |
61 | 1.55k | if (!ids.empty()) { |
62 | 1.49k | std::string decoded; |
63 | 1.49k | processor.Decode(ids, &decoded); |
64 | 1.49k | } |
65 | | |
66 | | // Normalization |
67 | 1.55k | std::string normalized; |
68 | 1.55k | processor.Normalize(test_text, &normalized); |
69 | | |
70 | | // Vocabulary operations |
71 | 1.55k | int vocab_size = processor.GetPieceSize(); |
72 | 1.55k | if (vocab_size > 0) { |
73 | | // PieceToId / IdToPiece round-trip |
74 | 11.4k | for (int i = 0; i < vocab_size && i < 10; i++) { |
75 | 9.94k | std::string piece = processor.IdToPiece(i); |
76 | 9.94k | processor.PieceToId(piece); |
77 | 9.94k | processor.GetScore(i); |
78 | 9.94k | processor.IsUnknown(i); |
79 | 9.94k | processor.IsControl(i); |
80 | 9.94k | processor.IsUnused(i); |
81 | 9.94k | processor.IsByte(i); |
82 | 9.94k | } |
83 | | |
84 | | // Try lookup with test_text as a piece |
85 | 1.55k | processor.PieceToId(test_text); |
86 | 1.55k | } |
87 | | |
88 | | // Special token IDs |
89 | 1.55k | processor.unk_id(); |
90 | 1.55k | processor.bos_id(); |
91 | 1.55k | processor.eos_id(); |
92 | 1.55k | processor.pad_id(); |
93 | | |
94 | | // NBest encoding (with small nbest_size to avoid slowness) |
95 | 1.55k | std::vector<std::vector<std::string>> nbest_pieces; |
96 | 1.55k | processor.NBestEncode(test_text, 2, &nbest_pieces); |
97 | | |
98 | | // Sample encoding |
99 | 1.55k | std::vector<std::string> sampled; |
100 | 1.55k | processor.SampleEncode(test_text, 1, 0.5, &sampled); |
101 | | |
102 | | // Encode as serialized proto |
103 | 1.55k | processor.EncodeAsSerializedProto(test_text); |
104 | 1.55k | processor.SampleEncodeAsSerializedProto(test_text, 1, 0.5); |
105 | | |
106 | | // Get serialized model |
107 | 1.55k | processor.serialized_model_proto(); |
108 | | |
109 | 1.55k | return 0; |
110 | 4.72k | } |