/src/processor_text_fuzzer.cc
Line | Count | Source |
1 | | // Copyright 2026 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // Fuzzer that loads a valid pre-built sentencepiece model (embedded in |
16 | | // the binary as a byte array) and then fuzzes all encoding, decoding, |
17 | | // normalization, and vocabulary operations with fuzz-derived text input. |
18 | | |
19 | | #include <cstddef> |
20 | | #include <cstdint> |
21 | | #include <cstdlib> |
22 | | #include <memory> |
23 | | #include <string> |
24 | | #include <vector> |
25 | | |
26 | | #include <fuzzer/FuzzedDataProvider.h> |
27 | | #include "sentencepiece_processor.h" |
28 | | |
29 | | // Generated at build time by: xxd -i processor_text_fuzzer_model |
30 | | #include "embedded_model.h" |
31 | | |
32 | | static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor; |
33 | | |
34 | 1 | extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { |
35 | 1 | g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>(); |
36 | | |
37 | | // Load the model from the embedded byte array |
38 | 1 | std::string model_data( |
39 | 1 | reinterpret_cast<const char *>(kEmbeddedModelData), |
40 | 1 | kEmbeddedModelSize); |
41 | 1 | auto status = g_processor->LoadFromSerializedProto(model_data); |
42 | 1 | if (!status.ok()) { |
43 | 0 | fprintf(stderr, "Failed to load embedded model: %s\n", |
44 | 0 | status.ToString().c_str()); |
45 | 0 | abort(); |
46 | 0 | } |
47 | | |
48 | 1 | return 0; |
49 | 1 | } |
50 | | |
51 | 0 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
52 | 0 | if (!g_processor || size < 2) |
53 | 0 | return 0; |
54 | | |
55 | 0 | FuzzedDataProvider fdp(data, size); |
56 | 0 | uint8_t ops = fdp.ConsumeIntegral<uint8_t>(); |
57 | 0 | std::string text = fdp.ConsumeRemainingBytesAsString(); |
58 | | |
59 | | // === Core encoding operations === |
60 | 0 | if (ops & 0x01) { |
61 | | // Encode to pieces (strings) |
62 | 0 | std::vector<std::string> pieces; |
63 | 0 | g_processor->Encode(text, &pieces); |
64 | | |
65 | | // Decode back from pieces |
66 | 0 | if (!pieces.empty()) { |
67 | 0 | std::string decoded; |
68 | 0 | g_processor->Decode(pieces, &decoded); |
69 | 0 | } |
70 | 0 | } |
71 | |
|
72 | 0 | if (ops & 0x02) { |
73 | | // Encode to IDs |
74 | 0 | std::vector<int> ids; |
75 | 0 | g_processor->Encode(text, &ids); |
76 | | |
77 | | // Decode back from IDs |
78 | 0 | if (!ids.empty()) { |
79 | 0 | std::string decoded; |
80 | 0 | g_processor->Decode(ids, &decoded); |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | | // === Advanced encoding operations === |
85 | 0 | if (ops & 0x04) { |
86 | | // NBest encoding - use small nbest to avoid being too slow |
87 | 0 | std::vector<std::vector<std::string>> nbest_pieces; |
88 | 0 | g_processor->NBestEncode(text, 3, &nbest_pieces); |
89 | | |
90 | | // NBest encode to IDs |
91 | 0 | std::vector<std::vector<int>> nbest_ids; |
92 | 0 | g_processor->NBestEncode(text, 3, &nbest_ids); |
93 | 0 | } |
94 | |
|
95 | 0 | if (ops & 0x08) { |
96 | | // Sample encoding with various alpha values |
97 | 0 | std::vector<std::string> sampled; |
98 | 0 | g_processor->SampleEncode(text, -1, 0.5, &sampled); |
99 | |
|
100 | 0 | std::vector<int> sampled_ids; |
101 | 0 | g_processor->SampleEncode(text, -1, 0.1, &sampled_ids); |
102 | 0 | } |
103 | |
|
104 | 0 | if (ops & 0x10) { |
105 | | // Encode as serialized proto (exercises protobuf serialization path) |
106 | 0 | auto serialized = g_processor->EncodeAsSerializedProto(text); |
107 | 0 | (void)serialized; |
108 | |
|
109 | 0 | auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2); |
110 | 0 | (void)nb_serialized; |
111 | |
|
112 | 0 | auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5); |
113 | 0 | (void)sample_serialized; |
114 | 0 | } |
115 | | |
116 | | // === Normalization === |
117 | 0 | if (ops & 0x20) { |
118 | 0 | std::string normalized; |
119 | 0 | g_processor->Normalize(text, &normalized); |
120 | | |
121 | | // Normalize with alignment info |
122 | 0 | std::string normalized2; |
123 | 0 | std::vector<size_t> norm_to_orig; |
124 | 0 | g_processor->Normalize(text, &normalized2, &norm_to_orig); |
125 | 0 | } |
126 | | |
127 | | // === Vocabulary operations === |
128 | 0 | if (ops & 0x40) { |
129 | | // PieceToId with fuzz text |
130 | 0 | g_processor->PieceToId(text); |
131 | | |
132 | | // Try splitting text into substrings and looking them up |
133 | 0 | if (text.size() > 2) { |
134 | 0 | for (size_t i = 0; i < text.size() && i < 8; i++) { |
135 | 0 | std::string sub = text.substr(0, i + 1); |
136 | 0 | int id = g_processor->PieceToId(sub); |
137 | 0 | if (id >= 0 && id < g_processor->GetPieceSize()) { |
138 | 0 | g_processor->IdToPiece(id); |
139 | 0 | g_processor->GetScore(id); |
140 | 0 | g_processor->IsUnknown(id); |
141 | 0 | g_processor->IsControl(id); |
142 | 0 | g_processor->IsUnused(id); |
143 | 0 | g_processor->IsByte(id); |
144 | 0 | } |
145 | 0 | } |
146 | 0 | } |
147 | 0 | } |
148 | | |
149 | | // === Entropy calculation === |
150 | 0 | if (ops & 0x80) { |
151 | 0 | if (text.size() > 0 && text.size() < 256) { |
152 | 0 | float entropy = g_processor->CalculateEntropy(text, 0.5); |
153 | 0 | (void)entropy; |
154 | 0 | } |
155 | 0 | } |
156 | |
|
157 | 0 | return 0; |
158 | 0 | } |