Line | Count | Source |
1 | | // Copyright 2026 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include <cstddef> |
16 | | #include <cstdint> |
17 | | #include <string> |
18 | | #include <vector> |
19 | | |
20 | | #include <fuzzer/FuzzedDataProvider.h> |
21 | | #include "sentencepiece_trainer.h" |
22 | | |
23 | 10.1k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
24 | 10.1k | FuzzedDataProvider fdp(data, size); |
25 | | |
26 | | // Training is expensive, so we limit the number of sentences and their length. |
27 | | // We also limit the vocab_size. |
28 | | |
29 | 10.1k | int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100); |
30 | 10.1k | std::string model_type; |
31 | 10.1k | switch (fdp.ConsumeIntegralInRange<int>(0, 3)) { |
32 | 6.89k | case 0: model_type = "unigram"; break; |
33 | 1.81k | case 1: model_type = "bpe"; break; |
34 | 1.17k | case 2: model_type = "word"; break; |
35 | 246 | case 3: model_type = "char"; break; |
36 | 10.1k | } |
37 | | |
38 | | // We use an empty model_prefix and pass a pointer to a string to receive |
39 | | // the serialized model proto. This avoids writing to disk. |
40 | 10.1k | std::string args = "--vocab_size=" + std::to_string(vocab_size) + |
41 | 10.1k | " --model_type=" + model_type; |
42 | | |
43 | | // Randomly add some other common flags |
44 | 10.1k | if (fdp.ConsumeBool()) { |
45 | 2.54k | args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0)); |
46 | 2.54k | } |
47 | 10.1k | if (fdp.ConsumeBool()) { |
48 | 2.04k | args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500)); |
49 | 2.04k | } |
50 | 10.1k | if (fdp.ConsumeBool()) { |
51 | 2.53k | args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
52 | 2.53k | } |
53 | 10.1k | if (fdp.ConsumeBool()) { |
54 | 3.55k | args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
55 | 3.55k | } |
56 | 10.1k | if (fdp.ConsumeBool()) { |
57 | 2.64k | args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
58 | 2.64k | } |
59 | 10.1k | if (fdp.ConsumeBool()) { |
60 | 2.75k | args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
61 | 2.75k | } |
62 | 10.1k | if (fdp.ConsumeBool()) { |
63 | 1.65k | args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
64 | 1.65k | } |
65 | | |
66 | | // Mandatory for performance in fuzzing |
67 | 10.1k | args += " --num_threads=1"; |
68 | | |
69 | | // Generate a small number of training sentences |
70 | 10.1k | int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50); |
71 | 10.1k | std::vector<std::string> sentences; |
72 | 269k | for (int i = 0; i < num_sentences; ++i) { |
73 | | // Keep sentences relatively short |
74 | 259k | std::string s = fdp.ConsumeRandomLengthString(200); |
75 | 259k | if (!s.empty()) { |
76 | 65.3k | sentences.push_back(s); |
77 | 65.3k | } |
78 | 259k | } |
79 | | |
80 | | // Ensure we have at least one non-empty sentence to avoid immediate |
81 | | // failure in some trainer types that CHECK(!sentences_.empty()). |
82 | 10.1k | if (sentences.empty()) { |
83 | 452 | sentences.push_back("the quick brown fox jumps over the lazy dog"); |
84 | 452 | } |
85 | | |
86 | 10.1k | std::string serialized_model_proto; |
87 | | // Train can return various errors for invalid combinations of parameters, |
88 | | // which is expected. |
89 | 10.1k | sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto); |
90 | | |
91 | 10.1k | return 0; |
92 | 10.1k | } |