Line | Count | Source |
1 | | // Copyright 2026 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include <cstddef> |
16 | | #include <cstdint> |
17 | | #include <string> |
18 | | #include <vector> |
19 | | |
20 | | #include <fuzzer/FuzzedDataProvider.h> |
21 | | #include "sentencepiece_trainer.h" |
22 | | |
23 | 11.6k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
24 | 11.6k | FuzzedDataProvider fdp(data, size); |
25 | | |
26 | | // Training is expensive, so we limit the number of sentences and their length. |
27 | | // We also limit the vocab_size. |
28 | | |
29 | 11.6k | int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100); |
30 | 11.6k | std::string model_type; |
31 | 11.6k | switch (fdp.ConsumeIntegralInRange<int>(0, 3)) { |
32 | 8.15k | case 0: model_type = "unigram"; break; |
33 | 1.79k | case 1: model_type = "bpe"; break; |
34 | 1.43k | case 2: model_type = "word"; break; |
35 | 324 | case 3: model_type = "char"; break; |
36 | 11.6k | } |
37 | | |
38 | | // We use an empty model_prefix and pass a pointer to a string to receive |
39 | | // the serialized model proto. This avoids writing to disk. |
40 | 11.6k | std::string args = "--vocab_size=" + std::to_string(vocab_size) + |
41 | 11.6k | " --model_type=" + model_type; |
42 | | |
43 | | // Randomly add some other common flags |
44 | 11.6k | if (fdp.ConsumeBool()) { |
45 | 3.97k | args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0)); |
46 | 3.97k | } |
47 | 11.6k | if (fdp.ConsumeBool()) { |
48 | 2.56k | args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500)); |
49 | 2.56k | } |
50 | 11.6k | if (fdp.ConsumeBool()) { |
51 | 2.90k | args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
52 | 2.90k | } |
53 | 11.6k | if (fdp.ConsumeBool()) { |
54 | 4.74k | args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
55 | 4.74k | } |
56 | 11.6k | if (fdp.ConsumeBool()) { |
57 | 3.22k | args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
58 | 3.22k | } |
59 | 11.6k | if (fdp.ConsumeBool()) { |
60 | 3.64k | args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
61 | 3.64k | } |
62 | 11.6k | if (fdp.ConsumeBool()) { |
63 | 2.07k | args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false"); |
64 | 2.07k | } |
65 | | |
66 | | // Mandatory for performance in fuzzing |
67 | 11.6k | args += " --num_threads=1"; |
68 | | |
69 | | // Generate a small number of training sentences |
70 | 11.6k | int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50); |
71 | 11.6k | std::vector<std::string> sentences; |
72 | 286k | for (int i = 0; i < num_sentences; ++i) { |
73 | | // Keep sentences relatively short |
74 | 274k | std::string s = fdp.ConsumeRandomLengthString(200); |
75 | 274k | if (!s.empty()) { |
76 | 55.4k | sentences.push_back(s); |
77 | 55.4k | } |
78 | 274k | } |
79 | | |
80 | | // Ensure we have at least one non-empty sentence to avoid immediate |
81 | | // failure in some trainer types that CHECK(!sentences_.empty()). |
82 | 11.6k | if (sentences.empty()) { |
83 | 260 | sentences.push_back("the quick brown fox jumps over the lazy dog"); |
84 | 260 | } |
85 | | |
86 | 11.6k | std::string serialized_model_proto; |
87 | | // Train can return various errors for invalid combinations of parameters, |
88 | | // which is expected. |
89 | 11.6k | sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto); |
90 | | |
91 | 11.6k | return 0; |
92 | 11.6k | } |