Coverage Report

Created: 2026-06-22 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/trainer_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include <cstddef>
16
#include <cstdint>
17
#include <string>
18
#include <vector>
19
20
#include <fuzzer/FuzzedDataProvider.h>
21
#include "sentencepiece_trainer.h"
22
23
8.54k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
24
8.54k
  FuzzedDataProvider fdp(data, size);
25
26
  // Training is expensive, so we limit the number of sentences and their length.
27
  // We also limit the vocab_size.
28
  
29
8.54k
  int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100);
30
8.54k
  std::string model_type;
31
8.54k
  switch (fdp.ConsumeIntegralInRange<int>(0, 3)) {
32
5.91k
    case 0: model_type = "unigram"; break;
33
1.44k
    case 1: model_type = "bpe"; break;
34
933
    case 2: model_type = "word"; break;
35
260
    case 3: model_type = "char"; break;
36
8.54k
  }
37
38
  // We use an empty model_prefix and pass a pointer to a string to receive
39
  // the serialized model proto. This avoids writing to disk.
40
8.54k
  std::string args = "--vocab_size=" + std::to_string(vocab_size) +
41
8.54k
                     " --model_type=" + model_type;
42
43
  // Randomly add some other common flags
44
8.54k
  if (fdp.ConsumeBool()) {
45
2.17k
    args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0));
46
2.17k
  }
47
8.54k
  if (fdp.ConsumeBool()) {
48
1.76k
    args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500));
49
1.76k
  }
50
8.54k
  if (fdp.ConsumeBool()) {
51
2.06k
    args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false");
52
2.06k
  }
53
8.54k
  if (fdp.ConsumeBool()) {
54
3.10k
    args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false");
55
3.10k
  }
56
8.54k
  if (fdp.ConsumeBool()) {
57
2.30k
    args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false");
58
2.30k
  }
59
8.54k
  if (fdp.ConsumeBool()) {
60
2.31k
    args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false");
61
2.31k
  }
62
8.54k
  if (fdp.ConsumeBool()) {
63
1.48k
    args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false");
64
1.48k
  }
65
  
66
  // Mandatory for performance in fuzzing
67
8.54k
  args += " --num_threads=1";
68
69
  // Generate a small number of training sentences
70
8.54k
  int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50);
71
8.54k
  std::vector<std::string> sentences;
72
234k
  for (int i = 0; i < num_sentences; ++i) {
73
    // Keep sentences relatively short
74
225k
    std::string s = fdp.ConsumeRandomLengthString(200);
75
225k
    if (!s.empty()) {
76
58.8k
      sentences.push_back(s);
77
58.8k
    }
78
225k
  }
79
80
  // Ensure we have at least one non-empty sentence to avoid immediate 
81
  // failure in some trainer types that CHECK(!sentences_.empty()).
82
8.54k
  if (sentences.empty()) {
83
439
    sentences.push_back("the quick brown fox jumps over the lazy dog");
84
439
  }
85
86
8.54k
  std::string serialized_model_proto;
87
  // Train can return various errors for invalid combinations of parameters,
88
  // which is expected.
89
8.54k
  sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto);
90
91
8.54k
  return 0;
92
8.54k
}