Coverage Report

Created: 2026-05-04 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/trainer_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include <cstddef>
16
#include <cstdint>
17
#include <string>
18
#include <vector>
19
20
#include <fuzzer/FuzzedDataProvider.h>
21
#include "sentencepiece_trainer.h"
22
23
11.6k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
24
11.6k
  FuzzedDataProvider fdp(data, size);
25
26
  // Training is expensive, so we limit the number of sentences and their length.
27
  // We also limit the vocab_size.
28
  
29
11.6k
  int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100);
30
11.6k
  std::string model_type;
31
11.6k
  switch (fdp.ConsumeIntegralInRange<int>(0, 3)) {
32
8.15k
    case 0: model_type = "unigram"; break;
33
1.79k
    case 1: model_type = "bpe"; break;
34
1.43k
    case 2: model_type = "word"; break;
35
324
    case 3: model_type = "char"; break;
36
11.6k
  }
37
38
  // We use an empty model_prefix and pass a pointer to a string to receive
39
  // the serialized model proto. This avoids writing to disk.
40
11.6k
  std::string args = "--vocab_size=" + std::to_string(vocab_size) +
41
11.6k
                     " --model_type=" + model_type;
42
43
  // Randomly add some other common flags
44
11.6k
  if (fdp.ConsumeBool()) {
45
3.97k
    args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0));
46
3.97k
  }
47
11.6k
  if (fdp.ConsumeBool()) {
48
2.56k
    args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500));
49
2.56k
  }
50
11.6k
  if (fdp.ConsumeBool()) {
51
2.90k
    args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false");
52
2.90k
  }
53
11.6k
  if (fdp.ConsumeBool()) {
54
4.74k
    args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false");
55
4.74k
  }
56
11.6k
  if (fdp.ConsumeBool()) {
57
3.22k
    args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false");
58
3.22k
  }
59
11.6k
  if (fdp.ConsumeBool()) {
60
3.64k
    args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false");
61
3.64k
  }
62
11.6k
  if (fdp.ConsumeBool()) {
63
2.07k
    args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false");
64
2.07k
  }
65
  
66
  // Mandatory for performance in fuzzing
67
11.6k
  args += " --num_threads=1";
68
69
  // Generate a small number of training sentences
70
11.6k
  int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50);
71
11.6k
  std::vector<std::string> sentences;
72
286k
  for (int i = 0; i < num_sentences; ++i) {
73
    // Keep sentences relatively short
74
274k
    std::string s = fdp.ConsumeRandomLengthString(200);
75
274k
    if (!s.empty()) {
76
55.4k
      sentences.push_back(s);
77
55.4k
    }
78
274k
  }
79
80
  // Ensure we have at least one non-empty sentence to avoid immediate 
81
  // failure in some trainer types that CHECK(!sentences_.empty()).
82
11.6k
  if (sentences.empty()) {
83
260
    sentences.push_back("the quick brown fox jumps over the lazy dog");
84
260
  }
85
86
11.6k
  std::string serialized_model_proto;
87
  // Train can return various errors for invalid combinations of parameters,
88
  // which is expected.
89
11.6k
  sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto);
90
91
11.6k
  return 0;
92
11.6k
}