Coverage Report

Created: 2026-05-30 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/trainer_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include <cstddef>
16
#include <cstdint>
17
#include <string>
18
#include <vector>
19
20
#include <fuzzer/FuzzedDataProvider.h>
21
#include "sentencepiece_trainer.h"
22
23
10.1k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
24
10.1k
  FuzzedDataProvider fdp(data, size);
25
26
  // Training is expensive, so we limit the number of sentences and their length.
27
  // We also limit the vocab_size.
28
  
29
10.1k
  int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100);
30
10.1k
  std::string model_type;
31
10.1k
  switch (fdp.ConsumeIntegralInRange<int>(0, 3)) {
32
6.89k
    case 0: model_type = "unigram"; break;
33
1.81k
    case 1: model_type = "bpe"; break;
34
1.17k
    case 2: model_type = "word"; break;
35
246
    case 3: model_type = "char"; break;
36
10.1k
  }
37
38
  // We use an empty model_prefix and pass a pointer to a string to receive
39
  // the serialized model proto. This avoids writing to disk.
40
10.1k
  std::string args = "--vocab_size=" + std::to_string(vocab_size) +
41
10.1k
                     " --model_type=" + model_type;
42
43
  // Randomly add some other common flags
44
10.1k
  if (fdp.ConsumeBool()) {
45
2.54k
    args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0));
46
2.54k
  }
47
10.1k
  if (fdp.ConsumeBool()) {
48
2.04k
    args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500));
49
2.04k
  }
50
10.1k
  if (fdp.ConsumeBool()) {
51
2.53k
    args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false");
52
2.53k
  }
53
10.1k
  if (fdp.ConsumeBool()) {
54
3.55k
    args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false");
55
3.55k
  }
56
10.1k
  if (fdp.ConsumeBool()) {
57
2.64k
    args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false");
58
2.64k
  }
59
10.1k
  if (fdp.ConsumeBool()) {
60
2.75k
    args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false");
61
2.75k
  }
62
10.1k
  if (fdp.ConsumeBool()) {
63
1.65k
    args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false");
64
1.65k
  }
65
  
66
  // Mandatory for performance in fuzzing
67
10.1k
  args += " --num_threads=1";
68
69
  // Generate a small number of training sentences
70
10.1k
  int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50);
71
10.1k
  std::vector<std::string> sentences;
72
269k
  for (int i = 0; i < num_sentences; ++i) {
73
    // Keep sentences relatively short
74
259k
    std::string s = fdp.ConsumeRandomLengthString(200);
75
259k
    if (!s.empty()) {
76
65.3k
      sentences.push_back(s);
77
65.3k
    }
78
259k
  }
79
80
  // Ensure we have at least one non-empty sentence to avoid immediate 
81
  // failure in some trainer types that CHECK(!sentences_.empty()).
82
10.1k
  if (sentences.empty()) {
83
452
    sentences.push_back("the quick brown fox jumps over the lazy dog");
84
452
  }
85
86
10.1k
  std::string serialized_model_proto;
87
  // Train can return various errors for invalid combinations of parameters,
88
  // which is expected.
89
10.1k
  sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto);
90
91
10.1k
  return 0;
92
10.1k
}