/src/trainer_fuzzer.cc

Source
// Copyright 2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>

#include <fuzzer/FuzzedDataProvider.h>
#include "sentencepiece_trainer.h"

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
  FuzzedDataProvider fdp(data, size);

  // Training is expensive, so we limit the number of sentences and their length.
  // We also limit the vocab_size.
  
  int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100);
  std::string model_type;
  switch (fdp.ConsumeIntegralInRange<int>(0, 3)) {
    case 0: model_type = "unigram"; break;
    case 1: model_type = "bpe"; break;
    case 2: model_type = "word"; break;
    case 3: model_type = "char"; break;
  }

  // We use an empty model_prefix and pass a pointer to a string to receive
  // the serialized model proto. This avoids writing to disk.
  std::string args = "--vocab_size=" + std::to_string(vocab_size) +
                     " --model_type=" + model_type;

  // Randomly add some other common flags
  if (fdp.ConsumeBool()) {
    args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0));
  }
  if (fdp.ConsumeBool()) {
    args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500));
  }
  if (fdp.ConsumeBool()) {
    args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false");
  }
  if (fdp.ConsumeBool()) {
    args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false");
  }
  if (fdp.ConsumeBool()) {
    args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false");
  }
  if (fdp.ConsumeBool()) {
    args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false");
  }
  if (fdp.ConsumeBool()) {
    args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false");
  }
  
  // Mandatory for performance in fuzzing
  args += " --num_threads=1";

  // Generate a small number of training sentences
  int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50);
  std::vector<std::string> sentences;
  for (int i = 0; i < num_sentences; ++i) {
    // Keep sentences relatively short
    std::string s = fdp.ConsumeRandomLengthString(200);
    if (!s.empty()) {
      sentences.push_back(s);
    }
  }

  // Ensure we have at least one non-empty sentence to avoid immediate 
  // failure in some trainer types that CHECK(!sentences_.empty()).
  if (sentences.empty()) {
    sentences.push_back("the quick brown fox jumps over the lazy dog");
  }

  std::string serialized_model_proto;
  // Train can return various errors for invalid combinations of parameters,
  // which is expected.
  sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto);

  return 0;
}

Line	Count	Source
1		// Copyright 2026 Google LLC
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		#include <cstddef>
16		#include <cstdint>
17		#include <string>
18		#include <vector>
19
20		#include <fuzzer/FuzzedDataProvider.h>
21		#include "sentencepiece_trainer.h"
22
23	11.6k	extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
24	11.6k	FuzzedDataProvider fdp(data, size);
25
26		// Training is expensive, so we limit the number of sentences and their length.
27		// We also limit the vocab_size.
28
29	11.6k	int vocab_size = fdp.ConsumeIntegralInRange<int>(10, 100);
30	11.6k	std::string model_type;
31	11.6k	switch (fdp.ConsumeIntegralInRange<int>(0, 3)) {
32	8.15k	case 0: model_type = "unigram"; break;
33	1.79k	case 1: model_type = "bpe"; break;
34	1.43k	case 2: model_type = "word"; break;
35	324	case 3: model_type = "char"; break;
36	11.6k	}
37
38		// We use an empty model_prefix and pass a pointer to a string to receive
39		// the serialized model proto. This avoids writing to disk.
40	11.6k	std::string args = "--vocab_size=" + std::to_string(vocab_size) +
41	11.6k	" --model_type=" + model_type;
42
43		// Randomly add some other common flags
44	11.6k	if (fdp.ConsumeBool()) {
45	3.97k	args += " --character_coverage=" + std::to_string(fdp.ConsumeFloatingPointInRange<float>(0.98, 1.0));
46	3.97k	}
47	11.6k	if (fdp.ConsumeBool()) {
48	2.56k	args += " --input_sentence_size=" + std::to_string(fdp.ConsumeIntegralInRange<int>(100, 500));
49	2.56k	}
50	11.6k	if (fdp.ConsumeBool()) {
51	2.90k	args += " --shuffle_input_sentence=" + std::string(fdp.ConsumeBool() ? "true" : "false");
52	2.90k	}
53	11.6k	if (fdp.ConsumeBool()) {
54	4.74k	args += " --split_by_unicode_script=" + std::string(fdp.ConsumeBool() ? "true" : "false");
55	4.74k	}
56	11.6k	if (fdp.ConsumeBool()) {
57	3.22k	args += " --split_by_whitespace=" + std::string(fdp.ConsumeBool() ? "true" : "false");
58	3.22k	}
59	11.6k	if (fdp.ConsumeBool()) {
60	3.64k	args += " --split_by_number=" + std::string(fdp.ConsumeBool() ? "true" : "false");
61	3.64k	}
62	11.6k	if (fdp.ConsumeBool()) {
63	2.07k	args += " --byte_fallback=" + std::string(fdp.ConsumeBool() ? "true" : "false");
64	2.07k	}
65
66		// Mandatory for performance in fuzzing
67	11.6k	args += " --num_threads=1";
68
69		// Generate a small number of training sentences
70	11.6k	int num_sentences = fdp.ConsumeIntegralInRange<int>(1, 50);
71	11.6k	std::vector<std::string> sentences;
72	286k	for (int i = 0; i < num_sentences; ++i) {
73		// Keep sentences relatively short
74	274k	std::string s = fdp.ConsumeRandomLengthString(200);
75	274k	if (!s.empty()) {
76	55.4k	sentences.push_back(s);
77	55.4k	}
78	274k	}
79
80		// Ensure we have at least one non-empty sentence to avoid immediate
81		// failure in some trainer types that CHECK(!sentences_.empty()).
82	11.6k	if (sentences.empty()) {
83	260	sentences.push_back("the quick brown fox jumps over the lazy dog");
84	260	}
85
86	11.6k	std::string serialized_model_proto;
87		// Train can return various errors for invalid combinations of parameters,
88		// which is expected.
89	11.6k	sentencepiece::SentencePieceTrainer::Train(args, sentences, &serialized_model_proto);
90
91	11.6k	return 0;
92	11.6k	}

Coverage Report

Created: 2026-05-04 07:01