/src/model_load_fuzzer.cc

Source
// Copyright 2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Fuzzer for SentencePiece model loading and post-load operations.
// Feeds arbitrary binary data as a serialized ModelProto, then exercises
// encoding/decoding if the model loads successfully.

#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>

#include <fuzzer/FuzzedDataProvider.h>
#include "sentencepiece_processor.h"

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
  if (size < 4)
    return 0;

  FuzzedDataProvider fdp(data, size);

  // Split data: most goes to model, some to test text
  std::string model_data = fdp.ConsumeRandomLengthString(size);
  std::string test_text = fdp.ConsumeRemainingBytesAsString();

  sentencepiece::SentencePieceProcessor processor;

  // Try loading fuzz data as a serialized model proto
  auto status = processor.LoadFromSerializedProto(model_data);
  if (!status.ok())
    return 0;

  // Model loaded successfully - exercise all major operations

  // Basic encoding
  std::vector<std::string> pieces;
  processor.Encode(test_text, &pieces);

  // Encode to IDs
  std::vector<int> ids;
  processor.Encode(test_text, &ids);

  // Decode from pieces
  if (!pieces.empty()) {
    std::string decoded;
    processor.Decode(pieces, &decoded);
  }

  // Decode from IDs
  if (!ids.empty()) {
    std::string decoded;
    processor.Decode(ids, &decoded);
  }

  // Normalization
  std::string normalized;
  processor.Normalize(test_text, &normalized);

  // Vocabulary operations
  int vocab_size = processor.GetPieceSize();
  if (vocab_size > 0) {
    // PieceToId / IdToPiece round-trip
    for (int i = 0; i < vocab_size && i < 10; i++) {
      std::string piece = processor.IdToPiece(i);
      processor.PieceToId(piece);
      processor.GetScore(i);
      processor.IsUnknown(i);
      processor.IsControl(i);
      processor.IsUnused(i);
      processor.IsByte(i);
    }

    // Try lookup with test_text as a piece
    processor.PieceToId(test_text);
  }

  // Special token IDs
  processor.unk_id();
  processor.bos_id();
  processor.eos_id();
  processor.pad_id();

  // NBest encoding (with small nbest_size to avoid slowness)
  std::vector<std::vector<std::string>> nbest_pieces;
  processor.NBestEncode(test_text, 2, &nbest_pieces);

  // Sample encoding
  std::vector<std::string> sampled;
  processor.SampleEncode(test_text, 1, 0.5, &sampled);

  // Encode as serialized proto
  processor.EncodeAsSerializedProto(test_text);
  processor.SampleEncodeAsSerializedProto(test_text, 1, 0.5);

  // Get serialized model
  processor.serialized_model_proto();

  return 0;
}

Line	Count	Source
1		// Copyright 2026 Google LLC
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		// Fuzzer for SentencePiece model loading and post-load operations.
16		// Feeds arbitrary binary data as a serialized ModelProto, then exercises
17		// encoding/decoding if the model loads successfully.
18
19		#include <cstddef>
20		#include <cstdint>
21		#include <string>
22		#include <vector>
23
24		#include <fuzzer/FuzzedDataProvider.h>
25		#include "sentencepiece_processor.h"
26
27	4.72k	extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
28	4.72k	if (size < 4)
29	2	return 0;
30
31	4.72k	FuzzedDataProvider fdp(data, size);
32
33		// Split data: most goes to model, some to test text
34	4.72k	std::string model_data = fdp.ConsumeRandomLengthString(size);
35	4.72k	std::string test_text = fdp.ConsumeRemainingBytesAsString();
36
37	4.72k	sentencepiece::SentencePieceProcessor processor;
38
39		// Try loading fuzz data as a serialized model proto
40	4.72k	auto status = processor.LoadFromSerializedProto(model_data);
41	4.72k	if (!status.ok())
42	3.16k	return 0;
43
44		// Model loaded successfully - exercise all major operations
45
46		// Basic encoding
47	1.55k	std::vector<std::string> pieces;
48	1.55k	processor.Encode(test_text, &pieces);
49
50		// Encode to IDs
51	1.55k	std::vector<int> ids;
52	1.55k	processor.Encode(test_text, &ids);
53
54		// Decode from pieces
55	1.55k	if (!pieces.empty()) {
56	1.49k	std::string decoded;
57	1.49k	processor.Decode(pieces, &decoded);
58	1.49k	}
59
60		// Decode from IDs
61	1.55k	if (!ids.empty()) {
62	1.49k	std::string decoded;
63	1.49k	processor.Decode(ids, &decoded);
64	1.49k	}
65
66		// Normalization
67	1.55k	std::string normalized;
68	1.55k	processor.Normalize(test_text, &normalized);
69
70		// Vocabulary operations
71	1.55k	int vocab_size = processor.GetPieceSize();
72	1.55k	if (vocab_size > 0) {
73		// PieceToId / IdToPiece round-trip
74	11.4k	for (int i = 0; i < vocab_size && i < 10; i++) {
75	9.94k	std::string piece = processor.IdToPiece(i);
76	9.94k	processor.PieceToId(piece);
77	9.94k	processor.GetScore(i);
78	9.94k	processor.IsUnknown(i);
79	9.94k	processor.IsControl(i);
80	9.94k	processor.IsUnused(i);
81	9.94k	processor.IsByte(i);
82	9.94k	}
83
84		// Try lookup with test_text as a piece
85	1.55k	processor.PieceToId(test_text);
86	1.55k	}
87
88		// Special token IDs
89	1.55k	processor.unk_id();
90	1.55k	processor.bos_id();
91	1.55k	processor.eos_id();
92	1.55k	processor.pad_id();
93
94		// NBest encoding (with small nbest_size to avoid slowness)
95	1.55k	std::vector<std::vector<std::string>> nbest_pieces;
96	1.55k	processor.NBestEncode(test_text, 2, &nbest_pieces);
97
98		// Sample encoding
99	1.55k	std::vector<std::string> sampled;
100	1.55k	processor.SampleEncode(test_text, 1, 0.5, &sampled);
101
102		// Encode as serialized proto
103	1.55k	processor.EncodeAsSerializedProto(test_text);
104	1.55k	processor.SampleEncodeAsSerializedProto(test_text, 1, 0.5);
105
106		// Get serialized model
107	1.55k	processor.serialized_model_proto();
108
109	1.55k	return 0;
110	4.72k	}

Coverage Report

Created: 2026-03-31 06:06