/src/processor_text_fuzzer.cc

Source
// Copyright 2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
// the binary as a byte array) and then fuzzes all encoding, decoding,
// normalization, and vocabulary operations with fuzz-derived text input.

#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>

#include <fuzzer/FuzzedDataProvider.h>
#include "sentencepiece_processor.h"

// Generated at build time by: xxd -i processor_text_fuzzer_model
#include "embedded_model.h"

static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;

extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
  g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();

  // Load the model from the embedded byte array
  std::string model_data(
      reinterpret_cast<const char *>(kEmbeddedModelData),
      kEmbeddedModelSize);
  auto status = g_processor->LoadFromSerializedProto(model_data);
  if (!status.ok()) {
    fprintf(stderr, "Failed to load embedded model: %s\n",
            status.ToString().c_str());
    abort();
  }

  return 0;
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
  if (!g_processor || size < 2)
    return 0;

  FuzzedDataProvider fdp(data, size);
  uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
  std::string text = fdp.ConsumeRemainingBytesAsString();

  // === Core encoding operations ===
  if (ops & 0x01) {
    // Encode to pieces (strings)
    std::vector<std::string> pieces;
    g_processor->Encode(text, &pieces);

    // Decode back from pieces
    if (!pieces.empty()) {
      std::string decoded;
      g_processor->Decode(pieces, &decoded);
    }
  }

  if (ops & 0x02) {
    // Encode to IDs
    std::vector<int> ids;
    g_processor->Encode(text, &ids);

    // Decode back from IDs
    if (!ids.empty()) {
      std::string decoded;
      g_processor->Decode(ids, &decoded);
    }
  }

  // === Advanced encoding operations ===
  if (ops & 0x04) {
    // NBest encoding - use small nbest to avoid being too slow
    std::vector<std::vector<std::string>> nbest_pieces;
    g_processor->NBestEncode(text, 3, &nbest_pieces);

    // NBest encode to IDs
    std::vector<std::vector<int>> nbest_ids;
    g_processor->NBestEncode(text, 3, &nbest_ids);
  }

  if (ops & 0x08) {
    // Sample encoding with various alpha values
    std::vector<std::string> sampled;
    g_processor->SampleEncode(text, -1, 0.5, &sampled);

    std::vector<int> sampled_ids;
    g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
  }

  if (ops & 0x10) {
    // Encode as serialized proto (exercises protobuf serialization path)
    auto serialized = g_processor->EncodeAsSerializedProto(text);
    (void)serialized;

    auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
    (void)nb_serialized;

    auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
    (void)sample_serialized;
  }

  // === Normalization ===
  if (ops & 0x20) {
    std::string normalized;
    g_processor->Normalize(text, &normalized);

    // Normalize with alignment info
    std::string normalized2;
    std::vector<size_t> norm_to_orig;
    g_processor->Normalize(text, &normalized2, &norm_to_orig);
  }

  // === Vocabulary operations ===
  if (ops & 0x40) {
    // PieceToId with fuzz text
    g_processor->PieceToId(text);

    // Try splitting text into substrings and looking them up
    if (text.size() > 2) {
      for (size_t i = 0; i < text.size() && i < 8; i++) {
        std::string sub = text.substr(0, i + 1);
        int id = g_processor->PieceToId(sub);
        if (id >= 0 && id < g_processor->GetPieceSize()) {
          g_processor->IdToPiece(id);
          g_processor->GetScore(id);
          g_processor->IsUnknown(id);
          g_processor->IsControl(id);
          g_processor->IsUnused(id);
          g_processor->IsByte(id);
        }
      }
    }
  }

  // === Entropy calculation ===
  if (ops & 0x80) {
    if (text.size() > 0 && text.size() < 256) {
      float entropy = g_processor->CalculateEntropy(text, 0.5);
      (void)entropy;
    }
  }

  return 0;
}

Line	Count	Source
1		// Copyright 2026 Google LLC
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
16		// the binary as a byte array) and then fuzzes all encoding, decoding,
17		// normalization, and vocabulary operations with fuzz-derived text input.
18
19		#include <cstddef>
20		#include <cstdint>
21		#include <cstdlib>
22		#include <memory>
23		#include <string>
24		#include <vector>
25
26		#include <fuzzer/FuzzedDataProvider.h>
27		#include "sentencepiece_processor.h"
28
29		// Generated at build time by: xxd -i processor_text_fuzzer_model
30		#include "embedded_model.h"
31
32		static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;
33
34	1	extern "C" int LLVMFuzzerInitialize(int argc, char **argv) {
35	1	g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();
36
37		// Load the model from the embedded byte array
38	1	std::string model_data(
39	1	reinterpret_cast<const char *>(kEmbeddedModelData),
40	1	kEmbeddedModelSize);
41	1	auto status = g_processor->LoadFromSerializedProto(model_data);
42	1	if (!status.ok()) {
43	0	fprintf(stderr, "Failed to load embedded model: %s\n",
44	0	status.ToString().c_str());
45	0	abort();
46	0	}
47
48	1	return 0;
49	1	}
50
51	0	extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
52	0	if (!g_processor \|\| size < 2)
53	0	return 0;
54
55	0	FuzzedDataProvider fdp(data, size);
56	0	uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
57	0	std::string text = fdp.ConsumeRemainingBytesAsString();
58
59		// === Core encoding operations ===
60	0	if (ops & 0x01) {
61		// Encode to pieces (strings)
62	0	std::vector<std::string> pieces;
63	0	g_processor->Encode(text, &pieces);
64
65		// Decode back from pieces
66	0	if (!pieces.empty()) {
67	0	std::string decoded;
68	0	g_processor->Decode(pieces, &decoded);
69	0	}
70	0	}
71
72	0	if (ops & 0x02) {
73		// Encode to IDs
74	0	std::vector<int> ids;
75	0	g_processor->Encode(text, &ids);
76
77		// Decode back from IDs
78	0	if (!ids.empty()) {
79	0	std::string decoded;
80	0	g_processor->Decode(ids, &decoded);
81	0	}
82	0	}
83
84		// === Advanced encoding operations ===
85	0	if (ops & 0x04) {
86		// NBest encoding - use small nbest to avoid being too slow
87	0	std::vector<std::vector<std::string>> nbest_pieces;
88	0	g_processor->NBestEncode(text, 3, &nbest_pieces);
89
90		// NBest encode to IDs
91	0	std::vector<std::vector<int>> nbest_ids;
92	0	g_processor->NBestEncode(text, 3, &nbest_ids);
93	0	}
94
95	0	if (ops & 0x08) {
96		// Sample encoding with various alpha values
97	0	std::vector<std::string> sampled;
98	0	g_processor->SampleEncode(text, -1, 0.5, &sampled);
99
100	0	std::vector<int> sampled_ids;
101	0	g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
102	0	}
103
104	0	if (ops & 0x10) {
105		// Encode as serialized proto (exercises protobuf serialization path)
106	0	auto serialized = g_processor->EncodeAsSerializedProto(text);
107	0	(void)serialized;
108
109	0	auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
110	0	(void)nb_serialized;
111
112	0	auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
113	0	(void)sample_serialized;
114	0	}
115
116		// === Normalization ===
117	0	if (ops & 0x20) {
118	0	std::string normalized;
119	0	g_processor->Normalize(text, &normalized);
120
121		// Normalize with alignment info
122	0	std::string normalized2;
123	0	std::vector<size_t> norm_to_orig;
124	0	g_processor->Normalize(text, &normalized2, &norm_to_orig);
125	0	}
126
127		// === Vocabulary operations ===
128	0	if (ops & 0x40) {
129		// PieceToId with fuzz text
130	0	g_processor->PieceToId(text);
131
132		// Try splitting text into substrings and looking them up
133	0	if (text.size() > 2) {
134	0	for (size_t i = 0; i < text.size() && i < 8; i++) {
135	0	std::string sub = text.substr(0, i + 1);
136	0	int id = g_processor->PieceToId(sub);
137	0	if (id >= 0 && id < g_processor->GetPieceSize()) {
138	0	g_processor->IdToPiece(id);
139	0	g_processor->GetScore(id);
140	0	g_processor->IsUnknown(id);
141	0	g_processor->IsControl(id);
142	0	g_processor->IsUnused(id);
143	0	g_processor->IsByte(id);
144	0	}
145	0	}
146	0	}
147	0	}
148
149		// === Entropy calculation ===
150	0	if (ops & 0x80) {
151	0	if (text.size() > 0 && text.size() < 256) {
152	0	float entropy = g_processor->CalculateEntropy(text, 0.5);
153	0	(void)entropy;
154	0	}
155	0	}
156
157	0	return 0;
158	0	}

Coverage Report

Created: 2026-03-31 06:06