/src/processor_text_fuzzer.cc

Source
// Copyright 2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
// the binary as a byte array) and then fuzzes all encoding, decoding,
// normalization, and vocabulary operations with fuzz-derived text input.

#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>

#include <fuzzer/FuzzedDataProvider.h>
#include "sentencepiece_processor.h"

// Generated at build time by: xxd -i processor_text_fuzzer_model
#include "embedded_model.h"

static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;

extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
  g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();

  // Load the model from the embedded byte array
  std::string model_data(
      reinterpret_cast<const char *>(kEmbeddedModelData),
      kEmbeddedModelSize);
  auto status = g_processor->LoadFromSerializedProto(model_data);
  if (!status.ok()) {
    fprintf(stderr, "Failed to load embedded model: %s\n",
            status.ToString().c_str());
    abort();
  }

  return 0;
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
  if (!g_processor || size < 2)
    return 0;

  FuzzedDataProvider fdp(data, size);
  uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
  std::string text = fdp.ConsumeRemainingBytesAsString();

  // === Core encoding operations ===
  if (ops & 0x01) {
    // Encode to pieces (strings)
    std::vector<std::string> pieces;
    g_processor->Encode(text, &pieces);

    // Decode back from pieces
    if (!pieces.empty()) {
      std::string decoded;
      g_processor->Decode(pieces, &decoded);
    }
  }

  if (ops & 0x02) {
    // Encode to IDs
    std::vector<int> ids;
    g_processor->Encode(text, &ids);

    // Decode back from IDs
    if (!ids.empty()) {
      std::string decoded;
      g_processor->Decode(ids, &decoded);
    }
  }

  // === Advanced encoding operations ===
  if (ops & 0x04) {
    // NBest encoding - use small nbest to avoid being too slow
    std::vector<std::vector<std::string>> nbest_pieces;
    g_processor->NBestEncode(text, 3, &nbest_pieces);

    // NBest encode to IDs
    std::vector<std::vector<int>> nbest_ids;
    g_processor->NBestEncode(text, 3, &nbest_ids);
  }

  if (ops & 0x08) {
    // Sample encoding with various alpha values
    std::vector<std::string> sampled;
    g_processor->SampleEncode(text, -1, 0.5, &sampled);

    std::vector<int> sampled_ids;
    g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
  }

  if (ops & 0x10) {
    // Encode as serialized proto (exercises protobuf serialization path)
    auto serialized = g_processor->EncodeAsSerializedProto(text);
    (void)serialized;

    auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
    (void)nb_serialized;

    auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
    (void)sample_serialized;
  }

  // === Normalization ===
  if (ops & 0x20) {
    std::string normalized;
    g_processor->Normalize(text, &normalized);

    // Normalize with alignment info
    std::string normalized2;
    std::vector<size_t> norm_to_orig;
    g_processor->Normalize(text, &normalized2, &norm_to_orig);
  }

  // === Vocabulary operations ===
  if (ops & 0x40) {
    // PieceToId with fuzz text
    g_processor->PieceToId(text);

    // Try splitting text into substrings and looking them up
    if (text.size() > 2) {
      for (size_t i = 0; i < text.size() && i < 8; i++) {
        std::string sub = text.substr(0, i + 1);
        int id = g_processor->PieceToId(sub);
        if (id >= 0 && id < g_processor->GetPieceSize()) {
          g_processor->IdToPiece(id);
          g_processor->GetScore(id);
          g_processor->IsUnknown(id);
          g_processor->IsControl(id);
          g_processor->IsUnused(id);
          g_processor->IsByte(id);
        }
      }
    }
  }

  // === Entropy calculation ===
  if (ops & 0x80) {
    if (text.size() > 0 && text.size() < 256) {
      float entropy = g_processor->CalculateEntropy(text, 0.5);
      (void)entropy;
    }
  }

  return 0;
}

Line	Count	Source
1		// Copyright 2026 Google LLC
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
16		// the binary as a byte array) and then fuzzes all encoding, decoding,
17		// normalization, and vocabulary operations with fuzz-derived text input.
18
19		#include <cstddef>
20		#include <cstdint>
21		#include <cstdlib>
22		#include <memory>
23		#include <string>
24		#include <vector>
25
26		#include <fuzzer/FuzzedDataProvider.h>
27		#include "sentencepiece_processor.h"
28
29		// Generated at build time by: xxd -i processor_text_fuzzer_model
30		#include "embedded_model.h"
31
32		static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;
33
34	1	extern "C" int LLVMFuzzerInitialize(int argc, char **argv) {
35	1	g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();
36
37		// Load the model from the embedded byte array
38	1	std::string model_data(
39	1	reinterpret_cast<const char *>(kEmbeddedModelData),
40	1	kEmbeddedModelSize);
41	1	auto status = g_processor->LoadFromSerializedProto(model_data);
42	1	if (!status.ok()) {
43	0	fprintf(stderr, "Failed to load embedded model: %s\n",
44	0	status.ToString().c_str());
45	0	abort();
46	0	}
47
48	1	return 0;
49	1	}
50
51		extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
52		if (!g_processor \|\| size < 2)
53		return 0;
54
55		FuzzedDataProvider fdp(data, size);
56		uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
57		std::string text = fdp.ConsumeRemainingBytesAsString();
58
59		// === Core encoding operations ===
60		if (ops & 0x01) {
61		// Encode to pieces (strings)
62		std::vector<std::string> pieces;
63		g_processor->Encode(text, &pieces);
64
65		// Decode back from pieces
66		if (!pieces.empty()) {
67		std::string decoded;
68		g_processor->Decode(pieces, &decoded);
69		}
70		}
71
72		if (ops & 0x02) {
73		// Encode to IDs
74		std::vector<int> ids;
75		g_processor->Encode(text, &ids);
76
77		// Decode back from IDs
78		if (!ids.empty()) {
79		std::string decoded;
80		g_processor->Decode(ids, &decoded);
81		}
82		}
83
84		// === Advanced encoding operations ===
85		if (ops & 0x04) {
86		// NBest encoding - use small nbest to avoid being too slow
87		std::vector<std::vector<std::string>> nbest_pieces;
88		g_processor->NBestEncode(text, 3, &nbest_pieces);
89
90		// NBest encode to IDs
91		std::vector<std::vector<int>> nbest_ids;
92		g_processor->NBestEncode(text, 3, &nbest_ids);
93		}
94
95		if (ops & 0x08) {
96		// Sample encoding with various alpha values
97		std::vector<std::string> sampled;
98		g_processor->SampleEncode(text, -1, 0.5, &sampled);
99
100		std::vector<int> sampled_ids;
101		g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
102		}
103
104		if (ops & 0x10) {
105		// Encode as serialized proto (exercises protobuf serialization path)
106		auto serialized = g_processor->EncodeAsSerializedProto(text);
107		(void)serialized;
108
109		auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
110		(void)nb_serialized;
111
112		auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
113		(void)sample_serialized;
114		}
115
116		// === Normalization ===
117		if (ops & 0x20) {
118		std::string normalized;
119		g_processor->Normalize(text, &normalized);
120
121		// Normalize with alignment info
122		std::string normalized2;
123		std::vector<size_t> norm_to_orig;
124		g_processor->Normalize(text, &normalized2, &norm_to_orig);
125		}
126
127		// === Vocabulary operations ===
128		if (ops & 0x40) {
129		// PieceToId with fuzz text
130		g_processor->PieceToId(text);
131
132		// Try splitting text into substrings and looking them up
133		if (text.size() > 2) {
134		for (size_t i = 0; i < text.size() && i < 8; i++) {
135		std::string sub = text.substr(0, i + 1);
136		int id = g_processor->PieceToId(sub);
137		if (id >= 0 && id < g_processor->GetPieceSize()) {
138		g_processor->IdToPiece(id);
139		g_processor->GetScore(id);
140		g_processor->IsUnknown(id);
141		g_processor->IsControl(id);
142		g_processor->IsUnused(id);
143		g_processor->IsByte(id);
144		}
145		}
146		}
147		}
148
149		// === Entropy calculation ===
150		if (ops & 0x80) {
151		if (text.size() > 0 && text.size() < 256) {
152		float entropy = g_processor->CalculateEntropy(text, 0.5);
153		(void)entropy;
154		}
155		}
156
157		return 0;
158		}

Coverage Report

Created: 2026-04-30 06:29