Coverage Report

Created: 2026-03-31 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/processor_text_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
16
// the binary as a byte array) and then fuzzes all encoding, decoding,
17
// normalization, and vocabulary operations with fuzz-derived text input.
18
19
#include <cstddef>
20
#include <cstdint>
21
#include <cstdlib>
22
#include <memory>
23
#include <string>
24
#include <vector>
25
26
#include <fuzzer/FuzzedDataProvider.h>
27
#include "sentencepiece_processor.h"
28
29
// Generated at build time by: xxd -i processor_text_fuzzer_model
30
#include "embedded_model.h"
31
32
static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;
33
34
1
extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
35
1
  g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();
36
37
  // Load the model from the embedded byte array
38
1
  std::string model_data(
39
1
      reinterpret_cast<const char *>(kEmbeddedModelData),
40
1
      kEmbeddedModelSize);
41
1
  auto status = g_processor->LoadFromSerializedProto(model_data);
42
1
  if (!status.ok()) {
43
0
    fprintf(stderr, "Failed to load embedded model: %s\n",
44
0
            status.ToString().c_str());
45
0
    abort();
46
0
  }
47
48
1
  return 0;
49
1
}
50
51
0
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
52
0
  if (!g_processor || size < 2)
53
0
    return 0;
54
55
0
  FuzzedDataProvider fdp(data, size);
56
0
  uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
57
0
  std::string text = fdp.ConsumeRemainingBytesAsString();
58
59
  // === Core encoding operations ===
60
0
  if (ops & 0x01) {
61
    // Encode to pieces (strings)
62
0
    std::vector<std::string> pieces;
63
0
    g_processor->Encode(text, &pieces);
64
65
    // Decode back from pieces
66
0
    if (!pieces.empty()) {
67
0
      std::string decoded;
68
0
      g_processor->Decode(pieces, &decoded);
69
0
    }
70
0
  }
71
72
0
  if (ops & 0x02) {
73
    // Encode to IDs
74
0
    std::vector<int> ids;
75
0
    g_processor->Encode(text, &ids);
76
77
    // Decode back from IDs
78
0
    if (!ids.empty()) {
79
0
      std::string decoded;
80
0
      g_processor->Decode(ids, &decoded);
81
0
    }
82
0
  }
83
84
  // === Advanced encoding operations ===
85
0
  if (ops & 0x04) {
86
    // NBest encoding - use small nbest to avoid being too slow
87
0
    std::vector<std::vector<std::string>> nbest_pieces;
88
0
    g_processor->NBestEncode(text, 3, &nbest_pieces);
89
90
    // NBest encode to IDs
91
0
    std::vector<std::vector<int>> nbest_ids;
92
0
    g_processor->NBestEncode(text, 3, &nbest_ids);
93
0
  }
94
95
0
  if (ops & 0x08) {
96
    // Sample encoding with various alpha values
97
0
    std::vector<std::string> sampled;
98
0
    g_processor->SampleEncode(text, -1, 0.5, &sampled);
99
100
0
    std::vector<int> sampled_ids;
101
0
    g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
102
0
  }
103
104
0
  if (ops & 0x10) {
105
    // Encode as serialized proto (exercises protobuf serialization path)
106
0
    auto serialized = g_processor->EncodeAsSerializedProto(text);
107
0
    (void)serialized;
108
109
0
    auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
110
0
    (void)nb_serialized;
111
112
0
    auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
113
0
    (void)sample_serialized;
114
0
  }
115
116
  // === Normalization ===
117
0
  if (ops & 0x20) {
118
0
    std::string normalized;
119
0
    g_processor->Normalize(text, &normalized);
120
121
    // Normalize with alignment info
122
0
    std::string normalized2;
123
0
    std::vector<size_t> norm_to_orig;
124
0
    g_processor->Normalize(text, &normalized2, &norm_to_orig);
125
0
  }
126
127
  // === Vocabulary operations ===
128
0
  if (ops & 0x40) {
129
    // PieceToId with fuzz text
130
0
    g_processor->PieceToId(text);
131
132
    // Try splitting text into substrings and looking them up
133
0
    if (text.size() > 2) {
134
0
      for (size_t i = 0; i < text.size() && i < 8; i++) {
135
0
        std::string sub = text.substr(0, i + 1);
136
0
        int id = g_processor->PieceToId(sub);
137
0
        if (id >= 0 && id < g_processor->GetPieceSize()) {
138
0
          g_processor->IdToPiece(id);
139
0
          g_processor->GetScore(id);
140
0
          g_processor->IsUnknown(id);
141
0
          g_processor->IsControl(id);
142
0
          g_processor->IsUnused(id);
143
0
          g_processor->IsByte(id);
144
0
        }
145
0
      }
146
0
    }
147
0
  }
148
149
  // === Entropy calculation ===
150
0
  if (ops & 0x80) {
151
0
    if (text.size() > 0 && text.size() < 256) {
152
0
      float entropy = g_processor->CalculateEntropy(text, 0.5);
153
0
      (void)entropy;
154
0
    }
155
0
  }
156
157
0
  return 0;
158
0
}