Coverage Report

Created: 2026-04-30 06:29

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/processor_text_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// Fuzzer that loads a valid pre-built sentencepiece model (embedded in
16
// the binary as a byte array) and then fuzzes all encoding, decoding,
17
// normalization, and vocabulary operations with fuzz-derived text input.
18
19
#include <cstddef>
20
#include <cstdint>
21
#include <cstdlib>
22
#include <memory>
23
#include <string>
24
#include <vector>
25
26
#include <fuzzer/FuzzedDataProvider.h>
27
#include "sentencepiece_processor.h"
28
29
// Generated at build time by: xxd -i processor_text_fuzzer_model
30
#include "embedded_model.h"
31
32
static std::unique_ptr<sentencepiece::SentencePieceProcessor> g_processor;
33
34
1
extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
35
1
  g_processor = std::make_unique<sentencepiece::SentencePieceProcessor>();
36
37
  // Load the model from the embedded byte array
38
1
  std::string model_data(
39
1
      reinterpret_cast<const char *>(kEmbeddedModelData),
40
1
      kEmbeddedModelSize);
41
1
  auto status = g_processor->LoadFromSerializedProto(model_data);
42
1
  if (!status.ok()) {
43
0
    fprintf(stderr, "Failed to load embedded model: %s\n",
44
0
            status.ToString().c_str());
45
0
    abort();
46
0
  }
47
48
1
  return 0;
49
1
}
50
51
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
52
  if (!g_processor || size < 2)
53
    return 0;
54
55
  FuzzedDataProvider fdp(data, size);
56
  uint8_t ops = fdp.ConsumeIntegral<uint8_t>();
57
  std::string text = fdp.ConsumeRemainingBytesAsString();
58
59
  // === Core encoding operations ===
60
  if (ops & 0x01) {
61
    // Encode to pieces (strings)
62
    std::vector<std::string> pieces;
63
    g_processor->Encode(text, &pieces);
64
65
    // Decode back from pieces
66
    if (!pieces.empty()) {
67
      std::string decoded;
68
      g_processor->Decode(pieces, &decoded);
69
    }
70
  }
71
72
  if (ops & 0x02) {
73
    // Encode to IDs
74
    std::vector<int> ids;
75
    g_processor->Encode(text, &ids);
76
77
    // Decode back from IDs
78
    if (!ids.empty()) {
79
      std::string decoded;
80
      g_processor->Decode(ids, &decoded);
81
    }
82
  }
83
84
  // === Advanced encoding operations ===
85
  if (ops & 0x04) {
86
    // NBest encoding - use small nbest to avoid being too slow
87
    std::vector<std::vector<std::string>> nbest_pieces;
88
    g_processor->NBestEncode(text, 3, &nbest_pieces);
89
90
    // NBest encode to IDs
91
    std::vector<std::vector<int>> nbest_ids;
92
    g_processor->NBestEncode(text, 3, &nbest_ids);
93
  }
94
95
  if (ops & 0x08) {
96
    // Sample encoding with various alpha values
97
    std::vector<std::string> sampled;
98
    g_processor->SampleEncode(text, -1, 0.5, &sampled);
99
100
    std::vector<int> sampled_ids;
101
    g_processor->SampleEncode(text, -1, 0.1, &sampled_ids);
102
  }
103
104
  if (ops & 0x10) {
105
    // Encode as serialized proto (exercises protobuf serialization path)
106
    auto serialized = g_processor->EncodeAsSerializedProto(text);
107
    (void)serialized;
108
109
    auto nb_serialized = g_processor->NBestEncodeAsSerializedProto(text, 2);
110
    (void)nb_serialized;
111
112
    auto sample_serialized = g_processor->SampleEncodeAsSerializedProto(text, -1, 0.5);
113
    (void)sample_serialized;
114
  }
115
116
  // === Normalization ===
117
  if (ops & 0x20) {
118
    std::string normalized;
119
    g_processor->Normalize(text, &normalized);
120
121
    // Normalize with alignment info
122
    std::string normalized2;
123
    std::vector<size_t> norm_to_orig;
124
    g_processor->Normalize(text, &normalized2, &norm_to_orig);
125
  }
126
127
  // === Vocabulary operations ===
128
  if (ops & 0x40) {
129
    // PieceToId with fuzz text
130
    g_processor->PieceToId(text);
131
132
    // Try splitting text into substrings and looking them up
133
    if (text.size() > 2) {
134
      for (size_t i = 0; i < text.size() && i < 8; i++) {
135
        std::string sub = text.substr(0, i + 1);
136
        int id = g_processor->PieceToId(sub);
137
        if (id >= 0 && id < g_processor->GetPieceSize()) {
138
          g_processor->IdToPiece(id);
139
          g_processor->GetScore(id);
140
          g_processor->IsUnknown(id);
141
          g_processor->IsControl(id);
142
          g_processor->IsUnused(id);
143
          g_processor->IsByte(id);
144
        }
145
      }
146
    }
147
  }
148
149
  // === Entropy calculation ===
150
  if (ops & 0x80) {
151
    if (text.size() > 0 && text.size() < 256) {
152
      float entropy = g_processor->CalculateEntropy(text, 0.5);
153
      (void)entropy;
154
    }
155
  }
156
157
  return 0;
158
}