Coverage Report

Created: 2026-05-04 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/model_load_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// Fuzzer for SentencePiece model loading and post-load operations.
16
// Feeds arbitrary binary data as a serialized ModelProto, then exercises
17
// encoding/decoding if the model loads successfully.
18
19
#include <cstddef>
20
#include <cstdint>
21
#include <string>
22
#include <vector>
23
24
#include <fuzzer/FuzzedDataProvider.h>
25
#include "sentencepiece_processor.h"
26
27
7.70k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
28
7.70k
  if (size < 4)
29
2
    return 0;
30
31
7.70k
  FuzzedDataProvider fdp(data, size);
32
33
  // Split data: most goes to model, some to test text
34
7.70k
  std::string model_data = fdp.ConsumeRandomLengthString(size);
35
7.70k
  std::string test_text = fdp.ConsumeRemainingBytesAsString();
36
37
7.70k
  sentencepiece::SentencePieceProcessor processor;
38
39
  // Try loading fuzz data as a serialized model proto
40
7.70k
  auto status = processor.LoadFromSerializedProto(model_data);
41
7.70k
  if (!status.ok())
42
5.67k
    return 0;
43
44
  // Model loaded successfully - exercise all major operations
45
46
  // Basic encoding
47
2.02k
  std::vector<std::string> pieces;
48
2.02k
  processor.Encode(test_text, &pieces);
49
50
  // Encode to IDs
51
2.02k
  std::vector<int> ids;
52
2.02k
  processor.Encode(test_text, &ids);
53
54
  // Decode from pieces
55
2.02k
  if (!pieces.empty()) {
56
1.92k
    std::string decoded;
57
1.92k
    processor.Decode(pieces, &decoded);
58
1.92k
  }
59
60
  // Decode from IDs
61
2.02k
  if (!ids.empty()) {
62
1.92k
    std::string decoded;
63
1.92k
    processor.Decode(ids, &decoded);
64
1.92k
  }
65
66
  // Normalization
67
2.02k
  std::string normalized;
68
2.02k
  processor.Normalize(test_text, &normalized);
69
70
  // Vocabulary operations
71
2.02k
  int vocab_size = processor.GetPieceSize();
72
2.02k
  if (vocab_size > 0) {
73
    // PieceToId / IdToPiece round-trip
74
12.3k
    for (int i = 0; i < vocab_size && i < 10; i++) {
75
10.3k
      std::string piece = processor.IdToPiece(i);
76
10.3k
      processor.PieceToId(piece);
77
10.3k
      processor.GetScore(i);
78
10.3k
      processor.IsUnknown(i);
79
10.3k
      processor.IsControl(i);
80
10.3k
      processor.IsUnused(i);
81
10.3k
      processor.IsByte(i);
82
10.3k
    }
83
84
    // Try lookup with test_text as a piece
85
2.02k
    processor.PieceToId(test_text);
86
2.02k
  }
87
88
  // Special token IDs
89
2.02k
  processor.unk_id();
90
2.02k
  processor.bos_id();
91
2.02k
  processor.eos_id();
92
2.02k
  processor.pad_id();
93
94
  // NBest encoding (with small nbest_size to avoid slowness)
95
2.02k
  std::vector<std::vector<std::string>> nbest_pieces;
96
2.02k
  processor.NBestEncode(test_text, 2, &nbest_pieces);
97
98
  // Sample encoding
99
2.02k
  std::vector<std::string> sampled;
100
2.02k
  processor.SampleEncode(test_text, 1, 0.5, &sampled);
101
102
  // Encode as serialized proto
103
2.02k
  processor.EncodeAsSerializedProto(test_text);
104
2.02k
  processor.SampleEncodeAsSerializedProto(test_text, 1, 0.5);
105
106
  // Get serialized model
107
2.02k
  processor.serialized_model_proto();
108
109
2.02k
  return 0;
110
7.70k
}