Coverage Report

Created: 2026-03-31 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/model_load_fuzzer.cc
Line
Count
Source
1
// Copyright 2026 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// Fuzzer for SentencePiece model loading and post-load operations.
16
// Feeds arbitrary binary data as a serialized ModelProto, then exercises
17
// encoding/decoding if the model loads successfully.
18
19
#include <cstddef>
20
#include <cstdint>
21
#include <string>
22
#include <vector>
23
24
#include <fuzzer/FuzzedDataProvider.h>
25
#include "sentencepiece_processor.h"
26
27
4.72k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
28
4.72k
  if (size < 4)
29
2
    return 0;
30
31
4.72k
  FuzzedDataProvider fdp(data, size);
32
33
  // Split data: most goes to model, some to test text
34
4.72k
  std::string model_data = fdp.ConsumeRandomLengthString(size);
35
4.72k
  std::string test_text = fdp.ConsumeRemainingBytesAsString();
36
37
4.72k
  sentencepiece::SentencePieceProcessor processor;
38
39
  // Try loading fuzz data as a serialized model proto
40
4.72k
  auto status = processor.LoadFromSerializedProto(model_data);
41
4.72k
  if (!status.ok())
42
3.16k
    return 0;
43
44
  // Model loaded successfully - exercise all major operations
45
46
  // Basic encoding
47
1.55k
  std::vector<std::string> pieces;
48
1.55k
  processor.Encode(test_text, &pieces);
49
50
  // Encode to IDs
51
1.55k
  std::vector<int> ids;
52
1.55k
  processor.Encode(test_text, &ids);
53
54
  // Decode from pieces
55
1.55k
  if (!pieces.empty()) {
56
1.49k
    std::string decoded;
57
1.49k
    processor.Decode(pieces, &decoded);
58
1.49k
  }
59
60
  // Decode from IDs
61
1.55k
  if (!ids.empty()) {
62
1.49k
    std::string decoded;
63
1.49k
    processor.Decode(ids, &decoded);
64
1.49k
  }
65
66
  // Normalization
67
1.55k
  std::string normalized;
68
1.55k
  processor.Normalize(test_text, &normalized);
69
70
  // Vocabulary operations
71
1.55k
  int vocab_size = processor.GetPieceSize();
72
1.55k
  if (vocab_size > 0) {
73
    // PieceToId / IdToPiece round-trip
74
11.4k
    for (int i = 0; i < vocab_size && i < 10; i++) {
75
9.94k
      std::string piece = processor.IdToPiece(i);
76
9.94k
      processor.PieceToId(piece);
77
9.94k
      processor.GetScore(i);
78
9.94k
      processor.IsUnknown(i);
79
9.94k
      processor.IsControl(i);
80
9.94k
      processor.IsUnused(i);
81
9.94k
      processor.IsByte(i);
82
9.94k
    }
83
84
    // Try lookup with test_text as a piece
85
1.55k
    processor.PieceToId(test_text);
86
1.55k
  }
87
88
  // Special token IDs
89
1.55k
  processor.unk_id();
90
1.55k
  processor.bos_id();
91
1.55k
  processor.eos_id();
92
1.55k
  processor.pad_id();
93
94
  // NBest encoding (with small nbest_size to avoid slowness)
95
1.55k
  std::vector<std::vector<std::string>> nbest_pieces;
96
1.55k
  processor.NBestEncode(test_text, 2, &nbest_pieces);
97
98
  // Sample encoding
99
1.55k
  std::vector<std::string> sampled;
100
1.55k
  processor.SampleEncode(test_text, 1, 0.5, &sampled);
101
102
  // Encode as serialized proto
103
1.55k
  processor.EncodeAsSerializedProto(test_text);
104
1.55k
  processor.SampleEncodeAsSerializedProto(test_text, 1, 0.5);
105
106
  // Get serialized model
107
1.55k
  processor.serialized_model_proto();
108
109
1.55k
  return 0;
110
4.72k
}