Coverage Report

Created: 2025-12-31 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/fuzz/misc.cpp
Line
Count
Source
1
#include <cstddef>
2
#include <cstdint>
3
#include <ranges>
4
5
#include "helpers/common.h"
6
#include "simdutf.h"
7
8
850
void autodetect(std::span<const char> chardata) {
9
850
  std::vector<simdutf::encoding_type> results;
10
850
  const auto implementations = get_supported_implementations();
11
2.55k
  for (const simdutf::implementation* impl : implementations) {
12
2.55k
    results.push_back(
13
2.55k
        impl->autodetect_encoding(chardata.data(), chardata.size()));
14
2.55k
  }
15
1.70k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
16
850
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
17
0
    std::cerr << "output differs between implementations\n";
18
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
19
0
      std::cerr << "implementation " << implementations[i] << " gave "
20
0
                << results.at(i) << '\n';
21
0
    }
22
0
    std::abort();
23
0
  }
24
850
}
25
26
651
void detect(std::span<const char> chardata) {
27
651
  std::vector<int> results;
28
651
  const auto implementations = get_supported_implementations();
29
1.95k
  for (const simdutf::implementation* impl : implementations) {
30
1.95k
    results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
31
1.95k
  }
32
1.30k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
33
651
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
34
0
    std::cerr << "in detect_encodings(const char*, std::size_t):\n";
35
0
    std::cerr << "output differs between implementations\n";
36
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
37
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
38
0
                << results.at(i) << '\n';
39
0
    }
40
0
    std::cerr << " std::vector<unsigned char> data{";
41
0
    for (unsigned char x : chardata) {
42
0
      std::cerr << +x << ", ";
43
0
    };
44
0
    std::cerr << "};\n";
45
0
    std::abort();
46
0
  }
47
651
}
48
49
200
void validate_ascii(std::span<const char> chardata) {
50
  // use int, not bool to avoid vector<bool>
51
200
  std::vector<int> results;
52
200
  const auto implementations = get_supported_implementations();
53
600
  for (const simdutf::implementation* impl : implementations) {
54
600
    results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
55
600
  }
56
400
  auto neq = [](const auto& a, const auto& b) { return a != b; };
57
200
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
58
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
59
0
    std::cerr << "output differs between implementations\n";
60
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
61
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
62
0
                << results.at(i) << '\n';
63
0
    }
64
0
    std::cerr << " std::vector<unsigned char> data{";
65
0
    for (unsigned char x : chardata) {
66
0
      std::cerr << +x << ", ";
67
0
    };
68
0
    std::cerr << "};\n";
69
0
    std::abort();
70
0
  }
71
200
}
72
73
260
void validate_ascii_with_err(std::span<const char> chardata) {
74
  // use int, not bool to avoid vector<bool>
75
260
  std::vector<simdutf::result> results;
76
260
  const auto implementations = get_supported_implementations();
77
780
  for (const simdutf::implementation* impl : implementations) {
78
780
    results.push_back(
79
780
        impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
80
780
  }
81
520
  auto neq = [](const auto& a, const auto& b) { return a != b; };
82
260
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
83
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
84
0
    std::cerr << "output differs between implementations\n";
85
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
86
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
87
0
                << results.at(i) << '\n';
88
0
    }
89
0
    std::cerr << " std::vector<unsigned char> data{";
90
0
    for (unsigned char x : chardata) {
91
0
      std::cerr << +x << ", ";
92
0
    };
93
0
    std::cerr << "};\n";
94
0
    std::abort();
95
0
  }
96
260
}
97
98
125
void utf16_endianess(std::span<const char16_t> data) {
99
125
  std::vector<std::string> results;
100
125
  const auto implementations = get_supported_implementations();
101
375
  for (const simdutf::implementation* impl : implementations) {
102
375
    std::vector<char16_t> out(data.size());
103
375
    impl->change_endianness_utf16(data.data(), data.size(), out.data());
104
375
    results.push_back(FNV1A_hash::as_str(out));
105
375
  }
106
250
  auto neq = [](const auto& a, const auto& b) { return a != b; };
107
125
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
108
0
    std::cerr << "in utf16_endianess(const char*, std::size_t):\n";
109
0
    std::cerr << "output differs between implementations\n";
110
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
111
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
112
0
                << results.at(i) << '\n';
113
0
    }
114
0
    std::cerr << " std::vector<char16_t> data{";
115
0
    for (int x : data) {
116
0
      std::cerr << +x << ", ";
117
0
    };
118
0
    std::cerr << "};\n";
119
0
    std::abort();
120
0
  }
121
125
}
122
123
void convert_latin1_to_utf8_safe(std::span<const char> chardata,
124
290
                                 const std::size_t outputsize) {
125
  // convert with a limited output buffer
126
290
  std::vector<char> limited_output(outputsize);
127
290
  const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
128
290
      chardata.data(), chardata.size(), limited_output.data(), outputsize);
129
130
  // convert with a sufficiently large output buffer
131
290
  std::vector<char> large_output(2 * chardata.size());
132
290
  const auto large_ret = simdutf::convert_latin1_to_utf8(
133
290
      chardata.data(), chardata.size(), large_output.data());
134
135
290
  if (large_ret != 0) {
136
    // conversion was possible with a large buffer.
137
256
    if (large_ret <= outputsize) {
138
      // the limited buffer was large enough, ensure we got the same result
139
132
      assert(limited_ret == large_ret);
140
132
      assert(std::ranges::equal(limited_output | std::views::take(large_ret),
141
132
                                large_output | std::views::take(large_ret)));
142
132
    } else {
143
      // the number of written bytes for a limited buffer must not exceed what
144
      // the large buffer got.
145
124
      assert(limited_ret <= large_ret);
146
      // the written data should be equal
147
124
      assert(std::ranges::equal(limited_output | std::views::take(limited_ret),
148
124
                                large_output | std::views::take(limited_ret)));
149
124
    }
150
256
  } else {
151
    // conversion with a big buffer failed - is there anything we can check or
152
    // assert for the limited buffer? I don't think so.
153
34
  }
154
290
}
155
156
2.56k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
157
  // pick one of the functions, based on the fuzz data.
158
  // the first byte is which action to take. step forward
159
  // several bytes so the input is aligned.
160
2.56k
  if (size < 4) {
161
2
    return 0;
162
2
  }
163
2.55k
  constexpr auto Ncases = 9u;
164
2.55k
  constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
165
2.55k
  const auto action = data[0] & actionmask;
166
167
2.55k
  const std::uint16_t u16 = data[1] + (data[2] << 8);
168
169
2.55k
  data += 4;
170
2.55k
  size -= 4;
171
172
2.55k
  const std::span<const char> chardata{(const char*)data, size};
173
2.55k
  const std::span<const char16_t> u16data{(const char16_t*)data,
174
2.55k
                                          size / sizeof(char16_t)};
175
176
2.55k
  switch (action) {
177
850
  case 0:
178
850
    autodetect(chardata);
179
850
    break;
180
651
  case 1:
181
651
    detect(chardata);
182
651
    break;
183
200
  case 2:
184
200
    validate_ascii(chardata);
185
200
    break;
186
260
  case 3:
187
260
    validate_ascii_with_err(chardata);
188
260
    break;
189
125
  case 4:
190
125
    utf16_endianess(u16data);
191
125
    break;
192
24
  case 5: {
193
24
    [[maybe_unused]] auto ret =
194
24
        simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
195
24
    assert(ret == u16data.size() || ret + 1 == u16data.size());
196
24
  } break;
197
46
  case 6: {
198
46
    [[maybe_unused]] auto ret =
199
46
        simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
200
46
    assert(ret == u16data.size() || ret + 1 == u16data.size());
201
46
  } break;
202
111
  case 7: {
203
111
    [[maybe_unused]] const std::size_t N = chardata.size();
204
111
    [[maybe_unused]] const auto ret =
205
111
        simdutf::trim_partial_utf8(chardata.data(), chardata.size());
206
111
    if ((ret + 3 < N) || (ret > N)) {
207
0
      std::cerr << "ret=" << ret << " N=" << N << '\n';
208
0
      std::abort();
209
0
    }
210
111
  } break;
211
290
  case 8:
212
290
    convert_latin1_to_utf8_safe(chardata, u16);
213
290
    break;
214
2.55k
  }
215
2.55k
  return 0;
216
2.55k
}