Coverage Report

Created: 2026-03-30 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/fuzz/misc.cpp
Line
Count
Source
1
#include <cstddef>
2
#include <cstdint>
3
#include <ranges>
4
5
#include "helpers/common.h"
6
#include "simdutf.h"
7
8
841
void autodetect(std::span<const char> chardata) {
9
841
  std::vector<simdutf::encoding_type> results;
10
841
  const auto implementations = get_supported_implementations();
11
2.52k
  for (const simdutf::implementation* impl : implementations) {
12
2.52k
    results.push_back(
13
2.52k
        impl->autodetect_encoding(chardata.data(), chardata.size()));
14
2.52k
  }
15
1.68k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
16
841
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
17
0
    std::cerr << "output differs between implementations\n";
18
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
19
0
      std::cerr << "implementation " << implementations[i] << " gave "
20
0
                << results.at(i) << '\n';
21
0
    }
22
0
    std::abort();
23
0
  }
24
841
}
25
26
679
void detect(std::span<const char> chardata) {
27
679
  std::vector<int> results;
28
679
  const auto implementations = get_supported_implementations();
29
2.03k
  for (const simdutf::implementation* impl : implementations) {
30
2.03k
    results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
31
2.03k
  }
32
1.35k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
33
679
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
34
0
    std::cerr << "in detect_encodings(const char*, std::size_t):\n";
35
0
    std::cerr << "output differs between implementations\n";
36
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
37
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
38
0
                << results.at(i) << '\n';
39
0
    }
40
0
    std::cerr << " std::vector<unsigned char> data{";
41
0
    for (unsigned char x : chardata) {
42
0
      std::cerr << +x << ", ";
43
0
    };
44
0
    std::cerr << "};\n";
45
0
    std::abort();
46
0
  }
47
679
}
48
49
214
void validate_ascii(std::span<const char> chardata) {
50
  // use int, not bool to avoid vector<bool>
51
214
  std::vector<int> results;
52
214
  const auto implementations = get_supported_implementations();
53
642
  for (const simdutf::implementation* impl : implementations) {
54
642
    results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
55
642
  }
56
428
  auto neq = [](const auto& a, const auto& b) { return a != b; };
57
214
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
58
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
59
0
    std::cerr << "output differs between implementations\n";
60
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
61
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
62
0
                << results.at(i) << '\n';
63
0
    }
64
0
    std::cerr << " std::vector<unsigned char> data{";
65
0
    for (unsigned char x : chardata) {
66
0
      std::cerr << +x << ", ";
67
0
    };
68
0
    std::cerr << "};\n";
69
0
    std::abort();
70
0
  }
71
214
}
72
73
266
void validate_ascii_with_err(std::span<const char> chardata) {
74
  // use int, not bool to avoid vector<bool>
75
266
  std::vector<simdutf::result> results;
76
266
  const auto implementations = get_supported_implementations();
77
798
  for (const simdutf::implementation* impl : implementations) {
78
798
    results.push_back(
79
798
        impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
80
798
  }
81
532
  auto neq = [](const auto& a, const auto& b) { return a != b; };
82
266
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
83
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
84
0
    std::cerr << "output differs between implementations\n";
85
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
86
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
87
0
                << results.at(i) << '\n';
88
0
    }
89
0
    std::cerr << " std::vector<unsigned char> data{";
90
0
    for (unsigned char x : chardata) {
91
0
      std::cerr << +x << ", ";
92
0
    };
93
0
    std::cerr << "};\n";
94
0
    std::abort();
95
0
  }
96
266
}
97
98
137
void utf16_endianness(std::span<const char16_t> data) {
99
137
  std::vector<std::string> results;
100
137
  const auto implementations = get_supported_implementations();
101
411
  for (const simdutf::implementation* impl : implementations) {
102
411
    std::vector<char16_t> out(data.size());
103
411
    impl->change_endianness_utf16(data.data(), data.size(), out.data());
104
411
    results.push_back(FNV1A_hash::as_str(out));
105
411
  }
106
274
  auto neq = [](const auto& a, const auto& b) { return a != b; };
107
137
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
108
0
    std::cerr << "in utf16_endianness(const char*, std::size_t):\n";
109
0
    std::cerr << "output differs between implementations\n";
110
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
111
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
112
0
                << results.at(i) << '\n';
113
0
    }
114
0
    std::cerr << " std::vector<char16_t> data{";
115
0
    for (int x : data) {
116
0
      std::cerr << +x << ", ";
117
0
    };
118
0
    std::cerr << "};\n";
119
0
    std::abort();
120
0
  }
121
137
}
122
123
// Checks that validate_utf16le_as_ascii and validate_utf16be_as_ascii agree
124
// across all implementations, and that a true result implies valid UTF-16.
125
0
void validate_utf16_as_ascii(std::span<const char16_t> data) {
126
0
  const auto implementations = get_supported_implementations();
127
  // use int, not bool to avoid vector<bool>
128
0
  std::vector<int> le_results, be_results;
129
0
  le_results.reserve(implementations.size());
130
0
  be_results.reserve(implementations.size());
131
0
  for (const simdutf::implementation* impl : implementations) {
132
0
    le_results.push_back(
133
0
        +impl->validate_utf16le_as_ascii(data.data(), data.size()));
134
0
    be_results.push_back(
135
0
        +impl->validate_utf16be_as_ascii(data.data(), data.size()));
136
0
  }
137
0
  auto neq = [](const auto& a, const auto& b) { return a != b; };
138
0
  if (std::ranges::adjacent_find(le_results, neq) != le_results.end()) {
139
0
    std::cerr << "validate_utf16le_as_ascii: output differs between "
140
0
                 "implementations\n";
141
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
142
0
      std::cerr << "  " << implementations[i]->name() << " gave "
143
0
                << le_results[i] << '\n';
144
0
    }
145
0
    std::abort();
146
0
  }
147
0
  if (std::ranges::adjacent_find(be_results, neq) != be_results.end()) {
148
0
    std::cerr << "validate_utf16be_as_ascii: output differs between "
149
0
                 "implementations\n";
150
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
151
0
      std::cerr << "  " << implementations[i]->name() << " gave "
152
0
                << be_results[i] << '\n';
153
0
    }
154
0
    std::abort();
155
0
  }
156
  // If LE validates as ASCII, it must also validate as UTF-16LE (ASCII is a
157
  // subset).
158
0
  if (le_results[0]) {
159
0
    for (const simdutf::implementation* impl : implementations) {
160
0
      if (!impl->validate_utf16le(data.data(), data.size())) {
161
0
        std::cerr << "validate_utf16le_as_ascii returned true but "
162
0
                     "validate_utf16le returned false"
163
0
                  << " impl=" << impl->name() << "\n";
164
0
        std::abort();
165
0
      }
166
0
    }
167
0
  }
168
  // Same for BE.
169
0
  if (be_results[0]) {
170
0
    for (const simdutf::implementation* impl : implementations) {
171
0
      if (!impl->validate_utf16be(data.data(), data.size())) {
172
0
        std::cerr << "validate_utf16be_as_ascii returned true but "
173
0
                     "validate_utf16be returned false"
174
0
                  << " impl=" << impl->name() << "\n";
175
0
        std::abort();
176
0
      }
177
0
    }
178
0
  }
179
0
}
180
181
// Checks that to_well_formed_utf16le / to_well_formed_utf16be:
182
// 1. All implementations agree on the output.
183
// 2. The output is always valid UTF-16LE / UTF-16BE.
184
// 3. When the input is already valid UTF-16, the output equals the input.
185
0
void to_well_formed_utf16(std::span<const char16_t> data) {
186
0
  const auto implementations = get_supported_implementations();
187
0
  if (implementations.empty()) {
188
0
    return;
189
0
  }
190
191
  // Check LE variant
192
0
  {
193
0
    std::vector<std::vector<char16_t>> le_outputs;
194
0
    le_outputs.reserve(implementations.size());
195
0
    for (const simdutf::implementation* impl : implementations) {
196
0
      std::vector<char16_t> out(data.size());
197
0
      impl->to_well_formed_utf16le(data.data(), data.size(), out.data());
198
0
      le_outputs.push_back(std::move(out));
199
0
    }
200
0
    auto neq = [](const auto& a, const auto& b) { return a != b; };
201
0
    if (std::ranges::adjacent_find(le_outputs, neq) != le_outputs.end()) {
202
0
      std::cerr
203
0
          << "to_well_formed_utf16le: outputs differ between implementations\n";
204
0
      for (std::size_t i = 0; i < implementations.size(); ++i) {
205
0
        std::cerr << "  " << implementations[i]->name()
206
0
                  << ": hash=" << FNV1A_hash::as_str(le_outputs[i]) << "\n";
207
0
      }
208
0
      std::abort();
209
0
    }
210
    // Output must be valid UTF-16LE.
211
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
212
0
      if (!implementations[i]->validate_utf16le(le_outputs[i].data(),
213
0
                                                le_outputs[i].size())) {
214
0
        std::cerr << "to_well_formed_utf16le: output is not valid UTF-16LE"
215
0
                  << " impl=" << implementations[i]->name() << "\n";
216
0
        std::abort();
217
0
      }
218
0
    }
219
    // If input was already valid UTF-16LE, output must equal input.
220
0
    if (implementations[0]->validate_utf16le(data.data(), data.size())) {
221
0
      if (!std::ranges::equal(le_outputs[0], data)) {
222
0
        std::cerr << "to_well_formed_utf16le: valid input was modified\n";
223
0
        std::abort();
224
0
      }
225
0
    }
226
0
  }
227
228
  // Check BE variant
229
0
  {
230
0
    std::vector<std::vector<char16_t>> be_outputs;
231
0
    be_outputs.reserve(implementations.size());
232
0
    for (const simdutf::implementation* impl : implementations) {
233
0
      std::vector<char16_t> out(data.size());
234
0
      impl->to_well_formed_utf16be(data.data(), data.size(), out.data());
235
0
      be_outputs.push_back(std::move(out));
236
0
    }
237
0
    auto neq = [](const auto& a, const auto& b) { return a != b; };
238
0
    if (std::ranges::adjacent_find(be_outputs, neq) != be_outputs.end()) {
239
0
      std::cerr
240
0
          << "to_well_formed_utf16be: outputs differ between implementations\n";
241
0
      for (std::size_t i = 0; i < implementations.size(); ++i) {
242
0
        std::cerr << "  " << implementations[i]->name()
243
0
                  << ": hash=" << FNV1A_hash::as_str(be_outputs[i]) << "\n";
244
0
      }
245
0
      std::abort();
246
0
    }
247
    // Output must be valid UTF-16BE.
248
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
249
0
      if (!implementations[i]->validate_utf16be(be_outputs[i].data(),
250
0
                                                be_outputs[i].size())) {
251
0
        std::cerr << "to_well_formed_utf16be: output is not valid UTF-16BE"
252
0
                  << " impl=" << implementations[i]->name() << "\n";
253
0
        std::abort();
254
0
      }
255
0
    }
256
    // If input was already valid UTF-16BE, output must equal input.
257
0
    if (implementations[0]->validate_utf16be(data.data(), data.size())) {
258
0
      if (!std::ranges::equal(be_outputs[0], data)) {
259
0
        std::cerr << "to_well_formed_utf16be: valid input was modified\n";
260
0
        std::abort();
261
0
      }
262
0
    }
263
0
  }
264
0
}
265
266
void convert_latin1_to_utf8_safe(std::span<const char> chardata,
267
279
                                 const std::size_t outputsize) {
268
  // convert with a limited output buffer
269
279
  std::vector<char> limited_output(outputsize);
270
279
  const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
271
279
      chardata.data(), chardata.size(), limited_output.data(), outputsize);
272
273
  // convert with a sufficiently large output buffer
274
279
  std::vector<char> large_output(2 * chardata.size());
275
279
  const auto large_ret = simdutf::convert_latin1_to_utf8(
276
279
      chardata.data(), chardata.size(), large_output.data());
277
278
279
  if (large_ret != 0) {
279
    // conversion was possible with a large buffer.
280
245
    if (large_ret <= outputsize) {
281
      // the limited buffer was large enough, ensure we got the same result
282
132
      assert(limited_ret == large_ret);
283
132
      assert(std::ranges::equal(limited_output | std::views::take(large_ret),
284
132
                                large_output | std::views::take(large_ret)));
285
132
    } else {
286
      // the number of written bytes for a limited buffer must not exceed what
287
      // the large buffer got.
288
113
      assert(limited_ret <= large_ret);
289
      // the written data should be equal
290
113
      assert(std::ranges::equal(limited_output | std::views::take(limited_ret),
291
113
                                large_output | std::views::take(limited_ret)));
292
113
    }
293
245
  } else {
294
    // conversion with a big buffer failed - is there anything we can check or
295
    // assert for the limited buffer? I don't think so.
296
34
  }
297
279
}
298
299
2.61k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
300
  // pick one of the functions, based on the fuzz data.
301
  // the first byte is which action to take. step forward
302
  // several bytes so the input is aligned.
303
2.61k
  if (size < 4) {
304
2
    return 0;
305
2
  }
306
2.61k
  constexpr auto Ncases = 11u;
307
2.61k
  constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
308
2.61k
  const auto action = data[0] & actionmask;
309
310
2.61k
  const std::uint16_t u16 = data[1] + (data[2] << 8);
311
312
2.61k
  data += 4;
313
2.61k
  size -= 4;
314
315
2.61k
  const std::span<const char> chardata{(const char*)data, size};
316
2.61k
  const std::span<const char16_t> u16data{(const char16_t*)data,
317
2.61k
                                          size / sizeof(char16_t)};
318
319
2.61k
  switch (action) {
320
841
  case 0:
321
841
    autodetect(chardata);
322
841
    break;
323
679
  case 1:
324
679
    detect(chardata);
325
679
    break;
326
214
  case 2:
327
214
    validate_ascii(chardata);
328
214
    break;
329
266
  case 3:
330
266
    validate_ascii_with_err(chardata);
331
266
    break;
332
137
  case 4:
333
137
    utf16_endianness(u16data);
334
137
    break;
335
48
  case 5: {
336
48
    [[maybe_unused]] auto ret =
337
48
        simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
338
48
    assert(ret == u16data.size() || ret + 1 == u16data.size());
339
48
  } break;
340
48
  case 6: {
341
39
    [[maybe_unused]] auto ret =
342
39
        simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
343
39
    assert(ret == u16data.size() || ret + 1 == u16data.size());
344
39
  } break;
345
111
  case 7: {
346
111
    [[maybe_unused]] const std::size_t N = chardata.size();
347
111
    [[maybe_unused]] const auto ret =
348
111
        simdutf::trim_partial_utf8(chardata.data(), chardata.size());
349
111
    if ((ret + 3 < N) || (ret > N)) {
350
0
      std::cerr << "ret=" << ret << " N=" << N << '\n';
351
0
      std::abort();
352
0
    }
353
111
  } break;
354
279
  case 8:
355
279
    convert_latin1_to_utf8_safe(chardata, u16);
356
279
    break;
357
0
  case 9:
358
0
    validate_utf16_as_ascii(u16data);
359
0
    break;
360
0
  case 10:
361
0
    to_well_formed_utf16(u16data);
362
0
    break;
363
2.61k
  }
364
2.61k
  return 0;
365
2.61k
}