Coverage Report

Created: 2025-11-16 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/fuzz/misc.cpp
Line
Count
Source
1
#include <cstddef>
2
#include <cstdint>
3
#include <ranges>
4
5
#include "helpers/common.h"
6
#include "simdutf.h"
7
8
784
void autodetect(std::span<const char> chardata) {
9
784
  std::vector<simdutf::encoding_type> results;
10
784
  const auto implementations = get_supported_implementations();
11
2.35k
  for (const simdutf::implementation* impl : implementations) {
12
2.35k
    results.push_back(
13
2.35k
        impl->autodetect_encoding(chardata.data(), chardata.size()));
14
2.35k
  }
15
1.56k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
16
784
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
17
0
    std::cerr << "output differs between implementations\n";
18
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
19
0
      std::cerr << "implementation " << implementations[i] << " gave "
20
0
                << results.at(i) << '\n';
21
0
    }
22
0
    std::abort();
23
0
  }
24
784
}
25
26
650
void detect(std::span<const char> chardata) {
27
650
  std::vector<int> results;
28
650
  const auto implementations = get_supported_implementations();
29
1.95k
  for (const simdutf::implementation* impl : implementations) {
30
1.95k
    results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
31
1.95k
  }
32
1.30k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
33
650
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
34
0
    std::cerr << "in detect_encodings(const char*, std::size_t):\n";
35
0
    std::cerr << "output differs between implementations\n";
36
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
37
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
38
0
                << results.at(i) << '\n';
39
0
    }
40
0
    std::cerr << " std::vector<unsigned char> data{";
41
0
    for (unsigned char x : chardata) {
42
0
      std::cerr << +x << ", ";
43
0
    };
44
0
    std::cerr << "};\n";
45
0
    std::abort();
46
0
  }
47
650
}
48
49
157
void validate_ascii(std::span<const char> chardata) {
50
  // use int, not bool to avoid vector<bool>
51
157
  std::vector<int> results;
52
157
  const auto implementations = get_supported_implementations();
53
471
  for (const simdutf::implementation* impl : implementations) {
54
471
    results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
55
471
  }
56
314
  auto neq = [](const auto& a, const auto& b) { return a != b; };
57
157
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
58
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
59
0
    std::cerr << "output differs between implementations\n";
60
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
61
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
62
0
                << results.at(i) << '\n';
63
0
    }
64
0
    std::cerr << " std::vector<unsigned char> data{";
65
0
    for (unsigned char x : chardata) {
66
0
      std::cerr << +x << ", ";
67
0
    };
68
0
    std::cerr << "};\n";
69
0
    std::abort();
70
0
  }
71
157
}
72
73
265
void validate_ascii_with_err(std::span<const char> chardata) {
74
  // use int, not bool to avoid vector<bool>
75
265
  std::vector<simdutf::result> results;
76
265
  const auto implementations = get_supported_implementations();
77
795
  for (const simdutf::implementation* impl : implementations) {
78
795
    results.push_back(
79
795
        impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
80
795
  }
81
530
  auto neq = [](const auto& a, const auto& b) { return a != b; };
82
265
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
83
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
84
0
    std::cerr << "output differs between implementations\n";
85
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
86
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
87
0
                << results.at(i) << '\n';
88
0
    }
89
0
    std::cerr << " std::vector<unsigned char> data{";
90
0
    for (unsigned char x : chardata) {
91
0
      std::cerr << +x << ", ";
92
0
    };
93
0
    std::cerr << "};\n";
94
0
    std::abort();
95
0
  }
96
265
}
97
98
126
void utf16_endianess(std::span<const char16_t> data) {
99
126
  std::vector<std::string> results;
100
126
  const auto implementations = get_supported_implementations();
101
378
  for (const simdutf::implementation* impl : implementations) {
102
378
    std::vector<char16_t> out(data.size());
103
378
    impl->change_endianness_utf16(data.data(), data.size(), out.data());
104
378
    results.push_back(FNV1A_hash::as_str(out));
105
378
  }
106
252
  auto neq = [](const auto& a, const auto& b) { return a != b; };
107
126
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
108
0
    std::cerr << "in utf16_endianess(const char*, std::size_t):\n";
109
0
    std::cerr << "output differs between implementations\n";
110
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
111
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
112
0
                << results.at(i) << '\n';
113
0
    }
114
0
    std::cerr << " std::vector<char16_t> data{";
115
0
    for (int x : data) {
116
0
      std::cerr << +x << ", ";
117
0
    };
118
0
    std::cerr << "};\n";
119
0
    std::abort();
120
0
  }
121
126
}
122
123
void convert_latin1_to_utf8_safe(std::span<const char> chardata,
124
273
                                 const std::size_t outputsize) {
125
  // convert with a limited output buffer
126
273
  std::vector<char> limited_output(outputsize);
127
273
  const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
128
273
      chardata.data(), chardata.size(), limited_output.data(), outputsize);
129
130
  // convert with a sufficiently large output buffer
131
273
  std::vector<char> large_output(2 * chardata.size());
132
273
  const auto large_ret = simdutf::convert_latin1_to_utf8(
133
273
      chardata.data(), chardata.size(), large_output.data());
134
135
273
  if (large_ret != 0) {
136
    // conversion was possible with a large buffer.
137
240
    if (large_ret <= outputsize) {
138
      // the limited buffer was large enough, ensure we got the same result
139
125
      assert(limited_ret == large_ret);
140
125
      assert(std::ranges::equal(limited_output | std::views::take(large_ret),
141
125
                                large_output | std::views::take(large_ret)));
142
125
    } else {
143
      // the number of written bytes for a limited buffer must not exceed what
144
      // the large buffer got.
145
115
      assert(limited_ret <= large_ret);
146
      // the written data should be equal
147
115
      assert(std::ranges::equal(limited_output | std::views::take(limited_ret),
148
115
                                large_output | std::views::take(limited_ret)));
149
115
    }
150
240
  } else {
151
    // conversion with a big buffer failed - is there anything we can check or
152
    // assert for the limited buffer? I don't think so.
153
33
  }
154
273
}
155
156
2.43k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
157
  // pick one of the functions, based on the fuzz data.
158
  // the first byte is which action to take. step forward
159
  // several bytes so the input is aligned.
160
2.43k
  if (size < 4) {
161
2
    return 0;
162
2
  }
163
2.43k
  constexpr auto Ncases = 9u;
164
2.43k
  constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
165
2.43k
  const auto action = data[0] & actionmask;
166
167
2.43k
  const std::uint16_t u16 = data[1] + (data[2] << 8);
168
169
2.43k
  data += 4;
170
2.43k
  size -= 4;
171
172
2.43k
  const std::span<const char> chardata{(const char*)data, size};
173
2.43k
  const std::span<const char16_t> u16data{(const char16_t*)data,
174
2.43k
                                          size / sizeof(char16_t)};
175
176
2.43k
  switch (action) {
177
784
  case 0:
178
784
    autodetect(chardata);
179
784
    break;
180
650
  case 1:
181
650
    detect(chardata);
182
650
    break;
183
157
  case 2:
184
157
    validate_ascii(chardata);
185
157
    break;
186
265
  case 3:
187
265
    validate_ascii_with_err(chardata);
188
265
    break;
189
126
  case 4:
190
126
    utf16_endianess(u16data);
191
126
    break;
192
42
  case 5: {
193
42
    [[maybe_unused]] auto ret =
194
42
        simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
195
42
    assert(ret == u16data.size() || ret + 1 == u16data.size());
196
42
  } break;
197
47
  case 6: {
198
47
    [[maybe_unused]] auto ret =
199
47
        simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
200
47
    assert(ret == u16data.size() || ret + 1 == u16data.size());
201
47
  } break;
202
90
  case 7: {
203
90
    [[maybe_unused]] const std::size_t N = chardata.size();
204
90
    [[maybe_unused]] const auto ret =
205
90
        simdutf::trim_partial_utf8(chardata.data(), chardata.size());
206
90
    if ((ret + 3 < N) || (ret > N)) {
207
0
      std::cerr << "ret=" << ret << " N=" << N << '\n';
208
0
      std::abort();
209
0
    }
210
90
  } break;
211
273
  case 8:
212
273
    convert_latin1_to_utf8_safe(chardata, u16);
213
273
    break;
214
2.43k
  }
215
2.43k
  return 0;
216
2.43k
}