Coverage Report

Created: 2025-07-23 06:28

/src/simdutf/fuzz/misc.cpp
Line
Count
Source (jump to first uncovered line)
1
#include <cstddef>
2
#include <cstdint>
3
#include <ranges>
4
5
#include "helpers/common.h"
6
#include "simdutf.h"
7
8
875
void autodetect(std::span<const char> chardata) {
9
875
  std::vector<simdutf::encoding_type> results;
10
875
  const auto implementations = get_supported_implementations();
11
2.62k
  for (const simdutf::implementation* impl : implementations) {
12
2.62k
    results.push_back(
13
2.62k
        impl->autodetect_encoding(chardata.data(), chardata.size()));
14
2.62k
  }
15
1.75k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
16
875
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
17
0
    std::cerr << "output differs between implementations\n";
18
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
19
0
      std::cerr << "implementation " << implementations[i] << " gave "
20
0
                << results.at(i) << '\n';
21
0
    }
22
0
    std::abort();
23
0
  }
24
875
}
25
26
700
void detect(std::span<const char> chardata) {
27
700
  std::vector<int> results;
28
700
  const auto implementations = get_supported_implementations();
29
2.10k
  for (const simdutf::implementation* impl : implementations) {
30
2.10k
    results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
31
2.10k
  }
32
1.40k
  auto neq = [](const auto& a, const auto& b) { return a != b; };
33
700
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
34
0
    std::cerr << "in detect_encodings(const char*, std::size_t):\n";
35
0
    std::cerr << "output differs between implementations\n";
36
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
37
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
38
0
                << results.at(i) << '\n';
39
0
    }
40
0
    std::cerr << " std::vector<unsigned char> data{";
41
0
    for (unsigned char x : chardata) {
42
0
      std::cerr << +x << ", ";
43
0
    };
44
0
    std::cerr << "};\n";
45
0
    std::abort();
46
0
  }
47
700
}
48
49
163
void validate_ascii(std::span<const char> chardata) {
50
  // use int, not bool to avoid vector<bool>
51
163
  std::vector<int> results;
52
163
  const auto implementations = get_supported_implementations();
53
489
  for (const simdutf::implementation* impl : implementations) {
54
489
    results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
55
489
  }
56
326
  auto neq = [](const auto& a, const auto& b) { return a != b; };
57
163
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
58
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
59
0
    std::cerr << "output differs between implementations\n";
60
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
61
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
62
0
                << results.at(i) << '\n';
63
0
    }
64
0
    std::cerr << " std::vector<unsigned char> data{";
65
0
    for (unsigned char x : chardata) {
66
0
      std::cerr << +x << ", ";
67
0
    };
68
0
    std::cerr << "};\n";
69
0
    std::abort();
70
0
  }
71
163
}
72
73
259
void validate_ascii_with_err(std::span<const char> chardata) {
74
  // use int, not bool to avoid vector<bool>
75
259
  std::vector<simdutf::result> results;
76
259
  const auto implementations = get_supported_implementations();
77
777
  for (const simdutf::implementation* impl : implementations) {
78
777
    results.push_back(
79
777
        impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
80
777
  }
81
518
  auto neq = [](const auto& a, const auto& b) { return a != b; };
82
259
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
83
0
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
84
0
    std::cerr << "output differs between implementations\n";
85
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
86
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
87
0
                << results.at(i) << '\n';
88
0
    }
89
0
    std::cerr << " std::vector<unsigned char> data{";
90
0
    for (unsigned char x : chardata) {
91
0
      std::cerr << +x << ", ";
92
0
    };
93
0
    std::cerr << "};\n";
94
0
    std::abort();
95
0
  }
96
259
}
97
98
137
void utf16_endianess(std::span<const char16_t> data) {
99
137
  std::vector<std::string> results;
100
137
  const auto implementations = get_supported_implementations();
101
411
  for (const simdutf::implementation* impl : implementations) {
102
411
    std::vector<char16_t> out(data.size());
103
411
    impl->change_endianness_utf16(data.data(), data.size(), out.data());
104
411
    results.push_back(FNV1A_hash::as_str(out));
105
411
  }
106
274
  auto neq = [](const auto& a, const auto& b) { return a != b; };
107
137
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
108
0
    std::cerr << "in utf16_endianess(const char*, std::size_t):\n";
109
0
    std::cerr << "output differs between implementations\n";
110
0
    for (std::size_t i = 0; i < implementations.size(); ++i) {
111
0
      std::cerr << "implementation " << implementations[i]->name() << " gave "
112
0
                << results.at(i) << '\n';
113
0
    }
114
0
    std::cerr << " std::vector<char16_t> data{";
115
0
    for (int x : data) {
116
0
      std::cerr << +x << ", ";
117
0
    };
118
0
    std::cerr << "};\n";
119
0
    std::abort();
120
0
  }
121
137
}
122
123
void convert_latin1_to_utf8_safe(std::span<const char> chardata,
124
311
                                 const std::size_t outputsize) {
125
  // convert with a limited output buffer
126
311
  std::vector<char> limited_output(outputsize);
127
311
  const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
128
311
      chardata.data(), chardata.size(), limited_output.data(), outputsize);
129
130
  // convert with a sufficiently large output buffer
131
311
  std::vector<char> large_output(2 * chardata.size());
132
311
  const auto large_ret = simdutf::convert_latin1_to_utf8(
133
311
      chardata.data(), chardata.size(), large_output.data());
134
135
311
  if (large_ret != 0) {
136
    // conversion was possible with a large buffer.
137
280
    if (large_ret <= outputsize) {
138
      // the limited buffer was large enough, ensure we got the same result
139
131
      assert(limited_ret == large_ret);
140
131
      assert(std::ranges::equal(limited_output | std::views::take(large_ret),
141
131
                                large_output | std::views::take(large_ret)));
142
149
    } else {
143
      // the number of written bytes for a limited buffer must not exceed what
144
      // the large buffer got.
145
149
      assert(limited_ret <= large_ret);
146
      // the written data should be equal
147
149
      assert(std::ranges::equal(limited_output | std::views::take(limited_ret),
148
149
                                large_output | std::views::take(limited_ret)));
149
149
    }
150
280
  } else {
151
    // conversion with a big buffer failed - is there anything we can check or
152
    // assert for the limited buffer? I don't think so.
153
31
  }
154
311
}
155
156
2.61k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
157
  // pick one of the functions, based on the fuzz data.
158
  // the first byte is which action to take. step forward
159
  // several bytes so the input is aligned.
160
2.61k
  if (size < 4) {
161
2
    return 0;
162
2
  }
163
2.61k
  constexpr auto Ncases = 9u;
164
2.61k
  constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
165
2.61k
  const auto action = data[0] & actionmask;
166
167
2.61k
  const std::uint16_t u16 = data[1] + (data[2] << 8);
168
169
2.61k
  data += 4;
170
2.61k
  size -= 4;
171
172
2.61k
  const std::span<const char> chardata{(const char*)data, size};
173
2.61k
  const std::span<const char16_t> u16data{(const char16_t*)data,
174
2.61k
                                          size / sizeof(char16_t)};
175
176
2.61k
  switch (action) {
177
875
  case 0:
178
875
    autodetect(chardata);
179
875
    break;
180
700
  case 1:
181
700
    detect(chardata);
182
700
    break;
183
163
  case 2:
184
163
    validate_ascii(chardata);
185
163
    break;
186
259
  case 3:
187
259
    validate_ascii_with_err(chardata);
188
259
    break;
189
137
  case 4:
190
137
    utf16_endianess(u16data);
191
137
    break;
192
48
  case 5: {
193
48
    [[maybe_unused]] auto ret =
194
48
        simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
195
48
    assert(ret == u16data.size() || ret + 1 == u16data.size());
196
48
  } break;
197
48
  case 6: {
198
40
    [[maybe_unused]] auto ret =
199
40
        simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
200
40
    assert(ret == u16data.size() || ret + 1 == u16data.size());
201
40
  } break;
202
81
  case 7: {
203
81
    [[maybe_unused]] const std::size_t N = chardata.size();
204
81
    [[maybe_unused]] const auto ret =
205
81
        simdutf::trim_partial_utf8(chardata.data(), chardata.size());
206
81
    if ((ret + 3 < N) || (ret > N)) {
207
0
      std::cerr << "ret=" << ret << " N=" << N << '\n';
208
0
      std::abort();
209
0
    }
210
81
  } break;
211
311
  case 8:
212
311
    convert_latin1_to_utf8_safe(chardata, u16);
213
311
    break;
214
2.61k
  }
215
2.61k
  return 0;
216
2.61k
}