/src/simdutf/fuzz/misc.cpp

Source
#include <cstddef>
#include <cstdint>
#include <ranges>

#include "helpers/common.h"
#include "simdutf.h"

void autodetect(std::span<const char> chardata) {
  std::vector<simdutf::encoding_type> results;
  const auto implementations = get_supported_implementations();
  for (const simdutf::implementation* impl : implementations) {
    results.push_back(
        impl->autodetect_encoding(chardata.data(), chardata.size()));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
    std::cerr << "output differs between implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "implementation " << implementations[i] << " gave "
                << results.at(i) << '\n';
    }
    std::abort();
  }
}

void detect(std::span<const char> chardata) {
  std::vector<int> results;
  const auto implementations = get_supported_implementations();
  for (const simdutf::implementation* impl : implementations) {
    results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
    std::cerr << "in detect_encodings(const char*, std::size_t):\n";
    std::cerr << "output differs between implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "implementation " << implementations[i]->name() << " gave "
                << results.at(i) << '\n';
    }
    std::cerr << " std::vector<unsigned char> data{";
    for (unsigned char x : chardata) {
      std::cerr << +x << ", ";
    };
    std::cerr << "};\n";
    std::abort();
  }
}

void validate_ascii(std::span<const char> chardata) {
  // use int, not bool to avoid vector<bool>
  std::vector<int> results;
  const auto implementations = get_supported_implementations();
  for (const simdutf::implementation* impl : implementations) {
    results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
    std::cerr << "output differs between implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "implementation " << implementations[i]->name() << " gave "
                << results.at(i) << '\n';
    }
    std::cerr << " std::vector<unsigned char> data{";
    for (unsigned char x : chardata) {
      std::cerr << +x << ", ";
    };
    std::cerr << "};\n";
    std::abort();
  }
}

void validate_ascii_with_err(std::span<const char> chardata) {
  // use int, not bool to avoid vector<bool>
  std::vector<simdutf::result> results;
  const auto implementations = get_supported_implementations();
  for (const simdutf::implementation* impl : implementations) {
    results.push_back(
        impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
    std::cerr << "in validate_ascii(const char*, std::size_t):\n";
    std::cerr << "output differs between implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "implementation " << implementations[i]->name() << " gave "
                << results.at(i) << '\n';
    }
    std::cerr << " std::vector<unsigned char> data{";
    for (unsigned char x : chardata) {
      std::cerr << +x << ", ";
    };
    std::cerr << "};\n";
    std::abort();
  }
}

void utf16_endianness(std::span<const char16_t> data) {
  std::vector<std::string> results;
  const auto implementations = get_supported_implementations();
  for (const simdutf::implementation* impl : implementations) {
    std::vector<char16_t> out(data.size());
    impl->change_endianness_utf16(data.data(), data.size(), out.data());
    results.push_back(FNV1A_hash::as_str(out));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(results, neq) != results.end()) {
    std::cerr << "in utf16_endianness(const char*, std::size_t):\n";
    std::cerr << "output differs between implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "implementation " << implementations[i]->name() << " gave "
                << results.at(i) << '\n';
    }
    std::cerr << " std::vector<char16_t> data{";
    for (int x : data) {
      std::cerr << +x << ", ";
    };
    std::cerr << "};\n";
    std::abort();
  }
}

// Checks that validate_utf16le_as_ascii and validate_utf16be_as_ascii agree
// across all implementations, and that a true result implies valid UTF-16.
void validate_utf16_as_ascii(std::span<const char16_t> data) {
  const auto implementations = get_supported_implementations();
  // use int, not bool to avoid vector<bool>
  std::vector<int> le_results, be_results;
  le_results.reserve(implementations.size());
  be_results.reserve(implementations.size());
  for (const simdutf::implementation* impl : implementations) {
    le_results.push_back(
        +impl->validate_utf16le_as_ascii(data.data(), data.size()));
    be_results.push_back(
        +impl->validate_utf16be_as_ascii(data.data(), data.size()));
  }
  auto neq = [](const auto& a, const auto& b) { return a != b; };
  if (std::ranges::adjacent_find(le_results, neq) != le_results.end()) {
    std::cerr << "validate_utf16le_as_ascii: output differs between "
                 "implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "  " << implementations[i]->name() << " gave "
                << le_results[i] << '\n';
    }
    std::abort();
  }
  if (std::ranges::adjacent_find(be_results, neq) != be_results.end()) {
    std::cerr << "validate_utf16be_as_ascii: output differs between "
                 "implementations\n";
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      std::cerr << "  " << implementations[i]->name() << " gave "
                << be_results[i] << '\n';
    }
    std::abort();
  }
  // If LE validates as ASCII, it must also validate as UTF-16LE (ASCII is a
  // subset).
  if (le_results[0]) {
    for (const simdutf::implementation* impl : implementations) {
      if (!impl->validate_utf16le(data.data(), data.size())) {
        std::cerr << "validate_utf16le_as_ascii returned true but "
                     "validate_utf16le returned false"
                  << " impl=" << impl->name() << "\n";
        std::abort();
      }
    }
  }
  // Same for BE.
  if (be_results[0]) {
    for (const simdutf::implementation* impl : implementations) {
      if (!impl->validate_utf16be(data.data(), data.size())) {
        std::cerr << "validate_utf16be_as_ascii returned true but "
                     "validate_utf16be returned false"
                  << " impl=" << impl->name() << "\n";
        std::abort();
      }
    }
  }
}

// Checks that to_well_formed_utf16le / to_well_formed_utf16be:
// 1. All implementations agree on the output.
// 2. The output is always valid UTF-16LE / UTF-16BE.
// 3. When the input is already valid UTF-16, the output equals the input.
void to_well_formed_utf16(std::span<const char16_t> data) {
  const auto implementations = get_supported_implementations();
  if (implementations.empty()) {
    return;
  }

  // Check LE variant
  {
    std::vector<std::vector<char16_t>> le_outputs;
    le_outputs.reserve(implementations.size());
    for (const simdutf::implementation* impl : implementations) {
      std::vector<char16_t> out(data.size());
      impl->to_well_formed_utf16le(data.data(), data.size(), out.data());
      le_outputs.push_back(std::move(out));
    }
    auto neq = [](const auto& a, const auto& b) { return a != b; };
    if (std::ranges::adjacent_find(le_outputs, neq) != le_outputs.end()) {
      std::cerr
          << "to_well_formed_utf16le: outputs differ between implementations\n";
      for (std::size_t i = 0; i < implementations.size(); ++i) {
        std::cerr << "  " << implementations[i]->name()
                  << ": hash=" << FNV1A_hash::as_str(le_outputs[i]) << "\n";
      }
      std::abort();
    }
    // Output must be valid UTF-16LE.
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      if (!implementations[i]->validate_utf16le(le_outputs[i].data(),
                                                le_outputs[i].size())) {
        std::cerr << "to_well_formed_utf16le: output is not valid UTF-16LE"
                  << " impl=" << implementations[i]->name() << "\n";
        std::abort();
      }
    }
    // If input was already valid UTF-16LE, output must equal input.
    if (implementations[0]->validate_utf16le(data.data(), data.size())) {
      if (!std::ranges::equal(le_outputs[0], data)) {
        std::cerr << "to_well_formed_utf16le: valid input was modified\n";
        std::abort();
      }
    }
  }

  // Check BE variant
  {
    std::vector<std::vector<char16_t>> be_outputs;
    be_outputs.reserve(implementations.size());
    for (const simdutf::implementation* impl : implementations) {
      std::vector<char16_t> out(data.size());
      impl->to_well_formed_utf16be(data.data(), data.size(), out.data());
      be_outputs.push_back(std::move(out));
    }
    auto neq = [](const auto& a, const auto& b) { return a != b; };
    if (std::ranges::adjacent_find(be_outputs, neq) != be_outputs.end()) {
      std::cerr
          << "to_well_formed_utf16be: outputs differ between implementations\n";
      for (std::size_t i = 0; i < implementations.size(); ++i) {
        std::cerr << "  " << implementations[i]->name()
                  << ": hash=" << FNV1A_hash::as_str(be_outputs[i]) << "\n";
      }
      std::abort();
    }
    // Output must be valid UTF-16BE.
    for (std::size_t i = 0; i < implementations.size(); ++i) {
      if (!implementations[i]->validate_utf16be(be_outputs[i].data(),
                                                be_outputs[i].size())) {
        std::cerr << "to_well_formed_utf16be: output is not valid UTF-16BE"
                  << " impl=" << implementations[i]->name() << "\n";
        std::abort();
      }
    }
    // If input was already valid UTF-16BE, output must equal input.
    if (implementations[0]->validate_utf16be(data.data(), data.size())) {
      if (!std::ranges::equal(be_outputs[0], data)) {
        std::cerr << "to_well_formed_utf16be: valid input was modified\n";
        std::abort();
      }
    }
  }
}

void convert_latin1_to_utf8_safe(std::span<const char> chardata,
                                 const std::size_t outputsize) {
  // convert with a limited output buffer
  std::vector<char> limited_output(outputsize);
  const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
      chardata.data(), chardata.size(), limited_output.data(), outputsize);

  // convert with a sufficiently large output buffer
  std::vector<char> large_output(2 * chardata.size());
  const auto large_ret = simdutf::convert_latin1_to_utf8(
      chardata.data(), chardata.size(), large_output.data());

  if (large_ret != 0) {
    // conversion was possible with a large buffer.
    if (large_ret <= outputsize) {
      // the limited buffer was large enough, ensure we got the same result
      assert(limited_ret == large_ret);
      assert(std::ranges::equal(limited_output | std::views::take(large_ret),
                                large_output | std::views::take(large_ret)));
    } else {
      // the number of written bytes for a limited buffer must not exceed what
      // the large buffer got.
      assert(limited_ret <= large_ret);
      // the written data should be equal
      assert(std::ranges::equal(limited_output | std::views::take(limited_ret),
                                large_output | std::views::take(limited_ret)));
    }
  } else {
    // conversion with a big buffer failed - is there anything we can check or
    // assert for the limited buffer? I don't think so.
  }
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
  // pick one of the functions, based on the fuzz data.
  // the first byte is which action to take. step forward
  // several bytes so the input is aligned.
  if (size < 4) {
    return 0;
  }
  constexpr auto Ncases = 11u;
  constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
  const auto action = data[0] & actionmask;

  const std::uint16_t u16 = data[1] + (data[2] << 8);

  data += 4;
  size -= 4;

  const std::span<const char> chardata{(const char*)data, size};
  const std::span<const char16_t> u16data{(const char16_t*)data,
                                          size / sizeof(char16_t)};

  switch (action) {
  case 0:
    autodetect(chardata);
    break;
  case 1:
    detect(chardata);
    break;
  case 2:
    validate_ascii(chardata);
    break;
  case 3:
    validate_ascii_with_err(chardata);
    break;
  case 4:
    utf16_endianness(u16data);
    break;
  case 5: {
    [[maybe_unused]] auto ret =
        simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
    assert(ret == u16data.size() || ret + 1 == u16data.size());
  } break;
  case 6: {
    [[maybe_unused]] auto ret =
        simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
    assert(ret == u16data.size() || ret + 1 == u16data.size());
  } break;
  case 7: {
    [[maybe_unused]] const std::size_t N = chardata.size();
    [[maybe_unused]] const auto ret =
        simdutf::trim_partial_utf8(chardata.data(), chardata.size());
    if ((ret + 3 < N) || (ret > N)) {
      std::cerr << "ret=" << ret << " N=" << N << '\n';
      std::abort();
    }
  } break;
  case 8:
    convert_latin1_to_utf8_safe(chardata, u16);
    break;
  case 9:
    validate_utf16_as_ascii(u16data);
    break;
  case 10:
    to_well_formed_utf16(u16data);
    break;
  }
  return 0;
}

Coverage Report

Created: 2026-03-30 06:51

Line	Count	Source
1		#include <cstddef>
2		#include <cstdint>
3		#include <ranges>
4
5		#include "helpers/common.h"
6		#include "simdutf.h"
7
8	841	void autodetect(std::span<const char> chardata) {
9	841	std::vector<simdutf::encoding_type> results;
10	841	const auto implementations = get_supported_implementations();
11	2.52k	for (const simdutf::implementation* impl : implementations) {
12	2.52k	results.push_back(
13	2.52k	impl->autodetect_encoding(chardata.data(), chardata.size()));
14	2.52k	}
15	1.68k	auto neq = [](const auto& a, const auto& b) { return a != b; };
16	841	if (std::ranges::adjacent_find(results, neq) != results.end()) {
17	0	std::cerr << "output differs between implementations\n";
18	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
19	0	std::cerr << "implementation " << implementations[i] << " gave "
20	0	<< results.at(i) << '\n';
21	0	}
22	0	std::abort();
23	0	}
24	841	}
25
26	679	void detect(std::span<const char> chardata) {
27	679	std::vector<int> results;
28	679	const auto implementations = get_supported_implementations();
29	2.03k	for (const simdutf::implementation* impl : implementations) {
30	2.03k	results.push_back(impl->detect_encodings(chardata.data(), chardata.size()));
31	2.03k	}
32	1.35k	auto neq = [](const auto& a, const auto& b) { return a != b; };
33	679	if (std::ranges::adjacent_find(results, neq) != results.end()) {
34	0	std::cerr << "in detect_encodings(const char*, std::size_t):\n";
35	0	std::cerr << "output differs between implementations\n";
36	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
37	0	std::cerr << "implementation " << implementations[i]->name() << " gave "
38	0	<< results.at(i) << '\n';
39	0	}
40	0	std::cerr << " std::vector<unsigned char> data{";
41	0	for (unsigned char x : chardata) {
42	0	std::cerr << +x << ", ";
43	0	};
44	0	std::cerr << "};\n";
45	0	std::abort();
46	0	}
47	679	}
48
49	214	void validate_ascii(std::span<const char> chardata) {
50		// use int, not bool to avoid vector<bool>
51	214	std::vector<int> results;
52	214	const auto implementations = get_supported_implementations();
53	642	for (const simdutf::implementation* impl : implementations) {
54	642	results.push_back(+impl->validate_ascii(chardata.data(), chardata.size()));
55	642	}
56	428	auto neq = [](const auto& a, const auto& b) { return a != b; };
57	214	if (std::ranges::adjacent_find(results, neq) != results.end()) {
58	0	std::cerr << "in validate_ascii(const char*, std::size_t):\n";
59	0	std::cerr << "output differs between implementations\n";
60	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
61	0	std::cerr << "implementation " << implementations[i]->name() << " gave "
62	0	<< results.at(i) << '\n';
63	0	}
64	0	std::cerr << " std::vector<unsigned char> data{";
65	0	for (unsigned char x : chardata) {
66	0	std::cerr << +x << ", ";
67	0	};
68	0	std::cerr << "};\n";
69	0	std::abort();
70	0	}
71	214	}
72
73	266	void validate_ascii_with_err(std::span<const char> chardata) {
74		// use int, not bool to avoid vector<bool>
75	266	std::vector<simdutf::result> results;
76	266	const auto implementations = get_supported_implementations();
77	798	for (const simdutf::implementation* impl : implementations) {
78	798	results.push_back(
79	798	impl->validate_ascii_with_errors(chardata.data(), chardata.size()));
80	798	}
81	532	auto neq = [](const auto& a, const auto& b) { return a != b; };
82	266	if (std::ranges::adjacent_find(results, neq) != results.end()) {
83	0	std::cerr << "in validate_ascii(const char*, std::size_t):\n";
84	0	std::cerr << "output differs between implementations\n";
85	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
86	0	std::cerr << "implementation " << implementations[i]->name() << " gave "
87	0	<< results.at(i) << '\n';
88	0	}
89	0	std::cerr << " std::vector<unsigned char> data{";
90	0	for (unsigned char x : chardata) {
91	0	std::cerr << +x << ", ";
92	0	};
93	0	std::cerr << "};\n";
94	0	std::abort();
95	0	}
96	266	}
97
98	137	void utf16_endianness(std::span<const char16_t> data) {
99	137	std::vector<std::string> results;
100	137	const auto implementations = get_supported_implementations();
101	411	for (const simdutf::implementation* impl : implementations) {
102	411	std::vector<char16_t> out(data.size());
103	411	impl->change_endianness_utf16(data.data(), data.size(), out.data());
104	411	results.push_back(FNV1A_hash::as_str(out));
105	411	}
106	274	auto neq = [](const auto& a, const auto& b) { return a != b; };
107	137	if (std::ranges::adjacent_find(results, neq) != results.end()) {
108	0	std::cerr << "in utf16_endianness(const char*, std::size_t):\n";
109	0	std::cerr << "output differs between implementations\n";
110	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
111	0	std::cerr << "implementation " << implementations[i]->name() << " gave "
112	0	<< results.at(i) << '\n';
113	0	}
114	0	std::cerr << " std::vector<char16_t> data{";
115	0	for (int x : data) {
116	0	std::cerr << +x << ", ";
117	0	};
118	0	std::cerr << "};\n";
119	0	std::abort();
120	0	}
121	137	}
122
123		// Checks that validate_utf16le_as_ascii and validate_utf16be_as_ascii agree
124		// across all implementations, and that a true result implies valid UTF-16.
125	0	void validate_utf16_as_ascii(std::span<const char16_t> data) {
126	0	const auto implementations = get_supported_implementations();
127		// use int, not bool to avoid vector<bool>
128	0	std::vector<int> le_results, be_results;
129	0	le_results.reserve(implementations.size());
130	0	be_results.reserve(implementations.size());
131	0	for (const simdutf::implementation* impl : implementations) {
132	0	le_results.push_back(
133	0	+impl->validate_utf16le_as_ascii(data.data(), data.size()));
134	0	be_results.push_back(
135	0	+impl->validate_utf16be_as_ascii(data.data(), data.size()));
136	0	}
137	0	auto neq = [](const auto& a, const auto& b) { return a != b; };
138	0	if (std::ranges::adjacent_find(le_results, neq) != le_results.end()) {
139	0	std::cerr << "validate_utf16le_as_ascii: output differs between "
140	0	"implementations\n";
141	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
142	0	std::cerr << " " << implementations[i]->name() << " gave "
143	0	<< le_results[i] << '\n';
144	0	}
145	0	std::abort();
146	0	}
147	0	if (std::ranges::adjacent_find(be_results, neq) != be_results.end()) {
148	0	std::cerr << "validate_utf16be_as_ascii: output differs between "
149	0	"implementations\n";
150	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
151	0	std::cerr << " " << implementations[i]->name() << " gave "
152	0	<< be_results[i] << '\n';
153	0	}
154	0	std::abort();
155	0	}
156		// If LE validates as ASCII, it must also validate as UTF-16LE (ASCII is a
157		// subset).
158	0	if (le_results[0]) {
159	0	for (const simdutf::implementation* impl : implementations) {
160	0	if (!impl->validate_utf16le(data.data(), data.size())) {
161	0	std::cerr << "validate_utf16le_as_ascii returned true but "
162	0	"validate_utf16le returned false"
163	0	<< " impl=" << impl->name() << "\n";
164	0	std::abort();
165	0	}
166	0	}
167	0	}
168		// Same for BE.
169	0	if (be_results[0]) {
170	0	for (const simdutf::implementation* impl : implementations) {
171	0	if (!impl->validate_utf16be(data.data(), data.size())) {
172	0	std::cerr << "validate_utf16be_as_ascii returned true but "
173	0	"validate_utf16be returned false"
174	0	<< " impl=" << impl->name() << "\n";
175	0	std::abort();
176	0	}
177	0	}
178	0	}
179	0	}
180
181		// Checks that to_well_formed_utf16le / to_well_formed_utf16be:
182		// 1. All implementations agree on the output.
183		// 2. The output is always valid UTF-16LE / UTF-16BE.
184		// 3. When the input is already valid UTF-16, the output equals the input.
185	0	void to_well_formed_utf16(std::span<const char16_t> data) {
186	0	const auto implementations = get_supported_implementations();
187	0	if (implementations.empty()) {
188	0	return;
189	0	}
190
191		// Check LE variant
192	0	{
193	0	std::vector<std::vector<char16_t>> le_outputs;
194	0	le_outputs.reserve(implementations.size());
195	0	for (const simdutf::implementation* impl : implementations) {
196	0	std::vector<char16_t> out(data.size());
197	0	impl->to_well_formed_utf16le(data.data(), data.size(), out.data());
198	0	le_outputs.push_back(std::move(out));
199	0	}
200	0	auto neq = [](const auto& a, const auto& b) { return a != b; };
201	0	if (std::ranges::adjacent_find(le_outputs, neq) != le_outputs.end()) {
202	0	std::cerr
203	0	<< "to_well_formed_utf16le: outputs differ between implementations\n";
204	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
205	0	std::cerr << " " << implementations[i]->name()
206	0	<< ": hash=" << FNV1A_hash::as_str(le_outputs[i]) << "\n";
207	0	}
208	0	std::abort();
209	0	}
210		// Output must be valid UTF-16LE.
211	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
212	0	if (!implementations[i]->validate_utf16le(le_outputs[i].data(),
213	0	le_outputs[i].size())) {
214	0	std::cerr << "to_well_formed_utf16le: output is not valid UTF-16LE"
215	0	<< " impl=" << implementations[i]->name() << "\n";
216	0	std::abort();
217	0	}
218	0	}
219		// If input was already valid UTF-16LE, output must equal input.
220	0	if (implementations[0]->validate_utf16le(data.data(), data.size())) {
221	0	if (!std::ranges::equal(le_outputs[0], data)) {
222	0	std::cerr << "to_well_formed_utf16le: valid input was modified\n";
223	0	std::abort();
224	0	}
225	0	}
226	0	}
227
228		// Check BE variant
229	0	{
230	0	std::vector<std::vector<char16_t>> be_outputs;
231	0	be_outputs.reserve(implementations.size());
232	0	for (const simdutf::implementation* impl : implementations) {
233	0	std::vector<char16_t> out(data.size());
234	0	impl->to_well_formed_utf16be(data.data(), data.size(), out.data());
235	0	be_outputs.push_back(std::move(out));
236	0	}
237	0	auto neq = [](const auto& a, const auto& b) { return a != b; };
238	0	if (std::ranges::adjacent_find(be_outputs, neq) != be_outputs.end()) {
239	0	std::cerr
240	0	<< "to_well_formed_utf16be: outputs differ between implementations\n";
241	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
242	0	std::cerr << " " << implementations[i]->name()
243	0	<< ": hash=" << FNV1A_hash::as_str(be_outputs[i]) << "\n";
244	0	}
245	0	std::abort();
246	0	}
247		// Output must be valid UTF-16BE.
248	0	for (std::size_t i = 0; i < implementations.size(); ++i) {
249	0	if (!implementations[i]->validate_utf16be(be_outputs[i].data(),
250	0	be_outputs[i].size())) {
251	0	std::cerr << "to_well_formed_utf16be: output is not valid UTF-16BE"
252	0	<< " impl=" << implementations[i]->name() << "\n";
253	0	std::abort();
254	0	}
255	0	}
256		// If input was already valid UTF-16BE, output must equal input.
257	0	if (implementations[0]->validate_utf16be(data.data(), data.size())) {
258	0	if (!std::ranges::equal(be_outputs[0], data)) {
259	0	std::cerr << "to_well_formed_utf16be: valid input was modified\n";
260	0	std::abort();
261	0	}
262	0	}
263	0	}
264	0	}
265
266		void convert_latin1_to_utf8_safe(std::span<const char> chardata,
267	279	const std::size_t outputsize) {
268		// convert with a limited output buffer
269	279	std::vector<char> limited_output(outputsize);
270	279	const auto limited_ret = simdutf::convert_latin1_to_utf8_safe(
271	279	chardata.data(), chardata.size(), limited_output.data(), outputsize);
272
273		// convert with a sufficiently large output buffer
274	279	std::vector<char> large_output(2 * chardata.size());
275	279	const auto large_ret = simdutf::convert_latin1_to_utf8(
276	279	chardata.data(), chardata.size(), large_output.data());
277
278	279	if (large_ret != 0) {
279		// conversion was possible with a large buffer.
280	245	if (large_ret <= outputsize) {
281		// the limited buffer was large enough, ensure we got the same result
282	132	assert(limited_ret == large_ret);
283	132	assert(std::ranges::equal(limited_output \| std::views::take(large_ret),
284	132	large_output \| std::views::take(large_ret)));
285	132	} else {
286		// the number of written bytes for a limited buffer must not exceed what
287		// the large buffer got.
288	113	assert(limited_ret <= large_ret);
289		// the written data should be equal
290	113	assert(std::ranges::equal(limited_output \| std::views::take(limited_ret),
291	113	large_output \| std::views::take(limited_ret)));
292	113	}
293	245	} else {
294		// conversion with a big buffer failed - is there anything we can check or
295		// assert for the limited buffer? I don't think so.
296	34	}
297	279	}
298
299	2.61k	extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
300		// pick one of the functions, based on the fuzz data.
301		// the first byte is which action to take. step forward
302		// several bytes so the input is aligned.
303	2.61k	if (size < 4) {
304	2	return 0;
305	2	}
306	2.61k	constexpr auto Ncases = 11u;
307	2.61k	constexpr auto actionmask = std::bit_ceil(Ncases) - 1;
308	2.61k	const auto action = data[0] & actionmask;
309
310	2.61k	const std::uint16_t u16 = data[1] + (data[2] << 8);
311
312	2.61k	data += 4;
313	2.61k	size -= 4;
314
315	2.61k	const std::span<const char> chardata{(const char*)data, size};
316	2.61k	const std::span<const char16_t> u16data{(const char16_t*)data,
317	2.61k	size / sizeof(char16_t)};
318
319	2.61k	switch (action) {
320	841	case 0:
321	841	autodetect(chardata);
322	841	break;
323	679	case 1:
324	679	detect(chardata);
325	679	break;
326	214	case 2:
327	214	validate_ascii(chardata);
328	214	break;
329	266	case 3:
330	266	validate_ascii_with_err(chardata);
331	266	break;
332	137	case 4:
333	137	utf16_endianness(u16data);
334	137	break;
335	48	case 5: {
336	48	[[maybe_unused]] auto ret =
337	48	simdutf::trim_partial_utf16le(u16data.data(), u16data.size());
338	48	assert(ret == u16data.size() \|\| ret + 1 == u16data.size());
339	48	} break;
340	48	case 6: {
341	39	[[maybe_unused]] auto ret =
342	39	simdutf::trim_partial_utf16be(u16data.data(), u16data.size());
343	39	assert(ret == u16data.size() \|\| ret + 1 == u16data.size());
344	39	} break;
345	111	case 7: {
346	111	[[maybe_unused]] const std::size_t N = chardata.size();
347	111	[[maybe_unused]] const auto ret =
348	111	simdutf::trim_partial_utf8(chardata.data(), chardata.size());
349	111	if ((ret + 3 < N) \|\| (ret > N)) {
350	0	std::cerr << "ret=" << ret << " N=" << N << '\n';
351	0	std::abort();
352	0	}
353	111	} break;
354	279	case 8:
355	279	convert_latin1_to_utf8_safe(chardata, u16);
356	279	break;
357	0	case 9:
358	0	validate_utf16_as_ascii(u16data);
359	0	break;
360	0	case 10:
361	0	to_well_formed_utf16(u16data);
362	0	break;
363	2.61k	}
364	2.61k	return 0;
365	2.61k	}