/proc/self/cwd/test/common/json/json_sanitizer_test_util.cc

Source (jump to first uncovered line)
#include "test/common/json/json_sanitizer_test_util.h"

#include <string>

#include "source/common/common/utility.h" // for IntervalSet.

#include "test/common/json/utf8.h"

#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_format.h"

namespace Envoy {
namespace Json {
namespace TestUtil {

namespace {

constexpr uint32_t UnicodeEscapeLength = 6; // "\u1234"
constexpr absl::string_view UnicodeEscapePrefix = "\\u";

class InvalidUnicodeSet {
public:
  InvalidUnicodeSet() {
    // Workaround limitations in protobuf serialization by skipping certain
    // unicodes from differential fuzzing. See
    // https://github.com/protocolbuffers/protobuf/issues/9729

    // The invalid intervals are generated with the command:
    // bazel -c opt run test/common/json:gen_excluded_unicodes |& grep -v 'contains invalid UTF-8'

    // Avoid ranges where the protobuf serialization fails, returning an empty
    // string. Nlohmann also fails (throws exceptions) in this range but
    // sanitizer() will catch that an do simple escapes on the string.
    invalid_3byte_intervals_.insert(0xd800, 0xe000);

    // Avoid differential testing of Unicode ranges generated from 4-byte utf-8
    // where protobuf serialization generates two small Unicode values instead
    // of the correct one. This must be a protobuf serialization issue.
    invalid_4byte_intervals_.insert(0x1d173, 0x1d17b);
    invalid_4byte_intervals_.insert(0xe0001, 0xe0002);
    invalid_4byte_intervals_.insert(0xe0020, 0xe0080);
  }

  // Helper functions to see if the specified Unicode is in the 3-byte utf-8
  // exclusion set or the 4-byte utf-8 exclusion-set.
  bool isInvalid3Byte(uint32_t unicode) const { return invalid_3byte_intervals_.test(unicode); }
  bool isInvalid4Byte(uint32_t unicode) const { return invalid_4byte_intervals_.test(unicode); }

private:
  IntervalSetImpl<uint32_t> invalid_3byte_intervals_;
  IntervalSetImpl<uint32_t> invalid_4byte_intervals_;
};

const InvalidUnicodeSet& invalidUnicodeSet() { CONSTRUCT_ON_FIRST_USE(InvalidUnicodeSet); }

} // namespace

bool isProtoSerializableUtf8(absl::string_view in) {
  const uint8_t* data = reinterpret_cast<const uint8_t*>(in.data());
  uint32_t size = in.size();
  while (size != 0) {
    if ((*data & 0x80) == 0) {
      ++data;
      --size;
    } else {
      auto [unicode, consumed] = Utf8::decode(data, size);
      data += consumed;
      size -= consumed;

      switch (consumed) {
      case 2:
        break;
      case 3:
        if (invalidUnicodeSet().isInvalid3Byte(unicode)) {
          return false;
        }
        break;
      case 4:
        if (invalidUnicodeSet().isInvalid4Byte(unicode)) {
          return false;
        }
        break;
      default:
        return false;
      }
    }
  }
  return true;
}

// Decodes unicode hex escape \u1234 into 0x1234, returning success.
bool parseUnicode(absl::string_view str, uint32_t& hex_value) {
  if (absl::StartsWith(str, UnicodeEscapePrefix) && str.size() >= UnicodeEscapeLength) {
    // TODO(jmarantz): Github master,
    // https://github.com/abseil/abseil-cpp/blob/master/absl/strings/numbers.h
    // has absl::SimpleHexAtoi, enabling this impl to be
    //   return absl::SimpleHexAtoi(str.substr(2, 4), &hex_value);
    // In the meantime we must nul-terminate.
    std::string hex_str(str.data() + 2, 4);
    char* str_end;
    hex_value = strtoul(hex_str.c_str(), &str_end, 16);
    return str_end != nullptr && *str_end == '\0';
  }
  return false;
}

// Compares a string that's possibly an escaped Unicode, e.g. \u1234, to
// one that is utf8-encoded.
bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) {
  uint32_t escaped_unicode;
  if (parseUnicode(escaped, escaped_unicode)) {
    // If one side of the comparison is a Unicode escape,
    auto [unicode, consumed] = Utf8::decode(utf8);
    if (consumed != 0 && unicode == escaped_unicode) {
      utf8 = utf8.substr(consumed, utf8.size() - consumed);
      escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength);
      return true;
    }
  }
  return false;
}

// Determines whether two strings differ only in whether they have
// literal utf-8 or escaped 3-byte Unicode. We do this equivalence
// comparison to enable differential fuzzing between sanitize() and
// protobuf JSON serialization. The protobuf implementation has made
// some hard-to-understand decisions about what to encode via Unicode
// escapes versus what to pass through as utf-8.
bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs) {
  absl::string_view all_a = a;
  absl::string_view all_b = b;
  while (true) {
    if (a.empty() && b.empty()) {
      return true;
    } else if (a.empty() || b.empty()) {
      diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b);
      return false;
    } else if (a[0] == b[0]) {
      a = a.substr(1, a.size() - 1);
      b = b.substr(1, b.size() - 1);
    } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) {
      diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a,
                              all_b, a.data() - all_a.data(), a[0], a[0], a[0],
                              b.data() - all_b.data(), b[0], b[0], b[0]);
      return false;
    }
  }
}

} // namespace TestUtil
} // namespace Json
} // namespace Envoy

Line	Count	Source (jump to first uncovered line)
1		#include "test/common/json/json_sanitizer_test_util.h"
2
3		#include <string>
4
5		#include "source/common/common/utility.h" // for IntervalSet.
6
7		#include "test/common/json/utf8.h"
8
9		#include "absl/strings/match.h"
10		#include "absl/strings/numbers.h"
11		#include "absl/strings/str_format.h"
12
13		namespace Envoy {
14		namespace Json {
15		namespace TestUtil {
16
17		namespace {
18
19		constexpr uint32_t UnicodeEscapeLength = 6; // "\u1234"
20		constexpr absl::string_view UnicodeEscapePrefix = "\\u";
21
22		class InvalidUnicodeSet {
23		public:
24	1	InvalidUnicodeSet() {
25		// Workaround limitations in protobuf serialization by skipping certain
26		// unicodes from differential fuzzing. See
27		// https://github.com/protocolbuffers/protobuf/issues/9729
28
29		// The invalid intervals are generated with the command:
30		// bazel -c opt run test/common/json:gen_excluded_unicodes \|& grep -v 'contains invalid UTF-8'
31
32		// Avoid ranges where the protobuf serialization fails, returning an empty
33		// string. Nlohmann also fails (throws exceptions) in this range but
34		// sanitizer() will catch that an do simple escapes on the string.
35	1	invalid_3byte_intervals_.insert(0xd800, 0xe000);
36
37		// Avoid differential testing of Unicode ranges generated from 4-byte utf-8
38		// where protobuf serialization generates two small Unicode values instead
39		// of the correct one. This must be a protobuf serialization issue.
40	1	invalid_4byte_intervals_.insert(0x1d173, 0x1d17b);
41	1	invalid_4byte_intervals_.insert(0xe0001, 0xe0002);
42	1	invalid_4byte_intervals_.insert(0xe0020, 0xe0080);
43	1	}
44
45		// Helper functions to see if the specified Unicode is in the 3-byte utf-8
46		// exclusion set or the 4-byte utf-8 exclusion-set.
47	52	bool isInvalid3Byte(uint32_t unicode) const { return invalid_3byte_intervals_.test(unicode); }
48	203	bool isInvalid4Byte(uint32_t unicode) const { return invalid_4byte_intervals_.test(unicode); }
49
50		private:
51		IntervalSetImpl<uint32_t> invalid_3byte_intervals_;
52		IntervalSetImpl<uint32_t> invalid_4byte_intervals_;
53		};
54
55	255	const InvalidUnicodeSet& invalidUnicodeSet() { CONSTRUCT_ON_FIRST_USE(InvalidUnicodeSet); }
56
57		} // namespace
58
59	484k	bool isProtoSerializableUtf8(absl::string_view in) {
60	484k	const uint8_t* data = reinterpret_cast<const uint8_t*>(in.data());
61	484k	uint32_t size = in.size();
62	10.6M	while (size != 0) {
63	10.1M	if ((*data & 0x80) == 0) {
64	10.1M	++data;
65	10.1M	--size;
66	10.1M	} else {
67	2.10k	auto [unicode, consumed] = Utf8::decode(data, size);
68	2.10k	data += consumed;
69	2.10k	size -= consumed;
70
71	2.10k	switch (consumed) {
72	83	case 2:
73	83	break;
74	52	case 3:
75	52	if (invalidUnicodeSet().isInvalid3Byte(unicode)) {
76	0	return false;
77	0	}
78	52	break;
79	203	case 4:
80	203	if (invalidUnicodeSet().isInvalid4Byte(unicode)) {
81	10	return false;
82	10	}
83	193	break;
84	1.76k	default:
85	1.76k	return false;
86	2.10k	}
87	2.10k	}
88	10.1M	}
89	483k	return true;
90	484k	}
91
92		// Decodes unicode hex escape \u1234 into 0x1234, returning success.
93	396	bool parseUnicode(absl::string_view str, uint32_t& hex_value) {
94	396	if (absl::StartsWith(str, UnicodeEscapePrefix) && str.size() >= UnicodeEscapeLength) {
95		// TODO(jmarantz): Github master,
96		// https://github.com/abseil/abseil-cpp/blob/master/absl/strings/numbers.h
97		// has absl::SimpleHexAtoi, enabling this impl to be
98		// return absl::SimpleHexAtoi(str.substr(2, 4), &hex_value);
99		// In the meantime we must nul-terminate.
100	198	std::string hex_str(str.data() + 2, 4);
101	198	char* str_end;
102	198	hex_value = strtoul(hex_str.c_str(), &str_end, 16);
103	198	return str_end != nullptr && *str_end == '\0';
104	198	}
105	198	return false;
106	396	}
107
108		// Compares a string that's possibly an escaped Unicode, e.g. \u1234, to
109		// one that is utf8-encoded.
110	396	bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) {
111	396	uint32_t escaped_unicode;
112	396	if (parseUnicode(escaped, escaped_unicode)) {
113		// If one side of the comparison is a Unicode escape,
114	198	auto [unicode, consumed] = Utf8::decode(utf8);
115	198	if (consumed != 0 && unicode == escaped_unicode) {
116	198	utf8 = utf8.substr(consumed, utf8.size() - consumed);
117	198	escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength);
118	198	return true;
119	198	}
120	198	}
121	198	return false;
122	396	}
123
124		// Determines whether two strings differ only in whether they have
125		// literal utf-8 or escaped 3-byte Unicode. We do this equivalence
126		// comparison to enable differential fuzzing between sanitize() and
127		// protobuf JSON serialization. The protobuf implementation has made
128		// some hard-to-understand decisions about what to encode via Unicode
129		// escapes versus what to pass through as utf-8.
130	483k	bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs) {
131	483k	absl::string_view all_a = a;
132	483k	absl::string_view all_b = b;
133	43.1M	while (true) {
134	43.1M	if (a.empty() && b.empty()) {
135	483k	return true;
136	42.6M	} else if (a.empty() \|\| b.empty()) {
137	0	diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b);
138	0	return false;
139	42.6M	} else if (a[0] == b[0]) {
140	42.6M	a = a.substr(1, a.size() - 1);
141	42.6M	b = b.substr(1, b.size() - 1);
142	42.6M	} else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) {
143	0	diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a,
144	0	all_b, a.data() - all_a.data(), a[0], a[0], a[0],
145	0	b.data() - all_b.data(), b[0], b[0], b[0]);
146	0	return false;
147	0	}
148	43.1M	}
149	483k	}
150
151		} // namespace TestUtil
152		} // namespace Json
153		} // namespace Envoy

Coverage Report

Created: 2023-11-12 09:30