/proc/self/cwd/test/common/json/json_sanitizer_test_util.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "test/common/json/json_sanitizer_test_util.h" |
2 | | |
3 | | #include <string> |
4 | | |
5 | | #include "source/common/common/utility.h" // for IntervalSet. |
6 | | |
7 | | #include "test/common/json/utf8.h" |
8 | | |
9 | | #include "absl/strings/match.h" |
10 | | #include "absl/strings/numbers.h" |
11 | | #include "absl/strings/str_format.h" |
12 | | |
13 | | namespace Envoy { |
14 | | namespace Json { |
15 | | namespace TestUtil { |
16 | | |
17 | | namespace { |
18 | | |
19 | | constexpr uint32_t UnicodeEscapeLength = 6; // "\u1234" |
20 | | constexpr absl::string_view UnicodeEscapePrefix = "\\u"; |
21 | | |
22 | | class InvalidUnicodeSet { |
23 | | public: |
24 | 1 | InvalidUnicodeSet() { |
25 | | // Workaround limitations in protobuf serialization by skipping certain |
26 | | // unicodes from differential fuzzing. See |
27 | | // https://github.com/protocolbuffers/protobuf/issues/9729 |
28 | | |
29 | | // The invalid intervals are generated with the command: |
30 | | // bazel -c opt run test/common/json:gen_excluded_unicodes |& grep -v 'contains invalid UTF-8' |
31 | | |
32 | | // Avoid ranges where the protobuf serialization fails, returning an empty |
33 | | // string. Nlohmann also fails (throws exceptions) in this range but |
34 | | // sanitizer() will catch that an do simple escapes on the string. |
35 | 1 | invalid_3byte_intervals_.insert(0xd800, 0xe000); |
36 | | |
37 | | // Avoid differential testing of Unicode ranges generated from 4-byte utf-8 |
38 | | // where protobuf serialization generates two small Unicode values instead |
39 | | // of the correct one. This must be a protobuf serialization issue. |
40 | 1 | invalid_4byte_intervals_.insert(0x1d173, 0x1d17b); |
41 | 1 | invalid_4byte_intervals_.insert(0xe0001, 0xe0002); |
42 | 1 | invalid_4byte_intervals_.insert(0xe0020, 0xe0080); |
43 | 1 | } |
44 | | |
45 | | // Helper functions to see if the specified Unicode is in the 3-byte utf-8 |
46 | | // exclusion set or the 4-byte utf-8 exclusion-set. |
47 | 52 | bool isInvalid3Byte(uint32_t unicode) const { return invalid_3byte_intervals_.test(unicode); } |
48 | 203 | bool isInvalid4Byte(uint32_t unicode) const { return invalid_4byte_intervals_.test(unicode); } |
49 | | |
50 | | private: |
51 | | IntervalSetImpl<uint32_t> invalid_3byte_intervals_; |
52 | | IntervalSetImpl<uint32_t> invalid_4byte_intervals_; |
53 | | }; |
54 | | |
55 | 255 | const InvalidUnicodeSet& invalidUnicodeSet() { CONSTRUCT_ON_FIRST_USE(InvalidUnicodeSet); } |
56 | | |
57 | | } // namespace |
58 | | |
59 | 484k | bool isProtoSerializableUtf8(absl::string_view in) { |
60 | 484k | const uint8_t* data = reinterpret_cast<const uint8_t*>(in.data()); |
61 | 484k | uint32_t size = in.size(); |
62 | 10.6M | while (size != 0) { |
63 | 10.1M | if ((*data & 0x80) == 0) { |
64 | 10.1M | ++data; |
65 | 10.1M | --size; |
66 | 10.1M | } else { |
67 | 2.10k | auto [unicode, consumed] = Utf8::decode(data, size); |
68 | 2.10k | data += consumed; |
69 | 2.10k | size -= consumed; |
70 | | |
71 | 2.10k | switch (consumed) { |
72 | 83 | case 2: |
73 | 83 | break; |
74 | 52 | case 3: |
75 | 52 | if (invalidUnicodeSet().isInvalid3Byte(unicode)) { |
76 | 0 | return false; |
77 | 0 | } |
78 | 52 | break; |
79 | 203 | case 4: |
80 | 203 | if (invalidUnicodeSet().isInvalid4Byte(unicode)) { |
81 | 10 | return false; |
82 | 10 | } |
83 | 193 | break; |
84 | 1.76k | default: |
85 | 1.76k | return false; |
86 | 2.10k | } |
87 | 2.10k | } |
88 | 10.1M | } |
89 | 483k | return true; |
90 | 484k | } |
91 | | |
92 | | // Decodes unicode hex escape \u1234 into 0x1234, returning success. |
93 | 396 | bool parseUnicode(absl::string_view str, uint32_t& hex_value) { |
94 | 396 | if (absl::StartsWith(str, UnicodeEscapePrefix) && str.size() >= UnicodeEscapeLength) { |
95 | | // TODO(jmarantz): Github master, |
96 | | // https://github.com/abseil/abseil-cpp/blob/master/absl/strings/numbers.h |
97 | | // has absl::SimpleHexAtoi, enabling this impl to be |
98 | | // return absl::SimpleHexAtoi(str.substr(2, 4), &hex_value); |
99 | | // In the meantime we must nul-terminate. |
100 | 198 | std::string hex_str(str.data() + 2, 4); |
101 | 198 | char* str_end; |
102 | 198 | hex_value = strtoul(hex_str.c_str(), &str_end, 16); |
103 | 198 | return str_end != nullptr && *str_end == '\0'; |
104 | 198 | } |
105 | 198 | return false; |
106 | 396 | } |
107 | | |
108 | | // Compares a string that's possibly an escaped Unicode, e.g. \u1234, to |
109 | | // one that is utf8-encoded. |
110 | 396 | bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) { |
111 | 396 | uint32_t escaped_unicode; |
112 | 396 | if (parseUnicode(escaped, escaped_unicode)) { |
113 | | // If one side of the comparison is a Unicode escape, |
114 | 198 | auto [unicode, consumed] = Utf8::decode(utf8); |
115 | 198 | if (consumed != 0 && unicode == escaped_unicode) { |
116 | 198 | utf8 = utf8.substr(consumed, utf8.size() - consumed); |
117 | 198 | escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength); |
118 | 198 | return true; |
119 | 198 | } |
120 | 198 | } |
121 | 198 | return false; |
122 | 396 | } |
123 | | |
124 | | // Determines whether two strings differ only in whether they have |
125 | | // literal utf-8 or escaped 3-byte Unicode. We do this equivalence |
126 | | // comparison to enable differential fuzzing between sanitize() and |
127 | | // protobuf JSON serialization. The protobuf implementation has made |
128 | | // some hard-to-understand decisions about what to encode via Unicode |
129 | | // escapes versus what to pass through as utf-8. |
130 | 483k | bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs) { |
131 | 483k | absl::string_view all_a = a; |
132 | 483k | absl::string_view all_b = b; |
133 | 43.1M | while (true) { |
134 | 43.1M | if (a.empty() && b.empty()) { |
135 | 483k | return true; |
136 | 42.6M | } else if (a.empty() || b.empty()) { |
137 | 0 | diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b); |
138 | 0 | return false; |
139 | 42.6M | } else if (a[0] == b[0]) { |
140 | 42.6M | a = a.substr(1, a.size() - 1); |
141 | 42.6M | b = b.substr(1, b.size() - 1); |
142 | 42.6M | } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) { |
143 | 0 | diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a, |
144 | 0 | all_b, a.data() - all_a.data(), a[0], a[0], a[0], |
145 | 0 | b.data() - all_b.data(), b[0], b[0], b[0]); |
146 | 0 | return false; |
147 | 0 | } |
148 | 43.1M | } |
149 | 483k | } |
150 | | |
151 | | } // namespace TestUtil |
152 | | } // namespace Json |
153 | | } // namespace Envoy |