Coverage Report

Created: 2023-11-12 09:30

/proc/self/cwd/test/common/json/json_sanitizer_test_util.cc
Line
Count
Source (jump to first uncovered line)
1
#include "test/common/json/json_sanitizer_test_util.h"
2
3
#include <string>
4
5
#include "source/common/common/utility.h" // for IntervalSet.
6
7
#include "test/common/json/utf8.h"
8
9
#include "absl/strings/match.h"
10
#include "absl/strings/numbers.h"
11
#include "absl/strings/str_format.h"
12
13
namespace Envoy {
14
namespace Json {
15
namespace TestUtil {
16
17
namespace {
18
19
constexpr uint32_t UnicodeEscapeLength = 6; // "\u1234"
20
constexpr absl::string_view UnicodeEscapePrefix = "\\u";
21
22
class InvalidUnicodeSet {
23
public:
24
1
  InvalidUnicodeSet() {
25
    // Workaround limitations in protobuf serialization by skipping certain
26
    // unicodes from differential fuzzing. See
27
    // https://github.com/protocolbuffers/protobuf/issues/9729
28
29
    // The invalid intervals are generated with the command:
30
    // bazel -c opt run test/common/json:gen_excluded_unicodes |& grep -v 'contains invalid UTF-8'
31
32
    // Avoid ranges where the protobuf serialization fails, returning an empty
33
    // string. Nlohmann also fails (throws exceptions) in this range but
34
    // sanitizer() will catch that an do simple escapes on the string.
35
1
    invalid_3byte_intervals_.insert(0xd800, 0xe000);
36
37
    // Avoid differential testing of Unicode ranges generated from 4-byte utf-8
38
    // where protobuf serialization generates two small Unicode values instead
39
    // of the correct one. This must be a protobuf serialization issue.
40
1
    invalid_4byte_intervals_.insert(0x1d173, 0x1d17b);
41
1
    invalid_4byte_intervals_.insert(0xe0001, 0xe0002);
42
1
    invalid_4byte_intervals_.insert(0xe0020, 0xe0080);
43
1
  }
44
45
  // Helper functions to see if the specified Unicode is in the 3-byte utf-8
46
  // exclusion set or the 4-byte utf-8 exclusion-set.
47
52
  bool isInvalid3Byte(uint32_t unicode) const { return invalid_3byte_intervals_.test(unicode); }
48
203
  bool isInvalid4Byte(uint32_t unicode) const { return invalid_4byte_intervals_.test(unicode); }
49
50
private:
51
  IntervalSetImpl<uint32_t> invalid_3byte_intervals_;
52
  IntervalSetImpl<uint32_t> invalid_4byte_intervals_;
53
};
54
55
255
const InvalidUnicodeSet& invalidUnicodeSet() { CONSTRUCT_ON_FIRST_USE(InvalidUnicodeSet); }
56
57
} // namespace
58
59
484k
bool isProtoSerializableUtf8(absl::string_view in) {
60
484k
  const uint8_t* data = reinterpret_cast<const uint8_t*>(in.data());
61
484k
  uint32_t size = in.size();
62
10.6M
  while (size != 0) {
63
10.1M
    if ((*data & 0x80) == 0) {
64
10.1M
      ++data;
65
10.1M
      --size;
66
10.1M
    } else {
67
2.10k
      auto [unicode, consumed] = Utf8::decode(data, size);
68
2.10k
      data += consumed;
69
2.10k
      size -= consumed;
70
71
2.10k
      switch (consumed) {
72
83
      case 2:
73
83
        break;
74
52
      case 3:
75
52
        if (invalidUnicodeSet().isInvalid3Byte(unicode)) {
76
0
          return false;
77
0
        }
78
52
        break;
79
203
      case 4:
80
203
        if (invalidUnicodeSet().isInvalid4Byte(unicode)) {
81
10
          return false;
82
10
        }
83
193
        break;
84
1.76k
      default:
85
1.76k
        return false;
86
2.10k
      }
87
2.10k
    }
88
10.1M
  }
89
483k
  return true;
90
484k
}
91
92
// Decodes unicode hex escape \u1234 into 0x1234, returning success.
93
396
bool parseUnicode(absl::string_view str, uint32_t& hex_value) {
94
396
  if (absl::StartsWith(str, UnicodeEscapePrefix) && str.size() >= UnicodeEscapeLength) {
95
    // TODO(jmarantz): Github master,
96
    // https://github.com/abseil/abseil-cpp/blob/master/absl/strings/numbers.h
97
    // has absl::SimpleHexAtoi, enabling this impl to be
98
    //   return absl::SimpleHexAtoi(str.substr(2, 4), &hex_value);
99
    // In the meantime we must nul-terminate.
100
198
    std::string hex_str(str.data() + 2, 4);
101
198
    char* str_end;
102
198
    hex_value = strtoul(hex_str.c_str(), &str_end, 16);
103
198
    return str_end != nullptr && *str_end == '\0';
104
198
  }
105
198
  return false;
106
396
}
107
108
// Compares a string that's possibly an escaped Unicode, e.g. \u1234, to
109
// one that is utf8-encoded.
110
396
bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) {
111
396
  uint32_t escaped_unicode;
112
396
  if (parseUnicode(escaped, escaped_unicode)) {
113
    // If one side of the comparison is a Unicode escape,
114
198
    auto [unicode, consumed] = Utf8::decode(utf8);
115
198
    if (consumed != 0 && unicode == escaped_unicode) {
116
198
      utf8 = utf8.substr(consumed, utf8.size() - consumed);
117
198
      escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength);
118
198
      return true;
119
198
    }
120
198
  }
121
198
  return false;
122
396
}
123
124
// Determines whether two strings differ only in whether they have
125
// literal utf-8 or escaped 3-byte Unicode. We do this equivalence
126
// comparison to enable differential fuzzing between sanitize() and
127
// protobuf JSON serialization. The protobuf implementation has made
128
// some hard-to-understand decisions about what to encode via Unicode
129
// escapes versus what to pass through as utf-8.
130
483k
bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs) {
131
483k
  absl::string_view all_a = a;
132
483k
  absl::string_view all_b = b;
133
43.1M
  while (true) {
134
43.1M
    if (a.empty() && b.empty()) {
135
483k
      return true;
136
42.6M
    } else if (a.empty() || b.empty()) {
137
0
      diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b);
138
0
      return false;
139
42.6M
    } else if (a[0] == b[0]) {
140
42.6M
      a = a.substr(1, a.size() - 1);
141
42.6M
      b = b.substr(1, b.size() - 1);
142
42.6M
    } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) {
143
0
      diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a,
144
0
                              all_b, a.data() - all_a.data(), a[0], a[0], a[0],
145
0
                              b.data() - all_b.data(), b[0], b[0], b[0]);
146
0
      return false;
147
0
    }
148
43.1M
  }
149
483k
}
150
151
} // namespace TestUtil
152
} // namespace Json
153
} // namespace Envoy