/proc/self/cwd/source/common/json/json_sanitizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "source/common/json/json_sanitizer.h" |
2 | | |
3 | | #include "source/common/common/assert.h" |
4 | | #include "source/common/common/thread.h" |
5 | | #include "source/common/json/json_internal.h" |
6 | | |
7 | | #include "absl/strings/str_format.h" |
8 | | #include "utf8_validity.h" |
9 | | |
10 | | namespace Envoy { |
11 | | namespace Json { |
12 | | |
13 | | // clang-format off |
14 | | // SPELLCHECKER(off) |
15 | | // |
16 | | // Performance benchmarks show this is slightly faster as an array of uint32_t |
17 | | // rather than an array of bool. |
18 | | static constexpr uint32_t needs_slow_sanitizer[256] = { |
19 | | // Control-characters 0-31 all require escapes. |
20 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 | | |
23 | | // Pass through printable characters starting with space. Double-quote and |
24 | | // backslash require an escape. |
25 | | 0, 0, 1 /* " */, 0, 0, 0, 0, 0, // !"#$%&' |
26 | | 0, 0, 0, 0, 0, 0, 0, 0, // ()*+,-.7 |
27 | | 0, 0, 0, 0, 0, 0, 0, 0, // 01234567 |
28 | | 0, 0, 0, 0, 0, 0, 0, 0, // 89:;<=>? |
29 | | 0, 0, 0, 0, 0, 0, 0, 0, // @ABCDEFG |
30 | | 0, 0, 0, 0, 0, 0, 0, 0, // HIJKLMNO |
31 | | 0, 0, 0, 0, 0, 0, 0, 0, // PQRSTUVW |
32 | | 0, 0, 0, 0, 1 /* backslash */, 0, 0, 0, // XYZ[\]^_ |
33 | | 0, 0, 0, 0, 0, 0, 0, 0, // `abcdefg |
34 | | 0, 0, 0, 0, 0, 0, 0, 0, // hijklmno |
35 | | 0, 0, 0, 0, 0, 0, 0, 0, // pqrstuvw |
36 | | 0, 0, 0, 0, 0, 0, 0, 1, // xyz{|}~\177 |
37 | | |
38 | | // 0x80-0xff, all of which require calling the slow sanitizer. |
39 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
40 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
42 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
43 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
44 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
45 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
46 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
47 | | }; |
48 | | // SPELLCHECKER(on) |
49 | | // clang-format on |
50 | | |
51 | 492k | absl::string_view sanitize(std::string& buffer, absl::string_view str) { |
52 | | // Fast-path to see whether any escapes or utf-encoding are needed. If str has |
53 | | // only unescaped ascii characters, we can simply return it. |
54 | | // |
55 | | // Benchmarks show it's faster to just rip through the string with no |
56 | | // conditionals, so we only check the arithmetically ORed condition after the |
57 | | // loop. This avoids branches and allows simpler loop unrolling by the |
58 | | // compiler. |
59 | 492k | static_assert(ARRAY_SIZE(needs_slow_sanitizer) == 256); |
60 | 492k | uint32_t need_slow = 0; |
61 | 38.8M | for (char c : str) { |
62 | | // We need to escape control characters, characters >= 127, and double-quote |
63 | | // and backslash. |
64 | 38.8M | need_slow |= needs_slow_sanitizer[static_cast<uint8_t>(c)]; |
65 | 38.8M | } |
66 | 492k | if (need_slow == 0) { |
67 | 481k | return str; // Fast path, should be executed most of the time. |
68 | 481k | } |
69 | 10.7k | if (utf8_range::IsStructurallyValid(str)) { |
70 | | // The Nlohmann JSON library supports serialization and is not too slow. A |
71 | | // hand-rolled sanitizer can be a little over 2x faster at the cost of added |
72 | | // production complexity. The main drawback is that this code cannot be used |
73 | | // in the data plane as it throws exceptions. Should this become an issue, |
74 | | // #20428 can be revived which is faster and doesn't throw exceptions, but |
75 | | // adds complexity to the production code base. |
76 | 2.77k | buffer = Nlohmann::Factory::serialize(str); |
77 | 2.77k | return stripDoubleQuotes(buffer); |
78 | 7.96k | } else { |
79 | | // If Nlohmann throws an error, emit a hex escape for any character |
80 | | // requiring it. This can occur for invalid utf-8 sequences, and we don't |
81 | | // want to crash the server if such a sequence makes its way into a string |
82 | | // we need to serialize. For example, if admin endpoint /stats?format=json |
83 | | // is called, and a stat name was synthesized from dynamic content such as a |
84 | | // gRPC method. |
85 | | // |
86 | | // Note that JSON string escapes are always 4 digit hex. 3 digit octal would |
87 | | // be more compact, and is legal JavaScript, but not legal JSON. See |
88 | | // https://www.json.org/json-en.html for details. |
89 | | // |
90 | | // TODO(jmarantz): It would better to use the compact JSON escapes for |
91 | | // quotes, slashes, backspace, form-feed, linefeed, CR, and tab, in which |
92 | | // case we'd also need to modify jsonEquivalentStrings in |
93 | | // test/common/json/json_sanitizer_test_util.h. We don't expect to hit this |
94 | | // often, so it isn't a priority to use these more compact encodings. |
95 | 7.96k | buffer.clear(); |
96 | 15.5M | for (char c : str) { |
97 | 15.5M | if (needs_slow_sanitizer[static_cast<uint8_t>(c)]) { |
98 | 15.0M | buffer.append(absl::StrFormat("\\u%04x", c)); |
99 | 15.0M | } else { |
100 | 489k | buffer.append(1, c); |
101 | 489k | } |
102 | 15.5M | } |
103 | 7.96k | } |
104 | | |
105 | 7.96k | return buffer; |
106 | 10.7k | } |
107 | | |
108 | 486k | absl::string_view stripDoubleQuotes(absl::string_view str) { |
109 | 486k | if (str.size() >= 2 && str[0] == '"' && str[str.size() - 1] == '"') { |
110 | 486k | str = str.substr(1, str.size() - 2); |
111 | 486k | } else { |
112 | 0 | ASSERT(false, |
113 | 0 | absl::StrCat("stripDoubleQuotes called on a str that lacks double-quotes: ", str)); |
114 | 0 | } |
115 | 486k | return str; |
116 | 486k | } |
117 | | |
118 | | } // namespace Json |
119 | | } // namespace Envoy |