1
#include "source/common/json/json_sanitizer.h"
2

            
3
#include "source/common/common/assert.h"
4
#include "source/common/common/thread.h"
5
#include "source/common/json/json_internal.h"
6

            
7
#include "absl/strings/str_format.h"
8
#include "third_party/utf8_range/utf8_validity.h"
9

            
10
namespace Envoy {
11
namespace Json {
12

            
13
// clang-format off
14
// SPELLCHECKER(off)
15
//
16
// Performance benchmarks show this is slightly faster as an array of uint32_t
17
// rather than an array of bool.
18
static constexpr uint32_t needs_slow_sanitizer[256] = {
19
  // Control-characters 0-31 all require escapes.
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22

            
23
  // Pass through printable characters starting with space. Double-quote and
24
  // backslash require an escape.
25
  0, 0, 1 /* " */, 0, 0, 0, 0, 0,         //  !"#$%&'
26
  0, 0, 0, 0, 0, 0, 0, 0,                 // ()*+,-.7
27
  0, 0, 0, 0, 0, 0, 0, 0,                 // 01234567
28
  0, 0, 0, 0, 0, 0, 0, 0,                 // 89:;<=>?
29
  0, 0, 0, 0, 0, 0, 0, 0,                 // @ABCDEFG
30
  0, 0, 0, 0, 0, 0, 0, 0,                 // HIJKLMNO
31
  0, 0, 0, 0, 0, 0, 0, 0,                 // PQRSTUVW
32
  0, 0, 0, 0, 1 /* backslash */, 0, 0, 0, // XYZ[\]^_
33
  0, 0, 0, 0, 0, 0, 0, 0,                 // `abcdefg
34
  0, 0, 0, 0, 0, 0, 0, 0,                 // hijklmno
35
  0, 0, 0, 0, 0, 0, 0, 0,                 // pqrstuvw
36
  0, 0, 0, 0, 0, 0, 0, 1,                 // xyz{|}~\177
37

            
38
  // 0x80-0xff, all of which require calling the slow sanitizer.
39
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47
};
48
// SPELLCHECKER(on)
49
// clang-format on
50

            
51
4268566
absl::string_view sanitize(std::string& buffer, absl::string_view str) {
52
  // Fast-path to see whether any escapes or utf-encoding are needed. If str has
53
  // only unescaped ascii characters, we can simply return it.
54
  //
55
  // Benchmarks show it's faster to just rip through the string with no
56
  // conditionals, so we only check the arithmetically ORed condition after the
57
  // loop. This avoids branches and allows simpler loop unrolling by the
58
  // compiler.
59
4268566
  static_assert(ARRAY_SIZE(needs_slow_sanitizer) == 256);
60
4268566
  uint32_t need_slow = 0;
61
17101479
  for (char c : str) {
62
    // We need to escape control characters, characters >= 127, and double-quote
63
    // and backslash.
64
17101479
    need_slow |= needs_slow_sanitizer[static_cast<uint8_t>(c)];
65
17101479
  }
66
4268566
  if (need_slow == 0) {
67
8767
    return str; // Fast path, should be executed most of the time.
68
8767
  }
69
4259799
  if (utf8_range::IsStructurallyValid(str)) {
70
    // The Nlohmann JSON library supports serialization and is not too slow. A
71
    // hand-rolled sanitizer can be a little over 2x faster at the cost of added
72
    // production complexity. The main drawback is that this code cannot be used
73
    // in the data plane as it throws exceptions. Should this become an issue,
74
    // #20428 can be revived which is faster and doesn't throw exceptions, but
75
    // adds complexity to the production code base.
76
1112013
    buffer = Nlohmann::Factory::serialize(str);
77
1112013
    return stripDoubleQuotes(buffer);
78
3147798
  } else {
79
    // If Nlohmann throws an error, emit a hex escape for any character
80
    // requiring it. This can occur for invalid utf-8 sequences, and we don't
81
    // want to crash the server if such a sequence makes its way into a string
82
    // we need to serialize. For example, if admin endpoint /stats?format=json
83
    // is called, and a stat name was synthesized from dynamic content such as a
84
    // gRPC method.
85
    //
86
    // Note that JSON string escapes are always 4 digit hex. 3 digit octal would
87
    // be more compact, and is legal JavaScript, but not legal JSON. See
88
    // https://www.json.org/json-en.html for details.
89
    //
90
    // TODO(jmarantz): It would better to use the compact JSON escapes for
91
    // quotes, slashes, backspace, form-feed, linefeed, CR, and tab, in which
92
    // case we'd also need to modify jsonEquivalentStrings in
93
    // test/common/json/json_sanitizer_test_util.h. We don't expect to hit this
94
    // often, so it isn't a priority to use these more compact encodings.
95
3147786
    buffer.clear();
96
12589237
    for (char c : str) {
97
12589237
      if (needs_slow_sanitizer[static_cast<uint8_t>(c)]) {
98
12589207
        buffer.append(absl::StrFormat("\\u%04x", c));
99
12589207
      } else {
100
30
        buffer.append(1, c);
101
30
      }
102
12589237
    }
103
3147786
  }
104

            
105
3147786
  return buffer;
106
4259799
}
107

            
108
2224007
absl::string_view stripDoubleQuotes(absl::string_view str) {
109
2224007
  if (str.size() >= 2 && str[0] == '"' && str[str.size() - 1] == '"') {
110
2224007
    str = str.substr(1, str.size() - 2);
111
2224007
  } else {
112
    ASSERT(false,
113
           absl::StrCat("stripDoubleQuotes called on a str that lacks double-quotes: ", str));
114
  }
115
2224007
  return str;
116
2224007
}
117

            
118
} // namespace Json
119
} // namespace Envoy