Coverage Report

Created: 2024-09-19 09:45

/proc/self/cwd/source/common/json/json_sanitizer.cc
Line
Count
Source (jump to first uncovered line)
1
#include "source/common/json/json_sanitizer.h"
2
3
#include "source/common/common/assert.h"
4
#include "source/common/common/thread.h"
5
#include "source/common/json/json_internal.h"
6
7
#include "absl/strings/str_format.h"
8
#include "utf8_validity.h"
9
10
namespace Envoy {
11
namespace Json {
12
13
// clang-format off
14
// SPELLCHECKER(off)
15
//
16
// Performance benchmarks show this is slightly faster as an array of uint32_t
17
// rather than an array of bool.
18
static constexpr uint32_t needs_slow_sanitizer[256] = {
19
  // Control-characters 0-31 all require escapes.
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
23
  // Pass through printable characters starting with space. Double-quote and
24
  // backslash require an escape.
25
  0, 0, 1 /* " */, 0, 0, 0, 0, 0,         //  !"#$%&'
26
  0, 0, 0, 0, 0, 0, 0, 0,                 // ()*+,-.7
27
  0, 0, 0, 0, 0, 0, 0, 0,                 // 01234567
28
  0, 0, 0, 0, 0, 0, 0, 0,                 // 89:;<=>?
29
  0, 0, 0, 0, 0, 0, 0, 0,                 // @ABCDEFG
30
  0, 0, 0, 0, 0, 0, 0, 0,                 // HIJKLMNO
31
  0, 0, 0, 0, 0, 0, 0, 0,                 // PQRSTUVW
32
  0, 0, 0, 0, 1 /* backslash */, 0, 0, 0, // XYZ[\]^_
33
  0, 0, 0, 0, 0, 0, 0, 0,                 // `abcdefg
34
  0, 0, 0, 0, 0, 0, 0, 0,                 // hijklmno
35
  0, 0, 0, 0, 0, 0, 0, 0,                 // pqrstuvw
36
  0, 0, 0, 0, 0, 0, 0, 1,                 // xyz{|}~\177
37
38
  // 0x80-0xff, all of which require calling the slow sanitizer.
39
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47
};
48
// SPELLCHECKER(on)
49
// clang-format on
50
51
492k
absl::string_view sanitize(std::string& buffer, absl::string_view str) {
52
  // Fast-path to see whether any escapes or utf-encoding are needed. If str has
53
  // only unescaped ascii characters, we can simply return it.
54
  //
55
  // Benchmarks show it's faster to just rip through the string with no
56
  // conditionals, so we only check the arithmetically ORed condition after the
57
  // loop. This avoids branches and allows simpler loop unrolling by the
58
  // compiler.
59
492k
  static_assert(ARRAY_SIZE(needs_slow_sanitizer) == 256);
60
492k
  uint32_t need_slow = 0;
61
38.8M
  for (char c : str) {
62
    // We need to escape control characters, characters >= 127, and double-quote
63
    // and backslash.
64
38.8M
    need_slow |= needs_slow_sanitizer[static_cast<uint8_t>(c)];
65
38.8M
  }
66
492k
  if (need_slow == 0) {
67
481k
    return str; // Fast path, should be executed most of the time.
68
481k
  }
69
10.7k
  if (utf8_range::IsStructurallyValid(str)) {
70
    // The Nlohmann JSON library supports serialization and is not too slow. A
71
    // hand-rolled sanitizer can be a little over 2x faster at the cost of added
72
    // production complexity. The main drawback is that this code cannot be used
73
    // in the data plane as it throws exceptions. Should this become an issue,
74
    // #20428 can be revived which is faster and doesn't throw exceptions, but
75
    // adds complexity to the production code base.
76
2.77k
    buffer = Nlohmann::Factory::serialize(str);
77
2.77k
    return stripDoubleQuotes(buffer);
78
7.96k
  } else {
79
    // If Nlohmann throws an error, emit a hex escape for any character
80
    // requiring it. This can occur for invalid utf-8 sequences, and we don't
81
    // want to crash the server if such a sequence makes its way into a string
82
    // we need to serialize. For example, if admin endpoint /stats?format=json
83
    // is called, and a stat name was synthesized from dynamic content such as a
84
    // gRPC method.
85
    //
86
    // Note that JSON string escapes are always 4 digit hex. 3 digit octal would
87
    // be more compact, and is legal JavaScript, but not legal JSON. See
88
    // https://www.json.org/json-en.html for details.
89
    //
90
    // TODO(jmarantz): It would better to use the compact JSON escapes for
91
    // quotes, slashes, backspace, form-feed, linefeed, CR, and tab, in which
92
    // case we'd also need to modify jsonEquivalentStrings in
93
    // test/common/json/json_sanitizer_test_util.h. We don't expect to hit this
94
    // often, so it isn't a priority to use these more compact encodings.
95
7.96k
    buffer.clear();
96
15.5M
    for (char c : str) {
97
15.5M
      if (needs_slow_sanitizer[static_cast<uint8_t>(c)]) {
98
15.0M
        buffer.append(absl::StrFormat("\\u%04x", c));
99
15.0M
      } else {
100
489k
        buffer.append(1, c);
101
489k
      }
102
15.5M
    }
103
7.96k
  }
104
105
7.96k
  return buffer;
106
10.7k
}
107
108
486k
absl::string_view stripDoubleQuotes(absl::string_view str) {
109
486k
  if (str.size() >= 2 && str[0] == '"' && str[str.size() - 1] == '"') {
110
486k
    str = str.substr(1, str.size() - 2);
111
486k
  } else {
112
0
    ASSERT(false,
113
0
           absl::StrCat("stripDoubleQuotes called on a str that lacks double-quotes: ", str));
114
0
  }
115
486k
  return str;
116
486k
}
117
118
} // namespace Json
119
} // namespace Envoy