Coverage Report

Created: 2024-09-19 09:45

/proc/self/cwd/test/common/json/utf8.cc
Line
Count
Source
1
#include "test/common/json/utf8.h"
2
3
namespace Envoy {
4
namespace Json {
5
namespace Utf8 {
6
7
12.8k
UnicodeSizePair decode(const uint8_t* bytes, uint32_t size) {
8
12.8k
  uint32_t unicode = 0;
9
12.8k
  uint32_t consumed = 0;
10
11
  // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
12
  //
13
  // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
14
  // marked as deprecated. There is also support in Windows libraries and Boost,
15
  // which can be discovered on StackOverflow. I could not find a usable OSS
16
  // implementation. However it's easily derived from the spec on Wikipedia.
17
  //
18
  // Note that the code below could be optimized a bit, e.g. by factoring out
19
  // repeated lookups of the same index in the bytes array and using SSE
20
  // instructions for the multi-word bit hacking.
21
  //
22
  // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
23
  // faster, though less readable. As coded, though, it looks like it would read
24
  // past the end of the input if the input is malformed.
25
12.8k
  if (size >= 1 && (bytes[0] & Utf8::Mask1Byte) == Utf8::Pattern1Byte) {
26
1.29k
    unicode = bytes[0] & ~Utf8::Mask1Byte;
27
1.29k
    consumed = 1;
28
11.5k
  } else if (size >= 2 && (bytes[0] & Utf8::Mask2Byte) == Utf8::Pattern2Byte &&
29
11.5k
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
30
1.06k
    unicode = bytes[0] & ~Utf8::Mask2Byte;
31
1.06k
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
32
1.06k
    if (unicode < 0x80) {
33
26
      return {0, 0};
34
26
    }
35
1.03k
    consumed = 2;
36
10.4k
  } else if (size >= 3 && (bytes[0] & Utf8::Mask3Byte) == Utf8::Pattern3Byte &&
37
10.4k
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
38
10.4k
             (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
39
2.06k
    unicode = bytes[0] & ~Utf8::Mask3Byte;
40
2.06k
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
41
2.06k
    unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask);
42
2.06k
    if (unicode < 0x800) { // 3-byte starts at 0x800
43
596
      return {0, 0};
44
596
    }
45
1.46k
    consumed = 3;
46
8.43k
  } else if (size >= 4 && (bytes[0] & Utf8::Mask4Byte) == Utf8::Pattern4Byte &&
47
8.43k
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
48
8.43k
             (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
49
8.43k
             (bytes[3] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
50
1.72k
    unicode = bytes[0] & ~Utf8::Mask4Byte;
51
1.72k
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
52
1.72k
    unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask);
53
1.72k
    unicode = (unicode << Utf8::Shift) | (bytes[3] & ~Utf8::ContinueMask);
54
55
    // 4-byte starts at 0x10000
56
    //
57
    // Note from https://en.wikipedia.org/wiki/UTF-8:
58
    // The earlier RFC2279 allowed UTF-8 encoding through code point 0x7FFFFFF.
59
    // But the current RFC3629 section 3 limits UTF-8 encoding through code
60
    // point 0x10FFFF, to match the limits of UTF-16.
61
1.72k
    if (unicode < 0x10000 || unicode > 0x10ffff) {
62
421
      return {0, 0};
63
421
    }
64
1.30k
    consumed = 4;
65
1.30k
  }
66
11.8k
  return {unicode, consumed};
67
12.8k
}
68
69
} // namespace Utf8
70
} // namespace Json
71
} // namespace Envoy