/proc/self/cwd/test/common/json/utf8.cc
Line | Count | Source |
1 | | #include "test/common/json/utf8.h" |
2 | | |
3 | | namespace Envoy { |
4 | | namespace Json { |
5 | | namespace Utf8 { |
6 | | |
7 | 12.8k | UnicodeSizePair decode(const uint8_t* bytes, uint32_t size) { |
8 | 12.8k | uint32_t unicode = 0; |
9 | 12.8k | uint32_t consumed = 0; |
10 | | |
11 | | // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section. |
12 | | // |
13 | | // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is |
14 | | // marked as deprecated. There is also support in Windows libraries and Boost, |
15 | | // which can be discovered on StackOverflow. I could not find a usable OSS |
16 | | // implementation. However it's easily derived from the spec on Wikipedia. |
17 | | // |
18 | | // Note that the code below could be optimized a bit, e.g. by factoring out |
19 | | // repeated lookups of the same index in the bytes array and using SSE |
20 | | // instructions for the multi-word bit hacking. |
21 | | // |
22 | | // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot |
23 | | // faster, though less readable. As coded, though, it looks like it would read |
24 | | // past the end of the input if the input is malformed. |
25 | 12.8k | if (size >= 1 && (bytes[0] & Utf8::Mask1Byte) == Utf8::Pattern1Byte) { |
26 | 1.29k | unicode = bytes[0] & ~Utf8::Mask1Byte; |
27 | 1.29k | consumed = 1; |
28 | 11.5k | } else if (size >= 2 && (bytes[0] & Utf8::Mask2Byte) == Utf8::Pattern2Byte && |
29 | 11.5k | (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern) { |
30 | 1.06k | unicode = bytes[0] & ~Utf8::Mask2Byte; |
31 | 1.06k | unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask); |
32 | 1.06k | if (unicode < 0x80) { |
33 | 26 | return {0, 0}; |
34 | 26 | } |
35 | 1.03k | consumed = 2; |
36 | 10.4k | } else if (size >= 3 && (bytes[0] & Utf8::Mask3Byte) == Utf8::Pattern3Byte && |
37 | 10.4k | (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern && |
38 | 10.4k | (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern) { |
39 | 2.06k | unicode = bytes[0] & ~Utf8::Mask3Byte; |
40 | 2.06k | unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask); |
41 | 2.06k | unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask); |
42 | 2.06k | if (unicode < 0x800) { // 3-byte starts at 0x800 |
43 | 596 | return {0, 0}; |
44 | 596 | } |
45 | 1.46k | consumed = 3; |
46 | 8.43k | } else if (size >= 4 && (bytes[0] & Utf8::Mask4Byte) == Utf8::Pattern4Byte && |
47 | 8.43k | (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern && |
48 | 8.43k | (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern && |
49 | 8.43k | (bytes[3] & Utf8::ContinueMask) == Utf8::ContinuePattern) { |
50 | 1.72k | unicode = bytes[0] & ~Utf8::Mask4Byte; |
51 | 1.72k | unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask); |
52 | 1.72k | unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask); |
53 | 1.72k | unicode = (unicode << Utf8::Shift) | (bytes[3] & ~Utf8::ContinueMask); |
54 | | |
55 | | // 4-byte starts at 0x10000 |
56 | | // |
57 | | // Note from https://en.wikipedia.org/wiki/UTF-8: |
58 | | // The earlier RFC2279 allowed UTF-8 encoding through code point 0x7FFFFFF. |
59 | | // But the current RFC3629 section 3 limits UTF-8 encoding through code |
60 | | // point 0x10FFFF, to match the limits of UTF-16. |
61 | 1.72k | if (unicode < 0x10000 || unicode > 0x10ffff) { |
62 | 421 | return {0, 0}; |
63 | 421 | } |
64 | 1.30k | consumed = 4; |
65 | 1.30k | } |
66 | 11.8k | return {unicode, consumed}; |
67 | 12.8k | } |
68 | | |
69 | | } // namespace Utf8 |
70 | | } // namespace Json |
71 | | } // namespace Envoy |