/proc/self/cwd/test/common/json/utf8.cc

Source
#include "test/common/json/utf8.h"

namespace Envoy {
namespace Json {
namespace Utf8 {

UnicodeSizePair decode(const uint8_t* bytes, uint32_t size) {
  uint32_t unicode = 0;
  uint32_t consumed = 0;

  // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
  //
  // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
  // marked as deprecated. There is also support in Windows libraries and Boost,
  // which can be discovered on StackOverflow. I could not find a usable OSS
  // implementation. However it's easily derived from the spec on Wikipedia.
  //
  // Note that the code below could be optimized a bit, e.g. by factoring out
  // repeated lookups of the same index in the bytes array and using SSE
  // instructions for the multi-word bit hacking.
  //
  // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
  // faster, though less readable. As coded, though, it looks like it would read
  // past the end of the input if the input is malformed.
  if (size >= 1 && (bytes[0] & Utf8::Mask1Byte) == Utf8::Pattern1Byte) {
    unicode = bytes[0] & ~Utf8::Mask1Byte;
    consumed = 1;
  } else if (size >= 2 && (bytes[0] & Utf8::Mask2Byte) == Utf8::Pattern2Byte &&
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
    unicode = bytes[0] & ~Utf8::Mask2Byte;
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
    if (unicode < 0x80) {
      return {0, 0};
    }
    consumed = 2;
  } else if (size >= 3 && (bytes[0] & Utf8::Mask3Byte) == Utf8::Pattern3Byte &&
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
             (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
    unicode = bytes[0] & ~Utf8::Mask3Byte;
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
    unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask);
    if (unicode < 0x800) { // 3-byte starts at 0x800
      return {0, 0};
    }
    consumed = 3;
  } else if (size >= 4 && (bytes[0] & Utf8::Mask4Byte) == Utf8::Pattern4Byte &&
             (bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
             (bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
             (bytes[3] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
    unicode = bytes[0] & ~Utf8::Mask4Byte;
    unicode = (unicode << Utf8::Shift) | (bytes[1] & ~Utf8::ContinueMask);
    unicode = (unicode << Utf8::Shift) | (bytes[2] & ~Utf8::ContinueMask);
    unicode = (unicode << Utf8::Shift) | (bytes[3] & ~Utf8::ContinueMask);

    // 4-byte starts at 0x10000
    //
    // Note from https://en.wikipedia.org/wiki/UTF-8:
    // The earlier RFC2279 allowed UTF-8 encoding through code point 0x7FFFFFF.
    // But the current RFC3629 section 3 limits UTF-8 encoding through code
    // point 0x10FFFF, to match the limits of UTF-16.
    if (unicode < 0x10000 || unicode > 0x10ffff) {
      return {0, 0};
    }
    consumed = 4;
  }
  return {unicode, consumed};
}

} // namespace Utf8
} // namespace Json
} // namespace Envoy

Line	Count	Source
1		#include "test/common/json/utf8.h"
2
3		namespace Envoy {
4		namespace Json {
5		namespace Utf8 {
6
7	12.8k	UnicodeSizePair decode(const uint8_t* bytes, uint32_t size) {
8	12.8k	uint32_t unicode = 0;
9	12.8k	uint32_t consumed = 0;
10
11		// See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
12		//
13		// See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
14		// marked as deprecated. There is also support in Windows libraries and Boost,
15		// which can be discovered on StackOverflow. I could not find a usable OSS
16		// implementation. However it's easily derived from the spec on Wikipedia.
17		//
18		// Note that the code below could be optimized a bit, e.g. by factoring out
19		// repeated lookups of the same index in the bytes array and using SSE
20		// instructions for the multi-word bit hacking.
21		//
22		// See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
23		// faster, though less readable. As coded, though, it looks like it would read
24		// past the end of the input if the input is malformed.
25	12.8k	if (size >= 1 && (bytes[0] & Utf8::Mask1Byte) == Utf8::Pattern1Byte) {
26	1.29k	unicode = bytes[0] & ~Utf8::Mask1Byte;
27	1.29k	consumed = 1;
28	11.5k	} else if (size >= 2 && (bytes[0] & Utf8::Mask2Byte) == Utf8::Pattern2Byte &&
29	11.5k	(bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
30	1.06k	unicode = bytes[0] & ~Utf8::Mask2Byte;
31	1.06k	unicode = (unicode << Utf8::Shift) \| (bytes[1] & ~Utf8::ContinueMask);
32	1.06k	if (unicode < 0x80) {
33	26	return {0, 0};
34	26	}
35	1.03k	consumed = 2;
36	10.4k	} else if (size >= 3 && (bytes[0] & Utf8::Mask3Byte) == Utf8::Pattern3Byte &&
37	10.4k	(bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
38	10.4k	(bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
39	2.06k	unicode = bytes[0] & ~Utf8::Mask3Byte;
40	2.06k	unicode = (unicode << Utf8::Shift) \| (bytes[1] & ~Utf8::ContinueMask);
41	2.06k	unicode = (unicode << Utf8::Shift) \| (bytes[2] & ~Utf8::ContinueMask);
42	2.06k	if (unicode < 0x800) { // 3-byte starts at 0x800
43	596	return {0, 0};
44	596	}
45	1.46k	consumed = 3;
46	8.43k	} else if (size >= 4 && (bytes[0] & Utf8::Mask4Byte) == Utf8::Pattern4Byte &&
47	8.43k	(bytes[1] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
48	8.43k	(bytes[2] & Utf8::ContinueMask) == Utf8::ContinuePattern &&
49	8.43k	(bytes[3] & Utf8::ContinueMask) == Utf8::ContinuePattern) {
50	1.72k	unicode = bytes[0] & ~Utf8::Mask4Byte;
51	1.72k	unicode = (unicode << Utf8::Shift) \| (bytes[1] & ~Utf8::ContinueMask);
52	1.72k	unicode = (unicode << Utf8::Shift) \| (bytes[2] & ~Utf8::ContinueMask);
53	1.72k	unicode = (unicode << Utf8::Shift) \| (bytes[3] & ~Utf8::ContinueMask);
54
55		// 4-byte starts at 0x10000
56		//
57		// Note from https://en.wikipedia.org/wiki/UTF-8:
58		// The earlier RFC2279 allowed UTF-8 encoding through code point 0x7FFFFFF.
59		// But the current RFC3629 section 3 limits UTF-8 encoding through code
60		// point 0x10FFFF, to match the limits of UTF-16.
61	1.72k	if (unicode < 0x10000 \|\| unicode > 0x10ffff) {
62	421	return {0, 0};
63	421	}
64	1.30k	consumed = 4;
65	1.30k	}
66	11.8k	return {unicode, consumed};
67	12.8k	}
68
69		} // namespace Utf8
70		} // namespace Json
71		} // namespace Envoy

Coverage Report

Created: 2024-09-19 09:45