/src/boost/boost/json/detail/utf8.hpp
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com) |
3 | | // |
4 | | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
5 | | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
6 | | // |
7 | | // Official repository: https://github.com/boostorg/json |
8 | | // |
9 | | |
10 | | #ifndef BOOST_JSON_DETAIL_UTF8_HPP |
11 | | #define BOOST_JSON_DETAIL_UTF8_HPP |
12 | | |
13 | | #include <boost/endian/conversion.hpp> |
14 | | #include <boost/json/detail/config.hpp> |
15 | | |
16 | | #include <cstddef> |
17 | | #include <cstring> |
18 | | #include <cstdint> |
19 | | |
20 | | namespace boost { |
21 | | namespace json { |
22 | | namespace detail { |
23 | | |
24 | | template<int N> |
25 | | std::uint32_t |
26 | | load_little_endian(void const* p) |
27 | 1.03M | { |
28 | 1.03M | std::uint32_t v = 0; |
29 | 1.03M | std::memcpy(&v, p, N); |
30 | 1.03M | endian::little_to_native_inplace(v); |
31 | 1.03M | return v; |
32 | 1.03M | } unsigned int boost::json::detail::load_little_endian<2>(void const*) Line | Count | Source | 27 | 99.0k | { | 28 | 99.0k | std::uint32_t v = 0; | 29 | 99.0k | std::memcpy(&v, p, N); | 30 | 99.0k | endian::little_to_native_inplace(v); | 31 | 99.0k | return v; | 32 | 99.0k | } |
unsigned int boost::json::detail::load_little_endian<3>(void const*) Line | Count | Source | 27 | 16.0k | { | 28 | 16.0k | std::uint32_t v = 0; | 29 | 16.0k | std::memcpy(&v, p, N); | 30 | 16.0k | endian::little_to_native_inplace(v); | 31 | 16.0k | return v; | 32 | 16.0k | } |
unsigned int boost::json::detail::load_little_endian<4>(void const*) Line | Count | Source | 27 | 917k | { | 28 | 917k | std::uint32_t v = 0; | 29 | 917k | std::memcpy(&v, p, N); | 30 | 917k | endian::little_to_native_inplace(v); | 31 | 917k | return v; | 32 | 917k | } |
|
33 | | |
34 | | inline |
35 | | uint16_t |
36 | | classify_utf8(char c) |
37 | 156k | { |
38 | | // 0x000 = invalid |
39 | | // 0x102 = 2 bytes, second byte [80, BF] |
40 | | // 0x203 = 3 bytes, second byte [A0, BF] |
41 | | // 0x303 = 3 bytes, second byte [80, BF] |
42 | | // 0x403 = 3 bytes, second byte [80, 9F] |
43 | | // 0x504 = 4 bytes, second byte [90, BF] |
44 | | // 0x604 = 4 bytes, second byte [80, BF] |
45 | | // 0x704 = 4 bytes, second byte [80, 8F] |
46 | 156k | static constexpr uint16_t first[128] |
47 | 156k | { |
48 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
49 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
50 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
51 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
52 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
53 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
54 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
55 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
56 | | |
57 | 156k | 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, |
58 | 156k | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, |
59 | 156k | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, |
60 | 156k | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, |
61 | 156k | 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, |
62 | 156k | 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, |
63 | 156k | 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, |
64 | 156k | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
65 | 156k | }; |
66 | 156k | return first[static_cast<unsigned char>(c & 0x7F)]; |
67 | 156k | } |
68 | | |
69 | | inline |
70 | | bool |
71 | | is_valid_utf8(const char* p, uint16_t first) |
72 | 154k | { |
73 | 154k | uint32_t v; |
74 | 154k | switch(first >> 8) |
75 | 154k | { |
76 | 303 | default: |
77 | 303 | return false; |
78 | | |
79 | | // 2 bytes, second byte [80, BF] |
80 | 99.0k | case 1: |
81 | 99.0k | v = load_little_endian<2>(p); |
82 | 99.0k | return (v & 0xC000) == 0x8000; |
83 | | |
84 | | // 3 bytes, second byte [A0, BF] |
85 | 10.0k | case 2: |
86 | 10.0k | v = load_little_endian<3>(p); |
87 | 10.0k | return (v & 0xC0E000) == 0x80A000; |
88 | | |
89 | | // 3 bytes, second byte [80, BF] |
90 | 3.47k | case 3: |
91 | 3.47k | v = load_little_endian<3>(p); |
92 | 3.47k | return (v & 0xC0C000) == 0x808000; |
93 | | |
94 | | // 3 bytes, second byte [80, 9F] |
95 | 2.55k | case 4: |
96 | 2.55k | v = load_little_endian<3>(p); |
97 | 2.55k | return (v & 0xC0E000) == 0x808000; |
98 | | |
99 | | // 4 bytes, second byte [90, BF] |
100 | 20.7k | case 5: |
101 | 20.7k | v = load_little_endian<4>(p); |
102 | 20.7k | return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; |
103 | | |
104 | | // 4 bytes, second byte [80, BF] |
105 | 17.1k | case 6: |
106 | 17.1k | v = load_little_endian<4>(p); |
107 | 17.1k | return (v & 0xC0C0C000) == 0x80808000; |
108 | | |
109 | | // 4 bytes, second byte [80, 8F] |
110 | 1.68k | case 7: |
111 | 1.68k | v = load_little_endian<4>(p); |
112 | 1.68k | return (v & 0xC0C0F000) == 0x80808000; |
113 | 154k | } |
114 | 154k | } |
115 | | |
116 | | class utf8_sequence |
117 | | { |
118 | | char seq_[4]; |
119 | | uint16_t first_; |
120 | | uint8_t size_; |
121 | | |
122 | | public: |
123 | | void |
124 | | save( |
125 | | const char* p, |
126 | | std::size_t remain) noexcept |
127 | 1.07k | { |
128 | 1.07k | first_ = classify_utf8(*p ); |
129 | 1.07k | if(remain >= length()) |
130 | 996 | size_ = length(); |
131 | 75 | else |
132 | 75 | size_ = static_cast<uint8_t>(remain); |
133 | 1.07k | std::memcpy(seq_, p, size_); |
134 | 1.07k | } |
135 | | |
136 | | uint8_t |
137 | | length() const noexcept |
138 | 3.18k | { |
139 | 3.18k | return first_ & 0xFF; |
140 | 3.18k | } |
141 | | |
142 | | bool |
143 | | complete() const noexcept |
144 | 1.07k | { |
145 | 1.07k | return size_ >= length(); |
146 | 1.07k | } |
147 | | |
148 | | // returns true if complete |
149 | | bool |
150 | | append( |
151 | | const char* p, |
152 | | std::size_t remain) noexcept |
153 | 16 | { |
154 | 16 | if(BOOST_JSON_UNLIKELY(needed() == 0)) |
155 | 0 | return true; |
156 | 16 | if(BOOST_JSON_LIKELY(remain >= needed())) |
157 | 0 | { |
158 | 0 | std::memcpy( |
159 | 0 | seq_ + size_, p, needed()); |
160 | 0 | size_ = length(); |
161 | 0 | return true; |
162 | 0 | } |
163 | 16 | if(BOOST_JSON_LIKELY(remain > 0)) |
164 | 0 | { |
165 | 0 | std::memcpy(seq_ + size_, p, remain); |
166 | 0 | size_ += static_cast<uint8_t>(remain); |
167 | 0 | } |
168 | 16 | return false; |
169 | 16 | } |
170 | | |
171 | | const char* |
172 | | data() const noexcept |
173 | 0 | { |
174 | 0 | return seq_; |
175 | 0 | } |
176 | | |
177 | | uint8_t |
178 | | needed() const noexcept |
179 | 48 | { |
180 | 48 | return length() - size_; |
181 | 48 | } |
182 | | |
183 | | bool |
184 | | valid() const noexcept |
185 | 0 | { |
186 | 0 | BOOST_ASSERT(size_ >= length()); |
187 | 0 | return is_valid_utf8(seq_, first_); |
188 | 0 | } |
189 | | }; |
190 | | |
191 | | } // detail |
192 | | } // namespace json |
193 | | } // namespace boost |
194 | | |
195 | | #endif |