/proc/self/cwd/internal/utf8.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2021 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include "internal/utf8.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cstdint> |
19 | | #include <string> |
20 | | |
21 | | #include "absl/base/macros.h" |
22 | | #include "absl/base/optimization.h" |
23 | | #include "internal/unicode.h" |
24 | | |
25 | | // Implementation is based on |
26 | | // https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go |
27 | | // but adapted for C++. |
28 | | |
29 | | namespace cel::internal { |
30 | | |
31 | | namespace { |
32 | | |
33 | | constexpr uint8_t kUtf8RuneSelf = 0x80; |
34 | | constexpr size_t kUtf8Max = 4; |
35 | | |
36 | | constexpr uint8_t kLow = 0x80; |
37 | | constexpr uint8_t kHigh = 0xbf; |
38 | | |
39 | | constexpr uint8_t kMaskX = 0x3f; |
40 | | constexpr uint8_t kMask2 = 0x1f; |
41 | | constexpr uint8_t kMask3 = 0xf; |
42 | | constexpr uint8_t kMask4 = 0x7; |
43 | | |
44 | | constexpr uint8_t kTX = 0x80; |
45 | | constexpr uint8_t kT2 = 0xc0; |
46 | | constexpr uint8_t kT3 = 0xe0; |
47 | | constexpr uint8_t kT4 = 0xf0; |
48 | | |
49 | | constexpr uint8_t kXX = 0xf1; |
50 | | constexpr uint8_t kAS = 0xf0; |
51 | | constexpr uint8_t kS1 = 0x02; |
52 | | constexpr uint8_t kS2 = 0x13; |
53 | | constexpr uint8_t kS3 = 0x03; |
54 | | constexpr uint8_t kS4 = 0x23; |
55 | | constexpr uint8_t kS5 = 0x34; |
56 | | constexpr uint8_t kS6 = 0x04; |
57 | | constexpr uint8_t kS7 = 0x44; |
58 | | |
59 | | // NOLINTBEGIN |
60 | | // clang-format off |
61 | | constexpr uint8_t kLeading[256] = { |
62 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
63 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x00-0x0F |
64 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x10-0x1F |
65 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x20-0x2F |
66 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x30-0x3F |
67 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x40-0x4F |
68 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x50-0x5F |
69 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x60-0x6F |
70 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x70-0x7F |
71 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
72 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x80-0x8F |
73 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x90-0x9F |
74 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xA0-0xAF |
75 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xB0-0xBF |
76 | | kXX, kXX, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xC0-0xCF |
77 | | kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xD0-0xDF |
78 | | kS2, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS4, kS3, kS3, // 0xE0-0xEF |
79 | | kS5, kS6, kS6, kS6, kS7, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xF0-0xFF |
80 | | }; |
81 | | // clang-format on |
82 | | // NOLINTEND |
83 | | |
84 | | constexpr std::pair<uint8_t, uint8_t> kAccept[16] = { |
85 | | {kLow, kHigh}, {0xa0, kHigh}, {kLow, 0x9f}, {0x90, kHigh}, |
86 | | {kLow, 0x8f}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
87 | | {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
88 | | {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
89 | | }; |
90 | | |
91 | | class StringReader final { |
92 | | public: |
93 | 495k | constexpr explicit StringReader(absl::string_view input) : input_(input) {} |
94 | | |
95 | 6.64k | size_t Remaining() const { return input_.size(); } |
96 | | |
97 | 5.35M | bool HasRemaining() const { return !input_.empty(); } |
98 | | |
99 | 6.64k | absl::string_view Peek(size_t n) { |
100 | 6.64k | ABSL_ASSERT(n <= Remaining()); |
101 | 6.64k | return input_.substr(0, n); |
102 | 6.64k | } |
103 | | |
104 | 4.86M | char Read() { |
105 | 4.86M | ABSL_ASSERT(HasRemaining()); |
106 | 4.86M | char value = input_.front(); |
107 | 4.86M | input_.remove_prefix(1); |
108 | 4.86M | return value; |
109 | 4.86M | } |
110 | | |
111 | 6.64k | void Advance(size_t n) { |
112 | 6.64k | ABSL_ASSERT(n <= Remaining()); |
113 | 6.64k | input_.remove_prefix(n); |
114 | 6.64k | } |
115 | | |
116 | 0 | void Reset(absl::string_view input) { input_ = input; } |
117 | | |
118 | | private: |
119 | | absl::string_view input_; |
120 | | }; |
121 | | |
122 | | class CordReader final { |
123 | | public: |
124 | | explicit CordReader(const absl::Cord& input) |
125 | 0 | : input_(input), size_(input_.size()), buffer_(), index_(0) {} |
126 | | |
127 | 0 | size_t Remaining() const { return size_; } |
128 | | |
129 | 0 | bool HasRemaining() const { return size_ != 0; } |
130 | | |
131 | 0 | absl::string_view Peek(size_t n) { |
132 | 0 | ABSL_ASSERT(n <= Remaining()); |
133 | 0 | if (n == 0) { |
134 | 0 | return absl::string_view(); |
135 | 0 | } |
136 | 0 | if (n <= buffer_.size() - index_) { |
137 | | // Enough data remaining in temporary buffer. |
138 | 0 | return absl::string_view(buffer_.data() + index_, n); |
139 | 0 | } |
140 | | // We do not have enough data. See if we can fit it without allocating by |
141 | | // shifting data back to the beginning of the buffer. |
142 | 0 | if (buffer_.capacity() >= n) { |
143 | | // It will fit in the current capacity, see if we need to shift the |
144 | | // existing data to make it fit. |
145 | 0 | if (buffer_.capacity() - buffer_.size() < n && index_ != 0) { |
146 | | // We need to shift. |
147 | 0 | buffer_.erase(buffer_.begin(), buffer_.begin() + index_); |
148 | 0 | index_ = 0; |
149 | 0 | } |
150 | 0 | } |
151 | | // Ensure we never reserve less than kUtf8Max. |
152 | 0 | buffer_.reserve(std::max(buffer_.size() + n, kUtf8Max)); |
153 | 0 | size_t to_copy = n - (buffer_.size() - index_); |
154 | 0 | absl::CopyCordToString(input_.Subcord(0, to_copy), &buffer_); |
155 | 0 | input_.RemovePrefix(to_copy); |
156 | 0 | return absl::string_view(buffer_.data() + index_, n); |
157 | 0 | } |
158 | | |
159 | 0 | char Read() { |
160 | 0 | char value = Peek(1).front(); |
161 | 0 | Advance(1); |
162 | 0 | return value; |
163 | 0 | } |
164 | | |
165 | 0 | void Advance(size_t n) { |
166 | 0 | ABSL_ASSERT(n <= Remaining()); |
167 | 0 | if (n == 0) { |
168 | 0 | return; |
169 | 0 | } |
170 | 0 | if (index_ < buffer_.size()) { |
171 | 0 | size_t count = std::min(n, buffer_.size() - index_); |
172 | 0 | index_ += count; |
173 | 0 | n -= count; |
174 | 0 | size_ -= count; |
175 | 0 | if (index_ < buffer_.size()) { |
176 | 0 | return; |
177 | 0 | } |
178 | | // Temporary buffer is empty, clear it. |
179 | 0 | buffer_.clear(); |
180 | 0 | index_ = 0; |
181 | 0 | } |
182 | 0 | input_.RemovePrefix(n); |
183 | 0 | size_ -= n; |
184 | 0 | } |
185 | | |
186 | 0 | void Reset(const absl::Cord& input) { |
187 | 0 | input_ = input; |
188 | 0 | size_ = input_.size(); |
189 | 0 | buffer_.clear(); |
190 | 0 | index_ = 0; |
191 | 0 | } |
192 | | |
193 | | private: |
194 | | absl::Cord input_; |
195 | | size_t size_; |
196 | | std::string buffer_; |
197 | | size_t index_; |
198 | | }; |
199 | | |
200 | | template <typename BufferedByteReader> |
201 | 495k | bool Utf8IsValidImpl(BufferedByteReader* reader) { |
202 | 5.35M | while (reader->HasRemaining()) { |
203 | 4.86M | const auto b = static_cast<uint8_t>(reader->Read()); |
204 | 4.86M | if (b < kUtf8RuneSelf) { |
205 | 4.85M | continue; |
206 | 4.85M | } |
207 | 6.64k | const auto leading = kLeading[b]; |
208 | 6.64k | if (leading == kXX) { |
209 | 0 | return false; |
210 | 0 | } |
211 | 6.64k | const auto size = static_cast<size_t>(leading & 7) - 1; |
212 | 6.64k | if (size > reader->Remaining()) { |
213 | 0 | return false; |
214 | 0 | } |
215 | 6.64k | const absl::string_view segment = reader->Peek(size); |
216 | 6.64k | const auto& accept = kAccept[leading >> 4]; |
217 | 6.64k | if (static_cast<uint8_t>(segment[0]) < accept.first || |
218 | 6.64k | static_cast<uint8_t>(segment[0]) > accept.second) { |
219 | 0 | return false; |
220 | 6.64k | } else if (size == 1) { |
221 | 5.16k | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
222 | 5.16k | static_cast<uint8_t>(segment[1]) > kHigh) { |
223 | 0 | return false; |
224 | 5.16k | } else if (size == 2) { |
225 | 3.78k | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
226 | 1.38k | static_cast<uint8_t>(segment[2]) > kHigh) { |
227 | 0 | return false; |
228 | 0 | } |
229 | 6.64k | reader->Advance(size); |
230 | 6.64k | } |
231 | 495k | return true; |
232 | 495k | } utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Line | Count | Source | 201 | 495k | bool Utf8IsValidImpl(BufferedByteReader* reader) { | 202 | 5.35M | while (reader->HasRemaining()) { | 203 | 4.86M | const auto b = static_cast<uint8_t>(reader->Read()); | 204 | 4.86M | if (b < kUtf8RuneSelf) { | 205 | 4.85M | continue; | 206 | 4.85M | } | 207 | 6.64k | const auto leading = kLeading[b]; | 208 | 6.64k | if (leading == kXX) { | 209 | 0 | return false; | 210 | 0 | } | 211 | 6.64k | const auto size = static_cast<size_t>(leading & 7) - 1; | 212 | 6.64k | if (size > reader->Remaining()) { | 213 | 0 | return false; | 214 | 0 | } | 215 | 6.64k | const absl::string_view segment = reader->Peek(size); | 216 | 6.64k | const auto& accept = kAccept[leading >> 4]; | 217 | 6.64k | if (static_cast<uint8_t>(segment[0]) < accept.first || | 218 | 6.64k | static_cast<uint8_t>(segment[0]) > accept.second) { | 219 | 0 | return false; | 220 | 6.64k | } else if (size == 1) { | 221 | 5.16k | } else if (static_cast<uint8_t>(segment[1]) < kLow || | 222 | 5.16k | static_cast<uint8_t>(segment[1]) > kHigh) { | 223 | 0 | return false; | 224 | 5.16k | } else if (size == 2) { | 225 | 3.78k | } else if (static_cast<uint8_t>(segment[2]) < kLow || | 226 | 1.38k | static_cast<uint8_t>(segment[2]) > kHigh) { | 227 | 0 | return false; | 228 | 0 | } | 229 | 6.64k | reader->Advance(size); | 230 | 6.64k | } | 231 | 495k | return true; | 232 | 495k | } |
Unexecuted instantiation: utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
233 | | |
234 | | template <typename BufferedByteReader> |
235 | 0 | size_t Utf8CodePointCountImpl(BufferedByteReader* reader) { |
236 | 0 | size_t count = 0; |
237 | 0 | while (reader->HasRemaining()) { |
238 | 0 | count++; |
239 | 0 | const auto b = static_cast<uint8_t>(reader->Read()); |
240 | 0 | if (b < kUtf8RuneSelf) { |
241 | 0 | continue; |
242 | 0 | } |
243 | 0 | const auto leading = kLeading[b]; |
244 | 0 | if (leading == kXX) { |
245 | 0 | continue; |
246 | 0 | } |
247 | 0 | auto size = static_cast<size_t>(leading & 7) - 1; |
248 | 0 | if (size > reader->Remaining()) { |
249 | 0 | continue; |
250 | 0 | } |
251 | 0 | const absl::string_view segment = reader->Peek(size); |
252 | 0 | const auto& accept = kAccept[leading >> 4]; |
253 | 0 | if (static_cast<uint8_t>(segment[0]) < accept.first || |
254 | 0 | static_cast<uint8_t>(segment[0]) > accept.second) { |
255 | 0 | size = 0; |
256 | 0 | } else if (size == 1) { |
257 | 0 | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
258 | 0 | static_cast<uint8_t>(segment[1]) > kHigh) { |
259 | 0 | size = 0; |
260 | 0 | } else if (size == 2) { |
261 | 0 | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
262 | 0 | static_cast<uint8_t>(segment[2]) > kHigh) { |
263 | 0 | size = 0; |
264 | 0 | } |
265 | 0 | reader->Advance(size); |
266 | 0 | } |
267 | 0 | return count; |
268 | 0 | } Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
269 | | |
270 | | template <typename BufferedByteReader> |
271 | 0 | std::pair<size_t, bool> Utf8ValidateImpl(BufferedByteReader* reader) { |
272 | 0 | size_t count = 0; |
273 | 0 | while (reader->HasRemaining()) { |
274 | 0 | const auto b = static_cast<uint8_t>(reader->Read()); |
275 | 0 | if (b < kUtf8RuneSelf) { |
276 | 0 | count++; |
277 | 0 | continue; |
278 | 0 | } |
279 | 0 | const auto leading = kLeading[b]; |
280 | 0 | if (leading == kXX) { |
281 | 0 | return {count, false}; |
282 | 0 | } |
283 | 0 | const auto size = static_cast<size_t>(leading & 7) - 1; |
284 | 0 | if (size > reader->Remaining()) { |
285 | 0 | return {count, false}; |
286 | 0 | } |
287 | 0 | const absl::string_view segment = reader->Peek(size); |
288 | 0 | const auto& accept = kAccept[leading >> 4]; |
289 | 0 | if (static_cast<uint8_t>(segment[0]) < accept.first || |
290 | 0 | static_cast<uint8_t>(segment[0]) > accept.second) { |
291 | 0 | return {count, false}; |
292 | 0 | } else if (size == 1) { |
293 | 0 | count++; |
294 | 0 | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
295 | 0 | static_cast<uint8_t>(segment[1]) > kHigh) { |
296 | 0 | return {count, false}; |
297 | 0 | } else if (size == 2) { |
298 | 0 | count++; |
299 | 0 | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
300 | 0 | static_cast<uint8_t>(segment[2]) > kHigh) { |
301 | 0 | return {count, false}; |
302 | 0 | } else { |
303 | 0 | count++; |
304 | 0 | } |
305 | 0 | reader->Advance(size); |
306 | 0 | } |
307 | 0 | return {count, true}; |
308 | 0 | } Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
309 | | |
310 | | } // namespace |
311 | | |
312 | 495k | bool Utf8IsValid(absl::string_view str) { |
313 | 495k | StringReader reader(str); |
314 | 495k | bool valid = Utf8IsValidImpl(&reader); |
315 | 495k | ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second)); |
316 | 495k | return valid; |
317 | 495k | } |
318 | | |
319 | 0 | bool Utf8IsValid(const absl::Cord& str) { |
320 | 0 | CordReader reader(str); |
321 | 0 | bool valid = Utf8IsValidImpl(&reader); |
322 | 0 | ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second)); |
323 | 0 | return valid; |
324 | 0 | } |
325 | | |
326 | 0 | size_t Utf8CodePointCount(absl::string_view str) { |
327 | 0 | StringReader reader(str); |
328 | 0 | return Utf8CodePointCountImpl(&reader); |
329 | 0 | } |
330 | | |
331 | 0 | size_t Utf8CodePointCount(const absl::Cord& str) { |
332 | 0 | CordReader reader(str); |
333 | 0 | return Utf8CodePointCountImpl(&reader); |
334 | 0 | } |
335 | | |
336 | 0 | std::pair<size_t, bool> Utf8Validate(absl::string_view str) { |
337 | 0 | StringReader reader(str); |
338 | 0 | auto result = Utf8ValidateImpl(&reader); |
339 | 0 | ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader))); |
340 | 0 | return result; |
341 | 0 | } |
342 | | |
343 | 0 | std::pair<size_t, bool> Utf8Validate(const absl::Cord& str) { |
344 | 0 | CordReader reader(str); |
345 | 0 | auto result = Utf8ValidateImpl(&reader); |
346 | 0 | ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader))); |
347 | 0 | return result; |
348 | 0 | } |
349 | | |
350 | 100M | std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) { |
351 | 100M | ABSL_ASSERT(!str.empty()); |
352 | 100M | const auto b = static_cast<uint8_t>(str.front()); |
353 | 100M | str.remove_prefix(1); |
354 | 100M | if (b < kUtf8RuneSelf) { |
355 | 99.9M | return {static_cast<char32_t>(b), 1}; |
356 | 99.9M | } |
357 | 19.8k | const auto leading = kLeading[b]; |
358 | 19.8k | if (leading == kXX) { |
359 | 163 | return {kUnicodeReplacementCharacter, 1}; |
360 | 163 | } |
361 | 19.6k | auto size = static_cast<size_t>(leading & 7) - 1; |
362 | 19.6k | if (size > str.size()) { |
363 | 25 | return {kUnicodeReplacementCharacter, 1}; |
364 | 25 | } |
365 | 19.6k | const auto& accept = kAccept[leading >> 4]; |
366 | 19.6k | const auto b1 = static_cast<uint8_t>(str.front()); |
367 | 19.6k | str.remove_prefix(1); |
368 | 19.6k | if (b1 < accept.first || b1 > accept.second) { |
369 | 78 | return {kUnicodeReplacementCharacter, 1}; |
370 | 78 | } |
371 | 19.5k | if (size <= 1) { |
372 | 7.36k | return {(static_cast<char32_t>(b & kMask2) << 6) | |
373 | 7.36k | static_cast<char32_t>(b1 & kMaskX), |
374 | 7.36k | 2}; |
375 | 7.36k | } |
376 | 12.1k | const auto b2 = static_cast<uint8_t>(str.front()); |
377 | 12.1k | str.remove_prefix(1); |
378 | 12.1k | if (b2 < kLow || b2 > kHigh) { |
379 | 17 | return {kUnicodeReplacementCharacter, 1}; |
380 | 17 | } |
381 | 12.1k | if (size <= 2) { |
382 | 7.12k | return {(static_cast<char32_t>(b & kMask3) << 12) | |
383 | 7.12k | (static_cast<char32_t>(b1 & kMaskX) << 6) | |
384 | 7.12k | static_cast<char32_t>(b2 & kMaskX), |
385 | 7.12k | 3}; |
386 | 7.12k | } |
387 | 5.03k | const auto b3 = static_cast<uint8_t>(str.front()); |
388 | 5.03k | str.remove_prefix(1); |
389 | 5.03k | if (b3 < kLow || b3 > kHigh) { |
390 | 12 | return {kUnicodeReplacementCharacter, 1}; |
391 | 12 | } |
392 | 5.02k | return {(static_cast<char32_t>(b & kMask4) << 18) | |
393 | 5.02k | (static_cast<char32_t>(b1 & kMaskX) << 12) | |
394 | 5.02k | (static_cast<char32_t>(b2 & kMaskX) << 6) | |
395 | 5.02k | static_cast<char32_t>(b3 & kMaskX), |
396 | 5.02k | 4}; |
397 | 5.03k | } |
398 | | |
399 | 14.6M | std::string& Utf8Encode(std::string* buffer, char32_t code_point) { |
400 | 14.6M | ABSL_ASSERT(buffer != nullptr); |
401 | 14.6M | if (!UnicodeIsValid(code_point)) { |
402 | 0 | code_point = kUnicodeReplacementCharacter; |
403 | 0 | } |
404 | 14.6M | if (code_point <= 0x7f) { |
405 | 14.6M | buffer->push_back(static_cast<char>(static_cast<uint8_t>(code_point))); |
406 | 14.6M | } else if (code_point <= 0x7ff) { |
407 | 7.12k | buffer->push_back( |
408 | 7.12k | static_cast<char>(kT2 | static_cast<uint8_t>(code_point >> 6))); |
409 | 7.12k | buffer->push_back( |
410 | 7.12k | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX))); |
411 | 11.8k | } else if (code_point <= 0xffff) { |
412 | 7.09k | buffer->push_back( |
413 | 7.09k | static_cast<char>(kT3 | static_cast<uint8_t>(code_point >> 12))); |
414 | 7.09k | buffer->push_back(static_cast<char>( |
415 | 7.09k | kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX))); |
416 | 7.09k | buffer->push_back( |
417 | 7.09k | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX))); |
418 | 7.09k | } else { |
419 | 4.73k | buffer->push_back( |
420 | 4.73k | static_cast<char>(kT4 | static_cast<uint8_t>(code_point >> 18))); |
421 | 4.73k | buffer->push_back(static_cast<char>( |
422 | 4.73k | kTX | (static_cast<uint8_t>(code_point >> 12) & kMaskX))); |
423 | 4.73k | buffer->push_back(static_cast<char>( |
424 | 4.73k | kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX))); |
425 | 4.73k | buffer->push_back( |
426 | 4.73k | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX))); |
427 | 4.73k | } |
428 | 14.6M | return *buffer; |
429 | 14.6M | } |
430 | | |
431 | | } // namespace cel::internal |