/proc/self/cwd/internal/utf8.cc
Line | Count | Source |
1 | | // Copyright 2021 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include "internal/utf8.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cstdint> |
19 | | #include <cstring> |
20 | | #include <string> |
21 | | #include <utility> |
22 | | |
23 | | #include "absl/base/macros.h" |
24 | | #include "absl/base/nullability.h" |
25 | | #include "absl/base/optimization.h" |
26 | | #include "absl/log/absl_check.h" |
27 | | #include "absl/strings/cord.h" |
28 | | #include "absl/strings/string_view.h" |
29 | | #include "internal/unicode.h" |
30 | | |
31 | | // Implementation is based on |
32 | | // https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go |
33 | | // but adapted for C++. |
34 | | |
35 | | namespace cel::internal { |
36 | | |
37 | | namespace { |
38 | | |
39 | | constexpr uint8_t kUtf8RuneSelf = 0x80; |
40 | | constexpr size_t kUtf8Max = 4; |
41 | | |
42 | | constexpr uint8_t kLow = 0x80; |
43 | | constexpr uint8_t kHigh = 0xbf; |
44 | | |
45 | | constexpr uint8_t kMaskX = 0x3f; |
46 | | constexpr uint8_t kMask2 = 0x1f; |
47 | | constexpr uint8_t kMask3 = 0xf; |
48 | | constexpr uint8_t kMask4 = 0x7; |
49 | | |
50 | | constexpr uint8_t kTX = 0x80; |
51 | | constexpr uint8_t kT2 = 0xc0; |
52 | | constexpr uint8_t kT3 = 0xe0; |
53 | | constexpr uint8_t kT4 = 0xf0; |
54 | | |
55 | | constexpr uint8_t kXX = 0xf1; |
56 | | constexpr uint8_t kAS = 0xf0; |
57 | | constexpr uint8_t kS1 = 0x02; |
58 | | constexpr uint8_t kS2 = 0x13; |
59 | | constexpr uint8_t kS3 = 0x03; |
60 | | constexpr uint8_t kS4 = 0x23; |
61 | | constexpr uint8_t kS5 = 0x34; |
62 | | constexpr uint8_t kS6 = 0x04; |
63 | | constexpr uint8_t kS7 = 0x44; |
64 | | |
65 | | // NOLINTBEGIN |
66 | | // clang-format off |
67 | | constexpr uint8_t kLeading[256] = { |
68 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
69 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x00-0x0F |
70 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x10-0x1F |
71 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x20-0x2F |
72 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x30-0x3F |
73 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x40-0x4F |
74 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x50-0x5F |
75 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x60-0x6F |
76 | | kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x70-0x7F |
77 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
78 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x80-0x8F |
79 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x90-0x9F |
80 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xA0-0xAF |
81 | | kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xB0-0xBF |
82 | | kXX, kXX, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xC0-0xCF |
83 | | kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xD0-0xDF |
84 | | kS2, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS4, kS3, kS3, // 0xE0-0xEF |
85 | | kS5, kS6, kS6, kS6, kS7, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xF0-0xFF |
86 | | }; |
87 | | // clang-format on |
88 | | // NOLINTEND |
89 | | |
90 | | constexpr std::pair<const uint8_t, const uint8_t> kAccept[16] = { |
91 | | {kLow, kHigh}, {0xa0, kHigh}, {kLow, 0x9f}, {0x90, kHigh}, |
92 | | {kLow, 0x8f}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
93 | | {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
94 | | {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, |
95 | | }; |
96 | | |
97 | | class StringReader final { |
98 | | public: |
99 | 392k | constexpr explicit StringReader(absl::string_view input) : input_(input) {} |
100 | | |
101 | 1.77k | size_t Remaining() const { return input_.size(); } |
102 | | |
103 | 2.21M | bool HasRemaining() const { return !input_.empty(); } |
104 | | |
105 | 1.77k | absl::string_view Peek(size_t n) { |
106 | 1.77k | ABSL_ASSERT(n <= Remaining()); |
107 | 1.77k | return input_.substr(0, n); |
108 | 1.77k | } |
109 | | |
110 | 1.82M | char Read() { |
111 | 1.82M | ABSL_ASSERT(HasRemaining()); |
112 | 1.82M | char value = input_.front(); |
113 | 1.82M | input_.remove_prefix(1); |
114 | 1.82M | return value; |
115 | 1.82M | } |
116 | | |
117 | 1.77k | void Advance(size_t n) { |
118 | 1.77k | ABSL_ASSERT(n <= Remaining()); |
119 | 1.77k | input_.remove_prefix(n); |
120 | 1.77k | } |
121 | | |
122 | 0 | void Reset(absl::string_view input) { input_ = input; } |
123 | | |
124 | | private: |
125 | | absl::string_view input_; |
126 | | }; |
127 | | |
128 | | class CordReader final { |
129 | | public: |
130 | | explicit CordReader(const absl::Cord& input) |
131 | 0 | : input_(input), size_(input_.size()), buffer_(), index_(0) {} |
132 | | |
133 | 0 | size_t Remaining() const { return size_; } |
134 | | |
135 | 0 | bool HasRemaining() const { return size_ != 0; } |
136 | | |
137 | 0 | absl::string_view Peek(size_t n) { |
138 | 0 | ABSL_ASSERT(n <= Remaining()); |
139 | 0 | if (n == 0) { |
140 | 0 | return absl::string_view(); |
141 | 0 | } |
142 | 0 | if (n <= buffer_.size() - index_) { |
143 | | // Enough data remaining in temporary buffer. |
144 | 0 | return absl::string_view(buffer_.data() + index_, n); |
145 | 0 | } |
146 | | // We do not have enough data. See if we can fit it without allocating by |
147 | | // shifting data back to the beginning of the buffer. |
148 | 0 | if (buffer_.capacity() >= n) { |
149 | | // It will fit in the current capacity, see if we need to shift the |
150 | | // existing data to make it fit. |
151 | 0 | if (buffer_.capacity() - buffer_.size() < n && index_ != 0) { |
152 | | // We need to shift. |
153 | 0 | buffer_.erase(buffer_.begin(), buffer_.begin() + index_); |
154 | 0 | index_ = 0; |
155 | 0 | } |
156 | 0 | } |
157 | | // Ensure we never reserve less than kUtf8Max. |
158 | 0 | buffer_.reserve(std::max(buffer_.size() + n, kUtf8Max)); |
159 | 0 | size_t to_copy = n - (buffer_.size() - index_); |
160 | 0 | absl::CopyCordToString(input_.Subcord(0, to_copy), &buffer_); |
161 | 0 | input_.RemovePrefix(to_copy); |
162 | 0 | return absl::string_view(buffer_.data() + index_, n); |
163 | 0 | } |
164 | | |
165 | 0 | char Read() { |
166 | 0 | char value = Peek(1).front(); |
167 | 0 | Advance(1); |
168 | 0 | return value; |
169 | 0 | } |
170 | | |
171 | 0 | void Advance(size_t n) { |
172 | 0 | ABSL_ASSERT(n <= Remaining()); |
173 | 0 | if (n == 0) { |
174 | 0 | return; |
175 | 0 | } |
176 | 0 | if (index_ < buffer_.size()) { |
177 | 0 | size_t count = std::min(n, buffer_.size() - index_); |
178 | 0 | index_ += count; |
179 | 0 | n -= count; |
180 | 0 | size_ -= count; |
181 | 0 | if (index_ < buffer_.size()) { |
182 | 0 | return; |
183 | 0 | } |
184 | | // Temporary buffer is empty, clear it. |
185 | 0 | buffer_.clear(); |
186 | 0 | index_ = 0; |
187 | 0 | } |
188 | 0 | input_.RemovePrefix(n); |
189 | 0 | size_ -= n; |
190 | 0 | } |
191 | | |
192 | 0 | void Reset(const absl::Cord& input) { |
193 | 0 | input_ = input; |
194 | 0 | size_ = input_.size(); |
195 | 0 | buffer_.clear(); |
196 | 0 | index_ = 0; |
197 | 0 | } |
198 | | |
199 | | private: |
200 | | absl::Cord input_; |
201 | | size_t size_; |
202 | | std::string buffer_; |
203 | | size_t index_; |
204 | | }; |
205 | | |
206 | | template <typename BufferedByteReader> |
207 | 392k | bool Utf8IsValidImpl(BufferedByteReader* reader) { |
208 | 2.21M | while (reader->HasRemaining()) { |
209 | 1.82M | const auto b = static_cast<uint8_t>(reader->Read()); |
210 | 1.82M | if (b < kUtf8RuneSelf) { |
211 | 1.82M | continue; |
212 | 1.82M | } |
213 | 1.77k | const auto leading = kLeading[b]; |
214 | 1.77k | if (leading == kXX) { |
215 | 0 | return false; |
216 | 0 | } |
217 | 1.77k | const auto size = static_cast<size_t>(leading & 7) - 1; |
218 | 1.77k | if (size > reader->Remaining()) { |
219 | 0 | return false; |
220 | 0 | } |
221 | 1.77k | const absl::string_view segment = reader->Peek(size); |
222 | 1.77k | const auto& accept = kAccept[leading >> 4]; |
223 | 1.77k | if (static_cast<uint8_t>(segment[0]) < accept.first || |
224 | 1.77k | static_cast<uint8_t>(segment[0]) > accept.second) { |
225 | 0 | return false; |
226 | 1.77k | } else if (size == 1) { |
227 | 984 | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
228 | 984 | static_cast<uint8_t>(segment[1]) > kHigh) { |
229 | 0 | return false; |
230 | 984 | } else if (size == 2) { |
231 | 670 | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
232 | 314 | static_cast<uint8_t>(segment[2]) > kHigh) { |
233 | 0 | return false; |
234 | 0 | } |
235 | 1.77k | reader->Advance(size); |
236 | 1.77k | } |
237 | 392k | return true; |
238 | 392k | } utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Line | Count | Source | 207 | 392k | bool Utf8IsValidImpl(BufferedByteReader* reader) { | 208 | 2.21M | while (reader->HasRemaining()) { | 209 | 1.82M | const auto b = static_cast<uint8_t>(reader->Read()); | 210 | 1.82M | if (b < kUtf8RuneSelf) { | 211 | 1.82M | continue; | 212 | 1.82M | } | 213 | 1.77k | const auto leading = kLeading[b]; | 214 | 1.77k | if (leading == kXX) { | 215 | 0 | return false; | 216 | 0 | } | 217 | 1.77k | const auto size = static_cast<size_t>(leading & 7) - 1; | 218 | 1.77k | if (size > reader->Remaining()) { | 219 | 0 | return false; | 220 | 0 | } | 221 | 1.77k | const absl::string_view segment = reader->Peek(size); | 222 | 1.77k | const auto& accept = kAccept[leading >> 4]; | 223 | 1.77k | if (static_cast<uint8_t>(segment[0]) < accept.first || | 224 | 1.77k | static_cast<uint8_t>(segment[0]) > accept.second) { | 225 | 0 | return false; | 226 | 1.77k | } else if (size == 1) { | 227 | 984 | } else if (static_cast<uint8_t>(segment[1]) < kLow || | 228 | 984 | static_cast<uint8_t>(segment[1]) > kHigh) { | 229 | 0 | return false; | 230 | 984 | } else if (size == 2) { | 231 | 670 | } else if (static_cast<uint8_t>(segment[2]) < kLow || | 232 | 314 | static_cast<uint8_t>(segment[2]) > kHigh) { | 233 | 0 | return false; | 234 | 0 | } | 235 | 1.77k | reader->Advance(size); | 236 | 1.77k | } | 237 | 392k | return true; | 238 | 392k | } |
Unexecuted instantiation: utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
239 | | |
240 | | template <typename BufferedByteReader> |
241 | 0 | size_t Utf8CodePointCountImpl(BufferedByteReader* reader) { |
242 | 0 | size_t count = 0; |
243 | 0 | while (reader->HasRemaining()) { |
244 | 0 | count++; |
245 | 0 | const auto b = static_cast<uint8_t>(reader->Read()); |
246 | 0 | if (b < kUtf8RuneSelf) { |
247 | 0 | continue; |
248 | 0 | } |
249 | 0 | const auto leading = kLeading[b]; |
250 | 0 | if (leading == kXX) { |
251 | 0 | continue; |
252 | 0 | } |
253 | 0 | auto size = static_cast<size_t>(leading & 7) - 1; |
254 | 0 | if (size > reader->Remaining()) { |
255 | 0 | continue; |
256 | 0 | } |
257 | 0 | const absl::string_view segment = reader->Peek(size); |
258 | 0 | const auto& accept = kAccept[leading >> 4]; |
259 | 0 | if (static_cast<uint8_t>(segment[0]) < accept.first || |
260 | 0 | static_cast<uint8_t>(segment[0]) > accept.second) { |
261 | 0 | size = 0; |
262 | 0 | } else if (size == 1) { |
263 | 0 | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
264 | 0 | static_cast<uint8_t>(segment[1]) > kHigh) { |
265 | 0 | size = 0; |
266 | 0 | } else if (size == 2) { |
267 | 0 | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
268 | 0 | static_cast<uint8_t>(segment[2]) > kHigh) { |
269 | 0 | size = 0; |
270 | 0 | } |
271 | 0 | reader->Advance(size); |
272 | 0 | } |
273 | 0 | return count; |
274 | 0 | } Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
275 | | |
276 | | template <typename BufferedByteReader> |
277 | 0 | std::pair<size_t, bool> Utf8ValidateImpl(BufferedByteReader* reader) { |
278 | 0 | size_t count = 0; |
279 | 0 | while (reader->HasRemaining()) { |
280 | 0 | const auto b = static_cast<uint8_t>(reader->Read()); |
281 | 0 | if (b < kUtf8RuneSelf) { |
282 | 0 | count++; |
283 | 0 | continue; |
284 | 0 | } |
285 | 0 | const auto leading = kLeading[b]; |
286 | 0 | if (leading == kXX) { |
287 | 0 | return {count, false}; |
288 | 0 | } |
289 | 0 | const auto size = static_cast<size_t>(leading & 7) - 1; |
290 | 0 | if (size > reader->Remaining()) { |
291 | 0 | return {count, false}; |
292 | 0 | } |
293 | 0 | const absl::string_view segment = reader->Peek(size); |
294 | 0 | const auto& accept = kAccept[leading >> 4]; |
295 | 0 | if (static_cast<uint8_t>(segment[0]) < accept.first || |
296 | 0 | static_cast<uint8_t>(segment[0]) > accept.second) { |
297 | 0 | return {count, false}; |
298 | 0 | } else if (size == 1) { |
299 | 0 | count++; |
300 | 0 | } else if (static_cast<uint8_t>(segment[1]) < kLow || |
301 | 0 | static_cast<uint8_t>(segment[1]) > kHigh) { |
302 | 0 | return {count, false}; |
303 | 0 | } else if (size == 2) { |
304 | 0 | count++; |
305 | 0 | } else if (static_cast<uint8_t>(segment[2]) < kLow || |
306 | 0 | static_cast<uint8_t>(segment[2]) > kHigh) { |
307 | 0 | return {count, false}; |
308 | 0 | } else { |
309 | 0 | count++; |
310 | 0 | } |
311 | 0 | reader->Advance(size); |
312 | 0 | } |
313 | 0 | return {count, true}; |
314 | 0 | } Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*) Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*) |
315 | | |
316 | | } // namespace |
317 | | |
318 | 392k | bool Utf8IsValid(absl::string_view str) { |
319 | 392k | StringReader reader(str); |
320 | 392k | bool valid = Utf8IsValidImpl(&reader); |
321 | 392k | ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second)); |
322 | 392k | return valid; |
323 | 392k | } |
324 | | |
325 | 0 | bool Utf8IsValid(const absl::Cord& str) { |
326 | 0 | CordReader reader(str); |
327 | 0 | bool valid = Utf8IsValidImpl(&reader); |
328 | 0 | ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second)); |
329 | 0 | return valid; |
330 | 0 | } |
331 | | |
332 | 0 | size_t Utf8CodePointCount(absl::string_view str) { |
333 | 0 | StringReader reader(str); |
334 | 0 | return Utf8CodePointCountImpl(&reader); |
335 | 0 | } |
336 | | |
337 | 0 | size_t Utf8CodePointCount(const absl::Cord& str) { |
338 | 0 | CordReader reader(str); |
339 | 0 | return Utf8CodePointCountImpl(&reader); |
340 | 0 | } |
341 | | |
342 | 0 | std::pair<size_t, bool> Utf8Validate(absl::string_view str) { |
343 | 0 | StringReader reader(str); |
344 | 0 | auto result = Utf8ValidateImpl(&reader); |
345 | 0 | ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader))); |
346 | 0 | return result; |
347 | 0 | } |
348 | | |
349 | 0 | std::pair<size_t, bool> Utf8Validate(const absl::Cord& str) { |
350 | 0 | CordReader reader(str); |
351 | 0 | auto result = Utf8ValidateImpl(&reader); |
352 | 0 | ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader))); |
353 | 0 | return result; |
354 | 0 | } |
355 | | |
356 | | namespace { |
357 | | |
358 | | size_t Utf8DecodeImpl(uint8_t b, uint8_t leading, size_t size, |
359 | | absl::string_view str, |
360 | 339k | char32_t* absl_nullable code_point) { |
361 | 339k | const auto& accept = kAccept[leading >> 4]; |
362 | 339k | const auto b1 = static_cast<uint8_t>(str.front()); |
363 | 339k | if (ABSL_PREDICT_FALSE(b1 < accept.first || b1 > accept.second)) { |
364 | 5 | if (code_point != nullptr) { |
365 | 5 | *code_point = kUnicodeReplacementCharacter; |
366 | 5 | } |
367 | 5 | return 1; |
368 | 5 | } |
369 | 339k | if (size <= 1) { |
370 | 18.2k | if (code_point != nullptr) { |
371 | 18.2k | *code_point = (static_cast<char32_t>(b & kMask2) << 6) | |
372 | 18.2k | static_cast<char32_t>(b1 & kMaskX); |
373 | 18.2k | } |
374 | 18.2k | return 2; |
375 | 18.2k | } |
376 | 321k | str.remove_prefix(1); |
377 | 321k | const auto b2 = static_cast<uint8_t>(str.front()); |
378 | 321k | if (ABSL_PREDICT_FALSE(b2 < kLow || b2 > kHigh)) { |
379 | 5 | if (code_point != nullptr) { |
380 | 5 | *code_point = kUnicodeReplacementCharacter; |
381 | 5 | } |
382 | 5 | return 1; |
383 | 5 | } |
384 | 321k | if (size <= 2) { |
385 | 306k | if (code_point != nullptr) { |
386 | 306k | *code_point = (static_cast<char32_t>(b & kMask3) << 12) | |
387 | 306k | (static_cast<char32_t>(b1 & kMaskX) << 6) | |
388 | 306k | static_cast<char32_t>(b2 & kMaskX); |
389 | 306k | } |
390 | 306k | return 3; |
391 | 306k | } |
392 | 15.0k | str.remove_prefix(1); |
393 | 15.0k | const auto b3 = static_cast<uint8_t>(str.front()); |
394 | 15.0k | if (ABSL_PREDICT_FALSE(b3 < kLow || b3 > kHigh)) { |
395 | 3 | if (code_point != nullptr) { |
396 | 3 | *code_point = kUnicodeReplacementCharacter; |
397 | 3 | } |
398 | 3 | return 1; |
399 | 3 | } |
400 | 15.0k | if (code_point != nullptr) { |
401 | 15.0k | *code_point = (static_cast<char32_t>(b & kMask4) << 18) | |
402 | 15.0k | (static_cast<char32_t>(b1 & kMaskX) << 12) | |
403 | 15.0k | (static_cast<char32_t>(b2 & kMaskX) << 6) | |
404 | 15.0k | static_cast<char32_t>(b3 & kMaskX); |
405 | 15.0k | } |
406 | 15.0k | return 4; |
407 | 15.0k | } |
408 | | |
409 | | } // namespace |
410 | | |
411 | 293M | size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point) { |
412 | 293M | ABSL_DCHECK(!str.empty()); |
413 | 293M | const auto b = static_cast<uint8_t>(str.front()); |
414 | 293M | if (b < kUtf8RuneSelf) { |
415 | 293M | if (code_point != nullptr) { |
416 | 293M | *code_point = static_cast<char32_t>(b); |
417 | 293M | } |
418 | 293M | return 1; |
419 | 293M | } |
420 | 339k | const auto leading = kLeading[b]; |
421 | 339k | if (ABSL_PREDICT_FALSE(leading == kXX)) { |
422 | 17 | if (code_point != nullptr) { |
423 | 17 | *code_point = kUnicodeReplacementCharacter; |
424 | 17 | } |
425 | 17 | return 1; |
426 | 17 | } |
427 | 339k | auto size = static_cast<size_t>(leading & 7) - 1; |
428 | 339k | str.remove_prefix(1); |
429 | 339k | if (ABSL_PREDICT_FALSE(size > str.size())) { |
430 | 2 | if (code_point != nullptr) { |
431 | 2 | *code_point = kUnicodeReplacementCharacter; |
432 | 2 | } |
433 | 2 | return 1; |
434 | 2 | } |
435 | 339k | return Utf8DecodeImpl(b, leading, size, str, code_point); |
436 | 339k | } |
437 | | |
438 | | size_t Utf8Decode(const absl::Cord::CharIterator& it, |
439 | 0 | char32_t* absl_nullable code_point) { |
440 | 0 | absl::string_view str = absl::Cord::ChunkRemaining(it); |
441 | 0 | ABSL_DCHECK(!str.empty()); |
442 | 0 | const auto b = static_cast<uint8_t>(str.front()); |
443 | 0 | if (b < kUtf8RuneSelf) { |
444 | 0 | if (code_point != nullptr) { |
445 | 0 | *code_point = static_cast<char32_t>(b); |
446 | 0 | } |
447 | 0 | return 1; |
448 | 0 | } |
449 | 0 | const auto leading = kLeading[b]; |
450 | 0 | if (ABSL_PREDICT_FALSE(leading == kXX)) { |
451 | 0 | if (code_point != nullptr) { |
452 | 0 | *code_point = kUnicodeReplacementCharacter; |
453 | 0 | } |
454 | 0 | return 1; |
455 | 0 | } |
456 | 0 | auto size = static_cast<size_t>(leading & 7) - 1; |
457 | 0 | str.remove_prefix(1); |
458 | 0 | if (ABSL_PREDICT_TRUE(size <= str.size())) { |
459 | | // Fast path. |
460 | 0 | return Utf8DecodeImpl(b, leading, size, str, code_point); |
461 | 0 | } |
462 | 0 | absl::Cord::CharIterator current = it; |
463 | 0 | absl::Cord::Advance(¤t, 1); |
464 | 0 | char buffer[3]; |
465 | 0 | size_t buffer_len = 0; |
466 | 0 | while (buffer_len < size) { |
467 | 0 | str = absl::Cord::ChunkRemaining(current); |
468 | 0 | if (ABSL_PREDICT_FALSE(str.empty())) { |
469 | 0 | if (code_point != nullptr) { |
470 | 0 | *code_point = kUnicodeReplacementCharacter; |
471 | 0 | } |
472 | 0 | return 1; |
473 | 0 | } |
474 | 0 | size_t to_copy = std::min(size_t{3} - buffer_len, str.size()); |
475 | 0 | std::memcpy(buffer + buffer_len, str.data(), to_copy); |
476 | 0 | buffer_len += to_copy; |
477 | 0 | absl::Cord::Advance(¤t, to_copy); |
478 | 0 | } |
479 | 0 | return Utf8DecodeImpl(b, leading, size, absl::string_view(buffer, buffer_len), |
480 | 0 | code_point); |
481 | 0 | } |
482 | | |
483 | 608M | size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer) { |
484 | 608M | ABSL_DCHECK(buffer != nullptr); |
485 | | |
486 | 608M | char storage[4]; |
487 | 608M | size_t storage_len = Utf8Encode(code_point, storage); |
488 | 608M | buffer->append(storage, storage_len); |
489 | 608M | return storage_len; |
490 | 608M | } |
491 | | |
492 | 608M | size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer) { |
493 | 608M | ABSL_DCHECK(buffer != nullptr); |
494 | | |
495 | 608M | if (ABSL_PREDICT_FALSE(!UnicodeIsValid(code_point))) { |
496 | 0 | code_point = kUnicodeReplacementCharacter; |
497 | 0 | } |
498 | 608M | size_t storage_len = 0; |
499 | 608M | if (code_point <= 0x7f) { |
500 | 597M | buffer[storage_len++] = static_cast<char>(static_cast<uint8_t>(code_point)); |
501 | 597M | } else if (code_point <= 0x7ff) { |
502 | 51.4k | buffer[storage_len++] = |
503 | 51.4k | static_cast<char>(kT2 | static_cast<uint8_t>(code_point >> 6)); |
504 | 51.4k | buffer[storage_len++] = |
505 | 51.4k | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)); |
506 | 10.3M | } else if (code_point <= 0xffff) { |
507 | 10.2M | buffer[storage_len++] = |
508 | 10.2M | static_cast<char>(kT3 | static_cast<uint8_t>(code_point >> 12)); |
509 | 10.2M | buffer[storage_len++] = static_cast<char>( |
510 | 10.2M | kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX)); |
511 | 10.2M | buffer[storage_len++] = |
512 | 10.2M | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)); |
513 | 10.2M | } else { |
514 | 47.5k | buffer[storage_len++] = |
515 | 47.5k | static_cast<char>(kT4 | static_cast<uint8_t>(code_point >> 18)); |
516 | 47.5k | buffer[storage_len++] = static_cast<char>( |
517 | 47.5k | kTX | (static_cast<uint8_t>(code_point >> 12) & kMaskX)); |
518 | 47.5k | buffer[storage_len++] = static_cast<char>( |
519 | 47.5k | kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX)); |
520 | 47.5k | buffer[storage_len++] = |
521 | 47.5k | static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)); |
522 | 47.5k | } |
523 | 608M | return storage_len; |
524 | 608M | } |
525 | | |
526 | | } // namespace cel::internal |