/proc/self/cwd/internal/utf8.h
Line | Count | Source |
1 | | // Copyright 2021 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #ifndef THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_ |
16 | | #define THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_ |
17 | | |
18 | | #include <cstddef> |
19 | | #include <string> |
20 | | #include <utility> |
21 | | |
22 | | #include "absl/base/attributes.h" |
23 | | #include "absl/base/nullability.h" |
24 | | #include "absl/strings/cord.h" |
25 | | #include "absl/strings/string_view.h" |
26 | | |
27 | | namespace cel::internal { |
28 | | |
29 | | // Returns true if the given UTF-8 encoded string is not malformed, false |
30 | | // otherwise. |
31 | | bool Utf8IsValid(absl::string_view str); |
32 | | bool Utf8IsValid(const absl::Cord& str); |
33 | | |
34 | | // Returns the number of Unicode code points in the UTF-8 encoded string. |
35 | | // |
36 | | // If there are any invalid bytes, they will each be counted as an invalid code |
37 | | // point. |
38 | | size_t Utf8CodePointCount(absl::string_view str); |
39 | | size_t Utf8CodePointCount(const absl::Cord& str); |
40 | | |
41 | | // Validates the given UTF-8 encoded string. The first return value is the |
42 | | // number of code points and its meaning depends on the second return value. If |
43 | | // the second return value is true the entire string is not malformed and the |
44 | | // first return value is the number of code points. If the second return value |
45 | | // is false the string is malformed and the first return value is the number of |
46 | | // code points up until the malformed sequence was encountered. |
47 | | std::pair<size_t, bool> Utf8Validate(absl::string_view str); |
48 | | std::pair<size_t, bool> Utf8Validate(const absl::Cord& str); |
49 | | |
50 | | // Decodes the next code point, returning the decoded code point and the number |
51 | | // of code units (a.k.a. bytes) consumed. In the event that an invalid code unit |
52 | | // sequence is returned the replacement character, U+FFFD, is returned with a |
53 | | // code unit count of 1. As U+FFFD requires 3 code units when encoded, this can |
54 | | // be used to differentiate valid input from malformed input. |
55 | | size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point); |
56 | | size_t Utf8Decode(const absl::Cord::CharIterator& it, |
57 | | char32_t* absl_nullable code_point); |
58 | 293M | inline std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) { |
59 | 293M | char32_t code_point; |
60 | 293M | size_t code_units = Utf8Decode(str, &code_point); |
61 | 293M | return std::pair{code_point, code_units}; |
62 | 293M | } |
63 | | inline std::pair<char32_t, size_t> Utf8Decode( |
64 | 0 | const absl::Cord::CharIterator& it) { |
65 | 0 | char32_t code_point; |
66 | 0 | size_t code_units = Utf8Decode(it, &code_point); |
67 | 0 | return std::pair{code_point, code_units}; |
68 | 0 | } |
69 | | |
70 | | // Encodes the given code point and appends it to the buffer. If the code point |
71 | | // is an unpaired surrogate or outside of the valid Unicode range it is replaced |
72 | | // with the replacement character, U+FFFD. |
73 | | size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer); |
74 | | size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer); |
75 | | ABSL_DEPRECATED("Use other overload") |
76 | 608M | inline size_t Utf8Encode(std::string& buffer, char32_t code_point) { |
77 | 608M | return Utf8Encode(code_point, &buffer); |
78 | 608M | } |
79 | | |
80 | | } // namespace cel::internal |
81 | | |
82 | | #endif // THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_ |