/src/abseil-cpp/absl/strings/internal/utf8.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2017 The Abseil Authors. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // UTF8 utilities, implemented to reduce dependencies. |
16 | | |
17 | | #include "absl/strings/internal/utf8.h" |
18 | | |
19 | | #include <cstddef> |
20 | | #include <cstdint> |
21 | | #include <limits> |
22 | | |
23 | | #include "absl/base/config.h" |
24 | | |
25 | | namespace absl { |
26 | | ABSL_NAMESPACE_BEGIN |
27 | | namespace strings_internal { |
28 | | |
29 | 0 | size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) { |
30 | 0 | if (utf8_char <= 0x7F) { |
31 | 0 | *buffer = static_cast<char>(utf8_char); |
32 | 0 | return 1; |
33 | 0 | } else if (utf8_char <= 0x7FF) { |
34 | 0 | buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
35 | 0 | utf8_char >>= 6; |
36 | 0 | buffer[0] = static_cast<char>(0xC0 | utf8_char); |
37 | 0 | return 2; |
38 | 0 | } else if (utf8_char <= 0xFFFF) { |
39 | 0 | buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
40 | 0 | utf8_char >>= 6; |
41 | 0 | buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
42 | 0 | utf8_char >>= 6; |
43 | 0 | buffer[0] = static_cast<char>(0xE0 | utf8_char); |
44 | 0 | return 3; |
45 | 0 | } else { |
46 | 0 | buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
47 | 0 | utf8_char >>= 6; |
48 | 0 | buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
49 | 0 | utf8_char >>= 6; |
50 | 0 | buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F)); |
51 | 0 | utf8_char >>= 6; |
52 | 0 | buffer[0] = static_cast<char>(0xF0 | utf8_char); |
53 | 0 | return 4; |
54 | 0 | } |
55 | 0 | } |
56 | | |
57 | 0 | size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) { |
58 | | // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent |
59 | | // bitwise operations. This ensures well-defined behavior for bit |
60 | | // manipulations (avoiding issues with signed `char`) and is safe under C++ |
61 | | // aliasing rules, as `unsigned char` can alias any type. |
62 | 0 | auto* ubuf = reinterpret_cast<unsigned char*>(buf); |
63 | 0 | const uint32_t v = static_cast<uint32_t>(wc); |
64 | 0 | constexpr size_t kError = static_cast<size_t>(-1); |
65 | |
|
66 | 0 | if (v <= 0x007F) { |
67 | | // 1-byte sequence (U+0000 to U+007F). |
68 | | // 0xxxxxxx. |
69 | 0 | ubuf[0] = (0b0111'1111 & v); |
70 | 0 | s = {}; // Reset surrogate state. |
71 | 0 | return 1; |
72 | 0 | } else if (0x0080 <= v && v <= 0x07FF) { |
73 | | // 2-byte sequence (U+0080 to U+07FF). |
74 | | // 110xxxxx 10xxxxxx. |
75 | 0 | ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6)); |
76 | 0 | ubuf[1] = 0b1000'0000 | (0b0011'1111 & v); |
77 | 0 | s = {}; // Reset surrogate state. |
78 | 0 | return 2; |
79 | 0 | } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) { |
80 | | // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF). |
81 | | // Excludes surrogate code points U+D800-U+DFFF. |
82 | | // 1110xxxx 10xxxxxx 10xxxxxx. |
83 | 0 | ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12)); |
84 | 0 | ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6)); |
85 | 0 | ubuf[2] = 0b1000'0000 | (0b0011'1111 & v); |
86 | 0 | s = {}; // Reset surrogate state. |
87 | 0 | return 3; |
88 | 0 | } else if (0xD800 <= v && v <= 0xDBFF) { |
89 | | // High Surrogate (U+D800 to U+DBFF). |
90 | | // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence. |
91 | 0 | const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1; |
92 | | |
93 | | // First byte of the 4-byte UTF-8 sequence (11110xxx). |
94 | 0 | ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2)); |
95 | | // Second byte of the 4-byte UTF-8 sequence (10xxxxxx). |
96 | 0 | ubuf[1] = 0b1000'0000 | // |
97 | 0 | (0b0011'0000 & (high_bits_val << 4)) | // |
98 | 0 | (0b0000'1111 & (v >> 2)); |
99 | | // Set state for high surrogate after writing to buffer. |
100 | 0 | s = {true, static_cast<unsigned char>(0b0000'0011 & v)}; |
101 | 0 | return 2; // Wrote 2 bytes, expecting 2 more from a low surrogate. |
102 | 0 | } else if (0xDC00 <= v && v <= 0xDFFF) { |
103 | | // Low Surrogate (U+DC00 to U+DFFF). |
104 | | // This part forms the last two bytes of a 4-byte UTF-8 sequence, |
105 | | // using state from a preceding high surrogate. |
106 | 0 | if (!s.saw_high_surrogate) { |
107 | | // Error: Isolated low surrogate without a preceding high surrogate. |
108 | | // s remains in its current (problematic) state. |
109 | | // Caller should handle error. |
110 | 0 | return kError; |
111 | 0 | } |
112 | | |
113 | | // Third byte of the 4-byte UTF-8 sequence (10xxxxxx). |
114 | 0 | ubuf[0] = 0b1000'0000 | // |
115 | 0 | (0b0011'0000 & (s.bits << 4)) | // |
116 | 0 | (0b0000'1111 & (v >> 6)); |
117 | | // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx). |
118 | 0 | ubuf[1] = 0b1000'0000 | (0b0011'1111 & v); |
119 | |
|
120 | 0 | s = {}; // Reset surrogate state, pair complete. |
121 | 0 | return 2; // Wrote 2 more bytes, completing the 4-byte sequence. |
122 | 0 | } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) { |
123 | | // Conditionally compile the 4-byte direct conversion branch. |
124 | | // This block is compiled only if wchar_t can represent values > 0xFFFF. |
125 | | // It's placed after surrogate checks to ensure surrogates are handled by |
126 | | // their specific logic. This inner 'if' is the runtime check for the 4-byte |
127 | | // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP |
128 | | // ranges, nor is it a surrogate code point. |
129 | 0 | if (0x10000 <= v && v <= 0x10FFFF) { |
130 | | // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. |
131 | 0 | ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18)); |
132 | 0 | ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12)); |
133 | 0 | ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6)); |
134 | 0 | ubuf[3] = 0b1000'0000 | (0b0011'1111 & v); |
135 | 0 | s = {}; // Reset surrogate state. |
136 | 0 | return 4; |
137 | 0 | } |
138 | 0 | } |
139 | | |
140 | | // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all |
141 | | // checks). |
142 | 0 | s = {}; // Reset surrogate state. |
143 | 0 | return kError; |
144 | 0 | } |
145 | | |
146 | | } // namespace strings_internal |
147 | | ABSL_NAMESPACE_END |
148 | | } // namespace absl |