/src/abseil-cpp/absl/strings/internal/utf8.cc
Line  | Count  | Source  | 
1  |  | // Copyright 2017 The Abseil Authors.  | 
2  |  | //  | 
3  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
4  |  | // you may not use this file except in compliance with the License.  | 
5  |  | // You may obtain a copy of the License at  | 
6  |  | //  | 
7  |  | //      https://www.apache.org/licenses/LICENSE-2.0  | 
8  |  | //  | 
9  |  | // Unless required by applicable law or agreed to in writing, software  | 
10  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
11  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
12  |  | // See the License for the specific language governing permissions and  | 
13  |  | // limitations under the License.  | 
14  |  |  | 
15  |  | // UTF8 utilities, implemented to reduce dependencies.  | 
16  |  |  | 
17  |  | #include "absl/strings/internal/utf8.h"  | 
18  |  |  | 
19  |  | #include <cstddef>  | 
20  |  | #include <cstdint>  | 
21  |  | #include <limits>  | 
22  |  |  | 
23  |  | #include "absl/base/config.h"  | 
24  |  |  | 
25  |  | namespace absl { | 
26  |  | ABSL_NAMESPACE_BEGIN  | 
27  |  | namespace strings_internal { | 
28  |  |  | 
29  | 0  | size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) { | 
30  | 0  |   if (utf8_char <= 0x7F) { | 
31  | 0  |     *buffer = static_cast<char>(utf8_char);  | 
32  | 0  |     return 1;  | 
33  | 0  |   } else if (utf8_char <= 0x7FF) { | 
34  | 0  |     buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
35  | 0  |     utf8_char >>= 6;  | 
36  | 0  |     buffer[0] = static_cast<char>(0xC0 | utf8_char);  | 
37  | 0  |     return 2;  | 
38  | 0  |   } else if (utf8_char <= 0xFFFF) { | 
39  | 0  |     buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
40  | 0  |     utf8_char >>= 6;  | 
41  | 0  |     buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
42  | 0  |     utf8_char >>= 6;  | 
43  | 0  |     buffer[0] = static_cast<char>(0xE0 | utf8_char);  | 
44  | 0  |     return 3;  | 
45  | 0  |   } else { | 
46  | 0  |     buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
47  | 0  |     utf8_char >>= 6;  | 
48  | 0  |     buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
49  | 0  |     utf8_char >>= 6;  | 
50  | 0  |     buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));  | 
51  | 0  |     utf8_char >>= 6;  | 
52  | 0  |     buffer[0] = static_cast<char>(0xF0 | utf8_char);  | 
53  | 0  |     return 4;  | 
54  | 0  |   }  | 
55  | 0  | }  | 
56  |  |  | 
57  | 0  | size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) { | 
58  |  |   // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent  | 
59  |  |   // bitwise operations. This ensures well-defined behavior for bit  | 
60  |  |   // manipulations (avoiding issues with signed `char`) and is safe under C++  | 
61  |  |   // aliasing rules, as `unsigned char` can alias any type.  | 
62  | 0  |   auto* ubuf = reinterpret_cast<unsigned char*>(buf);  | 
63  | 0  |   const uint32_t v = static_cast<uint32_t>(wc);  | 
64  | 0  |   constexpr size_t kError = static_cast<size_t>(-1);  | 
65  |  | 
  | 
66  | 0  |   if (v <= 0x007F) { | 
67  |  |     // 1-byte sequence (U+0000 to U+007F).  | 
68  |  |     // 0xxxxxxx.  | 
69  | 0  |     ubuf[0] = (0b0111'1111 & v);  | 
70  | 0  |     s = {};  // Reset surrogate state. | 
71  | 0  |     return 1;  | 
72  | 0  |   } else if (0x0080 <= v && v <= 0x07FF) { | 
73  |  |     // 2-byte sequence (U+0080 to U+07FF).  | 
74  |  |     // 110xxxxx 10xxxxxx.  | 
75  | 0  |     ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));  | 
76  | 0  |     ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);  | 
77  | 0  |     s = {};  // Reset surrogate state. | 
78  | 0  |     return 2;  | 
79  | 0  |   } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) { | 
80  |  |     // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).  | 
81  |  |     // Excludes surrogate code points U+D800-U+DFFF.  | 
82  |  |     // 1110xxxx 10xxxxxx 10xxxxxx.  | 
83  | 0  |     ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));  | 
84  | 0  |     ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));  | 
85  | 0  |     ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);  | 
86  | 0  |     s = {};  // Reset surrogate state. | 
87  | 0  |     return 3;  | 
88  | 0  |   } else if (0xD800 <= v && v <= 0xDBFF) { | 
89  |  |     // High Surrogate (U+D800 to U+DBFF).  | 
90  |  |     // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.  | 
91  | 0  |     const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;  | 
92  |  |  | 
93  |  |     // First byte of the 4-byte UTF-8 sequence (11110xxx).  | 
94  | 0  |     ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));  | 
95  |  |     // Second byte of the 4-byte UTF-8 sequence (10xxxxxx).  | 
96  | 0  |     ubuf[1] = 0b1000'0000 |                           //  | 
97  | 0  |               (0b0011'0000 & (high_bits_val << 4)) |  //  | 
98  | 0  |               (0b0000'1111 & (v >> 2));  | 
99  |  |     // Set state for high surrogate after writing to buffer.  | 
100  | 0  |     s = {true, static_cast<unsigned char>(0b0000'0011 & v)}; | 
101  | 0  |     return 2;  // Wrote 2 bytes, expecting 2 more from a low surrogate.  | 
102  | 0  |   } else if (0xDC00 <= v && v <= 0xDFFF) { | 
103  |  |     // Low Surrogate (U+DC00 to U+DFFF).  | 
104  |  |     // This part forms the last two bytes of a 4-byte UTF-8 sequence,  | 
105  |  |     // using state from a preceding high surrogate.  | 
106  | 0  |     if (!s.saw_high_surrogate) { | 
107  |  |       // Error: Isolated low surrogate without a preceding high surrogate.  | 
108  |  |       // s remains in its current (problematic) state.  | 
109  |  |       // Caller should handle error.  | 
110  | 0  |       return kError;  | 
111  | 0  |     }  | 
112  |  |  | 
113  |  |     // Third byte of the 4-byte UTF-8 sequence (10xxxxxx).  | 
114  | 0  |     ubuf[0] = 0b1000'0000 |                    //  | 
115  | 0  |               (0b0011'0000 & (s.bits << 4)) |  //  | 
116  | 0  |               (0b0000'1111 & (v >> 6));  | 
117  |  |     // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).  | 
118  | 0  |     ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);  | 
119  |  | 
  | 
120  | 0  |     s = {};    // Reset surrogate state, pair complete. | 
121  | 0  |     return 2;  // Wrote 2 more bytes, completing the 4-byte sequence.  | 
122  | 0  |   } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) { | 
123  |  |     // Conditionally compile the 4-byte direct conversion branch.  | 
124  |  |     // This block is compiled only if wchar_t can represent values > 0xFFFF.  | 
125  |  |     // It's placed after surrogate checks to ensure surrogates are handled by  | 
126  |  |     // their specific logic. This inner 'if' is the runtime check for the 4-byte  | 
127  |  |     // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP  | 
128  |  |     // ranges, nor is it a surrogate code point.  | 
129  | 0  |     if (0x10000 <= v && v <= 0x10FFFF) { | 
130  |  |       // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.  | 
131  | 0  |       ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));  | 
132  | 0  |       ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));  | 
133  | 0  |       ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));  | 
134  | 0  |       ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);  | 
135  | 0  |       s = {};  // Reset surrogate state. | 
136  | 0  |       return 4;  | 
137  | 0  |     }  | 
138  | 0  |   }  | 
139  |  |  | 
140  |  |   // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all  | 
141  |  |   // checks).  | 
142  | 0  |   s = {};  // Reset surrogate state. | 
143  | 0  |   return kError;  | 
144  | 0  | }  | 
145  |  |  | 
146  |  | }  // namespace strings_internal  | 
147  |  | ABSL_NAMESPACE_END  | 
148  |  | }  // namespace absl  |