/src/mozilla-central/xpcom/string/nsUTF8Utils.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | #ifndef nsUTF8Utils_h_ |
7 | | #define nsUTF8Utils_h_ |
8 | | |
9 | | // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this |
10 | | // file will provide signatures for the Mozilla abstract string types. It will |
11 | | // use XPCOM assertion/debugging macros, etc. |
12 | | |
13 | | #include "nscore.h" |
14 | | #include "mozilla/Assertions.h" |
15 | | #include "mozilla/EndianUtils.h" |
16 | | #include "mozilla/TypeTraits.h" |
17 | | |
18 | | #include "nsCharTraits.h" |
19 | | |
20 | | #ifdef MOZILLA_INTERNAL_API |
21 | | #define UTF8UTILS_WARNING(msg) NS_WARNING(msg) |
22 | | #else |
23 | | #define UTF8UTILS_WARNING(msg) |
24 | | #endif |
25 | | |
26 | | class UTF8traits |
27 | | { |
28 | | public: |
29 | | static bool isASCII(char aChar) |
30 | | { |
31 | | return (aChar & 0x80) == 0x00; |
32 | | } |
33 | | static bool isInSeq(char aChar) |
34 | | { |
35 | | return (aChar & 0xC0) == 0x80; |
36 | | } |
37 | | static bool is2byte(char aChar) |
38 | | { |
39 | | return (aChar & 0xE0) == 0xC0; |
40 | | } |
41 | | static bool is3byte(char aChar) |
42 | | { |
43 | | return (aChar & 0xF0) == 0xE0; |
44 | | } |
45 | | static bool is4byte(char aChar) |
46 | | { |
47 | | return (aChar & 0xF8) == 0xF0; |
48 | | } |
49 | | static bool is5byte(char aChar) |
50 | 0 | { |
51 | 0 | return (aChar & 0xFC) == 0xF8; |
52 | 0 | } |
53 | | static bool is6byte(char aChar) |
54 | 0 | { |
55 | 0 | return (aChar & 0xFE) == 0xFC; |
56 | 0 | } |
57 | | // return the number of bytes in a sequence beginning with aChar |
58 | | static int bytes(char aChar) |
59 | | { |
60 | | if (isASCII(aChar)) { |
61 | | return 1; |
62 | | } |
63 | | if (is2byte(aChar)) { |
64 | | return 2; |
65 | | } |
66 | | if (is3byte(aChar)) { |
67 | | return 3; |
68 | | } |
69 | | if (is4byte(aChar)) { |
70 | | return 4; |
71 | | } |
72 | | MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters"); |
73 | | return 1; |
74 | | } |
75 | | }; |
76 | | |
77 | | /** |
78 | | * Extract the next Unicode scalar value from the buffer and return it. The |
79 | | * pointer passed in is advanced to the start of the next character in the |
80 | | * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced |
81 | | * over the maximal valid prefix and *aErr is set to true (if aErr is not |
82 | | * null). |
83 | | * |
84 | | * Note: This method never sets *aErr to false to allow error accumulation |
85 | | * across multiple calls. |
86 | | * |
87 | | * Precondition: *aBuffer < aEnd |
88 | | */ |
89 | | class UTF8CharEnumerator |
90 | | { |
91 | | public: |
92 | | static inline char32_t NextChar(const char** aBuffer, |
93 | | const char* aEnd, |
94 | | bool* aErr = nullptr) |
95 | 210k | { |
96 | 210k | MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); |
97 | 210k | MOZ_ASSERT(aEnd, "null end pointer"); |
98 | 210k | |
99 | 210k | const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer); |
100 | 210k | const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd); |
101 | 210k | |
102 | 210k | MOZ_ASSERT(p, "null buffer"); |
103 | 210k | MOZ_ASSERT(p < end, "Bogus range"); |
104 | 210k | |
105 | 210k | unsigned char first = *p++; |
106 | 210k | |
107 | 210k | if (MOZ_LIKELY(first < 0x80U)) { |
108 | 210k | *aBuffer = reinterpret_cast<const char*>(p); |
109 | 210k | return first; |
110 | 210k | } |
111 | 0 | |
112 | 0 | // Unsigned underflow is defined behavior |
113 | 0 | if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) { |
114 | 0 | *aBuffer = reinterpret_cast<const char*>(p); |
115 | 0 | if (aErr) { |
116 | 0 | *aErr = true; |
117 | 0 | } |
118 | 0 | return 0xFFFDU; |
119 | 0 | } |
120 | 0 |
|
121 | 0 | unsigned char second = *p; |
122 | 0 |
|
123 | 0 | if (first < 0xE0U) { |
124 | 0 | // Two-byte |
125 | 0 | if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) { |
126 | 0 | *aBuffer = reinterpret_cast<const char*>(++p); |
127 | 0 | return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU); |
128 | 0 | } |
129 | 0 | *aBuffer = reinterpret_cast<const char*>(p); |
130 | 0 | if (aErr) { |
131 | 0 | *aErr = true; |
132 | 0 | } |
133 | 0 | return 0xFFFDU; |
134 | 0 | } |
135 | 0 |
|
136 | 0 | if (MOZ_LIKELY(first < 0xF0U)) { |
137 | 0 | // Three-byte |
138 | 0 | unsigned char lower = 0x80U; |
139 | 0 | unsigned char upper = 0xBFU; |
140 | 0 | if (first == 0xE0U) { |
141 | 0 | lower = 0xA0U; |
142 | 0 | } else if (first == 0xEDU) { |
143 | 0 | upper = 0x9FU; |
144 | 0 | } |
145 | 0 | if (MOZ_LIKELY(second >= lower && second <= upper)) { |
146 | 0 | if (MOZ_LIKELY(p != end)) { |
147 | 0 | unsigned char third = *++p; |
148 | 0 | if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { |
149 | 0 | *aBuffer = reinterpret_cast<const char*>(++p); |
150 | 0 | return ((uint32_t(first) & 0xFU) << 12) | |
151 | 0 | ((uint32_t(second) & 0x3FU) << 6) | |
152 | 0 | (uint32_t(third) & 0x3FU); |
153 | 0 | } |
154 | 0 | } |
155 | 0 | } |
156 | 0 | *aBuffer = reinterpret_cast<const char*>(p); |
157 | 0 | if (aErr) { |
158 | 0 | *aErr = true; |
159 | 0 | } |
160 | 0 | return 0xFFFDU; |
161 | 0 | } |
162 | 0 |
|
163 | 0 | // Four-byte |
164 | 0 | unsigned char lower = 0x80U; |
165 | 0 | unsigned char upper = 0xBFU; |
166 | 0 | if (first == 0xF0U) { |
167 | 0 | lower = 0x90U; |
168 | 0 | } else if (first == 0xF4U) { |
169 | 0 | upper = 0x8FU; |
170 | 0 | } |
171 | 0 | if (MOZ_LIKELY(second >= lower && second <= upper)) { |
172 | 0 | if (MOZ_LIKELY(p != end)) { |
173 | 0 | unsigned char third = *++p; |
174 | 0 | if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { |
175 | 0 | if (MOZ_LIKELY(p != end)) { |
176 | 0 | unsigned char fourth = *++p; |
177 | 0 | if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) { |
178 | 0 | *aBuffer = reinterpret_cast<const char*>(++p); |
179 | 0 | return ((uint32_t(first) & 0x7U) << 18) | |
180 | 0 | ((uint32_t(second) & 0x3FU) << 12) | |
181 | 0 | ((uint32_t(third) & 0x3FU) << 6) | |
182 | 0 | (uint32_t(fourth) & 0x3FU); |
183 | 0 | } |
184 | 0 | } |
185 | 0 | } |
186 | 0 | } |
187 | 0 | } |
188 | 0 | *aBuffer = reinterpret_cast<const char*>(p); |
189 | 0 | if (aErr) { |
190 | 0 | *aErr = true; |
191 | 0 | } |
192 | 0 | return 0xFFFDU; |
193 | 0 | } |
194 | | }; |
195 | | |
196 | | /** |
197 | | * Extract the next Unicode scalar value from the buffer and return it. The |
198 | | * pointer passed in is advanced to the start of the next character in the |
199 | | * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over |
200 | | * the unpaired surrogate and *aErr is set to true (if aErr is not null). |
201 | | * |
202 | | * Note: This method never sets *aErr to false to allow error accumulation |
203 | | * across multiple calls. |
204 | | * |
205 | | * Precondition: *aBuffer < aEnd |
206 | | */ |
207 | | class UTF16CharEnumerator |
208 | | { |
209 | | public: |
210 | | static inline char32_t NextChar(const char16_t** aBuffer, |
211 | | const char16_t* aEnd, |
212 | | bool* aErr = nullptr) |
213 | 105k | { |
214 | 105k | MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); |
215 | 105k | MOZ_ASSERT(aEnd, "null end pointer"); |
216 | 105k | |
217 | 105k | const char16_t* p = *aBuffer; |
218 | 105k | |
219 | 105k | MOZ_ASSERT(p, "null buffer"); |
220 | 105k | MOZ_ASSERT(p < aEnd, "Bogus range"); |
221 | 105k | |
222 | 105k | char16_t c = *p++; |
223 | 105k | |
224 | 105k | // Let's use encoding_rs-style code golf here. |
225 | 105k | // Unsigned underflow is defined behavior |
226 | 105k | char16_t cMinusSurrogateStart = c - 0xD800U; |
227 | 105k | if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) { |
228 | 105k | *aBuffer = p; |
229 | 105k | return c; |
230 | 105k | } |
231 | 0 | if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) { |
232 | 0 | // High surrogate |
233 | 0 | if (MOZ_LIKELY(p != aEnd)) { |
234 | 0 | char16_t second = *p; |
235 | 0 | // Unsigned underflow is defined behavior |
236 | 0 | if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) { |
237 | 0 | *aBuffer = ++p; |
238 | 0 | return (uint32_t(c) << 10) + uint32_t(second) - |
239 | 0 | (((0xD800U << 10) - 0x10000U) + 0xDC00U); |
240 | 0 | } |
241 | 0 | } |
242 | 0 | } |
243 | 0 | // Unpaired surrogate |
244 | 0 | *aBuffer = p; |
245 | 0 | if (aErr) { |
246 | 0 | *aErr = true; |
247 | 0 | } |
248 | 0 | return 0xFFFDU; |
249 | 0 | } |
250 | | }; |
251 | | |
252 | | template<typename Char, typename UnsignedT> |
253 | | inline UnsignedT |
254 | | RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index) |
255 | | { |
256 | | static_assert(mozilla::IsSame<Char, char>::value || |
257 | | mozilla::IsSame<Char, unsigned char>::value || |
258 | | mozilla::IsSame<Char, signed char>::value, |
259 | | "UTF-8 data must be in 8-bit units"); |
260 | | static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned"); |
261 | | while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) |
262 | | --index; |
263 | | |
264 | | return index; |
265 | | } |
266 | | |
267 | | #undef UTF8UTILS_WARNING |
268 | | |
269 | | #endif /* !defined(nsUTF8Utils_h_) */ |