/work/obj-fuzz/dist/include/nsUTF8Utils.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | #ifndef nsUTF8Utils_h_ |
7 | | #define nsUTF8Utils_h_ |
8 | | |
9 | | // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this |
10 | | // file will provide signatures for the Mozilla abstract string types. It will |
11 | | // use XPCOM assertion/debugging macros, etc. |
12 | | |
13 | | #include "nscore.h" |
14 | | #include "mozilla/Assertions.h" |
15 | | #include "mozilla/EndianUtils.h" |
16 | | #include "mozilla/TypeTraits.h" |
17 | | |
18 | | #include "nsCharTraits.h" |
19 | | |
20 | | #ifdef MOZILLA_INTERNAL_API |
21 | | #define UTF8UTILS_WARNING(msg) NS_WARNING(msg) |
22 | | #else |
23 | | #define UTF8UTILS_WARNING(msg) |
24 | | #endif |
25 | | |
26 | | class UTF8traits |
27 | | { |
28 | | public: |
29 | | static bool isASCII(char aChar) |
30 | 0 | { |
31 | 0 | return (aChar & 0x80) == 0x00; |
32 | 0 | } |
33 | | static bool isInSeq(char aChar) |
34 | 0 | { |
35 | 0 | return (aChar & 0xC0) == 0x80; |
36 | 0 | } |
37 | | static bool is2byte(char aChar) |
38 | 0 | { |
39 | 0 | return (aChar & 0xE0) == 0xC0; |
40 | 0 | } |
41 | | static bool is3byte(char aChar) |
42 | 0 | { |
43 | 0 | return (aChar & 0xF0) == 0xE0; |
44 | 0 | } |
45 | | static bool is4byte(char aChar) |
46 | 0 | { |
47 | 0 | return (aChar & 0xF8) == 0xF0; |
48 | 0 | } |
49 | | static bool is5byte(char aChar) |
50 | | { |
51 | | return (aChar & 0xFC) == 0xF8; |
52 | | } |
53 | | static bool is6byte(char aChar) |
54 | | { |
55 | | return (aChar & 0xFE) == 0xFC; |
56 | | } |
57 | | // return the number of bytes in a sequence beginning with aChar |
58 | | static int bytes(char aChar) |
59 | 0 | { |
60 | 0 | if (isASCII(aChar)) { |
61 | 0 | return 1; |
62 | 0 | } |
63 | 0 | if (is2byte(aChar)) { |
64 | 0 | return 2; |
65 | 0 | } |
66 | 0 | if (is3byte(aChar)) { |
67 | 0 | return 3; |
68 | 0 | } |
69 | 0 | if (is4byte(aChar)) { |
70 | 0 | return 4; |
71 | 0 | } |
72 | 0 | MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters"); |
73 | 0 | return 1; |
74 | 0 | } |
75 | | }; |
76 | | |
77 | | /** |
78 | | * Extract the next Unicode scalar value from the buffer and return it. The |
79 | | * pointer passed in is advanced to the start of the next character in the |
80 | | * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced |
81 | | * over the maximal valid prefix and *aErr is set to true (if aErr is not |
82 | | * null). |
83 | | * |
84 | | * Note: This method never sets *aErr to false to allow error accumulation |
85 | | * across multiple calls. |
86 | | * |
87 | | * Precondition: *aBuffer < aEnd |
88 | | */ |
89 | | class UTF8CharEnumerator |
90 | | { |
91 | | public: |
92 | | static inline char32_t NextChar(const char** aBuffer, |
93 | | const char* aEnd, |
94 | | bool* aErr = nullptr) |
95 | | { |
96 | | MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); |
97 | | MOZ_ASSERT(aEnd, "null end pointer"); |
98 | | |
99 | | const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer); |
100 | | const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd); |
101 | | |
102 | | MOZ_ASSERT(p, "null buffer"); |
103 | | MOZ_ASSERT(p < end, "Bogus range"); |
104 | | |
105 | | unsigned char first = *p++; |
106 | | |
107 | | if (MOZ_LIKELY(first < 0x80U)) { |
108 | | *aBuffer = reinterpret_cast<const char*>(p); |
109 | | return first; |
110 | | } |
111 | | |
112 | | // Unsigned underflow is defined behavior |
113 | | if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) { |
114 | | *aBuffer = reinterpret_cast<const char*>(p); |
115 | | if (aErr) { |
116 | | *aErr = true; |
117 | | } |
118 | | return 0xFFFDU; |
119 | | } |
120 | | |
121 | | unsigned char second = *p; |
122 | | |
123 | | if (first < 0xE0U) { |
124 | | // Two-byte |
125 | | if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) { |
126 | | *aBuffer = reinterpret_cast<const char*>(++p); |
127 | | return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU); |
128 | | } |
129 | | *aBuffer = reinterpret_cast<const char*>(p); |
130 | | if (aErr) { |
131 | | *aErr = true; |
132 | | } |
133 | | return 0xFFFDU; |
134 | | } |
135 | | |
136 | | if (MOZ_LIKELY(first < 0xF0U)) { |
137 | | // Three-byte |
138 | | unsigned char lower = 0x80U; |
139 | | unsigned char upper = 0xBFU; |
140 | | if (first == 0xE0U) { |
141 | | lower = 0xA0U; |
142 | | } else if (first == 0xEDU) { |
143 | | upper = 0x9FU; |
144 | | } |
145 | | if (MOZ_LIKELY(second >= lower && second <= upper)) { |
146 | | if (MOZ_LIKELY(p != end)) { |
147 | | unsigned char third = *++p; |
148 | | if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { |
149 | | *aBuffer = reinterpret_cast<const char*>(++p); |
150 | | return ((uint32_t(first) & 0xFU) << 12) | |
151 | | ((uint32_t(second) & 0x3FU) << 6) | |
152 | | (uint32_t(third) & 0x3FU); |
153 | | } |
154 | | } |
155 | | } |
156 | | *aBuffer = reinterpret_cast<const char*>(p); |
157 | | if (aErr) { |
158 | | *aErr = true; |
159 | | } |
160 | | return 0xFFFDU; |
161 | | } |
162 | | |
163 | | // Four-byte |
164 | | unsigned char lower = 0x80U; |
165 | | unsigned char upper = 0xBFU; |
166 | | if (first == 0xF0U) { |
167 | | lower = 0x90U; |
168 | | } else if (first == 0xF4U) { |
169 | | upper = 0x8FU; |
170 | | } |
171 | | if (MOZ_LIKELY(second >= lower && second <= upper)) { |
172 | | if (MOZ_LIKELY(p != end)) { |
173 | | unsigned char third = *++p; |
174 | | if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { |
175 | | if (MOZ_LIKELY(p != end)) { |
176 | | unsigned char fourth = *++p; |
177 | | if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) { |
178 | | *aBuffer = reinterpret_cast<const char*>(++p); |
179 | | return ((uint32_t(first) & 0x7U) << 18) | |
180 | | ((uint32_t(second) & 0x3FU) << 12) | |
181 | | ((uint32_t(third) & 0x3FU) << 6) | |
182 | | (uint32_t(fourth) & 0x3FU); |
183 | | } |
184 | | } |
185 | | } |
186 | | } |
187 | | } |
188 | | *aBuffer = reinterpret_cast<const char*>(p); |
189 | | if (aErr) { |
190 | | *aErr = true; |
191 | | } |
192 | | return 0xFFFDU; |
193 | | } |
194 | | }; |
195 | | |
196 | | /** |
197 | | * Extract the next Unicode scalar value from the buffer and return it. The |
198 | | * pointer passed in is advanced to the start of the next character in the |
199 | | * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over |
200 | | * the unpaired surrogate and *aErr is set to true (if aErr is not null). |
201 | | * |
202 | | * Note: This method never sets *aErr to false to allow error accumulation |
203 | | * across multiple calls. |
204 | | * |
205 | | * Precondition: *aBuffer < aEnd |
206 | | */ |
207 | | class UTF16CharEnumerator |
208 | | { |
209 | | public: |
210 | | static inline char32_t NextChar(const char16_t** aBuffer, |
211 | | const char16_t* aEnd, |
212 | | bool* aErr = nullptr) |
213 | | { |
214 | | MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); |
215 | | MOZ_ASSERT(aEnd, "null end pointer"); |
216 | | |
217 | | const char16_t* p = *aBuffer; |
218 | | |
219 | | MOZ_ASSERT(p, "null buffer"); |
220 | | MOZ_ASSERT(p < aEnd, "Bogus range"); |
221 | | |
222 | | char16_t c = *p++; |
223 | | |
224 | | // Let's use encoding_rs-style code golf here. |
225 | | // Unsigned underflow is defined behavior |
226 | | char16_t cMinusSurrogateStart = c - 0xD800U; |
227 | | if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) { |
228 | | *aBuffer = p; |
229 | | return c; |
230 | | } |
231 | | if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) { |
232 | | // High surrogate |
233 | | if (MOZ_LIKELY(p != aEnd)) { |
234 | | char16_t second = *p; |
235 | | // Unsigned underflow is defined behavior |
236 | | if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) { |
237 | | *aBuffer = ++p; |
238 | | return (uint32_t(c) << 10) + uint32_t(second) - |
239 | | (((0xD800U << 10) - 0x10000U) + 0xDC00U); |
240 | | } |
241 | | } |
242 | | } |
243 | | // Unpaired surrogate |
244 | | *aBuffer = p; |
245 | | if (aErr) { |
246 | | *aErr = true; |
247 | | } |
248 | | return 0xFFFDU; |
249 | | } |
250 | | }; |
251 | | |
252 | | template<typename Char, typename UnsignedT> |
253 | | inline UnsignedT |
254 | | RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index) |
255 | 0 | { |
256 | 0 | static_assert(mozilla::IsSame<Char, char>::value || |
257 | 0 | mozilla::IsSame<Char, unsigned char>::value || |
258 | 0 | mozilla::IsSame<Char, signed char>::value, |
259 | 0 | "UTF-8 data must be in 8-bit units"); |
260 | 0 | static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned"); |
261 | 0 | while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) |
262 | 0 | --index; |
263 | 0 |
|
264 | 0 | return index; |
265 | 0 | } |
266 | | |
267 | | #undef UTF8UTILS_WARNING |
268 | | |
269 | | #endif /* !defined(nsUTF8Utils_h_) */ |