/work/obj-fuzz/dist/include/nsReadableUtils.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | // IWYU pragma: private, include "nsString.h" |
7 | | |
8 | | #ifndef nsReadableUtils_h___ |
9 | | #define nsReadableUtils_h___ |
10 | | |
11 | | /** |
12 | | * I guess all the routines in this file are all mis-named. |
13 | | * According to our conventions, they should be |NS_xxx|. |
14 | | */ |
15 | | |
16 | | #include "mozilla/Assertions.h" |
17 | | #include "nsAString.h" |
18 | | |
19 | | #include "nsTArrayForwardDeclare.h" |
20 | | |
21 | | // Can't include mozilla/Encoding.h here. The implementations are in |
22 | | // the encoding_rs and encoding_glue crates. |
23 | | extern "C" { |
24 | | size_t |
25 | | encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); |
26 | | |
27 | | bool |
28 | | encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len); |
29 | | |
30 | | bool |
31 | | encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len); |
32 | | |
33 | | bool |
34 | | encoding_mem_is_utf8_latin1(uint8_t const* buffer, size_t buffer_len); |
35 | | |
36 | | bool |
37 | | encoding_mem_is_str_latin1(uint8_t const* buffer, size_t buffer_len); |
38 | | |
39 | | bool |
40 | | encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len); |
41 | | |
42 | | size_t |
43 | | encoding_mem_utf16_valid_up_to(char16_t const* buffer, size_t buffer_len); |
44 | | |
45 | | void |
46 | | encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t buffer_len); |
47 | | |
48 | | void |
49 | | encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, |
50 | | size_t src_len, |
51 | | char* dst, |
52 | | size_t dst_len); |
53 | | |
54 | | size_t |
55 | | encoding_mem_convert_utf8_to_latin1_lossy(const char* src, |
56 | | size_t src_len, |
57 | | char* dst, |
58 | | size_t dst_len); |
59 | | |
60 | | void |
61 | | encoding_mem_convert_latin1_to_utf16(const char* src, |
62 | | size_t src_len, |
63 | | char16_t* dst, |
64 | | size_t dst_len); |
65 | | |
66 | | size_t |
67 | | encoding_mem_convert_utf16_to_utf8(const char16_t* src, |
68 | | size_t src_len, |
69 | | char* dst, |
70 | | size_t dst_len); |
71 | | |
72 | | size_t |
73 | | encoding_mem_convert_utf8_to_utf16(const char* src, |
74 | | size_t src_len, |
75 | | char16_t* dst, |
76 | | size_t dst_len); |
77 | | } |
78 | | |
79 | | // From the nsstring crate |
80 | | extern "C" { |
81 | | bool |
82 | | nsstring_fallible_append_utf8_impl(nsAString* aThis, |
83 | | const char* aOther, |
84 | | size_t aOtherLen, |
85 | | size_t aOldLen); |
86 | | |
87 | | bool |
88 | | nsstring_fallible_append_latin1_impl(nsAString* aThis, |
89 | | const char* aOther, |
90 | | size_t aOtherLen, |
91 | | size_t aOldLen, |
92 | | bool aAllowShrinking); |
93 | | |
94 | | bool |
95 | | nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis, |
96 | | const char16_t*, |
97 | | size_t aOtherLen, |
98 | | size_t aOldLen); |
99 | | |
100 | | bool |
101 | | nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis, |
102 | | const char16_t*, |
103 | | size_t aOtherLen, |
104 | | size_t aOldLen, |
105 | | bool aAllowShrinking); |
106 | | |
107 | | bool |
108 | | nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis, |
109 | | const nsACString* aOther, |
110 | | size_t aOldLen); |
111 | | |
112 | | bool |
113 | | nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis, |
114 | | const nsACString* aOther, |
115 | | size_t aOldLen); |
116 | | } |
117 | | |
118 | | /** |
119 | | * If all the code points in the input are below U+0100, converts to Latin1, |
120 | | * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If |
121 | | * there are code points above U+00FF, produces garbage in a memory-safe way |
122 | | * and will likely start asserting in future debug builds. The nature of the |
123 | | * garbage depends on the CPU architecture and must not be relied upon. |
124 | | * |
125 | | * The length of aDest must be not be less than the length of aSource. |
126 | | */ |
127 | | inline void |
128 | | LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource, |
129 | | mozilla::Span<char> aDest) |
130 | | { |
131 | | encoding_mem_convert_utf16_to_latin1_lossy( |
132 | | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
133 | | } |
134 | | |
135 | | /** |
136 | | * If all the code points in the input are below U+0100, converts to Latin1, |
137 | | * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If |
138 | | * there are code points above U+00FF, asserts in debug builds and produces |
139 | | * garbage in memory-safe way in release builds. The nature of the garbage |
140 | | * may depend on the CPU architecture and must not be relied upon. |
141 | | * |
142 | | * The length of aDest must be not be less than the length of aSource. |
143 | | */ |
144 | | inline size_t |
145 | | LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource, |
146 | | mozilla::Span<char> aDest) |
147 | 3.24M | { |
148 | 3.24M | return encoding_mem_convert_utf8_to_latin1_lossy( |
149 | 3.24M | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
150 | 3.24M | } |
151 | | |
152 | | /** |
153 | | * Interprets unsigned byte value as Unicode scalar value (i.e. not |
154 | | * windows-1252!). |
155 | | * |
156 | | * The length of aDest must be not be less than the length of aSource. |
157 | | */ |
158 | | inline void |
159 | | ConvertLatin1toUTF16(mozilla::Span<const char> aSource, |
160 | | mozilla::Span<char16_t> aDest) |
161 | | { |
162 | | encoding_mem_convert_latin1_to_utf16( |
163 | | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
164 | | } |
165 | | |
166 | | /** |
167 | | * Lone surrogates are replaced with the REPLACEMENT CHARACTER. |
168 | | * |
169 | | * The length of aDest must be at least the length of aSource times three |
170 | | * _plus one_. |
171 | | * |
172 | | * Returns the number of code units written. |
173 | | */ |
174 | | inline size_t |
175 | | ConvertUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
176 | | mozilla::Span<char> aDest) |
177 | | { |
178 | | return encoding_mem_convert_utf16_to_utf8( |
179 | | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
180 | | } |
181 | | |
182 | | /** |
183 | | * Malformed byte sequences are replaced with the REPLACEMENT CHARACTER. |
184 | | * |
185 | | * The length of aDest must at least one greater than the length of aSource. |
186 | | * |
187 | | * Returns the number of code units written. |
188 | | */ |
189 | | inline size_t |
190 | | ConvertUTF8toUTF16(mozilla::Span<const char> aSource, |
191 | | mozilla::Span<char16_t> aDest) |
192 | | { |
193 | | return encoding_mem_convert_utf8_to_utf16( |
194 | | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
195 | | } |
196 | | |
197 | | inline size_t |
198 | | Distance(const nsReadingIterator<char16_t>& aStart, |
199 | | const nsReadingIterator<char16_t>& aEnd) |
200 | 0 | { |
201 | 0 | MOZ_ASSERT(aStart.get() <= aEnd.get()); |
202 | 0 | return static_cast<size_t>(aEnd.get() - aStart.get()); |
203 | 0 | } |
204 | | |
205 | | inline size_t |
206 | | Distance(const nsReadingIterator<char>& aStart, |
207 | | const nsReadingIterator<char>& aEnd) |
208 | 15 | { |
209 | 15 | MOZ_ASSERT(aStart.get() <= aEnd.get()); |
210 | 15 | return static_cast<size_t>(aEnd.get() - aStart.get()); |
211 | 15 | } |
212 | | |
213 | | // UTF-8 to UTF-16 |
214 | | // Invalid UTF-8 byte sequences are replaced with the REPLACEMENT CHARACTER. |
215 | | |
216 | | inline MOZ_MUST_USE bool |
217 | | CopyUTF8toUTF16(mozilla::Span<const char> aSource, |
218 | | nsAString& aDest, |
219 | | const mozilla::fallible_t&) |
220 | 724 | { |
221 | 724 | return nsstring_fallible_append_utf8_impl( |
222 | 724 | &aDest, aSource.Elements(), aSource.Length(), 0); |
223 | 724 | } |
224 | | |
225 | | inline void |
226 | | CopyUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
227 | 724 | { |
228 | 724 | if (MOZ_UNLIKELY(!CopyUTF8toUTF16(aSource, aDest, mozilla::fallible))) { |
229 | 0 | aDest.AllocFailed(aSource.Length()); |
230 | 0 | } |
231 | 724 | } |
232 | | |
233 | | inline MOZ_MUST_USE bool |
234 | | AppendUTF8toUTF16(mozilla::Span<const char> aSource, |
235 | | nsAString& aDest, |
236 | | const mozilla::fallible_t&) |
237 | | { |
238 | | return nsstring_fallible_append_utf8_impl( |
239 | | &aDest, aSource.Elements(), aSource.Length(), aDest.Length()); |
240 | | } |
241 | | |
242 | | inline void |
243 | | AppendUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
244 | | { |
245 | | if (MOZ_UNLIKELY(!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible))) { |
246 | | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
247 | | } |
248 | | } |
249 | | |
250 | | // Latin1 to UTF-16 |
251 | | // Interpret each incoming unsigned byte value as a Unicode scalar value (not |
252 | | // windows-1252!). The function names say "ASCII" instead of "Latin1" for |
253 | | // legacy reasons. |
254 | | |
255 | | inline MOZ_MUST_USE bool |
256 | | CopyASCIItoUTF16(mozilla::Span<const char> aSource, |
257 | | nsAString& aDest, |
258 | | const mozilla::fallible_t&) |
259 | 496 | { |
260 | 496 | return nsstring_fallible_append_latin1_impl( |
261 | 496 | &aDest, aSource.Elements(), aSource.Length(), 0, true); |
262 | 496 | } |
263 | | |
264 | | inline void |
265 | | CopyASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
266 | 496 | { |
267 | 496 | if (MOZ_UNLIKELY(!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible))) { |
268 | 0 | aDest.AllocFailed(aSource.Length()); |
269 | 0 | } |
270 | 496 | } |
271 | | |
272 | | inline MOZ_MUST_USE bool |
273 | | AppendASCIItoUTF16(mozilla::Span<const char> aSource, |
274 | | nsAString& aDest, |
275 | | const mozilla::fallible_t&) |
276 | 5.87k | { |
277 | 5.87k | return nsstring_fallible_append_latin1_impl( |
278 | 5.87k | &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false); |
279 | 5.87k | } |
280 | | |
281 | | inline void |
282 | | AppendASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
283 | 5.87k | { |
284 | 5.87k | if (MOZ_UNLIKELY(!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible))) { |
285 | 0 | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
286 | 0 | } |
287 | 5.87k | } |
288 | | |
289 | | // UTF-16 to UTF-8 |
290 | | // Unpaired surrogates are replaced with the REPLACEMENT CHARACTER. |
291 | | |
292 | | inline MOZ_MUST_USE bool |
293 | | CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
294 | | nsACString& aDest, |
295 | | const mozilla::fallible_t&) |
296 | 41.0k | { |
297 | 41.0k | return nscstring_fallible_append_utf16_to_utf8_impl( |
298 | 41.0k | &aDest, aSource.Elements(), aSource.Length(), 0); |
299 | 41.0k | } |
300 | | |
301 | | inline void |
302 | | CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
303 | 41.0k | { |
304 | 41.0k | if (MOZ_UNLIKELY(!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible))) { |
305 | 0 | aDest.AllocFailed(aSource.Length()); |
306 | 0 | } |
307 | 41.0k | } |
308 | | |
309 | | inline MOZ_MUST_USE bool |
310 | | AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
311 | | nsACString& aDest, |
312 | | const mozilla::fallible_t&) |
313 | 3.04M | { |
314 | 3.04M | return nscstring_fallible_append_utf16_to_utf8_impl( |
315 | 3.04M | &aDest, aSource.Elements(), aSource.Length(), aDest.Length()); |
316 | 3.04M | } |
317 | | |
318 | | inline void |
319 | | AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
320 | 3.04M | { |
321 | 3.04M | if (MOZ_UNLIKELY(!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible))) { |
322 | 0 | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
323 | 0 | } |
324 | 3.04M | } |
325 | | |
326 | | // UTF-16 to Latin1 |
327 | | // If all code points in the input are below U+0100, represents each scalar |
328 | | // value as an unsigned byte. (This is not windows-1252!) If there are code |
329 | | // points above U+00FF, memory-safely produces garbage and will likely start |
330 | | // asserting in future debug builds. The nature of the garbage may differ |
331 | | // based on CPU architecture and must not be relied upon. The names say |
332 | | // "ASCII" instead of "Latin1" for legacy reasons. |
333 | | |
334 | | inline MOZ_MUST_USE bool |
335 | | LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, |
336 | | nsACString& aDest, |
337 | | const mozilla::fallible_t&) |
338 | 9.17k | { |
339 | 9.17k | return nscstring_fallible_append_utf16_to_latin1_lossy_impl( |
340 | 9.17k | &aDest, aSource.Elements(), aSource.Length(), 0, true); |
341 | 9.17k | } |
342 | | |
343 | | inline void |
344 | | LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
345 | 9.17k | { |
346 | 9.17k | if (MOZ_UNLIKELY(!LossyCopyUTF16toASCII(aSource, aDest, mozilla::fallible))) { |
347 | 0 | aDest.AllocFailed(aSource.Length()); |
348 | 0 | } |
349 | 9.17k | } |
350 | | |
351 | | inline MOZ_MUST_USE bool |
352 | | LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource, |
353 | | nsACString& aDest, |
354 | | const mozilla::fallible_t&) |
355 | | { |
356 | | return nscstring_fallible_append_utf16_to_latin1_lossy_impl( |
357 | | &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false); |
358 | | } |
359 | | |
360 | | inline void |
361 | | LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource, |
362 | | nsACString& aDest) |
363 | | { |
364 | | if (MOZ_UNLIKELY( |
365 | | !LossyAppendUTF16toASCII(aSource, aDest, mozilla::fallible))) { |
366 | | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
367 | | } |
368 | | } |
369 | | |
370 | | /** |
371 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
372 | | * |
373 | | * Allocates and returns a new |char| buffer which you must free with |free|. |
374 | | * Performs a conversion with LossyConvertUTF16toLatin1() writing into the |
375 | | * newly-allocated buffer. |
376 | | * |
377 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
378 | | * contains embedded nulls. |
379 | | * |
380 | | * @param aSource a 16-bit wide string |
381 | | * @return a new |char| buffer you must free with |free|. |
382 | | */ |
383 | | char* ToNewCString(const nsAString& aSource); |
384 | | |
385 | | /** |
386 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
387 | | * |
388 | | * Allocates and returns a new |char| buffer which you must free with |free|. |
389 | | * |
390 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
391 | | * contains embedded nulls. |
392 | | * |
393 | | * @param aSource an 8-bit wide string |
394 | | * @return a new |char| buffer you must free with |free|. |
395 | | */ |
396 | | char* ToNewCString(const nsACString& aSource); |
397 | | |
398 | | /** |
399 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
400 | | * |
401 | | * Allocates and returns a new |char| buffer which you must free with |
402 | | * |free|. |
403 | | * Performs an encoding conversion from a UTF-16 string to a UTF-8 string with |
404 | | * unpaired surrogates replaced with the REPLACEMENT CHARACTER copying |
405 | | * |aSource| to your new buffer. |
406 | | * |
407 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
408 | | * contains embedded nulls. |
409 | | * |
410 | | * @param aSource a UTF-16 string (made of char16_t's) |
411 | | * @param aUTF8Count the number of 8-bit units that was returned |
412 | | * @return a new |char| buffer you must free with |free|. |
413 | | */ |
414 | | |
415 | | char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr); |
416 | | |
417 | | |
418 | | /** |
419 | | * Returns a new |char16_t| buffer containing a zero-terminated copy of |
420 | | * |aSource|. |
421 | | * |
422 | | * Allocates and returns a new |char16_t| buffer which you must free with |
423 | | * |free|. |
424 | | * |
425 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
426 | | * contains embedded nulls. |
427 | | * |
428 | | * @param aSource a UTF-16 string |
429 | | * @return a new |char16_t| buffer you must free with |free|. |
430 | | */ |
431 | | char16_t* ToNewUnicode(const nsAString& aSource); |
432 | | |
433 | | |
434 | | /** |
435 | | * Returns a new |char16_t| buffer containing a zero-terminated copy of |
436 | | * |aSource|. |
437 | | * |
438 | | * Allocates and returns a new |char16_t| buffer which you must free with |
439 | | * |free|. |
440 | | * |
441 | | * Performs an encoding conversion by 0-padding 8-bit wide characters up to |
442 | | * 16-bits wide (i.e. Latin1 to UTF-16 conversion) while copying |aSource| |
443 | | * to your new buffer. |
444 | | * |
445 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
446 | | * contains embedded nulls. |
447 | | * |
448 | | * @param aSource a Latin1 string |
449 | | * @return a new |char16_t| buffer you must free with |free|. |
450 | | */ |
451 | | char16_t* ToNewUnicode(const nsACString& aSource); |
452 | | |
453 | | /** |
454 | | * Returns a new |char16_t| buffer containing a zero-terminated copy |
455 | | * of |aSource|. |
456 | | * |
457 | | * Allocates and returns a new |char| buffer which you must free with |
458 | | * |free|. Performs an encoding conversion from UTF-8 to UTF-16 |
459 | | * while copying |aSource| to your new buffer. Malformed byte sequences |
460 | | * are replaced with the REPLACEMENT CHARACTER. |
461 | | * |
462 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
463 | | * contains embedded nulls. |
464 | | * |
465 | | * @param aSource an 8-bit wide string, UTF-8 encoded |
466 | | * @param aUTF16Count the number of 16-bit units that was returned |
467 | | * @return a new |char16_t| buffer you must free with |free|. |
468 | | * (UTF-16 encoded) |
469 | | */ |
470 | | char16_t* UTF8ToNewUnicode(const nsACString& aSource, |
471 | | uint32_t* aUTF16Count = nullptr); |
472 | | |
473 | | /** |
474 | | * Copies |aLength| 16-bit code units from the start of |aSource| to the |
475 | | * |char16_t| buffer |aDest|. |
476 | | * |
477 | | * After this operation |aDest| is not null terminated. |
478 | | * |
479 | | * @param aSource a UTF-16 string |
480 | | * @param aSrcOffset start offset in the source string |
481 | | * @param aDest a |char16_t| buffer |
482 | | * @param aLength the number of 16-bit code units to copy |
483 | | * @return pointer to destination buffer - identical to |aDest| |
484 | | */ |
485 | | char16_t* CopyUnicodeTo(const nsAString& aSource, |
486 | | uint32_t aSrcOffset, |
487 | | char16_t* aDest, |
488 | | uint32_t aLength); |
489 | | |
490 | | /** |
491 | | * Returns |true| if |aString| contains only ASCII characters, that is, |
492 | | * characters in the range (0x00, 0x7F). |
493 | | * |
494 | | * @param aString a 16-bit wide string to scan |
495 | | */ |
496 | | inline bool |
497 | | IsASCII(mozilla::Span<const char16_t> aString) |
498 | 101k | { |
499 | 101k | size_t length = aString.Length(); |
500 | 101k | const char16_t* ptr = aString.Elements(); |
501 | 101k | // For short strings, calling into Rust is a pessimization, and the SIMD |
502 | 101k | // code won't have a chance to kick in anyway. |
503 | 101k | if (length < 16) { |
504 | 85.5k | char16_t accu = 0; |
505 | 437k | for (size_t i = 0; i < length; i++) { |
506 | 352k | accu |= ptr[i]; |
507 | 352k | } |
508 | 85.5k | return accu < 0x80U; |
509 | 85.5k | } |
510 | 15.9k | return encoding_mem_is_basic_latin(ptr, length); |
511 | 15.9k | } |
512 | | |
513 | | /** |
514 | | * Returns |true| if |aString| contains only ASCII characters, that is, |
515 | | * characters in the range (0x00, 0x7F). |
516 | | * |
517 | | * @param aString a 8-bit wide string to scan |
518 | | */ |
519 | | inline bool |
520 | | IsASCII(mozilla::Span<const char> aString) |
521 | 1.12M | { |
522 | 1.12M | size_t length = aString.Length(); |
523 | 1.12M | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
524 | 1.12M | // For short strings, calling into Rust is a pessimization, and the SIMD |
525 | 1.12M | // code won't have a chance to kick in anyway. |
526 | 1.12M | if (length < 16) { |
527 | 1.09M | uint8_t accu = 0; |
528 | 12.8M | for (size_t i = 0; i < length; i++) { |
529 | 11.7M | accu |= ptr[i]; |
530 | 11.7M | } |
531 | 1.09M | return accu < 0x80U; |
532 | 1.09M | } |
533 | 33.8k | return encoding_mem_is_ascii(ptr, length); |
534 | 33.8k | } |
535 | | |
536 | | /** |
537 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
538 | | * characters in the range (U+0000, U+00FF). |
539 | | * |
540 | | * @param aString a potentially-invalid UTF-16 string to scan |
541 | | */ |
542 | | inline bool |
543 | | IsUTF16Latin1(mozilla::Span<const char16_t> aString) |
544 | 0 | { |
545 | 0 | size_t length = aString.Length(); |
546 | 0 | const char16_t* ptr = aString.Elements(); |
547 | 0 | // For short strings, calling into Rust is a pessimization, and the SIMD |
548 | 0 | // code won't have a chance to kick in anyway. |
549 | 0 | if (length < 16) { |
550 | 0 | char16_t accu = 0; |
551 | 0 | for (size_t i = 0; i < length; i++) { |
552 | 0 | accu |= ptr[i]; |
553 | 0 | } |
554 | 0 | return accu < 0x100U; |
555 | 0 | } |
556 | 0 | return encoding_mem_is_utf16_latin1(ptr, length); |
557 | 0 | } |
558 | | |
559 | | /** |
560 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
561 | | * characters in the range (U+0000, U+00FF). |
562 | | * |
563 | | * If you know that the argument is always absolutely guaranteed to be valid |
564 | | * UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead. |
565 | | * |
566 | | * @param aString potentially-invalid UTF-8 string to scan |
567 | | */ |
568 | | inline bool |
569 | | IsUTF8Latin1(mozilla::Span<const char> aString) |
570 | 3.24M | { |
571 | 3.24M | size_t length = aString.Length(); |
572 | 3.24M | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
573 | 3.24M | // For short strings, calling into Rust is a pessimization, and the SIMD |
574 | 3.24M | // code won't have a chance to kick in anyway. |
575 | 3.24M | if (length < 16) { |
576 | 25.3M | for (size_t i = 0; i < length; i++) { |
577 | 22.2M | if (ptr[i] >= 0x80U) { |
578 | 236k | ptr += i; |
579 | 236k | length -= i; |
580 | 236k | // This loop can't handle non-ASCII, but the Rust code can, so |
581 | 236k | // upon seeing non-ASCII, break the loop and let the Rust code |
582 | 236k | // handle the rest of the buffer (including the non-ASCII byte). |
583 | 236k | goto end; |
584 | 236k | } |
585 | 22.2M | } |
586 | 3.24M | return true; |
587 | 238k | } |
588 | 238k | end: |
589 | 238k | return encoding_mem_is_utf8_latin1(ptr, length); |
590 | 3.24M | } |
591 | | |
592 | | /** |
593 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
594 | | * characters in the range (U+0000, U+00FF). |
595 | | * |
596 | | * The argument MUST be valid UTF-8. If you are at all unsure, use IsUTF8Latin1 |
597 | | * instead! |
598 | | * |
599 | | * @param aString known-valid UTF-8 string to scan |
600 | | */ |
601 | | inline bool |
602 | | UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString) |
603 | 0 | { |
604 | 0 | size_t length = aString.Length(); |
605 | 0 | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
606 | 0 | // For short strings, calling into Rust is a pessimization, and the SIMD |
607 | 0 | // code won't have a chance to kick in anyway. |
608 | 0 | if (length < 16) { |
609 | 0 | for (size_t i = 0; i < length; i++) { |
610 | 0 | if (ptr[i] >= 0x80U) { |
611 | 0 | ptr += i; |
612 | 0 | length -= i; |
613 | 0 | goto end; |
614 | 0 | } |
615 | 0 | } |
616 | 0 | return true; |
617 | 0 | } |
618 | 0 | end: |
619 | 0 | return encoding_mem_is_str_latin1(ptr, length); |
620 | 0 | } |
621 | | |
622 | | /** |
623 | | * Returns |true| if |aString| is a valid UTF-8 string. |
624 | | * |
625 | | * Note that this doesn't check whether the string might look like a valid |
626 | | * string in another encoding, too, e.g. ISO-2022-JP. |
627 | | * |
628 | | * @param aString an 8-bit wide string to scan |
629 | | */ |
630 | | inline bool |
631 | | IsUTF8(mozilla::Span<const char> aString) |
632 | 27.4k | { |
633 | 27.4k | size_t length = aString.Length(); |
634 | 27.4k | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
635 | 27.4k | // For short strings, calling into Rust is a pessimization, and the SIMD |
636 | 27.4k | // code won't have a chance to kick in anyway. |
637 | 27.4k | if (length < 16) { |
638 | 26.0k | for (size_t i = 0; i < length; i++) { |
639 | 25.9k | if (ptr[i] >= 0x80U) { |
640 | 12.5k | ptr += i; |
641 | 12.5k | length -= i; |
642 | 12.5k | goto end; |
643 | 12.5k | } |
644 | 25.9k | } |
645 | 12.6k | return true; |
646 | 27.3k | } |
647 | 27.3k | end: |
648 | 27.3k | return length == encoding_utf8_valid_up_to(ptr, length); |
649 | 27.4k | } |
650 | | |
651 | | /** |
652 | | * Returns the index of the first unpaired surrogate or |
653 | | * the length of the string if there are none. |
654 | | */ |
655 | | inline uint32_t |
656 | | UTF16ValidUpTo(mozilla::Span<const char16_t> aString) |
657 | 0 | { |
658 | 0 | return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length()); |
659 | 0 | } |
660 | | |
661 | | /** |
662 | | * Replaces unpaired surrogates with U+FFFD in the argument. |
663 | | */ |
664 | | inline void |
665 | | EnsureUTF16ValiditySpan(mozilla::Span<char16_t> aString) |
666 | 0 | { |
667 | 0 | encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length()); |
668 | 0 | } |
669 | | |
670 | | /** |
671 | | * Replaces unpaired surrogates with U+FFFD in the argument. |
672 | | * |
673 | | * Copies a shared string buffer or an otherwise read-only |
674 | | * buffer only if there are unpaired surrogates. |
675 | | */ |
676 | | inline void |
677 | | EnsureUTF16Validity(nsAString& aString) |
678 | 0 | { |
679 | 0 | uint32_t upTo = UTF16ValidUpTo(aString); |
680 | 0 | uint32_t len = aString.Length(); |
681 | 0 | if (upTo == len) { |
682 | 0 | return; |
683 | 0 | } |
684 | 0 | char16_t* ptr = aString.BeginWriting(); |
685 | 0 | auto span = mozilla::MakeSpan(ptr, len); |
686 | 0 | span[upTo] = 0xFFFD; |
687 | 0 | EnsureUTF16ValiditySpan(span.From(upTo + 1)); |
688 | 0 | } |
689 | | |
690 | | bool ParseString(const nsACString& aAstring, char aDelimiter, |
691 | | nsTArray<nsCString>& aArray); |
692 | | |
693 | | /** |
694 | | * Converts case in place in the argument string. |
695 | | */ |
696 | | void ToUpperCase(nsACString&); |
697 | | |
698 | | void ToLowerCase(nsACString&); |
699 | | |
700 | | void ToUpperCase(nsACString&); |
701 | | |
702 | | void ToLowerCase(nsACString&); |
703 | | |
704 | | /** |
705 | | * Converts case from string aSource to aDest. |
706 | | */ |
707 | | void ToUpperCase(const nsACString& aSource, nsACString& aDest); |
708 | | |
709 | | void ToLowerCase(const nsACString& aSource, nsACString& aDest); |
710 | | |
711 | | /** |
712 | | * Finds the leftmost occurrence of |aPattern|, if any in the range |
713 | | * |aSearchStart|..|aSearchEnd|. |
714 | | * |
715 | | * Returns |true| if a match was found, and adjusts |aSearchStart| and |
716 | | * |aSearchEnd| to point to the match. If no match was found, returns |false| |
717 | | * and makes |aSearchStart == aSearchEnd|. |
718 | | * |
719 | | * Currently, this is equivalent to the O(m*n) implementation previously on |
720 | | * |ns[C]String|. |
721 | | * |
722 | | * If we need something faster, then we can implement that later. |
723 | | */ |
724 | | |
725 | | bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&, |
726 | | nsAString::const_iterator&, |
727 | | const nsStringComparator& = nsDefaultStringComparator()); |
728 | | bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&, |
729 | | nsACString::const_iterator&, |
730 | | const nsCStringComparator& = nsDefaultCStringComparator()); |
731 | | |
732 | | /* sometimes we don't care about where the string was, just that we |
733 | | * found it or not */ |
734 | | inline bool |
735 | | FindInReadable(const nsAString& aPattern, const nsAString& aSource, |
736 | | const nsStringComparator& aCompare = nsDefaultStringComparator()) |
737 | 0 | { |
738 | 0 | nsAString::const_iterator start, end; |
739 | 0 | aSource.BeginReading(start); |
740 | 0 | aSource.EndReading(end); |
741 | 0 | return FindInReadable(aPattern, start, end, aCompare); |
742 | 0 | } |
743 | | |
744 | | inline bool |
745 | | FindInReadable(const nsACString& aPattern, const nsACString& aSource, |
746 | | const nsCStringComparator& aCompare = nsDefaultCStringComparator()) |
747 | 5 | { |
748 | 5 | nsACString::const_iterator start, end; |
749 | 5 | aSource.BeginReading(start); |
750 | 5 | aSource.EndReading(end); |
751 | 5 | return FindInReadable(aPattern, start, end, aCompare); |
752 | 5 | } |
753 | | |
754 | | |
755 | | bool CaseInsensitiveFindInReadable(const nsACString& aPattern, |
756 | | nsACString::const_iterator&, |
757 | | nsACString::const_iterator&); |
758 | | |
759 | | /** |
760 | | * Finds the rightmost occurrence of |aPattern| |
761 | | * Returns |true| if a match was found, and adjusts |aSearchStart| and |
762 | | * |aSearchEnd| to point to the match. If no match was found, returns |false| |
763 | | * and makes |aSearchStart == aSearchEnd|. |
764 | | */ |
765 | | bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&, |
766 | | nsAString::const_iterator&, |
767 | | const nsStringComparator& = nsDefaultStringComparator()); |
768 | | bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&, |
769 | | nsACString::const_iterator&, |
770 | | const nsCStringComparator& = nsDefaultCStringComparator()); |
771 | | |
772 | | /** |
773 | | * Finds the leftmost occurrence of |aChar|, if any in the range |
774 | | * |aSearchStart|..|aSearchEnd|. |
775 | | * |
776 | | * Returns |true| if a match was found, and adjusts |aSearchStart| to |
777 | | * point to the match. If no match was found, returns |false| and |
778 | | * makes |aSearchStart == aSearchEnd|. |
779 | | */ |
780 | | bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart, |
781 | | const nsAString::const_iterator& aSearchEnd); |
782 | | bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart, |
783 | | const nsACString::const_iterator& aSearchEnd); |
784 | | |
785 | | bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring); |
786 | | bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring, |
787 | | const nsStringComparator& aComparator); |
788 | | bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring); |
789 | | bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring, |
790 | | const nsCStringComparator& aComparator); |
791 | | bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring); |
792 | | bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring, |
793 | | const nsStringComparator& aComparator); |
794 | | bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring); |
795 | | bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring, |
796 | | const nsCStringComparator& aComparator); |
797 | | |
798 | | const nsString& EmptyString(); |
799 | | const nsCString& EmptyCString(); |
800 | | |
801 | | const nsString& VoidString(); |
802 | | const nsCString& VoidCString(); |
803 | | |
804 | | /** |
805 | | * Compare a UTF-8 string to an UTF-16 string. |
806 | | * |
807 | | * Returns 0 if the strings are equal, -1 if aUTF8String is less |
808 | | * than aUTF16Count, and 1 in the reverse case. Errors are replaced |
809 | | * with U+FFFD and then the U+FFFD is compared as if it had occurred |
810 | | * in the input. If aErr is not nullptr, *aErr is set to true if |
811 | | * either string had malformed sequences. |
812 | | */ |
813 | | int32_t |
814 | | CompareUTF8toUTF16(const nsACString& aUTF8String, |
815 | | const nsAString& aUTF16String, |
816 | | bool* aErr = nullptr); |
817 | | |
818 | | void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest); |
819 | | |
820 | | #endif // !defined(nsReadableUtils_h___) |