/src/mozilla-central/xpcom/string/nsReadableUtils.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | // IWYU pragma: private, include "nsString.h" |
7 | | |
8 | | #ifndef nsReadableUtils_h___ |
9 | | #define nsReadableUtils_h___ |
10 | | |
11 | | /** |
12 | | * I guess all the routines in this file are all mis-named. |
13 | | * According to our conventions, they should be |NS_xxx|. |
14 | | */ |
15 | | |
16 | | #include "mozilla/Assertions.h" |
17 | | #include "nsAString.h" |
18 | | |
19 | | #include "nsTArrayForwardDeclare.h" |
20 | | |
21 | | // Can't include mozilla/Encoding.h here. The implementations are in |
22 | | // the encoding_rs and encoding_glue crates. |
23 | | extern "C" { |
24 | | size_t |
25 | | encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); |
26 | | |
27 | | bool |
28 | | encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len); |
29 | | |
30 | | bool |
31 | | encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len); |
32 | | |
33 | | bool |
34 | | encoding_mem_is_utf8_latin1(uint8_t const* buffer, size_t buffer_len); |
35 | | |
36 | | bool |
37 | | encoding_mem_is_str_latin1(uint8_t const* buffer, size_t buffer_len); |
38 | | |
39 | | bool |
40 | | encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len); |
41 | | |
42 | | size_t |
43 | | encoding_mem_utf16_valid_up_to(char16_t const* buffer, size_t buffer_len); |
44 | | |
45 | | void |
46 | | encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t buffer_len); |
47 | | |
48 | | void |
49 | | encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, |
50 | | size_t src_len, |
51 | | char* dst, |
52 | | size_t dst_len); |
53 | | |
54 | | size_t |
55 | | encoding_mem_convert_utf8_to_latin1_lossy(const char* src, |
56 | | size_t src_len, |
57 | | char* dst, |
58 | | size_t dst_len); |
59 | | |
60 | | void |
61 | | encoding_mem_convert_latin1_to_utf16(const char* src, |
62 | | size_t src_len, |
63 | | char16_t* dst, |
64 | | size_t dst_len); |
65 | | |
66 | | size_t |
67 | | encoding_mem_convert_utf16_to_utf8(const char16_t* src, |
68 | | size_t src_len, |
69 | | char* dst, |
70 | | size_t dst_len); |
71 | | |
72 | | size_t |
73 | | encoding_mem_convert_utf8_to_utf16(const char* src, |
74 | | size_t src_len, |
75 | | char16_t* dst, |
76 | | size_t dst_len); |
77 | | } |
78 | | |
79 | | // From the nsstring crate |
80 | | extern "C" { |
81 | | bool |
82 | | nsstring_fallible_append_utf8_impl(nsAString* aThis, |
83 | | const char* aOther, |
84 | | size_t aOtherLen, |
85 | | size_t aOldLen); |
86 | | |
87 | | bool |
88 | | nsstring_fallible_append_latin1_impl(nsAString* aThis, |
89 | | const char* aOther, |
90 | | size_t aOtherLen, |
91 | | size_t aOldLen, |
92 | | bool aAllowShrinking); |
93 | | |
94 | | bool |
95 | | nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis, |
96 | | const char16_t*, |
97 | | size_t aOtherLen, |
98 | | size_t aOldLen); |
99 | | |
100 | | bool |
101 | | nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis, |
102 | | const char16_t*, |
103 | | size_t aOtherLen, |
104 | | size_t aOldLen, |
105 | | bool aAllowShrinking); |
106 | | |
107 | | bool |
108 | | nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis, |
109 | | const nsACString* aOther, |
110 | | size_t aOldLen); |
111 | | |
112 | | bool |
113 | | nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis, |
114 | | const nsACString* aOther, |
115 | | size_t aOldLen); |
116 | | } |
117 | | |
118 | | /** |
119 | | * If all the code points in the input are below U+0100, converts to Latin1, |
120 | | * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If |
121 | | * there are code points above U+00FF, produces garbage in a memory-safe way |
122 | | * and will likely start asserting in future debug builds. The nature of the |
123 | | * garbage depends on the CPU architecture and must not be relied upon. |
124 | | * |
125 | | * The length of aDest must be not be less than the length of aSource. |
126 | | */ |
127 | | inline void |
128 | | LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource, |
129 | | mozilla::Span<char> aDest) |
130 | 0 | { |
131 | 0 | encoding_mem_convert_utf16_to_latin1_lossy( |
132 | 0 | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
133 | 0 | } |
134 | | |
135 | | /** |
136 | | * If all the code points in the input are below U+0100, converts to Latin1, |
137 | | * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If |
138 | | * there are code points above U+00FF, asserts in debug builds and produces |
139 | | * garbage in memory-safe way in release builds. The nature of the garbage |
140 | | * may depend on the CPU architecture and must not be relied upon. |
141 | | * |
142 | | * The length of aDest must be not be less than the length of aSource. |
143 | | */ |
144 | | inline size_t |
145 | | LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource, |
146 | | mozilla::Span<char> aDest) |
147 | | { |
148 | | return encoding_mem_convert_utf8_to_latin1_lossy( |
149 | | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
150 | | } |
151 | | |
152 | | /** |
153 | | * Interprets unsigned byte value as Unicode scalar value (i.e. not |
154 | | * windows-1252!). |
155 | | * |
156 | | * The length of aDest must be not be less than the length of aSource. |
157 | | */ |
158 | | inline void |
159 | | ConvertLatin1toUTF16(mozilla::Span<const char> aSource, |
160 | | mozilla::Span<char16_t> aDest) |
161 | 0 | { |
162 | 0 | encoding_mem_convert_latin1_to_utf16( |
163 | 0 | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
164 | 0 | } |
165 | | |
166 | | /** |
167 | | * Lone surrogates are replaced with the REPLACEMENT CHARACTER. |
168 | | * |
169 | | * The length of aDest must be at least the length of aSource times three |
170 | | * _plus one_. |
171 | | * |
172 | | * Returns the number of code units written. |
173 | | */ |
174 | | inline size_t |
175 | | ConvertUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
176 | | mozilla::Span<char> aDest) |
177 | 0 | { |
178 | 0 | return encoding_mem_convert_utf16_to_utf8( |
179 | 0 | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
180 | 0 | } |
181 | | |
182 | | /** |
183 | | * Malformed byte sequences are replaced with the REPLACEMENT CHARACTER. |
184 | | * |
185 | | * The length of aDest must at least one greater than the length of aSource. |
186 | | * |
187 | | * Returns the number of code units written. |
188 | | */ |
189 | | inline size_t |
190 | | ConvertUTF8toUTF16(mozilla::Span<const char> aSource, |
191 | | mozilla::Span<char16_t> aDest) |
192 | 0 | { |
193 | 0 | return encoding_mem_convert_utf8_to_utf16( |
194 | 0 | aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); |
195 | 0 | } |
196 | | |
197 | | inline size_t |
198 | | Distance(const nsReadingIterator<char16_t>& aStart, |
199 | | const nsReadingIterator<char16_t>& aEnd) |
200 | | { |
201 | | MOZ_ASSERT(aStart.get() <= aEnd.get()); |
202 | | return static_cast<size_t>(aEnd.get() - aStart.get()); |
203 | | } |
204 | | |
205 | | inline size_t |
206 | | Distance(const nsReadingIterator<char>& aStart, |
207 | | const nsReadingIterator<char>& aEnd) |
208 | | { |
209 | | MOZ_ASSERT(aStart.get() <= aEnd.get()); |
210 | | return static_cast<size_t>(aEnd.get() - aStart.get()); |
211 | | } |
212 | | |
213 | | // UTF-8 to UTF-16 |
214 | | // Invalid UTF-8 byte sequences are replaced with the REPLACEMENT CHARACTER. |
215 | | |
216 | | inline MOZ_MUST_USE bool |
217 | | CopyUTF8toUTF16(mozilla::Span<const char> aSource, |
218 | | nsAString& aDest, |
219 | | const mozilla::fallible_t&) |
220 | | { |
221 | | return nsstring_fallible_append_utf8_impl( |
222 | | &aDest, aSource.Elements(), aSource.Length(), 0); |
223 | | } |
224 | | |
225 | | inline void |
226 | | CopyUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
227 | | { |
228 | | if (MOZ_UNLIKELY(!CopyUTF8toUTF16(aSource, aDest, mozilla::fallible))) { |
229 | | aDest.AllocFailed(aSource.Length()); |
230 | | } |
231 | | } |
232 | | |
233 | | inline MOZ_MUST_USE bool |
234 | | AppendUTF8toUTF16(mozilla::Span<const char> aSource, |
235 | | nsAString& aDest, |
236 | | const mozilla::fallible_t&) |
237 | 68.9k | { |
238 | 68.9k | return nsstring_fallible_append_utf8_impl( |
239 | 68.9k | &aDest, aSource.Elements(), aSource.Length(), aDest.Length()); |
240 | 68.9k | } |
241 | | |
242 | | inline void |
243 | | AppendUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
244 | 68.9k | { |
245 | 68.9k | if (MOZ_UNLIKELY(!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible))) { |
246 | 0 | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
247 | 0 | } |
248 | 68.9k | } |
249 | | |
250 | | // Latin1 to UTF-16 |
251 | | // Interpret each incoming unsigned byte value as a Unicode scalar value (not |
252 | | // windows-1252!). The function names say "ASCII" instead of "Latin1" for |
253 | | // legacy reasons. |
254 | | |
255 | | inline MOZ_MUST_USE bool |
256 | | CopyASCIItoUTF16(mozilla::Span<const char> aSource, |
257 | | nsAString& aDest, |
258 | | const mozilla::fallible_t&) |
259 | | { |
260 | | return nsstring_fallible_append_latin1_impl( |
261 | | &aDest, aSource.Elements(), aSource.Length(), 0, true); |
262 | | } |
263 | | |
264 | | inline void |
265 | | CopyASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
266 | | { |
267 | | if (MOZ_UNLIKELY(!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible))) { |
268 | | aDest.AllocFailed(aSource.Length()); |
269 | | } |
270 | | } |
271 | | |
272 | | inline MOZ_MUST_USE bool |
273 | | AppendASCIItoUTF16(mozilla::Span<const char> aSource, |
274 | | nsAString& aDest, |
275 | | const mozilla::fallible_t&) |
276 | | { |
277 | | return nsstring_fallible_append_latin1_impl( |
278 | | &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false); |
279 | | } |
280 | | |
281 | | inline void |
282 | | AppendASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest) |
283 | | { |
284 | | if (MOZ_UNLIKELY(!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible))) { |
285 | | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
286 | | } |
287 | | } |
288 | | |
289 | | // UTF-16 to UTF-8 |
290 | | // Unpaired surrogates are replaced with the REPLACEMENT CHARACTER. |
291 | | |
292 | | inline MOZ_MUST_USE bool |
293 | | CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
294 | | nsACString& aDest, |
295 | | const mozilla::fallible_t&) |
296 | | { |
297 | | return nscstring_fallible_append_utf16_to_utf8_impl( |
298 | | &aDest, aSource.Elements(), aSource.Length(), 0); |
299 | | } |
300 | | |
301 | | inline void |
302 | | CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
303 | | { |
304 | | if (MOZ_UNLIKELY(!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible))) { |
305 | | aDest.AllocFailed(aSource.Length()); |
306 | | } |
307 | | } |
308 | | |
309 | | inline MOZ_MUST_USE bool |
310 | | AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, |
311 | | nsACString& aDest, |
312 | | const mozilla::fallible_t&) |
313 | | { |
314 | | return nscstring_fallible_append_utf16_to_utf8_impl( |
315 | | &aDest, aSource.Elements(), aSource.Length(), aDest.Length()); |
316 | | } |
317 | | |
318 | | inline void |
319 | | AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
320 | | { |
321 | | if (MOZ_UNLIKELY(!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible))) { |
322 | | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
323 | | } |
324 | | } |
325 | | |
326 | | // UTF-16 to Latin1 |
327 | | // If all code points in the input are below U+0100, represents each scalar |
328 | | // value as an unsigned byte. (This is not windows-1252!) If there are code |
329 | | // points above U+00FF, memory-safely produces garbage and will likely start |
330 | | // asserting in future debug builds. The nature of the garbage may differ |
331 | | // based on CPU architecture and must not be relied upon. The names say |
332 | | // "ASCII" instead of "Latin1" for legacy reasons. |
333 | | |
334 | | inline MOZ_MUST_USE bool |
335 | | LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, |
336 | | nsACString& aDest, |
337 | | const mozilla::fallible_t&) |
338 | | { |
339 | | return nscstring_fallible_append_utf16_to_latin1_lossy_impl( |
340 | | &aDest, aSource.Elements(), aSource.Length(), 0, true); |
341 | | } |
342 | | |
343 | | inline void |
344 | | LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, nsACString& aDest) |
345 | | { |
346 | | if (MOZ_UNLIKELY(!LossyCopyUTF16toASCII(aSource, aDest, mozilla::fallible))) { |
347 | | aDest.AllocFailed(aSource.Length()); |
348 | | } |
349 | | } |
350 | | |
351 | | inline MOZ_MUST_USE bool |
352 | | LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource, |
353 | | nsACString& aDest, |
354 | | const mozilla::fallible_t&) |
355 | 0 | { |
356 | 0 | return nscstring_fallible_append_utf16_to_latin1_lossy_impl( |
357 | 0 | &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false); |
358 | 0 | } |
359 | | |
360 | | inline void |
361 | | LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource, |
362 | | nsACString& aDest) |
363 | 0 | { |
364 | 0 | if (MOZ_UNLIKELY( |
365 | 0 | !LossyAppendUTF16toASCII(aSource, aDest, mozilla::fallible))) { |
366 | 0 | aDest.AllocFailed(aDest.Length() + aSource.Length()); |
367 | 0 | } |
368 | 0 | } |
369 | | |
370 | | /** |
371 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
372 | | * |
373 | | * Allocates and returns a new |char| buffer which you must free with |free|. |
374 | | * Performs a conversion with LossyConvertUTF16toLatin1() writing into the |
375 | | * newly-allocated buffer. |
376 | | * |
377 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
378 | | * contains embedded nulls. |
379 | | * |
380 | | * @param aSource a 16-bit wide string |
381 | | * @return a new |char| buffer you must free with |free|. |
382 | | */ |
383 | | char* ToNewCString(const nsAString& aSource); |
384 | | |
385 | | /** |
386 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
387 | | * |
388 | | * Allocates and returns a new |char| buffer which you must free with |free|. |
389 | | * |
390 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
391 | | * contains embedded nulls. |
392 | | * |
393 | | * @param aSource an 8-bit wide string |
394 | | * @return a new |char| buffer you must free with |free|. |
395 | | */ |
396 | | char* ToNewCString(const nsACString& aSource); |
397 | | |
398 | | /** |
399 | | * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. |
400 | | * |
401 | | * Allocates and returns a new |char| buffer which you must free with |
402 | | * |free|. |
403 | | * Performs an encoding conversion from a UTF-16 string to a UTF-8 string with |
404 | | * unpaired surrogates replaced with the REPLACEMENT CHARACTER copying |
405 | | * |aSource| to your new buffer. |
406 | | * |
407 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
408 | | * contains embedded nulls. |
409 | | * |
410 | | * @param aSource a UTF-16 string (made of char16_t's) |
411 | | * @param aUTF8Count the number of 8-bit units that was returned |
412 | | * @return a new |char| buffer you must free with |free|. |
413 | | */ |
414 | | |
415 | | char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr); |
416 | | |
417 | | |
418 | | /** |
419 | | * Returns a new |char16_t| buffer containing a zero-terminated copy of |
420 | | * |aSource|. |
421 | | * |
422 | | * Allocates and returns a new |char16_t| buffer which you must free with |
423 | | * |free|. |
424 | | * |
425 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
426 | | * contains embedded nulls. |
427 | | * |
428 | | * @param aSource a UTF-16 string |
429 | | * @return a new |char16_t| buffer you must free with |free|. |
430 | | */ |
431 | | char16_t* ToNewUnicode(const nsAString& aSource); |
432 | | |
433 | | |
434 | | /** |
435 | | * Returns a new |char16_t| buffer containing a zero-terminated copy of |
436 | | * |aSource|. |
437 | | * |
438 | | * Allocates and returns a new |char16_t| buffer which you must free with |
439 | | * |free|. |
440 | | * |
441 | | * Performs an encoding conversion by 0-padding 8-bit wide characters up to |
442 | | * 16-bits wide (i.e. Latin1 to UTF-16 conversion) while copying |aSource| |
443 | | * to your new buffer. |
444 | | * |
445 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
446 | | * contains embedded nulls. |
447 | | * |
448 | | * @param aSource a Latin1 string |
449 | | * @return a new |char16_t| buffer you must free with |free|. |
450 | | */ |
451 | | char16_t* ToNewUnicode(const nsACString& aSource); |
452 | | |
453 | | /** |
454 | | * Returns a new |char16_t| buffer containing a zero-terminated copy |
455 | | * of |aSource|. |
456 | | * |
457 | | * Allocates and returns a new |char| buffer which you must free with |
458 | | * |free|. Performs an encoding conversion from UTF-8 to UTF-16 |
459 | | * while copying |aSource| to your new buffer. Malformed byte sequences |
460 | | * are replaced with the REPLACEMENT CHARACTER. |
461 | | * |
462 | | * The new buffer is zero-terminated, but that may not help you if |aSource| |
463 | | * contains embedded nulls. |
464 | | * |
465 | | * @param aSource an 8-bit wide string, UTF-8 encoded |
466 | | * @param aUTF16Count the number of 16-bit units that was returned |
467 | | * @return a new |char16_t| buffer you must free with |free|. |
468 | | * (UTF-16 encoded) |
469 | | */ |
470 | | char16_t* UTF8ToNewUnicode(const nsACString& aSource, |
471 | | uint32_t* aUTF16Count = nullptr); |
472 | | |
473 | | /** |
474 | | * Copies |aLength| 16-bit code units from the start of |aSource| to the |
475 | | * |char16_t| buffer |aDest|. |
476 | | * |
477 | | * After this operation |aDest| is not null terminated. |
478 | | * |
479 | | * @param aSource a UTF-16 string |
480 | | * @param aSrcOffset start offset in the source string |
481 | | * @param aDest a |char16_t| buffer |
482 | | * @param aLength the number of 16-bit code units to copy |
483 | | * @return pointer to destination buffer - identical to |aDest| |
484 | | */ |
485 | | char16_t* CopyUnicodeTo(const nsAString& aSource, |
486 | | uint32_t aSrcOffset, |
487 | | char16_t* aDest, |
488 | | uint32_t aLength); |
489 | | |
490 | | /** |
491 | | * Returns |true| if |aString| contains only ASCII characters, that is, |
492 | | * characters in the range (0x00, 0x7F). |
493 | | * |
494 | | * @param aString a 16-bit wide string to scan |
495 | | */ |
496 | | inline bool |
497 | | IsASCII(mozilla::Span<const char16_t> aString) |
498 | | { |
499 | | size_t length = aString.Length(); |
500 | | const char16_t* ptr = aString.Elements(); |
501 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
502 | | // code won't have a chance to kick in anyway. |
503 | | if (length < 16) { |
504 | | char16_t accu = 0; |
505 | | for (size_t i = 0; i < length; i++) { |
506 | | accu |= ptr[i]; |
507 | | } |
508 | | return accu < 0x80U; |
509 | | } |
510 | | return encoding_mem_is_basic_latin(ptr, length); |
511 | | } |
512 | | |
513 | | /** |
514 | | * Returns |true| if |aString| contains only ASCII characters, that is, |
515 | | * characters in the range (0x00, 0x7F). |
516 | | * |
517 | | * @param aString a 8-bit wide string to scan |
518 | | */ |
519 | | inline bool |
520 | | IsASCII(mozilla::Span<const char> aString) |
521 | | { |
522 | | size_t length = aString.Length(); |
523 | | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
524 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
525 | | // code won't have a chance to kick in anyway. |
526 | | if (length < 16) { |
527 | | uint8_t accu = 0; |
528 | | for (size_t i = 0; i < length; i++) { |
529 | | accu |= ptr[i]; |
530 | | } |
531 | | return accu < 0x80U; |
532 | | } |
533 | | return encoding_mem_is_ascii(ptr, length); |
534 | | } |
535 | | |
536 | | /** |
537 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
538 | | * characters in the range (U+0000, U+00FF). |
539 | | * |
540 | | * @param aString a potentially-invalid UTF-16 string to scan |
541 | | */ |
542 | | inline bool |
543 | | IsUTF16Latin1(mozilla::Span<const char16_t> aString) |
544 | | { |
545 | | size_t length = aString.Length(); |
546 | | const char16_t* ptr = aString.Elements(); |
547 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
548 | | // code won't have a chance to kick in anyway. |
549 | | if (length < 16) { |
550 | | char16_t accu = 0; |
551 | | for (size_t i = 0; i < length; i++) { |
552 | | accu |= ptr[i]; |
553 | | } |
554 | | return accu < 0x100U; |
555 | | } |
556 | | return encoding_mem_is_utf16_latin1(ptr, length); |
557 | | } |
558 | | |
559 | | /** |
560 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
561 | | * characters in the range (U+0000, U+00FF). |
562 | | * |
563 | | * If you know that the argument is always absolutely guaranteed to be valid |
564 | | * UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead. |
565 | | * |
566 | | * @param aString potentially-invalid UTF-8 string to scan |
567 | | */ |
568 | | inline bool |
569 | | IsUTF8Latin1(mozilla::Span<const char> aString) |
570 | | { |
571 | | size_t length = aString.Length(); |
572 | | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
573 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
574 | | // code won't have a chance to kick in anyway. |
575 | | if (length < 16) { |
576 | | for (size_t i = 0; i < length; i++) { |
577 | | if (ptr[i] >= 0x80U) { |
578 | | ptr += i; |
579 | | length -= i; |
580 | | // This loop can't handle non-ASCII, but the Rust code can, so |
581 | | // upon seeing non-ASCII, break the loop and let the Rust code |
582 | | // handle the rest of the buffer (including the non-ASCII byte). |
583 | | goto end; |
584 | | } |
585 | | } |
586 | | return true; |
587 | | } |
588 | | end: |
589 | | return encoding_mem_is_utf8_latin1(ptr, length); |
590 | | } |
591 | | |
592 | | /** |
593 | | * Returns |true| if |aString| contains only Latin1 characters, that is, |
594 | | * characters in the range (U+0000, U+00FF). |
595 | | * |
596 | | * The argument MUST be valid UTF-8. If you are at all unsure, use IsUTF8Latin1 |
597 | | * instead! |
598 | | * |
599 | | * @param aString known-valid UTF-8 string to scan |
600 | | */ |
601 | | inline bool |
602 | | UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString) |
603 | | { |
604 | | size_t length = aString.Length(); |
605 | | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
606 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
607 | | // code won't have a chance to kick in anyway. |
608 | | if (length < 16) { |
609 | | for (size_t i = 0; i < length; i++) { |
610 | | if (ptr[i] >= 0x80U) { |
611 | | ptr += i; |
612 | | length -= i; |
613 | | goto end; |
614 | | } |
615 | | } |
616 | | return true; |
617 | | } |
618 | | end: |
619 | | return encoding_mem_is_str_latin1(ptr, length); |
620 | | } |
621 | | |
622 | | /** |
623 | | * Returns |true| if |aString| is a valid UTF-8 string. |
624 | | * |
625 | | * Note that this doesn't check whether the string might look like a valid |
626 | | * string in another encoding, too, e.g. ISO-2022-JP. |
627 | | * |
628 | | * @param aString an 8-bit wide string to scan |
629 | | */ |
630 | | inline bool |
631 | | IsUTF8(mozilla::Span<const char> aString) |
632 | | { |
633 | | size_t length = aString.Length(); |
634 | | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); |
635 | | // For short strings, calling into Rust is a pessimization, and the SIMD |
636 | | // code won't have a chance to kick in anyway. |
637 | | if (length < 16) { |
638 | | for (size_t i = 0; i < length; i++) { |
639 | | if (ptr[i] >= 0x80U) { |
640 | | ptr += i; |
641 | | length -= i; |
642 | | goto end; |
643 | | } |
644 | | } |
645 | | return true; |
646 | | } |
647 | | end: |
648 | | return length == encoding_utf8_valid_up_to(ptr, length); |
649 | | } |
650 | | |
651 | | /** |
652 | | * Returns the index of the first unpaired surrogate or |
653 | | * the length of the string if there are none. |
654 | | */ |
655 | | inline uint32_t |
656 | | UTF16ValidUpTo(mozilla::Span<const char16_t> aString) |
657 | | { |
658 | | return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length()); |
659 | | } |
660 | | |
661 | | /** |
662 | | * Replaces unpaired surrogates with U+FFFD in the argument. |
663 | | */ |
664 | | inline void |
665 | | EnsureUTF16ValiditySpan(mozilla::Span<char16_t> aString) |
666 | | { |
667 | | encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length()); |
668 | | } |
669 | | |
670 | | /** |
671 | | * Replaces unpaired surrogates with U+FFFD in the argument. |
672 | | * |
673 | | * Copies a shared string buffer or an otherwise read-only |
674 | | * buffer only if there are unpaired surrogates. |
675 | | */ |
676 | | inline void |
677 | | EnsureUTF16Validity(nsAString& aString) |
678 | | { |
679 | | uint32_t upTo = UTF16ValidUpTo(aString); |
680 | | uint32_t len = aString.Length(); |
681 | | if (upTo == len) { |
682 | | return; |
683 | | } |
684 | | char16_t* ptr = aString.BeginWriting(); |
685 | | auto span = mozilla::MakeSpan(ptr, len); |
686 | | span[upTo] = 0xFFFD; |
687 | | EnsureUTF16ValiditySpan(span.From(upTo + 1)); |
688 | | } |
689 | | |
690 | | bool ParseString(const nsACString& aAstring, char aDelimiter, |
691 | | nsTArray<nsCString>& aArray); |
692 | | |
693 | | /** |
694 | | * Converts case in place in the argument string. |
695 | | */ |
696 | | void ToUpperCase(nsACString&); |
697 | | |
698 | | void ToLowerCase(nsACString&); |
699 | | |
700 | | void ToUpperCase(nsACString&); |
701 | | |
702 | | void ToLowerCase(nsACString&); |
703 | | |
704 | | /** |
705 | | * Converts case from string aSource to aDest. |
706 | | */ |
707 | | void ToUpperCase(const nsACString& aSource, nsACString& aDest); |
708 | | |
709 | | void ToLowerCase(const nsACString& aSource, nsACString& aDest); |
710 | | |
711 | | /** |
712 | | * Finds the leftmost occurrence of |aPattern|, if any in the range |
713 | | * |aSearchStart|..|aSearchEnd|. |
714 | | * |
715 | | * Returns |true| if a match was found, and adjusts |aSearchStart| and |
716 | | * |aSearchEnd| to point to the match. If no match was found, returns |false| |
717 | | * and makes |aSearchStart == aSearchEnd|. |
718 | | * |
719 | | * Currently, this is equivalent to the O(m*n) implementation previously on |
720 | | * |ns[C]String|. |
721 | | * |
722 | | * If we need something faster, then we can implement that later. |
723 | | */ |
724 | | |
725 | | bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&, |
726 | | nsAString::const_iterator&, |
727 | | const nsStringComparator& = nsDefaultStringComparator()); |
728 | | bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&, |
729 | | nsACString::const_iterator&, |
730 | | const nsCStringComparator& = nsDefaultCStringComparator()); |
731 | | |
732 | | /* sometimes we don't care about where the string was, just that we |
733 | | * found it or not */ |
734 | | inline bool |
735 | | FindInReadable(const nsAString& aPattern, const nsAString& aSource, |
736 | | const nsStringComparator& aCompare = nsDefaultStringComparator()) |
737 | | { |
738 | | nsAString::const_iterator start, end; |
739 | | aSource.BeginReading(start); |
740 | | aSource.EndReading(end); |
741 | | return FindInReadable(aPattern, start, end, aCompare); |
742 | | } |
743 | | |
744 | | inline bool |
745 | | FindInReadable(const nsACString& aPattern, const nsACString& aSource, |
746 | | const nsCStringComparator& aCompare = nsDefaultCStringComparator()) |
747 | | { |
748 | | nsACString::const_iterator start, end; |
749 | | aSource.BeginReading(start); |
750 | | aSource.EndReading(end); |
751 | | return FindInReadable(aPattern, start, end, aCompare); |
752 | | } |
753 | | |
754 | | |
755 | | bool CaseInsensitiveFindInReadable(const nsACString& aPattern, |
756 | | nsACString::const_iterator&, |
757 | | nsACString::const_iterator&); |
758 | | |
759 | | /** |
760 | | * Finds the rightmost occurrence of |aPattern| |
761 | | * Returns |true| if a match was found, and adjusts |aSearchStart| and |
762 | | * |aSearchEnd| to point to the match. If no match was found, returns |false| |
763 | | * and makes |aSearchStart == aSearchEnd|. |
764 | | */ |
765 | | bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&, |
766 | | nsAString::const_iterator&, |
767 | | const nsStringComparator& = nsDefaultStringComparator()); |
768 | | bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&, |
769 | | nsACString::const_iterator&, |
770 | | const nsCStringComparator& = nsDefaultCStringComparator()); |
771 | | |
772 | | /** |
773 | | * Finds the leftmost occurrence of |aChar|, if any in the range |
774 | | * |aSearchStart|..|aSearchEnd|. |
775 | | * |
776 | | * Returns |true| if a match was found, and adjusts |aSearchStart| to |
777 | | * point to the match. If no match was found, returns |false| and |
778 | | * makes |aSearchStart == aSearchEnd|. |
779 | | */ |
780 | | bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart, |
781 | | const nsAString::const_iterator& aSearchEnd); |
782 | | bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart, |
783 | | const nsACString::const_iterator& aSearchEnd); |
784 | | |
785 | | bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring); |
786 | | bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring, |
787 | | const nsStringComparator& aComparator); |
788 | | bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring); |
789 | | bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring, |
790 | | const nsCStringComparator& aComparator); |
791 | | bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring); |
792 | | bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring, |
793 | | const nsStringComparator& aComparator); |
794 | | bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring); |
795 | | bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring, |
796 | | const nsCStringComparator& aComparator); |
797 | | |
798 | | const nsString& EmptyString(); |
799 | | const nsCString& EmptyCString(); |
800 | | |
801 | | const nsString& VoidString(); |
802 | | const nsCString& VoidCString(); |
803 | | |
804 | | /** |
805 | | * Compare a UTF-8 string to an UTF-16 string. |
806 | | * |
807 | | * Returns 0 if the strings are equal, -1 if aUTF8String is less |
808 | | * than aUTF16Count, and 1 in the reverse case. Errors are replaced |
809 | | * with U+FFFD and then the U+FFFD is compared as if it had occurred |
810 | | * in the input. If aErr is not nullptr, *aErr is set to true if |
811 | | * either string had malformed sequences. |
812 | | */ |
813 | | int32_t |
814 | | CompareUTF8toUTF16(const nsACString& aUTF8String, |
815 | | const nsAString& aUTF16String, |
816 | | bool* aErr = nullptr); |
817 | | |
818 | | void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest); |
819 | | |
820 | | #endif // !defined(nsReadableUtils_h___) |