Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/xpcom/string/nsReadableUtils.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
// IWYU pragma: private, include "nsString.h"
7
8
#ifndef nsReadableUtils_h___
9
#define nsReadableUtils_h___
10
11
/**
12
 * I guess all the routines in this file are all mis-named.
13
 * According to our conventions, they should be |NS_xxx|.
14
 */
15
16
#include "mozilla/Assertions.h"
17
#include "nsAString.h"
18
19
#include "nsTArrayForwardDeclare.h"
20
21
// Can't include mozilla/Encoding.h here. The implementations are in
22
// the encoding_rs and encoding_glue crates.
23
extern "C" {
24
  size_t
25
  encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
26
27
  bool
28
  encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len);
29
30
  bool
31
  encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
32
33
  bool
34
  encoding_mem_is_utf8_latin1(uint8_t const* buffer, size_t buffer_len);
35
36
  bool
37
  encoding_mem_is_str_latin1(uint8_t const* buffer, size_t buffer_len);
38
39
  bool
40
  encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len);
41
42
  size_t
43
  encoding_mem_utf16_valid_up_to(char16_t const* buffer, size_t buffer_len);
44
45
  void
46
  encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t buffer_len);
47
48
  void
49
  encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src,
50
                                             size_t src_len,
51
                                             char* dst,
52
                                             size_t dst_len);
53
54
  size_t
55
  encoding_mem_convert_utf8_to_latin1_lossy(const char* src,
56
                                            size_t src_len,
57
                                            char* dst,
58
                                            size_t dst_len);
59
60
  void
61
  encoding_mem_convert_latin1_to_utf16(const char* src,
62
                                       size_t src_len,
63
                                       char16_t* dst,
64
                                       size_t dst_len);
65
66
  size_t
67
  encoding_mem_convert_utf16_to_utf8(const char16_t* src,
68
                                     size_t src_len,
69
                                     char* dst,
70
                                     size_t dst_len);
71
72
  size_t
73
  encoding_mem_convert_utf8_to_utf16(const char* src,
74
                                     size_t src_len,
75
                                     char16_t* dst,
76
                                     size_t dst_len);
77
}
78
79
// From the nsstring crate
80
extern "C" {
81
  bool
82
  nsstring_fallible_append_utf8_impl(nsAString* aThis,
83
                                     const char* aOther,
84
                                     size_t aOtherLen,
85
                                     size_t aOldLen);
86
87
  bool
88
  nsstring_fallible_append_latin1_impl(nsAString* aThis,
89
                                       const char* aOther,
90
                                       size_t aOtherLen,
91
                                       size_t aOldLen,
92
                                       bool aAllowShrinking);
93
94
  bool
95
  nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis,
96
                                               const char16_t*,
97
                                               size_t aOtherLen,
98
                                               size_t aOldLen);
99
100
  bool
101
  nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis,
102
                                                       const char16_t*,
103
                                                       size_t aOtherLen,
104
                                                       size_t aOldLen,
105
                                                       bool aAllowShrinking);
106
107
  bool
108
  nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis,
109
                                                       const nsACString* aOther,
110
                                                       size_t aOldLen);
111
112
  bool
113
  nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis,
114
                                                 const nsACString* aOther,
115
                                                 size_t aOldLen);
116
}
117
118
/**
119
 * If all the code points in the input are below U+0100, converts to Latin1,
120
 * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
121
 * there are code points above U+00FF, produces garbage in a memory-safe way
122
 * and will likely start asserting in future debug builds. The nature of the
123
 * garbage depends on the CPU architecture and must not be relied upon.
124
 *
125
 * The length of aDest must be not be less than the length of aSource.
126
 */
127
inline void
128
LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource,
129
                          mozilla::Span<char> aDest)
130
0
{
131
0
  encoding_mem_convert_utf16_to_latin1_lossy(
132
0
    aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
133
0
}
134
135
/**
136
 * If all the code points in the input are below U+0100, converts to Latin1,
137
 * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
138
 * there are code points above U+00FF, asserts in debug builds and produces
139
 * garbage in memory-safe way in release builds. The nature of the garbage
140
 * may depend on the CPU architecture and must not be relied upon.
141
 *
142
 * The length of aDest must be not be less than the length of aSource.
143
 */
144
inline size_t
145
LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource,
146
                         mozilla::Span<char> aDest)
147
{
148
  return encoding_mem_convert_utf8_to_latin1_lossy(
149
    aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
150
}
151
152
/**
153
 * Interprets unsigned byte value as Unicode scalar value (i.e. not
154
 * windows-1252!).
155
 *
156
 * The length of aDest must be not be less than the length of aSource.
157
 */
158
inline void
159
ConvertLatin1toUTF16(mozilla::Span<const char> aSource,
160
                     mozilla::Span<char16_t> aDest)
161
0
{
162
0
  encoding_mem_convert_latin1_to_utf16(
163
0
    aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
164
0
}
165
166
/**
167
 * Lone surrogates are replaced with the REPLACEMENT CHARACTER.
168
 *
169
 * The length of aDest must be at least the length of aSource times three
170
 * _plus one_.
171
 *
172
 * Returns the number of code units written.
173
 */
174
inline size_t
175
ConvertUTF16toUTF8(mozilla::Span<const char16_t> aSource,
176
                   mozilla::Span<char> aDest)
177
0
{
178
0
  return encoding_mem_convert_utf16_to_utf8(
179
0
    aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
180
0
}
181
182
/**
183
 * Malformed byte sequences are replaced with the REPLACEMENT CHARACTER.
184
 *
185
 * The length of aDest must at least one greater than the length of aSource.
186
 *
187
 * Returns the number of code units written.
188
 */
189
inline size_t
190
ConvertUTF8toUTF16(mozilla::Span<const char> aSource,
191
                   mozilla::Span<char16_t> aDest)
192
0
{
193
0
  return encoding_mem_convert_utf8_to_utf16(
194
0
    aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
195
0
}
196
197
inline size_t
198
Distance(const nsReadingIterator<char16_t>& aStart,
199
         const nsReadingIterator<char16_t>& aEnd)
200
{
201
  MOZ_ASSERT(aStart.get() <= aEnd.get());
202
  return static_cast<size_t>(aEnd.get() - aStart.get());
203
}
204
205
inline size_t
206
Distance(const nsReadingIterator<char>& aStart,
207
         const nsReadingIterator<char>& aEnd)
208
{
209
  MOZ_ASSERT(aStart.get() <= aEnd.get());
210
  return static_cast<size_t>(aEnd.get() - aStart.get());
211
}
212
213
// UTF-8 to UTF-16
214
// Invalid UTF-8 byte sequences are replaced with the REPLACEMENT CHARACTER.
215
216
inline MOZ_MUST_USE bool
217
CopyUTF8toUTF16(mozilla::Span<const char> aSource,
218
                nsAString& aDest,
219
                const mozilla::fallible_t&)
220
{
221
  return nsstring_fallible_append_utf8_impl(
222
    &aDest, aSource.Elements(), aSource.Length(), 0);
223
}
224
225
inline void
226
CopyUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
227
{
228
  if (MOZ_UNLIKELY(!CopyUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
229
    aDest.AllocFailed(aSource.Length());
230
  }
231
}
232
233
inline MOZ_MUST_USE bool
234
AppendUTF8toUTF16(mozilla::Span<const char> aSource,
235
                  nsAString& aDest,
236
                  const mozilla::fallible_t&)
237
68.9k
{
238
68.9k
  return nsstring_fallible_append_utf8_impl(
239
68.9k
    &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
240
68.9k
}
241
242
inline void
243
AppendUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
244
68.9k
{
245
68.9k
  if (MOZ_UNLIKELY(!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
246
0
    aDest.AllocFailed(aDest.Length() + aSource.Length());
247
0
  }
248
68.9k
}
249
250
// Latin1 to UTF-16
251
// Interpret each incoming unsigned byte value as a Unicode scalar value (not
252
// windows-1252!). The function names say "ASCII" instead of "Latin1" for
253
// legacy reasons.
254
255
inline MOZ_MUST_USE bool
256
CopyASCIItoUTF16(mozilla::Span<const char> aSource,
257
                 nsAString& aDest,
258
                 const mozilla::fallible_t&)
259
{
260
  return nsstring_fallible_append_latin1_impl(
261
    &aDest, aSource.Elements(), aSource.Length(), 0, true);
262
}
263
264
inline void
265
CopyASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
266
{
267
  if (MOZ_UNLIKELY(!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
268
    aDest.AllocFailed(aSource.Length());
269
  }
270
}
271
272
inline MOZ_MUST_USE bool
273
AppendASCIItoUTF16(mozilla::Span<const char> aSource,
274
                   nsAString& aDest,
275
                   const mozilla::fallible_t&)
276
{
277
  return nsstring_fallible_append_latin1_impl(
278
    &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false);
279
}
280
281
inline void
282
AppendASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
283
{
284
  if (MOZ_UNLIKELY(!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
285
    aDest.AllocFailed(aDest.Length() + aSource.Length());
286
  }
287
}
288
289
// UTF-16 to UTF-8
290
// Unpaired surrogates are replaced with the REPLACEMENT CHARACTER.
291
292
inline MOZ_MUST_USE bool
293
CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource,
294
                nsACString& aDest,
295
                const mozilla::fallible_t&)
296
{
297
  return nscstring_fallible_append_utf16_to_utf8_impl(
298
    &aDest, aSource.Elements(), aSource.Length(), 0);
299
}
300
301
inline void
302
CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
303
{
304
  if (MOZ_UNLIKELY(!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
305
    aDest.AllocFailed(aSource.Length());
306
  }
307
}
308
309
inline MOZ_MUST_USE bool
310
AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource,
311
                  nsACString& aDest,
312
                  const mozilla::fallible_t&)
313
{
314
  return nscstring_fallible_append_utf16_to_utf8_impl(
315
    &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
316
}
317
318
inline void
319
AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
320
{
321
  if (MOZ_UNLIKELY(!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
322
    aDest.AllocFailed(aDest.Length() + aSource.Length());
323
  }
324
}
325
326
// UTF-16 to Latin1
327
// If all code points in the input are below U+0100, represents each scalar
328
// value as an unsigned byte. (This is not windows-1252!) If there are code
329
// points above U+00FF, memory-safely produces garbage and will likely start
330
// asserting in future debug builds. The nature of the garbage may differ
331
// based on CPU architecture and must not be relied upon. The names say
332
// "ASCII" instead of "Latin1" for legacy reasons.
333
334
inline MOZ_MUST_USE bool
335
LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource,
336
                      nsACString& aDest,
337
                      const mozilla::fallible_t&)
338
{
339
  return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
340
    &aDest, aSource.Elements(), aSource.Length(), 0, true);
341
}
342
343
inline void
344
LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, nsACString& aDest)
345
{
346
  if (MOZ_UNLIKELY(!LossyCopyUTF16toASCII(aSource, aDest, mozilla::fallible))) {
347
    aDest.AllocFailed(aSource.Length());
348
  }
349
}
350
351
inline MOZ_MUST_USE bool
352
LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
353
                        nsACString& aDest,
354
                        const mozilla::fallible_t&)
355
0
{
356
0
  return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
357
0
    &aDest, aSource.Elements(), aSource.Length(), aDest.Length(), false);
358
0
}
359
360
inline void
361
LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
362
                        nsACString& aDest)
363
0
{
364
0
  if (MOZ_UNLIKELY(
365
0
        !LossyAppendUTF16toASCII(aSource, aDest, mozilla::fallible))) {
366
0
    aDest.AllocFailed(aDest.Length() + aSource.Length());
367
0
  }
368
0
}
369
370
/**
371
 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
372
 *
373
 * Allocates and returns a new |char| buffer which you must free with |free|.
374
 * Performs a conversion with LossyConvertUTF16toLatin1() writing into the
375
 * newly-allocated buffer.
376
 *
377
 * The new buffer is zero-terminated, but that may not help you if |aSource|
378
 * contains embedded nulls.
379
 *
380
 * @param aSource a 16-bit wide string
381
 * @return a new |char| buffer you must free with |free|.
382
 */
383
char* ToNewCString(const nsAString& aSource);
384
385
/**
386
 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
387
 *
388
 * Allocates and returns a new |char| buffer which you must free with |free|.
389
 *
390
 * The new buffer is zero-terminated, but that may not help you if |aSource|
391
 * contains embedded nulls.
392
 *
393
 * @param aSource an 8-bit wide string
394
 * @return a new |char| buffer you must free with |free|.
395
 */
396
char* ToNewCString(const nsACString& aSource);
397
398
/**
399
 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
400
 *
401
 * Allocates and returns a new |char| buffer which you must free with
402
 * |free|.
403
 * Performs an encoding conversion from a UTF-16 string to a UTF-8 string with
404
 * unpaired surrogates replaced with the REPLACEMENT CHARACTER copying
405
 * |aSource| to your new buffer.
406
 *
407
 * The new buffer is zero-terminated, but that may not help you if |aSource|
408
 * contains embedded nulls.
409
 *
410
 * @param aSource a UTF-16 string (made of char16_t's)
411
 * @param aUTF8Count the number of 8-bit units that was returned
412
 * @return a new |char| buffer you must free with |free|.
413
 */
414
415
char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr);
416
417
418
/**
419
 * Returns a new |char16_t| buffer containing a zero-terminated copy of
420
 * |aSource|.
421
 *
422
 * Allocates and returns a new |char16_t| buffer which you must free with
423
 * |free|.
424
 *
425
 * The new buffer is zero-terminated, but that may not help you if |aSource|
426
 * contains embedded nulls.
427
 *
428
 * @param aSource a UTF-16 string
429
 * @return a new |char16_t| buffer you must free with |free|.
430
 */
431
char16_t* ToNewUnicode(const nsAString& aSource);
432
433
434
/**
435
 * Returns a new |char16_t| buffer containing a zero-terminated copy of
436
 * |aSource|.
437
 *
438
 * Allocates and returns a new |char16_t| buffer which you must free with
439
 * |free|.
440
 *
441
 * Performs an encoding conversion by 0-padding 8-bit wide characters up to
442
 * 16-bits wide (i.e. Latin1 to UTF-16 conversion) while copying |aSource|
443
 * to your new buffer.
444
 *
445
 * The new buffer is zero-terminated, but that may not help you if |aSource|
446
 * contains embedded nulls.
447
 *
448
 * @param aSource a Latin1 string
449
 * @return a new |char16_t| buffer you must free with |free|.
450
 */
451
char16_t* ToNewUnicode(const nsACString& aSource);
452
453
/**
454
 * Returns a new |char16_t| buffer containing a zero-terminated copy
455
 * of |aSource|.
456
 *
457
 * Allocates and returns a new |char| buffer which you must free with
458
 * |free|.  Performs an encoding conversion from UTF-8 to UTF-16
459
 * while copying |aSource| to your new buffer.  Malformed byte sequences
460
 * are replaced with the REPLACEMENT CHARACTER.
461
 *
462
 * The new buffer is zero-terminated, but that may not help you if |aSource|
463
 * contains embedded nulls.
464
 *
465
 * @param aSource an 8-bit wide string, UTF-8 encoded
466
 * @param aUTF16Count the number of 16-bit units that was returned
467
 * @return a new |char16_t| buffer you must free with |free|.
468
 *         (UTF-16 encoded)
469
 */
470
char16_t* UTF8ToNewUnicode(const nsACString& aSource,
471
                           uint32_t* aUTF16Count = nullptr);
472
473
/**
474
 * Copies |aLength| 16-bit code units from the start of |aSource| to the
475
 * |char16_t| buffer |aDest|.
476
 *
477
 * After this operation |aDest| is not null terminated.
478
 *
479
 * @param aSource a UTF-16 string
480
 * @param aSrcOffset start offset in the source string
481
 * @param aDest a |char16_t| buffer
482
 * @param aLength the number of 16-bit code units to copy
483
 * @return pointer to destination buffer - identical to |aDest|
484
 */
485
char16_t* CopyUnicodeTo(const nsAString& aSource,
486
                        uint32_t aSrcOffset,
487
                        char16_t* aDest,
488
                        uint32_t aLength);
489
490
/**
491
 * Returns |true| if |aString| contains only ASCII characters, that is,
492
 * characters in the range (0x00, 0x7F).
493
 *
494
 * @param aString a 16-bit wide string to scan
495
 */
496
inline bool
497
IsASCII(mozilla::Span<const char16_t> aString)
498
{
499
  size_t length = aString.Length();
500
  const char16_t* ptr = aString.Elements();
501
  // For short strings, calling into Rust is a pessimization, and the SIMD
502
  // code won't have a chance to kick in anyway.
503
  if (length < 16) {
504
    char16_t accu = 0;
505
    for (size_t i = 0; i < length; i++) {
506
      accu |= ptr[i];
507
    }
508
    return accu < 0x80U;
509
  }
510
  return encoding_mem_is_basic_latin(ptr, length);
511
}
512
513
/**
514
 * Returns |true| if |aString| contains only ASCII characters, that is,
515
 * characters in the range (0x00, 0x7F).
516
 *
517
 * @param aString a 8-bit wide string to scan
518
 */
519
inline bool
520
IsASCII(mozilla::Span<const char> aString)
521
{
522
  size_t length = aString.Length();
523
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
524
  // For short strings, calling into Rust is a pessimization, and the SIMD
525
  // code won't have a chance to kick in anyway.
526
  if (length < 16) {
527
    uint8_t accu = 0;
528
    for (size_t i = 0; i < length; i++) {
529
      accu |= ptr[i];
530
    }
531
    return accu < 0x80U;
532
  }
533
  return encoding_mem_is_ascii(ptr, length);
534
}
535
536
/**
537
 * Returns |true| if |aString| contains only Latin1 characters, that is,
538
 * characters in the range (U+0000, U+00FF).
539
 *
540
 * @param aString a potentially-invalid UTF-16 string to scan
541
 */
542
inline bool
543
IsUTF16Latin1(mozilla::Span<const char16_t> aString)
544
{
545
  size_t length = aString.Length();
546
  const char16_t* ptr = aString.Elements();
547
  // For short strings, calling into Rust is a pessimization, and the SIMD
548
  // code won't have a chance to kick in anyway.
549
  if (length < 16) {
550
    char16_t accu = 0;
551
    for (size_t i = 0; i < length; i++) {
552
      accu |= ptr[i];
553
    }
554
    return accu < 0x100U;
555
  }
556
  return encoding_mem_is_utf16_latin1(ptr, length);
557
}
558
559
/**
560
 * Returns |true| if |aString| contains only Latin1 characters, that is,
561
 * characters in the range (U+0000, U+00FF).
562
 *
563
 * If you know that the argument is always absolutely guaranteed to be valid
564
 * UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead.
565
 *
566
 * @param aString potentially-invalid UTF-8 string to scan
567
 */
568
inline bool
569
IsUTF8Latin1(mozilla::Span<const char> aString)
570
{
571
  size_t length = aString.Length();
572
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
573
  // For short strings, calling into Rust is a pessimization, and the SIMD
574
  // code won't have a chance to kick in anyway.
575
  if (length < 16) {
576
    for (size_t i = 0; i < length; i++) {
577
      if (ptr[i] >= 0x80U) {
578
        ptr += i;
579
        length -= i;
580
        // This loop can't handle non-ASCII, but the Rust code can, so
581
        // upon seeing non-ASCII, break the loop and let the Rust code
582
        // handle the rest of the buffer (including the non-ASCII byte).
583
        goto end;
584
      }
585
    }
586
    return true;
587
  }
588
end:
589
  return encoding_mem_is_utf8_latin1(ptr, length);
590
}
591
592
/**
593
 * Returns |true| if |aString| contains only Latin1 characters, that is,
594
 * characters in the range (U+0000, U+00FF).
595
 *
596
 * The argument MUST be valid UTF-8. If you are at all unsure, use IsUTF8Latin1
597
 * instead!
598
 *
599
 * @param aString known-valid UTF-8 string to scan
600
 */
601
inline bool
602
UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString)
603
{
604
  size_t length = aString.Length();
605
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
606
  // For short strings, calling into Rust is a pessimization, and the SIMD
607
  // code won't have a chance to kick in anyway.
608
  if (length < 16) {
609
    for (size_t i = 0; i < length; i++) {
610
      if (ptr[i] >= 0x80U) {
611
        ptr += i;
612
        length -= i;
613
        goto end;
614
      }
615
    }
616
    return true;
617
  }
618
end:
619
  return encoding_mem_is_str_latin1(ptr, length);
620
}
621
622
/**
623
 * Returns |true| if |aString| is a valid UTF-8 string.
624
 *
625
 * Note that this doesn't check whether the string might look like a valid
626
 * string in another encoding, too, e.g. ISO-2022-JP.
627
 *
628
 * @param aString an 8-bit wide string to scan
629
 */
630
inline bool
631
IsUTF8(mozilla::Span<const char> aString)
632
{
633
  size_t length = aString.Length();
634
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
635
  // For short strings, calling into Rust is a pessimization, and the SIMD
636
  // code won't have a chance to kick in anyway.
637
  if (length < 16) {
638
    for (size_t i = 0; i < length; i++) {
639
      if (ptr[i] >= 0x80U) {
640
        ptr += i;
641
        length -= i;
642
        goto end;
643
      }
644
    }
645
    return true;
646
  }
647
  end:
648
  return length == encoding_utf8_valid_up_to(ptr, length);
649
}
650
651
/**
652
 * Returns the index of the first unpaired surrogate or
653
 * the length of the string if there are none.
654
 */
655
inline uint32_t
656
UTF16ValidUpTo(mozilla::Span<const char16_t> aString)
657
{
658
  return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length());
659
}
660
661
/**
662
 * Replaces unpaired surrogates with U+FFFD in the argument.
663
 */
664
inline void
665
EnsureUTF16ValiditySpan(mozilla::Span<char16_t> aString)
666
{
667
  encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length());
668
}
669
670
/**
671
 * Replaces unpaired surrogates with U+FFFD in the argument.
672
 *
673
 * Copies a shared string buffer or an otherwise read-only
674
 * buffer only if there are unpaired surrogates.
675
 */
676
inline void
677
EnsureUTF16Validity(nsAString& aString)
678
{
679
  uint32_t upTo = UTF16ValidUpTo(aString);
680
  uint32_t len = aString.Length();
681
  if (upTo == len) {
682
    return;
683
  }
684
  char16_t* ptr = aString.BeginWriting();
685
  auto span = mozilla::MakeSpan(ptr, len);
686
  span[upTo] = 0xFFFD;
687
  EnsureUTF16ValiditySpan(span.From(upTo + 1));
688
}
689
690
bool ParseString(const nsACString& aAstring, char aDelimiter,
691
                 nsTArray<nsCString>& aArray);
692
693
/**
694
 * Converts case in place in the argument string.
695
 */
696
void ToUpperCase(nsACString&);
697
698
void ToLowerCase(nsACString&);
699
700
void ToUpperCase(nsACString&);
701
702
void ToLowerCase(nsACString&);
703
704
/**
705
 * Converts case from string aSource to aDest.
706
 */
707
void ToUpperCase(const nsACString& aSource, nsACString& aDest);
708
709
void ToLowerCase(const nsACString& aSource, nsACString& aDest);
710
711
/**
712
 * Finds the leftmost occurrence of |aPattern|, if any in the range
713
 * |aSearchStart|..|aSearchEnd|.
714
 *
715
 * Returns |true| if a match was found, and adjusts |aSearchStart| and
716
 * |aSearchEnd| to point to the match.  If no match was found, returns |false|
717
 * and makes |aSearchStart == aSearchEnd|.
718
 *
719
 * Currently, this is equivalent to the O(m*n) implementation previously on
720
 * |ns[C]String|.
721
 *
722
 * If we need something faster, then we can implement that later.
723
 */
724
725
bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
726
                    nsAString::const_iterator&,
727
                    const nsStringComparator& = nsDefaultStringComparator());
728
bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
729
                    nsACString::const_iterator&,
730
                    const nsCStringComparator& = nsDefaultCStringComparator());
731
732
/* sometimes we don't care about where the string was, just that we
733
 * found it or not */
734
inline bool
735
FindInReadable(const nsAString& aPattern, const nsAString& aSource,
736
               const nsStringComparator& aCompare = nsDefaultStringComparator())
737
{
738
  nsAString::const_iterator start, end;
739
  aSource.BeginReading(start);
740
  aSource.EndReading(end);
741
  return FindInReadable(aPattern, start, end, aCompare);
742
}
743
744
inline bool
745
FindInReadable(const nsACString& aPattern, const nsACString& aSource,
746
               const nsCStringComparator& aCompare = nsDefaultCStringComparator())
747
{
748
  nsACString::const_iterator start, end;
749
  aSource.BeginReading(start);
750
  aSource.EndReading(end);
751
  return FindInReadable(aPattern, start, end, aCompare);
752
}
753
754
755
bool CaseInsensitiveFindInReadable(const nsACString& aPattern,
756
                                   nsACString::const_iterator&,
757
                                   nsACString::const_iterator&);
758
759
/**
760
 * Finds the rightmost occurrence of |aPattern|
761
 * Returns |true| if a match was found, and adjusts |aSearchStart| and
762
 * |aSearchEnd| to point to the match.  If no match was found, returns |false|
763
 * and makes |aSearchStart == aSearchEnd|.
764
 */
765
bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
766
                     nsAString::const_iterator&,
767
                     const nsStringComparator& = nsDefaultStringComparator());
768
bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
769
                     nsACString::const_iterator&,
770
                     const nsCStringComparator& = nsDefaultCStringComparator());
771
772
/**
773
* Finds the leftmost occurrence of |aChar|, if any in the range
774
* |aSearchStart|..|aSearchEnd|.
775
*
776
* Returns |true| if a match was found, and adjusts |aSearchStart| to
777
* point to the match.  If no match was found, returns |false| and
778
* makes |aSearchStart == aSearchEnd|.
779
*/
780
bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart,
781
                        const nsAString::const_iterator& aSearchEnd);
782
bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart,
783
                        const nsACString::const_iterator& aSearchEnd);
784
785
bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring);
786
bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring,
787
                      const nsStringComparator& aComparator);
788
bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring);
789
bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring,
790
                      const nsCStringComparator& aComparator);
791
bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring);
792
bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring,
793
                    const nsStringComparator& aComparator);
794
bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring);
795
bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring,
796
                    const nsCStringComparator& aComparator);
797
798
const nsString& EmptyString();
799
const nsCString& EmptyCString();
800
801
const nsString& VoidString();
802
const nsCString& VoidCString();
803
804
/**
805
 * Compare a UTF-8 string to an UTF-16 string.
806
 *
807
 * Returns 0 if the strings are equal, -1 if aUTF8String is less
808
 * than aUTF16Count, and 1 in the reverse case. Errors are replaced
809
 * with U+FFFD and then the U+FFFD is compared as if it had occurred
810
 * in the input. If aErr is not nullptr, *aErr is set to true if
811
 * either string had malformed sequences.
812
 */
813
int32_t
814
CompareUTF8toUTF16(const nsACString& aUTF8String,
815
                   const nsAString& aUTF16String,
816
                   bool* aErr = nullptr);
817
818
void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);
819
820
#endif // !defined(nsReadableUtils_h___)