Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/unicharutil/util/nsUnicharUtils.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
#include "nsUnicharUtils.h"
7
#include "nsUTF8Utils.h"
8
#include "nsUnicodeProperties.h"
9
#include "mozilla/Likely.h"
10
#include "mozilla/HashFunctions.h"
11
12
// We map x -> x, except for upper-case letters,
13
// which we map to their lower-case equivalents.
14
static const uint8_t gASCIIToLower [128] = {
15
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
16
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
17
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
18
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
19
    0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
20
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
21
    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
22
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
23
};
24
25
// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
26
// when they're called from within the case-insensitive comparators, so we
27
// define inlined versions.
28
static MOZ_ALWAYS_INLINE uint32_t
29
ToLowerCase_inline(uint32_t aChar)
30
8.96M
{
31
8.96M
  if (IS_ASCII(aChar)) {
32
8.94M
    return gASCIIToLower[aChar];
33
8.94M
  }
34
18.2k
35
18.2k
  return mozilla::unicode::GetLowercase(aChar);
36
18.2k
}
37
38
static MOZ_ALWAYS_INLINE uint32_t
39
ToLowerCaseASCII_inline(const uint32_t aChar)
40
43.2k
{
41
43.2k
  if (IS_ASCII(aChar)) {
42
42.6k
    return gASCIIToLower[aChar];
43
42.6k
  }
44
562
45
562
  return aChar;
46
562
}
47
48
void
49
ToLowerCase(nsAString& aString)
50
233k
{
51
233k
  char16_t *buf = aString.BeginWriting();
52
233k
  ToLowerCase(buf, buf, aString.Length());
53
233k
}
54
55
void
56
ToLowerCaseASCII(nsAString& aString)
57
0
{
58
0
  char16_t *buf = aString.BeginWriting();
59
0
  ToLowerCaseASCII(buf, buf, aString.Length());
60
0
}
61
62
char
63
ToLowerCaseASCII(char aChar)
64
8.48k
{
65
8.48k
  if (aChar >= 'A' && aChar <= 'Z') {
66
489
    return aChar + 0x20;
67
489
  }
68
7.99k
  return aChar;
69
7.99k
}
70
71
char16_t
72
ToLowerCaseASCII(char16_t aChar)
73
0
{
74
0
  if (aChar >= 'A' && aChar <= 'Z') {
75
0
    return aChar + 0x20;
76
0
  }
77
0
  return aChar;
78
0
}
79
80
char32_t
81
ToLowerCaseASCII(char32_t aChar)
82
0
{
83
0
  if (aChar >= 'A' && aChar <= 'Z') {
84
0
    return aChar + 0x20;
85
0
  }
86
0
  return aChar;
87
0
}
88
89
char
90
ToUpperCaseASCII(char aChar)
91
0
{
92
0
  if (aChar >= 'a' && aChar <= 'z') {
93
0
    return aChar - 0x20;
94
0
  }
95
0
  return aChar;
96
0
}
97
98
char16_t
99
ToUpperCaseASCII(char16_t aChar)
100
0
{
101
0
  if (aChar >= 'a' && aChar <= 'z') {
102
0
    return aChar - 0x20;
103
0
  }
104
0
  return aChar;
105
0
}
106
107
char32_t
108
ToUpperCaseASCII(char32_t aChar)
109
0
{
110
0
  if (aChar >= 'a' && aChar <= 'z') {
111
0
    return aChar - 0x20;
112
0
  }
113
0
  return aChar;
114
0
}
115
116
void
117
ToLowerCase(const nsAString& aSource,
118
            nsAString& aDest)
119
176k
{
120
176k
  const char16_t *in = aSource.BeginReading();
121
176k
  uint32_t len = aSource.Length();
122
176k
123
176k
  aDest.SetLength(len);
124
176k
  char16_t *out = aDest.BeginWriting();
125
176k
126
176k
  ToLowerCase(in, out, len);
127
176k
}
128
129
void
130
ToLowerCaseASCII(const nsAString& aSource,
131
                 nsAString& aDest)
132
0
{
133
0
  const char16_t *in = aSource.BeginReading();
134
0
  uint32_t len = aSource.Length();
135
0
136
0
  aDest.SetLength(len);
137
0
  char16_t *out = aDest.BeginWriting();
138
0
139
0
  ToLowerCaseASCII(in, out, len);
140
0
}
141
142
uint32_t
143
ToLowerCaseASCII(const uint32_t aChar)
144
0
{
145
0
  return ToLowerCaseASCII_inline(aChar);
146
0
}
147
148
void
149
ToUpperCase(nsAString& aString)
150
0
{
151
0
  char16_t *buf = aString.BeginWriting();
152
0
  ToUpperCase(buf, buf, aString.Length());
153
0
}
154
155
void
156
ToUpperCase(const nsAString& aSource,
157
            nsAString& aDest)
158
0
{
159
0
  const char16_t *in = aSource.BeginReading();
160
0
  uint32_t len = aSource.Length();
161
0
162
0
  aDest.SetLength(len);
163
0
  char16_t *out = aDest.BeginWriting();
164
0
165
0
  ToUpperCase(in, out, len);
166
0
}
167
168
#ifdef MOZILLA_INTERNAL_API
169
170
int32_t
171
nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
172
                                              const char16_t* rhs,
173
                                              uint32_t lLength,
174
                                              uint32_t rLength) const
175
0
{
176
0
  return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
177
0
         (lLength > rLength) ? 1 : -1;
178
0
}
179
180
int32_t
181
nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
182
                                                  const char* rhs,
183
                                                  uint32_t lLength,
184
                                                  uint32_t rLength) const
185
0
{
186
0
  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
187
0
}
188
189
int32_t
190
nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
191
                                                   const char16_t* rhs,
192
                                                   uint32_t lLength,
193
                                                   uint32_t rLength) const
194
22.9k
{
195
22.9k
  if (lLength != rLength) {
196
0
    if (lLength > rLength)
197
0
      return 1;
198
0
    return -1;
199
0
  }
200
22.9k
201
35.7k
  while (rLength) {
202
34.4k
    // we don't care about surrogates here, because we're only
203
34.4k
    // lowercasing the ASCII range
204
34.4k
    char16_t l = *lhs++;
205
34.4k
    char16_t r = *rhs++;
206
34.4k
    if (l != r) {
207
21.6k
      l = ToLowerCaseASCII_inline(l);
208
21.6k
      r = ToLowerCaseASCII_inline(r);
209
21.6k
210
21.6k
      if (l > r)
211
20.8k
        return 1;
212
746
      else if (r > l)
213
745
        return -1;
214
12.8k
    }
215
12.8k
    rLength--;
216
12.8k
  }
217
22.9k
218
22.9k
  return 0;
219
22.9k
}
220
221
#endif // MOZILLA_INTERNAL_API
222
223
uint32_t
224
ToLowerCase(uint32_t aChar)
225
8.96M
{
226
8.96M
  return ToLowerCase_inline(aChar);
227
8.96M
}
228
229
void
230
ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
231
410k
{
232
1.17M
  for (uint32_t i = 0; i < aLen; i++) {
233
763k
    uint32_t ch = aIn[i];
234
763k
    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
235
763k
        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
236
0
      ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
237
0
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
238
0
      aOut[i++] = H_SURROGATE(ch);
239
0
      aOut[i] = L_SURROGATE(ch);
240
0
      continue;
241
0
    }
242
763k
    aOut[i] = ToLowerCase(ch);
243
763k
  }
244
410k
}
245
246
void
247
ToLowerCaseASCII(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
248
0
{
249
0
  for (uint32_t i = 0; i < aLen; i++) {
250
0
    char16_t ch = aIn[i];
251
0
    aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;
252
0
  }
253
0
}
254
255
uint32_t
256
ToUpperCase(uint32_t aChar)
257
8.20M
{
258
8.20M
  if (IS_ASCII(aChar)) {
259
8.18M
    if (IS_ASCII_LOWER(aChar)) {
260
63.5k
      return aChar - 0x20;
261
63.5k
    }
262
8.12M
    return aChar;
263
8.12M
  }
264
18.2k
265
18.2k
  return mozilla::unicode::GetUppercase(aChar);
266
18.2k
}
267
268
void
269
ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
270
0
{
271
0
  for (uint32_t i = 0; i < aLen; i++) {
272
0
    uint32_t ch = aIn[i];
273
0
    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
274
0
        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
275
0
      ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
276
0
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
277
0
      aOut[i++] = H_SURROGATE(ch);
278
0
      aOut[i] = L_SURROGATE(ch);
279
0
      continue;
280
0
    }
281
0
    aOut[i] = ToUpperCase(ch);
282
0
  }
283
0
}
284
285
uint32_t
286
ToTitleCase(uint32_t aChar)
287
0
{
288
0
  if (IS_ASCII(aChar)) {
289
0
    return ToUpperCase(aChar);
290
0
  }
291
0
292
0
  return mozilla::unicode::GetTitlecaseForLower(aChar);
293
0
}
294
295
int32_t
296
CaseInsensitiveCompare(const char16_t *a,
297
                       const char16_t *b,
298
                       uint32_t len)
299
0
{
300
0
  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
301
0
302
0
  if (len) {
303
0
    do {
304
0
      uint32_t c1 = *a++;
305
0
      uint32_t c2 = *b++;
306
0
307
0
      // Unfortunately, we need to check for surrogates BEFORE we check
308
0
      // for equality, because we could have identical high surrogates
309
0
      // but non-identical characters, so we can't just skip them
310
0
311
0
      // If c1 isn't a surrogate, we don't bother to check c2;
312
0
      // in the case where it _is_ a surrogate, we're definitely going to get
313
0
      // a mismatch, and don't need to interpret and lowercase it
314
0
315
0
      if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
316
0
        c1 = SURROGATE_TO_UCS4(c1, *a++);
317
0
        if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
318
0
          c2 = SURROGATE_TO_UCS4(c2, *b++);
319
0
        }
320
0
        // If c2 wasn't a surrogate, decrementing len means we'd stop
321
0
        // short of the end of string b, but that doesn't actually matter
322
0
        // because we're going to find a mismatch and return early
323
0
        --len;
324
0
      }
325
0
326
0
      if (c1 != c2) {
327
0
        c1 = ToLowerCase_inline(c1);
328
0
        c2 = ToLowerCase_inline(c2);
329
0
        if (c1 != c2) {
330
0
          if (c1 < c2) {
331
0
            return -1;
332
0
          }
333
0
          return 1;
334
0
        }
335
0
      }
336
0
    } while (--len != 0);
337
0
  }
338
0
  return 0;
339
0
}
340
341
// Inlined definition of GetLowerUTF8Codepoint, which we use because we want
342
// to be fast when called from the case-insensitive comparators.
343
static MOZ_ALWAYS_INLINE uint32_t
344
GetLowerUTF8Codepoint_inline(const char* aStr,
345
                             const char* aEnd,
346
                             const char **aNext)
347
0
{
348
0
  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
349
0
  // sign extend.
350
0
  const unsigned char *str = (unsigned char*)aStr;
351
0
352
0
  if (UTF8traits::isASCII(str[0])) {
353
0
    // It's ASCII; just convert to lower-case and return it.
354
0
    *aNext = aStr + 1;
355
0
    return gASCIIToLower[*str];
356
0
  }
357
0
  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
358
0
    // It's a two-byte sequence, so it looks like
359
0
    //  110XXXXX 10XXXXXX.
360
0
    // This is definitely in the BMP, so we can store straightaway into a
361
0
    // uint16_t.
362
0
363
0
    uint16_t c;
364
0
    c  = (str[0] & 0x1F) << 6;
365
0
    c += (str[1] & 0x3F);
366
0
367
0
    // we don't go through ToLowerCase here, because we know this isn't
368
0
    // an ASCII character so the ASCII fast-path there is useless
369
0
    c = mozilla::unicode::GetLowercase(c);
370
0
371
0
    *aNext = aStr + 2;
372
0
    return c;
373
0
  }
374
0
  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
375
0
    // It's a three-byte sequence, so it looks like
376
0
    //  1110XXXX 10XXXXXX 10XXXXXX.
377
0
    // This will just barely fit into 16-bits, so store into a uint16_t.
378
0
379
0
    uint16_t c;
380
0
    c  = (str[0] & 0x0F) << 12;
381
0
    c += (str[1] & 0x3F) << 6;
382
0
    c += (str[2] & 0x3F);
383
0
384
0
    c = mozilla::unicode::GetLowercase(c);
385
0
386
0
    *aNext = aStr + 3;
387
0
    return c;
388
0
  }
389
0
  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
390
0
    // It's a four-byte sequence, so it looks like
391
0
    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
392
0
393
0
    uint32_t c;
394
0
    c  = (str[0] & 0x07) << 18;
395
0
    c += (str[1] & 0x3F) << 12;
396
0
    c += (str[2] & 0x3F) << 6;
397
0
    c += (str[3] & 0x3F);
398
0
399
0
    c = mozilla::unicode::GetLowercase(c);
400
0
401
0
    *aNext = aStr + 4;
402
0
    return c;
403
0
  }
404
0
405
0
  // Hm, we don't understand this sequence.
406
0
  return -1;
407
0
}
408
409
uint32_t
410
0
GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) {
411
0
  return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
412
0
}
413
414
int32_t CaseInsensitiveCompare(const char *aLeft,
415
                               const char *aRight,
416
                               uint32_t aLeftBytes,
417
                               uint32_t aRightBytes)
418
0
{
419
0
  const char *leftEnd = aLeft + aLeftBytes;
420
0
  const char *rightEnd = aRight + aRightBytes;
421
0
422
0
  while (aLeft < leftEnd && aRight < rightEnd) {
423
0
    uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
424
0
    if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
425
0
      return -1;
426
0
427
0
    uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
428
0
    if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
429
0
      return -1;
430
0
431
0
    // Now leftChar and rightChar are lower-case, so we can compare them.
432
0
    if (leftChar != rightChar) {
433
0
      if (leftChar > rightChar)
434
0
        return 1;
435
0
      return -1;
436
0
    }
437
0
  }
438
0
439
0
  // Make sure that if one string is longer than the other we return the
440
0
  // correct result.
441
0
  if (aLeft < leftEnd)
442
0
    return 1;
443
0
  if (aRight < rightEnd)
444
0
    return -1;
445
0
446
0
  return 0;
447
0
}
448
449
bool
450
CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
451
                              const char* aLeftEnd, const char* aRightEnd,
452
                              const char** aLeftNext, const char** aRightNext,
453
                              bool* aErr)
454
0
{
455
0
  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
456
0
  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
457
0
  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
458
0
  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
459
0
  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
460
0
461
0
  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext);
462
0
  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
463
0
    *aErr = true;
464
0
    return false;
465
0
  }
466
0
467
0
  uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, aRightEnd, aRightNext);
468
0
  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
469
0
    *aErr = true;
470
0
    return false;
471
0
  }
472
0
473
0
  // Can't have an error past this point.
474
0
  *aErr = false;
475
0
476
0
  return leftChar == rightChar;
477
0
}
478
479
namespace mozilla {
480
481
uint32_t
482
HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
483
11.7k
{
484
11.7k
  uint32_t hash = 0;
485
11.7k
  const char* s = aUTF8;
486
11.7k
  const char* end = aUTF8 + aLength;
487
11.7k
488
11.7k
  *aErr = false;
489
11.7k
490
117k
  while (s < end)
491
105k
  {
492
105k
    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
493
105k
    if (*aErr) {
494
0
      return 0;
495
0
    }
496
105k
497
105k
    if (ucs4 < PLANE1_BASE) {
498
105k
      hash = AddToHash(hash, ucs4);
499
105k
    }
500
0
    else {
501
0
      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
502
0
    }
503
105k
  }
504
11.7k
505
11.7k
  return hash;
506
11.7k
}
507
508
bool
509
IsSegmentBreakSkipChar(uint32_t u)
510
0
{
511
0
  return unicode::IsEastAsianWidthFWH(u) &&
512
0
         unicode::GetScriptCode(u) != unicode::Script::HANGUL;
513
0
}
514
515
} // namespace mozilla