/src/mozilla-central/intl/unicharutil/util/nsUnicharUtils.cpp

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicharUtils.h"
#include "nsUTF8Utils.h"
#include "nsUnicodeProperties.h"
#include "mozilla/Likely.h"
#include "mozilla/HashFunctions.h"

// We map x -> x, except for upper-case letters,
// which we map to their lower-case equivalents.
static const uint8_t gASCIIToLower [128] = {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
};

// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
// when they're called from within the case-insensitive comparators, so we
// define inlined versions.
static MOZ_ALWAYS_INLINE uint32_t
ToLowerCase_inline(uint32_t aChar)
{
  if (IS_ASCII(aChar)) {
    return gASCIIToLower[aChar];
  }

  return mozilla::unicode::GetLowercase(aChar);
}

static MOZ_ALWAYS_INLINE uint32_t
ToLowerCaseASCII_inline(const uint32_t aChar)
{
  if (IS_ASCII(aChar)) {
    return gASCIIToLower[aChar];
  }

  return aChar;
}

void
ToLowerCase(nsAString& aString)
{
  char16_t *buf = aString.BeginWriting();
  ToLowerCase(buf, buf, aString.Length());
}

void
ToLowerCaseASCII(nsAString& aString)
{
  char16_t *buf = aString.BeginWriting();
  ToLowerCaseASCII(buf, buf, aString.Length());
}

char
ToLowerCaseASCII(char aChar)
{
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char16_t
ToLowerCaseASCII(char16_t aChar)
{
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char32_t
ToLowerCaseASCII(char32_t aChar)
{
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char
ToUpperCaseASCII(char aChar)
{
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

char16_t
ToUpperCaseASCII(char16_t aChar)
{
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

char32_t
ToUpperCaseASCII(char32_t aChar)
{
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

void
ToLowerCase(const nsAString& aSource,
            nsAString& aDest)
{
  const char16_t *in = aSource.BeginReading();
  uint32_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t *out = aDest.BeginWriting();

  ToLowerCase(in, out, len);
}

void
ToLowerCaseASCII(const nsAString& aSource,
                 nsAString& aDest)
{
  const char16_t *in = aSource.BeginReading();
  uint32_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t *out = aDest.BeginWriting();

  ToLowerCaseASCII(in, out, len);
}

uint32_t
ToLowerCaseASCII(const uint32_t aChar)
{
  return ToLowerCaseASCII_inline(aChar);
}

void
ToUpperCase(nsAString& aString)
{
  char16_t *buf = aString.BeginWriting();
  ToUpperCase(buf, buf, aString.Length());
}

void
ToUpperCase(const nsAString& aSource,
            nsAString& aDest)
{
  const char16_t *in = aSource.BeginReading();
  uint32_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t *out = aDest.BeginWriting();

  ToUpperCase(in, out, len);
}

#ifdef MOZILLA_INTERNAL_API

int32_t
nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
                                              const char16_t* rhs,
                                              uint32_t lLength,
                                              uint32_t rLength) const
{
  return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
         (lLength > rLength) ? 1 : -1;
}

int32_t
nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
                                                  const char* rhs,
                                                  uint32_t lLength,
                                                  uint32_t rLength) const
{
  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
}

int32_t
nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
                                                   const char16_t* rhs,
                                                   uint32_t lLength,
                                                   uint32_t rLength) const
{
  if (lLength != rLength) {
    if (lLength > rLength)
      return 1;
    return -1;
  }

  while (rLength) {
    // we don't care about surrogates here, because we're only
    // lowercasing the ASCII range
    char16_t l = *lhs++;
    char16_t r = *rhs++;
    if (l != r) {
      l = ToLowerCaseASCII_inline(l);
      r = ToLowerCaseASCII_inline(r);

      if (l > r)
        return 1;
      else if (r > l)
        return -1;
    }
    rLength--;
  }

  return 0;
}

#endif // MOZILLA_INTERNAL_API

uint32_t
ToLowerCase(uint32_t aChar)
{
  return ToLowerCase_inline(aChar);
}

void
ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
{
  for (uint32_t i = 0; i < aLen; i++) {
    uint32_t ch = aIn[i];
    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
      ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
      aOut[i++] = H_SURROGATE(ch);
      aOut[i] = L_SURROGATE(ch);
      continue;
    }
    aOut[i] = ToLowerCase(ch);
  }
}

void
ToLowerCaseASCII(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
{
  for (uint32_t i = 0; i < aLen; i++) {
    char16_t ch = aIn[i];
    aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;
  }
}

uint32_t
ToUpperCase(uint32_t aChar)
{
  if (IS_ASCII(aChar)) {
    if (IS_ASCII_LOWER(aChar)) {
      return aChar - 0x20;
    }
    return aChar;
  }

  return mozilla::unicode::GetUppercase(aChar);
}

void
ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
{
  for (uint32_t i = 0; i < aLen; i++) {
    uint32_t ch = aIn[i];
    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
      ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
      aOut[i++] = H_SURROGATE(ch);
      aOut[i] = L_SURROGATE(ch);
      continue;
    }
    aOut[i] = ToUpperCase(ch);
  }
}

uint32_t
ToTitleCase(uint32_t aChar)
{
  if (IS_ASCII(aChar)) {
    return ToUpperCase(aChar);
  }

  return mozilla::unicode::GetTitlecaseForLower(aChar);
}

int32_t
CaseInsensitiveCompare(const char16_t *a,
                       const char16_t *b,
                       uint32_t len)
{
  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");

  if (len) {
    do {
      uint32_t c1 = *a++;
      uint32_t c2 = *b++;

      // Unfortunately, we need to check for surrogates BEFORE we check
      // for equality, because we could have identical high surrogates
      // but non-identical characters, so we can't just skip them

      // If c1 isn't a surrogate, we don't bother to check c2;
      // in the case where it _is_ a surrogate, we're definitely going to get
      // a mismatch, and don't need to interpret and lowercase it

      if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
        c1 = SURROGATE_TO_UCS4(c1, *a++);
        if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
          c2 = SURROGATE_TO_UCS4(c2, *b++);
        }
        // If c2 wasn't a surrogate, decrementing len means we'd stop
        // short of the end of string b, but that doesn't actually matter
        // because we're going to find a mismatch and return early
        --len;
      }

      if (c1 != c2) {
        c1 = ToLowerCase_inline(c1);
        c2 = ToLowerCase_inline(c2);
        if (c1 != c2) {
          if (c1 < c2) {
            return -1;
          }
          return 1;
        }
      }
    } while (--len != 0);
  }
  return 0;
}

// Inlined definition of GetLowerUTF8Codepoint, which we use because we want
// to be fast when called from the case-insensitive comparators.
static MOZ_ALWAYS_INLINE uint32_t
GetLowerUTF8Codepoint_inline(const char* aStr,
                             const char* aEnd,
                             const char **aNext)
{
  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
  // sign extend.
  const unsigned char *str = (unsigned char*)aStr;

  if (UTF8traits::isASCII(str[0])) {
    // It's ASCII; just convert to lower-case and return it.
    *aNext = aStr + 1;
    return gASCIIToLower[*str];
  }
  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
    // It's a two-byte sequence, so it looks like
    //  110XXXXX 10XXXXXX.
    // This is definitely in the BMP, so we can store straightaway into a
    // uint16_t.

    uint16_t c;
    c  = (str[0] & 0x1F) << 6;
    c += (str[1] & 0x3F);

    // we don't go through ToLowerCase here, because we know this isn't
    // an ASCII character so the ASCII fast-path there is useless
    c = mozilla::unicode::GetLowercase(c);

    *aNext = aStr + 2;
    return c;
  }
  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
    // It's a three-byte sequence, so it looks like
    //  1110XXXX 10XXXXXX 10XXXXXX.
    // This will just barely fit into 16-bits, so store into a uint16_t.

    uint16_t c;
    c  = (str[0] & 0x0F) << 12;
    c += (str[1] & 0x3F) << 6;
    c += (str[2] & 0x3F);

    c = mozilla::unicode::GetLowercase(c);

    *aNext = aStr + 3;
    return c;
  }
  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
    // It's a four-byte sequence, so it looks like
    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.

    uint32_t c;
    c  = (str[0] & 0x07) << 18;
    c += (str[1] & 0x3F) << 12;
    c += (str[2] & 0x3F) << 6;
    c += (str[3] & 0x3F);

    c = mozilla::unicode::GetLowercase(c);

    *aNext = aStr + 4;
    return c;
  }

  // Hm, we don't understand this sequence.
  return -1;
}

uint32_t
GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) {
  return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
}

int32_t CaseInsensitiveCompare(const char *aLeft,
                               const char *aRight,
                               uint32_t aLeftBytes,
                               uint32_t aRightBytes)
{
  const char *leftEnd = aLeft + aLeftBytes;
  const char *rightEnd = aRight + aRightBytes;

  while (aLeft < leftEnd && aRight < rightEnd) {
    uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
    if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
      return -1;

    uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
    if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
      return -1;

    // Now leftChar and rightChar are lower-case, so we can compare them.
    if (leftChar != rightChar) {
      if (leftChar > rightChar)
        return 1;
      return -1;
    }
  }

  // Make sure that if one string is longer than the other we return the
  // correct result.
  if (aLeft < leftEnd)
    return 1;
  if (aRight < rightEnd)
    return -1;

  return 0;
}

bool
CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
                              const char* aLeftEnd, const char* aRightEnd,
                              const char** aLeftNext, const char** aRightNext,
                              bool* aErr)
{
  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext);
  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
    *aErr = true;
    return false;
  }

  uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, aRightEnd, aRightNext);
  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
    *aErr = true;
    return false;
  }

  // Can't have an error past this point.
  *aErr = false;

  return leftChar == rightChar;
}

namespace mozilla {

uint32_t
HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
{
  uint32_t hash = 0;
  const char* s = aUTF8;
  const char* end = aUTF8 + aLength;

  *aErr = false;

  while (s < end)
  {
    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
    if (*aErr) {
      return 0;
    }

    if (ucs4 < PLANE1_BASE) {
      hash = AddToHash(hash, ucs4);
    }
    else {
      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
    }
  }

  return hash;
}

bool
IsSegmentBreakSkipChar(uint32_t u)
{
  return unicode::IsEastAsianWidthFWH(u) &&
         unicode::GetScriptCode(u) != unicode::Script::HANGUL;
}

} // namespace mozilla

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* This Source Code Form is subject to the terms of the Mozilla Public
3		* License, v. 2.0. If a copy of the MPL was not distributed with this
4		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6		#include "nsUnicharUtils.h"
7		#include "nsUTF8Utils.h"
8		#include "nsUnicodeProperties.h"
9		#include "mozilla/Likely.h"
10		#include "mozilla/HashFunctions.h"
11
12		// We map x -> x, except for upper-case letters,
13		// which we map to their lower-case equivalents.
14		static const uint8_t gASCIIToLower [128] = {
15		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
16		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
17		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
18		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
19		0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
20		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
21		0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
22		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
23		};
24
25		// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
26		// when they're called from within the case-insensitive comparators, so we
27		// define inlined versions.
28		static MOZ_ALWAYS_INLINE uint32_t
29		ToLowerCase_inline(uint32_t aChar)
30	8.96M	{
31	8.96M	if (IS_ASCII(aChar)) {
32	8.94M	return gASCIIToLower[aChar];
33	8.94M	}
34	18.2k
35	18.2k	return mozilla::unicode::GetLowercase(aChar);
36	18.2k	}
37
38		static MOZ_ALWAYS_INLINE uint32_t
39		ToLowerCaseASCII_inline(const uint32_t aChar)
40	43.2k	{
41	43.2k	if (IS_ASCII(aChar)) {
42	42.6k	return gASCIIToLower[aChar];
43	42.6k	}
44	562
45	562	return aChar;
46	562	}
47
48		void
49		ToLowerCase(nsAString& aString)
50	233k	{
51	233k	char16_t *buf = aString.BeginWriting();
52	233k	ToLowerCase(buf, buf, aString.Length());
53	233k	}
54
55		void
56		ToLowerCaseASCII(nsAString& aString)
57	0	{
58	0	char16_t *buf = aString.BeginWriting();
59	0	ToLowerCaseASCII(buf, buf, aString.Length());
60	0	}
61
62		char
63		ToLowerCaseASCII(char aChar)
64	8.48k	{
65	8.48k	if (aChar >= 'A' && aChar <= 'Z') {
66	489	return aChar + 0x20;
67	489	}
68	7.99k	return aChar;
69	7.99k	}
70
71		char16_t
72		ToLowerCaseASCII(char16_t aChar)
73	0	{
74	0	if (aChar >= 'A' && aChar <= 'Z') {
75	0	return aChar + 0x20;
76	0	}
77	0	return aChar;
78	0	}
79
80		char32_t
81		ToLowerCaseASCII(char32_t aChar)
82	0	{
83	0	if (aChar >= 'A' && aChar <= 'Z') {
84	0	return aChar + 0x20;
85	0	}
86	0	return aChar;
87	0	}
88
89		char
90		ToUpperCaseASCII(char aChar)
91	0	{
92	0	if (aChar >= 'a' && aChar <= 'z') {
93	0	return aChar - 0x20;
94	0	}
95	0	return aChar;
96	0	}
97
98		char16_t
99		ToUpperCaseASCII(char16_t aChar)
100	0	{
101	0	if (aChar >= 'a' && aChar <= 'z') {
102	0	return aChar - 0x20;
103	0	}
104	0	return aChar;
105	0	}
106
107		char32_t
108		ToUpperCaseASCII(char32_t aChar)
109	0	{
110	0	if (aChar >= 'a' && aChar <= 'z') {
111	0	return aChar - 0x20;
112	0	}
113	0	return aChar;
114	0	}
115
116		void
117		ToLowerCase(const nsAString& aSource,
118		nsAString& aDest)
119	176k	{
120	176k	const char16_t *in = aSource.BeginReading();
121	176k	uint32_t len = aSource.Length();
122	176k
123	176k	aDest.SetLength(len);
124	176k	char16_t *out = aDest.BeginWriting();
125	176k
126	176k	ToLowerCase(in, out, len);
127	176k	}
128
129		void
130		ToLowerCaseASCII(const nsAString& aSource,
131		nsAString& aDest)
132	0	{
133	0	const char16_t *in = aSource.BeginReading();
134	0	uint32_t len = aSource.Length();
135	0
136	0	aDest.SetLength(len);
137	0	char16_t *out = aDest.BeginWriting();
138	0
139	0	ToLowerCaseASCII(in, out, len);
140	0	}
141
142		uint32_t
143		ToLowerCaseASCII(const uint32_t aChar)
144	0	{
145	0	return ToLowerCaseASCII_inline(aChar);
146	0	}
147
148		void
149		ToUpperCase(nsAString& aString)
150	0	{
151	0	char16_t *buf = aString.BeginWriting();
152	0	ToUpperCase(buf, buf, aString.Length());
153	0	}
154
155		void
156		ToUpperCase(const nsAString& aSource,
157		nsAString& aDest)
158	0	{
159	0	const char16_t *in = aSource.BeginReading();
160	0	uint32_t len = aSource.Length();
161	0
162	0	aDest.SetLength(len);
163	0	char16_t *out = aDest.BeginWriting();
164	0
165	0	ToUpperCase(in, out, len);
166	0	}
167
168		#ifdef MOZILLA_INTERNAL_API
169
170		int32_t
171		nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
172		const char16_t* rhs,
173		uint32_t lLength,
174		uint32_t rLength) const
175	0	{
176	0	return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
177	0	(lLength > rLength) ? 1 : -1;
178	0	}
179
180		int32_t
181		nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
182		const char* rhs,
183		uint32_t lLength,
184		uint32_t rLength) const
185	0	{
186	0	return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
187	0	}
188
189		int32_t
190		nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
191		const char16_t* rhs,
192		uint32_t lLength,
193		uint32_t rLength) const
194	22.9k	{
195	22.9k	if (lLength != rLength) {
196	0	if (lLength > rLength)
197	0	return 1;
198	0	return -1;
199	0	}
200	22.9k
201	35.7k	while (rLength) {
202	34.4k	// we don't care about surrogates here, because we're only
203	34.4k	// lowercasing the ASCII range
204	34.4k	char16_t l = *lhs++;
205	34.4k	char16_t r = *rhs++;
206	34.4k	if (l != r) {
207	21.6k	l = ToLowerCaseASCII_inline(l);
208	21.6k	r = ToLowerCaseASCII_inline(r);
209	21.6k
210	21.6k	if (l > r)
211	20.8k	return 1;
212	746	else if (r > l)
213	745	return -1;
214	12.8k	}
215	12.8k	rLength--;
216	12.8k	}
217	22.9k
218	22.9k	return 0;
219	22.9k	}
220
221		#endif // MOZILLA_INTERNAL_API
222
223		uint32_t
224		ToLowerCase(uint32_t aChar)
225	8.96M	{
226	8.96M	return ToLowerCase_inline(aChar);
227	8.96M	}
228
229		void
230		ToLowerCase(const char16_t aIn, char16_t aOut, uint32_t aLen)
231	410k	{
232	1.17M	for (uint32_t i = 0; i < aLen; i++) {
233	763k	uint32_t ch = aIn[i];
234	763k	if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
235	763k	NS_IS_LOW_SURROGATE(aIn[i + 1])) {
236	0	ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
237	0	NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
238	0	aOut[i++] = H_SURROGATE(ch);
239	0	aOut[i] = L_SURROGATE(ch);
240	0	continue;
241	0	}
242	763k	aOut[i] = ToLowerCase(ch);
243	763k	}
244	410k	}
245
246		void
247		ToLowerCaseASCII(const char16_t aIn, char16_t aOut, uint32_t aLen)
248	0	{
249	0	for (uint32_t i = 0; i < aLen; i++) {
250	0	char16_t ch = aIn[i];
251	0	aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;
252	0	}
253	0	}
254
255		uint32_t
256		ToUpperCase(uint32_t aChar)
257	8.20M	{
258	8.20M	if (IS_ASCII(aChar)) {
259	8.18M	if (IS_ASCII_LOWER(aChar)) {
260	63.5k	return aChar - 0x20;
261	63.5k	}
262	8.12M	return aChar;
263	8.12M	}
264	18.2k
265	18.2k	return mozilla::unicode::GetUppercase(aChar);
266	18.2k	}
267
268		void
269		ToUpperCase(const char16_t aIn, char16_t aOut, uint32_t aLen)
270	0	{
271	0	for (uint32_t i = 0; i < aLen; i++) {
272	0	uint32_t ch = aIn[i];
273	0	if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
274	0	NS_IS_LOW_SURROGATE(aIn[i + 1])) {
275	0	ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
276	0	NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
277	0	aOut[i++] = H_SURROGATE(ch);
278	0	aOut[i] = L_SURROGATE(ch);
279	0	continue;
280	0	}
281	0	aOut[i] = ToUpperCase(ch);
282	0	}
283	0	}
284
285		uint32_t
286		ToTitleCase(uint32_t aChar)
287	0	{
288	0	if (IS_ASCII(aChar)) {
289	0	return ToUpperCase(aChar);
290	0	}
291	0
292	0	return mozilla::unicode::GetTitlecaseForLower(aChar);
293	0	}
294
295		int32_t
296		CaseInsensitiveCompare(const char16_t *a,
297		const char16_t *b,
298		uint32_t len)
299	0	{
300	0	NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
301	0
302	0	if (len) {
303	0	do {
304	0	uint32_t c1 = *a++;
305	0	uint32_t c2 = *b++;
306	0
307	0	// Unfortunately, we need to check for surrogates BEFORE we check
308	0	// for equality, because we could have identical high surrogates
309	0	// but non-identical characters, so we can't just skip them
310	0
311	0	// If c1 isn't a surrogate, we don't bother to check c2;
312	0	// in the case where it _is_ a surrogate, we're definitely going to get
313	0	// a mismatch, and don't need to interpret and lowercase it
314	0
315	0	if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
316	0	c1 = SURROGATE_TO_UCS4(c1, *a++);
317	0	if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
318	0	c2 = SURROGATE_TO_UCS4(c2, *b++);
319	0	}
320	0	// If c2 wasn't a surrogate, decrementing len means we'd stop
321	0	// short of the end of string b, but that doesn't actually matter
322	0	// because we're going to find a mismatch and return early
323	0	--len;
324	0	}
325	0
326	0	if (c1 != c2) {
327	0	c1 = ToLowerCase_inline(c1);
328	0	c2 = ToLowerCase_inline(c2);
329	0	if (c1 != c2) {
330	0	if (c1 < c2) {
331	0	return -1;
332	0	}
333	0	return 1;
334	0	}
335	0	}
336	0	} while (--len != 0);
337	0	}
338	0	return 0;
339	0	}
340
341		// Inlined definition of GetLowerUTF8Codepoint, which we use because we want
342		// to be fast when called from the case-insensitive comparators.
343		static MOZ_ALWAYS_INLINE uint32_t
344		GetLowerUTF8Codepoint_inline(const char* aStr,
345		const char* aEnd,
346		const char **aNext)
347	0	{
348	0	// Convert to unsigned char so that stuffing chars into PRUint32s doesn't
349	0	// sign extend.
350	0	const unsigned char str = (unsigned char)aStr;
351	0
352	0	if (UTF8traits::isASCII(str[0])) {
353	0	// It's ASCII; just convert to lower-case and return it.
354	0	*aNext = aStr + 1;
355	0	return gASCIIToLower[*str];
356	0	}
357	0	if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
358	0	// It's a two-byte sequence, so it looks like
359	0	// 110XXXXX 10XXXXXX.
360	0	// This is definitely in the BMP, so we can store straightaway into a
361	0	// uint16_t.
362	0
363	0	uint16_t c;
364	0	c = (str[0] & 0x1F) << 6;
365	0	c += (str[1] & 0x3F);
366	0
367	0	// we don't go through ToLowerCase here, because we know this isn't
368	0	// an ASCII character so the ASCII fast-path there is useless
369	0	c = mozilla::unicode::GetLowercase(c);
370	0
371	0	*aNext = aStr + 2;
372	0	return c;
373	0	}
374	0	if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
375	0	// It's a three-byte sequence, so it looks like
376	0	// 1110XXXX 10XXXXXX 10XXXXXX.
377	0	// This will just barely fit into 16-bits, so store into a uint16_t.
378	0
379	0	uint16_t c;
380	0	c = (str[0] & 0x0F) << 12;
381	0	c += (str[1] & 0x3F) << 6;
382	0	c += (str[2] & 0x3F);
383	0
384	0	c = mozilla::unicode::GetLowercase(c);
385	0
386	0	*aNext = aStr + 3;
387	0	return c;
388	0	}
389	0	if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
390	0	// It's a four-byte sequence, so it looks like
391	0	// 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
392	0
393	0	uint32_t c;
394	0	c = (str[0] & 0x07) << 18;
395	0	c += (str[1] & 0x3F) << 12;
396	0	c += (str[2] & 0x3F) << 6;
397	0	c += (str[3] & 0x3F);
398	0
399	0	c = mozilla::unicode::GetLowercase(c);
400	0
401	0	*aNext = aStr + 4;
402	0	return c;
403	0	}
404	0
405	0	// Hm, we don't understand this sequence.
406	0	return -1;
407	0	}
408
409		uint32_t
410	0	GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) {
411	0	return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
412	0	}
413
414		int32_t CaseInsensitiveCompare(const char *aLeft,
415		const char *aRight,
416		uint32_t aLeftBytes,
417		uint32_t aRightBytes)
418	0	{
419	0	const char *leftEnd = aLeft + aLeftBytes;
420	0	const char *rightEnd = aRight + aRightBytes;
421	0
422	0	while (aLeft < leftEnd && aRight < rightEnd) {
423	0	uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
424	0	if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
425	0	return -1;
426	0
427	0	uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
428	0	if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
429	0	return -1;
430	0
431	0	// Now leftChar and rightChar are lower-case, so we can compare them.
432	0	if (leftChar != rightChar) {
433	0	if (leftChar > rightChar)
434	0	return 1;
435	0	return -1;
436	0	}
437	0	}
438	0
439	0	// Make sure that if one string is longer than the other we return the
440	0	// correct result.
441	0	if (aLeft < leftEnd)
442	0	return 1;
443	0	if (aRight < rightEnd)
444	0	return -1;
445	0
446	0	return 0;
447	0	}
448
449		bool
450		CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
451		const char* aLeftEnd, const char* aRightEnd,
452		const char aLeftNext, const char aRightNext,
453		bool* aErr)
454	0	{
455	0	NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
456	0	NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
457	0	NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
458	0	NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
459	0	NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
460	0
461	0	uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext);
462	0	if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
463	0	*aErr = true;
464	0	return false;
465	0	}
466	0
467	0	uint32_t rightChar = GetLowerUTF8Codepoint_inline(aRight, aRightEnd, aRightNext);
468	0	if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
469	0	*aErr = true;
470	0	return false;
471	0	}
472	0
473	0	// Can't have an error past this point.
474	0	*aErr = false;
475	0
476	0	return leftChar == rightChar;
477	0	}
478
479		namespace mozilla {
480
481		uint32_t
482		HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
483	11.7k	{
484	11.7k	uint32_t hash = 0;
485	11.7k	const char* s = aUTF8;
486	11.7k	const char* end = aUTF8 + aLength;
487	11.7k
488	11.7k	*aErr = false;
489	11.7k
490	117k	while (s < end)
491	105k	{
492	105k	uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
493	105k	if (*aErr) {
494	0	return 0;
495	0	}
496	105k
497	105k	if (ucs4 < PLANE1_BASE) {
498	105k	hash = AddToHash(hash, ucs4);
499	105k	}
500	0	else {
501	0	hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
502	0	}
503	105k	}
504	11.7k
505	11.7k	return hash;
506	11.7k	}
507
508		bool
509		IsSegmentBreakSkipChar(uint32_t u)
510	0	{
511	0	return unicode::IsEastAsianWidthFWH(u) &&
512	0	unicode::GetScriptCode(u) != unicode::Script::HANGUL;
513	0	}
514
515		} // namespace mozilla