/src/mozilla-central/xpcom/string/nsUTF8Utils.h

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_

// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
// file will provide signatures for the Mozilla abstract string types. It will
// use XPCOM assertion/debugging macros, etc.

#include "nscore.h"
#include "mozilla/Assertions.h"
#include "mozilla/EndianUtils.h"
#include "mozilla/TypeTraits.h"

#include "nsCharTraits.h"

#ifdef MOZILLA_INTERNAL_API
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
#else
#define UTF8UTILS_WARNING(msg)
#endif

class UTF8traits
{
public:
  static bool isASCII(char aChar)
  {
    return (aChar & 0x80) == 0x00;
  }
  static bool isInSeq(char aChar)
  {
    return (aChar & 0xC0) == 0x80;
  }
  static bool is2byte(char aChar)
  {
    return (aChar & 0xE0) == 0xC0;
  }
  static bool is3byte(char aChar)
  {
    return (aChar & 0xF0) == 0xE0;
  }
  static bool is4byte(char aChar)
  {
    return (aChar & 0xF8) == 0xF0;
  }
  static bool is5byte(char aChar)
  {
    return (aChar & 0xFC) == 0xF8;
  }
  static bool is6byte(char aChar)
  {
    return (aChar & 0xFE) == 0xFC;
  }
  // return the number of bytes in a sequence beginning with aChar
  static int bytes(char aChar)
  {
    if (isASCII(aChar)) {
      return 1;
    }
    if (is2byte(aChar)) {
      return 2;
    }
    if (is3byte(aChar)) {
      return 3;
    }
    if (is4byte(aChar)) {
      return 4;
    }
    MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
    return 1;
  }
};

/**
 * Extract the next Unicode scalar value from the buffer and return it. The
 * pointer passed in is advanced to the start of the next character in the
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
 * over the maximal valid prefix and *aErr is set to true (if aErr is not
 * null).
 *
 * Note: This method never sets *aErr to false to allow error accumulation
 * across multiple calls.
 *
 * Precondition: *aBuffer < aEnd
 */
class UTF8CharEnumerator
{
public:
  static inline char32_t NextChar(const char** aBuffer,
                                  const char* aEnd,
                                  bool* aErr = nullptr)
  {
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
    MOZ_ASSERT(aEnd, "null end pointer");

    const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
    const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);

    MOZ_ASSERT(p, "null buffer");
    MOZ_ASSERT(p < end, "Bogus range");

    unsigned char first = *p++;

    if (MOZ_LIKELY(first < 0x80U)) {
      *aBuffer = reinterpret_cast<const char*>(p);
      return first;
    }

    // Unsigned underflow is defined behavior
    if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    unsigned char second = *p;

    if (first < 0xE0U) {
      // Two-byte
      if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
        *aBuffer = reinterpret_cast<const char*>(++p);
        return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
      }
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    if (MOZ_LIKELY(first < 0xF0U)) {
      // Three-byte
      unsigned char lower = 0x80U;
      unsigned char upper = 0xBFU;
      if (first == 0xE0U) {
        lower = 0xA0U;
      } else if (first == 0xEDU) {
        upper = 0x9FU;
      }
      if (MOZ_LIKELY(second >= lower && second <= upper)) {
        if (MOZ_LIKELY(p != end)) {
          unsigned char third = *++p;
          if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
            *aBuffer = reinterpret_cast<const char*>(++p);
            return ((uint32_t(first) & 0xFU) << 12) |
                   ((uint32_t(second) & 0x3FU) << 6) |
                   (uint32_t(third) & 0x3FU);
          }
        }
      }
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    // Four-byte
    unsigned char lower = 0x80U;
    unsigned char upper = 0xBFU;
    if (first == 0xF0U) {
      lower = 0x90U;
    } else if (first == 0xF4U) {
      upper = 0x8FU;
    }
    if (MOZ_LIKELY(second >= lower && second <= upper)) {
      if (MOZ_LIKELY(p != end)) {
        unsigned char third = *++p;
        if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
          if (MOZ_LIKELY(p != end)) {
            unsigned char fourth = *++p;
            if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
              *aBuffer = reinterpret_cast<const char*>(++p);
              return ((uint32_t(first) & 0x7U) << 18) |
                     ((uint32_t(second) & 0x3FU) << 12) |
                     ((uint32_t(third) & 0x3FU) << 6) |
                     (uint32_t(fourth) & 0x3FU);
            }
          }
        }
      }
    }
    *aBuffer = reinterpret_cast<const char*>(p);
    if (aErr) {
      *aErr = true;
    }
    return 0xFFFDU;
  }
};

/**
 * Extract the next Unicode scalar value from the buffer and return it. The
 * pointer passed in is advanced to the start of the next character in the
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
 * the unpaired surrogate and *aErr is set to true (if aErr is not null).
 *
 * Note: This method never sets *aErr to false to allow error accumulation
 * across multiple calls.
 *
 * Precondition: *aBuffer < aEnd
 */
class UTF16CharEnumerator
{
public:
  static inline char32_t NextChar(const char16_t** aBuffer,
                                  const char16_t* aEnd,
                                  bool* aErr = nullptr)
  {
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
    MOZ_ASSERT(aEnd, "null end pointer");

    const char16_t* p = *aBuffer;

    MOZ_ASSERT(p, "null buffer");
    MOZ_ASSERT(p < aEnd, "Bogus range");

    char16_t c = *p++;

    // Let's use encoding_rs-style code golf here.
    // Unsigned underflow is defined behavior
    char16_t cMinusSurrogateStart = c - 0xD800U;
    if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
      *aBuffer = p;
      return c;
    }
    if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
      // High surrogate
      if (MOZ_LIKELY(p != aEnd)) {
        char16_t second = *p;
        // Unsigned underflow is defined behavior
        if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
          *aBuffer = ++p;
          return (uint32_t(c) << 10) + uint32_t(second) -
                 (((0xD800U << 10) - 0x10000U) + 0xDC00U);
        }
      }
    }
    // Unpaired surrogate
    *aBuffer = p;
    if (aErr) {
      *aErr = true;
    }
    return 0xFFFDU;
  }
};

template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
{
  static_assert(mozilla::IsSame<Char, char>::value ||
                mozilla::IsSame<Char, unsigned char>::value ||
                mozilla::IsSame<Char, signed char>::value,
                "UTF-8 data must be in 8-bit units");
  static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
  while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
    --index;

  return index;
}

#undef UTF8UTILS_WARNING

#endif /* !defined(nsUTF8Utils_h_) */

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3		/* This Source Code Form is subject to the terms of the Mozilla Public
4		* License, v. 2.0. If a copy of the MPL was not distributed with this
5		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6		#ifndef nsUTF8Utils_h_
7		#define nsUTF8Utils_h_
8
9		// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
10		// file will provide signatures for the Mozilla abstract string types. It will
11		// use XPCOM assertion/debugging macros, etc.
12
13		#include "nscore.h"
14		#include "mozilla/Assertions.h"
15		#include "mozilla/EndianUtils.h"
16		#include "mozilla/TypeTraits.h"
17
18		#include "nsCharTraits.h"
19
20		#ifdef MOZILLA_INTERNAL_API
21		#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
22		#else
23		#define UTF8UTILS_WARNING(msg)
24		#endif
25
26		class UTF8traits
27		{
28		public:
29		static bool isASCII(char aChar)
30		{
31		return (aChar & 0x80) == 0x00;
32		}
33		static bool isInSeq(char aChar)
34		{
35		return (aChar & 0xC0) == 0x80;
36		}
37		static bool is2byte(char aChar)
38		{
39		return (aChar & 0xE0) == 0xC0;
40		}
41		static bool is3byte(char aChar)
42		{
43		return (aChar & 0xF0) == 0xE0;
44		}
45		static bool is4byte(char aChar)
46		{
47		return (aChar & 0xF8) == 0xF0;
48		}
49		static bool is5byte(char aChar)
50	0	{
51	0	return (aChar & 0xFC) == 0xF8;
52	0	}
53		static bool is6byte(char aChar)
54	0	{
55	0	return (aChar & 0xFE) == 0xFC;
56	0	}
57		// return the number of bytes in a sequence beginning with aChar
58		static int bytes(char aChar)
59		{
60		if (isASCII(aChar)) {
61		return 1;
62		}
63		if (is2byte(aChar)) {
64		return 2;
65		}
66		if (is3byte(aChar)) {
67		return 3;
68		}
69		if (is4byte(aChar)) {
70		return 4;
71		}
72		MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
73		return 1;
74		}
75		};
76
77		/**
78		* Extract the next Unicode scalar value from the buffer and return it. The
79		* pointer passed in is advanced to the start of the next character in the
80		* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
81		* over the maximal valid prefix and *aErr is set to true (if aErr is not
82		* null).
83		*
84		* Note: This method never sets *aErr to false to allow error accumulation
85		* across multiple calls.
86		*
87		* Precondition: *aBuffer < aEnd
88		*/
89		class UTF8CharEnumerator
90		{
91		public:
92		static inline char32_t NextChar(const char** aBuffer,
93		const char* aEnd,
94		bool* aErr = nullptr)
95	210k	{
96	210k	MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
97	210k	MOZ_ASSERT(aEnd, "null end pointer");
98	210k
99	210k	const unsigned char* p = reinterpret_cast<const unsigned char>(aBuffer);
100	210k	const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
101	210k
102	210k	MOZ_ASSERT(p, "null buffer");
103	210k	MOZ_ASSERT(p < end, "Bogus range");
104	210k
105	210k	unsigned char first = *p++;
106	210k
107	210k	if (MOZ_LIKELY(first < 0x80U)) {
108	210k	aBuffer = reinterpret_cast<const char>(p);
109	210k	return first;
110	210k	}
111	0
112	0	// Unsigned underflow is defined behavior
113	0	if (MOZ_UNLIKELY((p == end) \|\| ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
114	0	aBuffer = reinterpret_cast<const char>(p);
115	0	if (aErr) {
116	0	*aErr = true;
117	0	}
118	0	return 0xFFFDU;
119	0	}
120	0
121	0	unsigned char second = *p;
122	0
123	0	if (first < 0xE0U) {
124	0	// Two-byte
125	0	if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
126	0	aBuffer = reinterpret_cast<const char>(++p);
127	0	return ((uint32_t(first) & 0x1FU) << 6) \| (uint32_t(second) & 0x3FU);
128	0	}
129	0	aBuffer = reinterpret_cast<const char>(p);
130	0	if (aErr) {
131	0	*aErr = true;
132	0	}
133	0	return 0xFFFDU;
134	0	}
135	0
136	0	if (MOZ_LIKELY(first < 0xF0U)) {
137	0	// Three-byte
138	0	unsigned char lower = 0x80U;
139	0	unsigned char upper = 0xBFU;
140	0	if (first == 0xE0U) {
141	0	lower = 0xA0U;
142	0	} else if (first == 0xEDU) {
143	0	upper = 0x9FU;
144	0	}
145	0	if (MOZ_LIKELY(second >= lower && second <= upper)) {
146	0	if (MOZ_LIKELY(p != end)) {
147	0	unsigned char third = *++p;
148	0	if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
149	0	aBuffer = reinterpret_cast<const char>(++p);
150	0	return ((uint32_t(first) & 0xFU) << 12) \|
151	0	((uint32_t(second) & 0x3FU) << 6) \|
152	0	(uint32_t(third) & 0x3FU);
153	0	}
154	0	}
155	0	}
156	0	aBuffer = reinterpret_cast<const char>(p);
157	0	if (aErr) {
158	0	*aErr = true;
159	0	}
160	0	return 0xFFFDU;
161	0	}
162	0
163	0	// Four-byte
164	0	unsigned char lower = 0x80U;
165	0	unsigned char upper = 0xBFU;
166	0	if (first == 0xF0U) {
167	0	lower = 0x90U;
168	0	} else if (first == 0xF4U) {
169	0	upper = 0x8FU;
170	0	}
171	0	if (MOZ_LIKELY(second >= lower && second <= upper)) {
172	0	if (MOZ_LIKELY(p != end)) {
173	0	unsigned char third = *++p;
174	0	if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
175	0	if (MOZ_LIKELY(p != end)) {
176	0	unsigned char fourth = *++p;
177	0	if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
178	0	aBuffer = reinterpret_cast<const char>(++p);
179	0	return ((uint32_t(first) & 0x7U) << 18) \|
180	0	((uint32_t(second) & 0x3FU) << 12) \|
181	0	((uint32_t(third) & 0x3FU) << 6) \|
182	0	(uint32_t(fourth) & 0x3FU);
183	0	}
184	0	}
185	0	}
186	0	}
187	0	}
188	0	aBuffer = reinterpret_cast<const char>(p);
189	0	if (aErr) {
190	0	*aErr = true;
191	0	}
192	0	return 0xFFFDU;
193	0	}
194		};
195
196		/**
197		* Extract the next Unicode scalar value from the buffer and return it. The
198		* pointer passed in is advanced to the start of the next character in the
199		* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
200		* the unpaired surrogate and *aErr is set to true (if aErr is not null).
201		*
202		* Note: This method never sets *aErr to false to allow error accumulation
203		* across multiple calls.
204		*
205		* Precondition: *aBuffer < aEnd
206		*/
207		class UTF16CharEnumerator
208		{
209		public:
210		static inline char32_t NextChar(const char16_t** aBuffer,
211		const char16_t* aEnd,
212		bool* aErr = nullptr)
213	105k	{
214	105k	MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
215	105k	MOZ_ASSERT(aEnd, "null end pointer");
216	105k
217	105k	const char16_t* p = *aBuffer;
218	105k
219	105k	MOZ_ASSERT(p, "null buffer");
220	105k	MOZ_ASSERT(p < aEnd, "Bogus range");
221	105k
222	105k	char16_t c = *p++;
223	105k
224	105k	// Let's use encoding_rs-style code golf here.
225	105k	// Unsigned underflow is defined behavior
226	105k	char16_t cMinusSurrogateStart = c - 0xD800U;
227	105k	if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
228	105k	*aBuffer = p;
229	105k	return c;
230	105k	}
231	0	if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
232	0	// High surrogate
233	0	if (MOZ_LIKELY(p != aEnd)) {
234	0	char16_t second = *p;
235	0	// Unsigned underflow is defined behavior
236	0	if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
237	0	*aBuffer = ++p;
238	0	return (uint32_t(c) << 10) + uint32_t(second) -
239	0	(((0xD800U << 10) - 0x10000U) + 0xDC00U);
240	0	}
241	0	}
242	0	}
243	0	// Unpaired surrogate
244	0	*aBuffer = p;
245	0	if (aErr) {
246	0	*aErr = true;
247	0	}
248	0	return 0xFFFDU;
249	0	}
250		};
251
252		template<typename Char, typename UnsignedT>
253		inline UnsignedT
254		RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
255		{
256		static_assert(mozilla::IsSame<Char, char>::value \|\|
257		mozilla::IsSame<Char, unsigned char>::value \|\|
258		mozilla::IsSame<Char, signed char>::value,
259		"UTF-8 data must be in 8-bit units");
260		static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
261		while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
262		--index;
263
264		return index;
265		}
266
267		#undef UTF8UTILS_WARNING
268
269		#endif /* !defined(nsUTF8Utils_h_) */