/work/obj-fuzz/dist/include/nsUTF8Utils.h

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_

// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
// file will provide signatures for the Mozilla abstract string types. It will
// use XPCOM assertion/debugging macros, etc.

#include "nscore.h"
#include "mozilla/Assertions.h"
#include "mozilla/EndianUtils.h"
#include "mozilla/TypeTraits.h"

#include "nsCharTraits.h"

#ifdef MOZILLA_INTERNAL_API
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
#else
#define UTF8UTILS_WARNING(msg)
#endif

class UTF8traits
{
public:
  static bool isASCII(char aChar)
  {
    return (aChar & 0x80) == 0x00;
  }
  static bool isInSeq(char aChar)
  {
    return (aChar & 0xC0) == 0x80;
  }
  static bool is2byte(char aChar)
  {
    return (aChar & 0xE0) == 0xC0;
  }
  static bool is3byte(char aChar)
  {
    return (aChar & 0xF0) == 0xE0;
  }
  static bool is4byte(char aChar)
  {
    return (aChar & 0xF8) == 0xF0;
  }
  static bool is5byte(char aChar)
  {
    return (aChar & 0xFC) == 0xF8;
  }
  static bool is6byte(char aChar)
  {
    return (aChar & 0xFE) == 0xFC;
  }
  // return the number of bytes in a sequence beginning with aChar
  static int bytes(char aChar)
  {
    if (isASCII(aChar)) {
      return 1;
    }
    if (is2byte(aChar)) {
      return 2;
    }
    if (is3byte(aChar)) {
      return 3;
    }
    if (is4byte(aChar)) {
      return 4;
    }
    MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
    return 1;
  }
};

/**
 * Extract the next Unicode scalar value from the buffer and return it. The
 * pointer passed in is advanced to the start of the next character in the
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
 * over the maximal valid prefix and *aErr is set to true (if aErr is not
 * null).
 *
 * Note: This method never sets *aErr to false to allow error accumulation
 * across multiple calls.
 *
 * Precondition: *aBuffer < aEnd
 */
class UTF8CharEnumerator
{
public:
  static inline char32_t NextChar(const char** aBuffer,
                                  const char* aEnd,
                                  bool* aErr = nullptr)
  {
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
    MOZ_ASSERT(aEnd, "null end pointer");

    const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
    const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);

    MOZ_ASSERT(p, "null buffer");
    MOZ_ASSERT(p < end, "Bogus range");

    unsigned char first = *p++;

    if (MOZ_LIKELY(first < 0x80U)) {
      *aBuffer = reinterpret_cast<const char*>(p);
      return first;
    }

    // Unsigned underflow is defined behavior
    if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    unsigned char second = *p;

    if (first < 0xE0U) {
      // Two-byte
      if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
        *aBuffer = reinterpret_cast<const char*>(++p);
        return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
      }
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    if (MOZ_LIKELY(first < 0xF0U)) {
      // Three-byte
      unsigned char lower = 0x80U;
      unsigned char upper = 0xBFU;
      if (first == 0xE0U) {
        lower = 0xA0U;
      } else if (first == 0xEDU) {
        upper = 0x9FU;
      }
      if (MOZ_LIKELY(second >= lower && second <= upper)) {
        if (MOZ_LIKELY(p != end)) {
          unsigned char third = *++p;
          if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
            *aBuffer = reinterpret_cast<const char*>(++p);
            return ((uint32_t(first) & 0xFU) << 12) |
                   ((uint32_t(second) & 0x3FU) << 6) |
                   (uint32_t(third) & 0x3FU);
          }
        }
      }
      *aBuffer = reinterpret_cast<const char*>(p);
      if (aErr) {
        *aErr = true;
      }
      return 0xFFFDU;
    }

    // Four-byte
    unsigned char lower = 0x80U;
    unsigned char upper = 0xBFU;
    if (first == 0xF0U) {
      lower = 0x90U;
    } else if (first == 0xF4U) {
      upper = 0x8FU;
    }
    if (MOZ_LIKELY(second >= lower && second <= upper)) {
      if (MOZ_LIKELY(p != end)) {
        unsigned char third = *++p;
        if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
          if (MOZ_LIKELY(p != end)) {
            unsigned char fourth = *++p;
            if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
              *aBuffer = reinterpret_cast<const char*>(++p);
              return ((uint32_t(first) & 0x7U) << 18) |
                     ((uint32_t(second) & 0x3FU) << 12) |
                     ((uint32_t(third) & 0x3FU) << 6) |
                     (uint32_t(fourth) & 0x3FU);
            }
          }
        }
      }
    }
    *aBuffer = reinterpret_cast<const char*>(p);
    if (aErr) {
      *aErr = true;
    }
    return 0xFFFDU;
  }
};

/**
 * Extract the next Unicode scalar value from the buffer and return it. The
 * pointer passed in is advanced to the start of the next character in the
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
 * the unpaired surrogate and *aErr is set to true (if aErr is not null).
 *
 * Note: This method never sets *aErr to false to allow error accumulation
 * across multiple calls.
 *
 * Precondition: *aBuffer < aEnd
 */
class UTF16CharEnumerator
{
public:
  static inline char32_t NextChar(const char16_t** aBuffer,
                                  const char16_t* aEnd,
                                  bool* aErr = nullptr)
  {
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
    MOZ_ASSERT(aEnd, "null end pointer");

    const char16_t* p = *aBuffer;

    MOZ_ASSERT(p, "null buffer");
    MOZ_ASSERT(p < aEnd, "Bogus range");

    char16_t c = *p++;

    // Let's use encoding_rs-style code golf here.
    // Unsigned underflow is defined behavior
    char16_t cMinusSurrogateStart = c - 0xD800U;
    if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
      *aBuffer = p;
      return c;
    }
    if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
      // High surrogate
      if (MOZ_LIKELY(p != aEnd)) {
        char16_t second = *p;
        // Unsigned underflow is defined behavior
        if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
          *aBuffer = ++p;
          return (uint32_t(c) << 10) + uint32_t(second) -
                 (((0xD800U << 10) - 0x10000U) + 0xDC00U);
        }
      }
    }
    // Unpaired surrogate
    *aBuffer = p;
    if (aErr) {
      *aErr = true;
    }
    return 0xFFFDU;
  }
};

template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
{
  static_assert(mozilla::IsSame<Char, char>::value ||
                mozilla::IsSame<Char, unsigned char>::value ||
                mozilla::IsSame<Char, signed char>::value,
                "UTF-8 data must be in 8-bit units");
  static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
  while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
    --index;

  return index;
}

#undef UTF8UTILS_WARNING

#endif /* !defined(nsUTF8Utils_h_) */

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3		/* This Source Code Form is subject to the terms of the Mozilla Public
4		* License, v. 2.0. If a copy of the MPL was not distributed with this
5		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6		#ifndef nsUTF8Utils_h_
7		#define nsUTF8Utils_h_
8
9		// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
10		// file will provide signatures for the Mozilla abstract string types. It will
11		// use XPCOM assertion/debugging macros, etc.
12
13		#include "nscore.h"
14		#include "mozilla/Assertions.h"
15		#include "mozilla/EndianUtils.h"
16		#include "mozilla/TypeTraits.h"
17
18		#include "nsCharTraits.h"
19
20		#ifdef MOZILLA_INTERNAL_API
21		#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
22		#else
23		#define UTF8UTILS_WARNING(msg)
24		#endif
25
26		class UTF8traits
27		{
28		public:
29		static bool isASCII(char aChar)
30	0	{
31	0	return (aChar & 0x80) == 0x00;
32	0	}
33		static bool isInSeq(char aChar)
34	0	{
35	0	return (aChar & 0xC0) == 0x80;
36	0	}
37		static bool is2byte(char aChar)
38	0	{
39	0	return (aChar & 0xE0) == 0xC0;
40	0	}
41		static bool is3byte(char aChar)
42	0	{
43	0	return (aChar & 0xF0) == 0xE0;
44	0	}
45		static bool is4byte(char aChar)
46	0	{
47	0	return (aChar & 0xF8) == 0xF0;
48	0	}
49		static bool is5byte(char aChar)
50		{
51		return (aChar & 0xFC) == 0xF8;
52		}
53		static bool is6byte(char aChar)
54		{
55		return (aChar & 0xFE) == 0xFC;
56		}
57		// return the number of bytes in a sequence beginning with aChar
58		static int bytes(char aChar)
59	0	{
60	0	if (isASCII(aChar)) {
61	0	return 1;
62	0	}
63	0	if (is2byte(aChar)) {
64	0	return 2;
65	0	}
66	0	if (is3byte(aChar)) {
67	0	return 3;
68	0	}
69	0	if (is4byte(aChar)) {
70	0	return 4;
71	0	}
72	0	MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
73	0	return 1;
74	0	}
75		};
76
77		/**
78		* Extract the next Unicode scalar value from the buffer and return it. The
79		* pointer passed in is advanced to the start of the next character in the
80		* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
81		* over the maximal valid prefix and *aErr is set to true (if aErr is not
82		* null).
83		*
84		* Note: This method never sets *aErr to false to allow error accumulation
85		* across multiple calls.
86		*
87		* Precondition: *aBuffer < aEnd
88		*/
89		class UTF8CharEnumerator
90		{
91		public:
92		static inline char32_t NextChar(const char** aBuffer,
93		const char* aEnd,
94		bool* aErr = nullptr)
95		{
96		MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
97		MOZ_ASSERT(aEnd, "null end pointer");
98
99		const unsigned char* p = reinterpret_cast<const unsigned char>(aBuffer);
100		const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
101
102		MOZ_ASSERT(p, "null buffer");
103		MOZ_ASSERT(p < end, "Bogus range");
104
105		unsigned char first = *p++;
106
107		if (MOZ_LIKELY(first < 0x80U)) {
108		aBuffer = reinterpret_cast<const char>(p);
109		return first;
110		}
111
112		// Unsigned underflow is defined behavior
113		if (MOZ_UNLIKELY((p == end) \|\| ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
114		aBuffer = reinterpret_cast<const char>(p);
115		if (aErr) {
116		*aErr = true;
117		}
118		return 0xFFFDU;
119		}
120
121		unsigned char second = *p;
122
123		if (first < 0xE0U) {
124		// Two-byte
125		if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
126		aBuffer = reinterpret_cast<const char>(++p);
127		return ((uint32_t(first) & 0x1FU) << 6) \| (uint32_t(second) & 0x3FU);
128		}
129		aBuffer = reinterpret_cast<const char>(p);
130		if (aErr) {
131		*aErr = true;
132		}
133		return 0xFFFDU;
134		}
135
136		if (MOZ_LIKELY(first < 0xF0U)) {
137		// Three-byte
138		unsigned char lower = 0x80U;
139		unsigned char upper = 0xBFU;
140		if (first == 0xE0U) {
141		lower = 0xA0U;
142		} else if (first == 0xEDU) {
143		upper = 0x9FU;
144		}
145		if (MOZ_LIKELY(second >= lower && second <= upper)) {
146		if (MOZ_LIKELY(p != end)) {
147		unsigned char third = *++p;
148		if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
149		aBuffer = reinterpret_cast<const char>(++p);
150		return ((uint32_t(first) & 0xFU) << 12) \|
151		((uint32_t(second) & 0x3FU) << 6) \|
152		(uint32_t(third) & 0x3FU);
153		}
154		}
155		}
156		aBuffer = reinterpret_cast<const char>(p);
157		if (aErr) {
158		*aErr = true;
159		}
160		return 0xFFFDU;
161		}
162
163		// Four-byte
164		unsigned char lower = 0x80U;
165		unsigned char upper = 0xBFU;
166		if (first == 0xF0U) {
167		lower = 0x90U;
168		} else if (first == 0xF4U) {
169		upper = 0x8FU;
170		}
171		if (MOZ_LIKELY(second >= lower && second <= upper)) {
172		if (MOZ_LIKELY(p != end)) {
173		unsigned char third = *++p;
174		if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
175		if (MOZ_LIKELY(p != end)) {
176		unsigned char fourth = *++p;
177		if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
178		aBuffer = reinterpret_cast<const char>(++p);
179		return ((uint32_t(first) & 0x7U) << 18) \|
180		((uint32_t(second) & 0x3FU) << 12) \|
181		((uint32_t(third) & 0x3FU) << 6) \|
182		(uint32_t(fourth) & 0x3FU);
183		}
184		}
185		}
186		}
187		}
188		aBuffer = reinterpret_cast<const char>(p);
189		if (aErr) {
190		*aErr = true;
191		}
192		return 0xFFFDU;
193		}
194		};
195
196		/**
197		* Extract the next Unicode scalar value from the buffer and return it. The
198		* pointer passed in is advanced to the start of the next character in the
199		* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
200		* the unpaired surrogate and *aErr is set to true (if aErr is not null).
201		*
202		* Note: This method never sets *aErr to false to allow error accumulation
203		* across multiple calls.
204		*
205		* Precondition: *aBuffer < aEnd
206		*/
207		class UTF16CharEnumerator
208		{
209		public:
210		static inline char32_t NextChar(const char16_t** aBuffer,
211		const char16_t* aEnd,
212		bool* aErr = nullptr)
213		{
214		MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
215		MOZ_ASSERT(aEnd, "null end pointer");
216
217		const char16_t* p = *aBuffer;
218
219		MOZ_ASSERT(p, "null buffer");
220		MOZ_ASSERT(p < aEnd, "Bogus range");
221
222		char16_t c = *p++;
223
224		// Let's use encoding_rs-style code golf here.
225		// Unsigned underflow is defined behavior
226		char16_t cMinusSurrogateStart = c - 0xD800U;
227		if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
228		*aBuffer = p;
229		return c;
230		}
231		if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
232		// High surrogate
233		if (MOZ_LIKELY(p != aEnd)) {
234		char16_t second = *p;
235		// Unsigned underflow is defined behavior
236		if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
237		*aBuffer = ++p;
238		return (uint32_t(c) << 10) + uint32_t(second) -
239		(((0xD800U << 10) - 0x10000U) + 0xDC00U);
240		}
241		}
242		}
243		// Unpaired surrogate
244		*aBuffer = p;
245		if (aErr) {
246		*aErr = true;
247		}
248		return 0xFFFDU;
249		}
250		};
251
252		template<typename Char, typename UnsignedT>
253		inline UnsignedT
254		RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
255	0	{
256	0	static_assert(mozilla::IsSame<Char, char>::value \|\|
257	0	mozilla::IsSame<Char, unsigned char>::value \|\|
258	0	mozilla::IsSame<Char, signed char>::value,
259	0	"UTF-8 data must be in 8-bit units");
260	0	static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
261	0	while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
262	0	--index;
263	0
264	0	return index;
265	0	}
266
267		#undef UTF8UTILS_WARNING
268
269		#endif /* !defined(nsUTF8Utils_h_) */