Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/xpcom/string/nsUTF8Utils.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
#ifndef nsUTF8Utils_h_
7
#define nsUTF8Utils_h_
8
9
// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
10
// file will provide signatures for the Mozilla abstract string types. It will
11
// use XPCOM assertion/debugging macros, etc.
12
13
#include "nscore.h"
14
#include "mozilla/Assertions.h"
15
#include "mozilla/EndianUtils.h"
16
#include "mozilla/TypeTraits.h"
17
18
#include "nsCharTraits.h"
19
20
#ifdef MOZILLA_INTERNAL_API
21
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
22
#else
23
#define UTF8UTILS_WARNING(msg)
24
#endif
25
26
class UTF8traits
27
{
28
public:
29
  static bool isASCII(char aChar)
30
  {
31
    return (aChar & 0x80) == 0x00;
32
  }
33
  static bool isInSeq(char aChar)
34
  {
35
    return (aChar & 0xC0) == 0x80;
36
  }
37
  static bool is2byte(char aChar)
38
  {
39
    return (aChar & 0xE0) == 0xC0;
40
  }
41
  static bool is3byte(char aChar)
42
  {
43
    return (aChar & 0xF0) == 0xE0;
44
  }
45
  static bool is4byte(char aChar)
46
  {
47
    return (aChar & 0xF8) == 0xF0;
48
  }
49
  static bool is5byte(char aChar)
50
0
  {
51
0
    return (aChar & 0xFC) == 0xF8;
52
0
  }
53
  static bool is6byte(char aChar)
54
0
  {
55
0
    return (aChar & 0xFE) == 0xFC;
56
0
  }
57
  // return the number of bytes in a sequence beginning with aChar
58
  static int bytes(char aChar)
59
  {
60
    if (isASCII(aChar)) {
61
      return 1;
62
    }
63
    if (is2byte(aChar)) {
64
      return 2;
65
    }
66
    if (is3byte(aChar)) {
67
      return 3;
68
    }
69
    if (is4byte(aChar)) {
70
      return 4;
71
    }
72
    MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
73
    return 1;
74
  }
75
};
76
77
/**
78
 * Extract the next Unicode scalar value from the buffer and return it. The
79
 * pointer passed in is advanced to the start of the next character in the
80
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
81
 * over the maximal valid prefix and *aErr is set to true (if aErr is not
82
 * null).
83
 *
84
 * Note: This method never sets *aErr to false to allow error accumulation
85
 * across multiple calls.
86
 *
87
 * Precondition: *aBuffer < aEnd
88
 */
89
class UTF8CharEnumerator
90
{
91
public:
92
  static inline char32_t NextChar(const char** aBuffer,
93
                                  const char* aEnd,
94
                                  bool* aErr = nullptr)
95
210k
  {
96
210k
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
97
210k
    MOZ_ASSERT(aEnd, "null end pointer");
98
210k
99
210k
    const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
100
210k
    const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
101
210k
102
210k
    MOZ_ASSERT(p, "null buffer");
103
210k
    MOZ_ASSERT(p < end, "Bogus range");
104
210k
105
210k
    unsigned char first = *p++;
106
210k
107
210k
    if (MOZ_LIKELY(first < 0x80U)) {
108
210k
      *aBuffer = reinterpret_cast<const char*>(p);
109
210k
      return first;
110
210k
    }
111
0
112
0
    // Unsigned underflow is defined behavior
113
0
    if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
114
0
      *aBuffer = reinterpret_cast<const char*>(p);
115
0
      if (aErr) {
116
0
        *aErr = true;
117
0
      }
118
0
      return 0xFFFDU;
119
0
    }
120
0
121
0
    unsigned char second = *p;
122
0
123
0
    if (first < 0xE0U) {
124
0
      // Two-byte
125
0
      if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
126
0
        *aBuffer = reinterpret_cast<const char*>(++p);
127
0
        return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
128
0
      }
129
0
      *aBuffer = reinterpret_cast<const char*>(p);
130
0
      if (aErr) {
131
0
        *aErr = true;
132
0
      }
133
0
      return 0xFFFDU;
134
0
    }
135
0
136
0
    if (MOZ_LIKELY(first < 0xF0U)) {
137
0
      // Three-byte
138
0
      unsigned char lower = 0x80U;
139
0
      unsigned char upper = 0xBFU;
140
0
      if (first == 0xE0U) {
141
0
        lower = 0xA0U;
142
0
      } else if (first == 0xEDU) {
143
0
        upper = 0x9FU;
144
0
      }
145
0
      if (MOZ_LIKELY(second >= lower && second <= upper)) {
146
0
        if (MOZ_LIKELY(p != end)) {
147
0
          unsigned char third = *++p;
148
0
          if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
149
0
            *aBuffer = reinterpret_cast<const char*>(++p);
150
0
            return ((uint32_t(first) & 0xFU) << 12) |
151
0
                   ((uint32_t(second) & 0x3FU) << 6) |
152
0
                   (uint32_t(third) & 0x3FU);
153
0
          }
154
0
        }
155
0
      }
156
0
      *aBuffer = reinterpret_cast<const char*>(p);
157
0
      if (aErr) {
158
0
        *aErr = true;
159
0
      }
160
0
      return 0xFFFDU;
161
0
    }
162
0
163
0
    // Four-byte
164
0
    unsigned char lower = 0x80U;
165
0
    unsigned char upper = 0xBFU;
166
0
    if (first == 0xF0U) {
167
0
      lower = 0x90U;
168
0
    } else if (first == 0xF4U) {
169
0
      upper = 0x8FU;
170
0
    }
171
0
    if (MOZ_LIKELY(second >= lower && second <= upper)) {
172
0
      if (MOZ_LIKELY(p != end)) {
173
0
        unsigned char third = *++p;
174
0
        if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
175
0
          if (MOZ_LIKELY(p != end)) {
176
0
            unsigned char fourth = *++p;
177
0
            if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
178
0
              *aBuffer = reinterpret_cast<const char*>(++p);
179
0
              return ((uint32_t(first) & 0x7U) << 18) |
180
0
                     ((uint32_t(second) & 0x3FU) << 12) |
181
0
                     ((uint32_t(third) & 0x3FU) << 6) |
182
0
                     (uint32_t(fourth) & 0x3FU);
183
0
            }
184
0
          }
185
0
        }
186
0
      }
187
0
    }
188
0
    *aBuffer = reinterpret_cast<const char*>(p);
189
0
    if (aErr) {
190
0
      *aErr = true;
191
0
    }
192
0
    return 0xFFFDU;
193
0
  }
194
};
195
196
/**
197
 * Extract the next Unicode scalar value from the buffer and return it. The
198
 * pointer passed in is advanced to the start of the next character in the
199
 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
200
 * the unpaired surrogate and *aErr is set to true (if aErr is not null).
201
 *
202
 * Note: This method never sets *aErr to false to allow error accumulation
203
 * across multiple calls.
204
 *
205
 * Precondition: *aBuffer < aEnd
206
 */
207
class UTF16CharEnumerator
208
{
209
public:
210
  static inline char32_t NextChar(const char16_t** aBuffer,
211
                                  const char16_t* aEnd,
212
                                  bool* aErr = nullptr)
213
105k
  {
214
105k
    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
215
105k
    MOZ_ASSERT(aEnd, "null end pointer");
216
105k
217
105k
    const char16_t* p = *aBuffer;
218
105k
219
105k
    MOZ_ASSERT(p, "null buffer");
220
105k
    MOZ_ASSERT(p < aEnd, "Bogus range");
221
105k
222
105k
    char16_t c = *p++;
223
105k
224
105k
    // Let's use encoding_rs-style code golf here.
225
105k
    // Unsigned underflow is defined behavior
226
105k
    char16_t cMinusSurrogateStart = c - 0xD800U;
227
105k
    if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
228
105k
      *aBuffer = p;
229
105k
      return c;
230
105k
    }
231
0
    if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
232
0
      // High surrogate
233
0
      if (MOZ_LIKELY(p != aEnd)) {
234
0
        char16_t second = *p;
235
0
        // Unsigned underflow is defined behavior
236
0
        if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
237
0
          *aBuffer = ++p;
238
0
          return (uint32_t(c) << 10) + uint32_t(second) -
239
0
                 (((0xD800U << 10) - 0x10000U) + 0xDC00U);
240
0
        }
241
0
      }
242
0
    }
243
0
    // Unpaired surrogate
244
0
    *aBuffer = p;
245
0
    if (aErr) {
246
0
      *aErr = true;
247
0
    }
248
0
    return 0xFFFDU;
249
0
  }
250
};
251
252
template<typename Char, typename UnsignedT>
253
inline UnsignedT
254
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
255
{
256
  static_assert(mozilla::IsSame<Char, char>::value ||
257
                mozilla::IsSame<Char, unsigned char>::value ||
258
                mozilla::IsSame<Char, signed char>::value,
259
                "UTF-8 data must be in 8-bit units");
260
  static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
261
  while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
262
    --index;
263
264
  return index;
265
}
266
267
#undef UTF8UTILS_WARNING
268
269
#endif /* !defined(nsUTF8Utils_h_) */