Coverage Report

Created: 2018-09-25 14:53

/work/obj-fuzz/dist/include/js/CharacterEncoding.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2
 * vim: set ts=8 sts=4 et sw=4 tw=99:
3
 * This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#ifndef js_CharacterEncoding_h
8
#define js_CharacterEncoding_h
9
10
#include "mozilla/Range.h"
11
12
#include "js/TypeDecls.h"
13
#include "js/Utility.h"
14
15
class JSFlatString;
16
17
namespace JS {
18
19
/*
20
 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
21
 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
22
 * byte is treated as a 2-byte character, and there is no way to pass in a
23
 * string containing characters beyond U+00FF.
24
 */
25
class Latin1Chars : public mozilla::Range<Latin1Char>
26
{
27
    typedef mozilla::Range<Latin1Char> Base;
28
29
  public:
30
    using CharT = Latin1Char;
31
32
0
    Latin1Chars() : Base() {}
33
0
    Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
34
    Latin1Chars(const Latin1Char* aBytes, size_t aLength)
35
      : Base(const_cast<Latin1Char*>(aBytes), aLength)
36
0
    {}
37
    Latin1Chars(const char* aBytes, size_t aLength)
38
      : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
39
0
    {}
40
};
41
42
/*
43
 * A Latin1Chars, but with \0 termination for C compatibility.
44
 */
45
class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
46
{
47
    typedef mozilla::RangedPtr<Latin1Char> Base;
48
49
  public:
50
    using CharT = Latin1Char;
51
52
0
    Latin1CharsZ() : Base(nullptr, 0) {}
53
54
    Latin1CharsZ(char* aBytes, size_t aLength)
55
      : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
56
0
    {
57
0
        MOZ_ASSERT(aBytes[aLength] == '\0');
58
0
    }
59
60
    Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
61
      : Base(aBytes, aLength)
62
0
    {
63
0
        MOZ_ASSERT(aBytes[aLength] == '\0');
64
0
    }
65
66
    using Base::operator=;
67
68
0
    char* c_str() { return reinterpret_cast<char*>(get()); }
69
};
70
71
class UTF8Chars : public mozilla::Range<unsigned char>
72
{
73
    typedef mozilla::Range<unsigned char> Base;
74
75
  public:
76
    using CharT = unsigned char;
77
78
0
    UTF8Chars() : Base() {}
79
    UTF8Chars(char* aBytes, size_t aLength)
80
      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
81
0
    {}
82
    UTF8Chars(const char* aBytes, size_t aLength)
83
      : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
84
3
    {}
85
};
86
87
/*
88
 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
89
 */
90
class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
91
{
92
    typedef mozilla::RangedPtr<unsigned char> Base;
93
94
  public:
95
    using CharT = unsigned char;
96
97
0
    UTF8CharsZ() : Base(nullptr, 0) {}
98
99
    UTF8CharsZ(char* aBytes, size_t aLength)
100
      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
101
14
    {
102
14
        MOZ_ASSERT(aBytes[aLength] == '\0');
103
14
    }
104
105
    UTF8CharsZ(unsigned char* aBytes, size_t aLength)
106
      : Base(aBytes, aLength)
107
0
    {
108
0
        MOZ_ASSERT(aBytes[aLength] == '\0');
109
0
    }
110
111
    using Base::operator=;
112
113
14
    char* c_str() { return reinterpret_cast<char*>(get()); }
114
};
115
116
/*
117
 * A wrapper for a "const char*" that is encoded using UTF-8.
118
 * This class does not manage ownership of the data; that is left
119
 * to others.  This differs from UTF8CharsZ in that the chars are
120
 * const and it disallows assignment.
121
 */
122
class JS_PUBLIC_API(ConstUTF8CharsZ)
123
{
124
    const char* data_;
125
126
  public:
127
    using CharT = unsigned char;
128
129
    ConstUTF8CharsZ() : data_(nullptr)
130
0
    {}
131
132
    ConstUTF8CharsZ(const char* aBytes, size_t aLength)
133
      : data_(aBytes)
134
0
    {
135
0
        MOZ_ASSERT(aBytes[aLength] == '\0');
136
#ifdef DEBUG
137
        validate(aLength);
138
#endif
139
    }
140
141
0
    const void* get() const { return data_; }
142
143
0
    const char* c_str() const { return data_; }
144
145
0
    explicit operator bool() const { return data_ != nullptr; }
146
147
  private:
148
#ifdef DEBUG
149
    void validate(size_t aLength);
150
#endif
151
};
152
153
/*
154
 * SpiderMonkey uses a 2-byte character representation: it is a
155
 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
156
 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
157
 * sufficiently dedicated JavaScript program to be fully unicode-aware by
158
 * manually interpreting UTF-16 extension characters embedded in the JS
159
 * string.
160
 */
161
class TwoByteChars : public mozilla::Range<char16_t>
162
{
163
    typedef mozilla::Range<char16_t> Base;
164
165
  public:
166
    using CharT = char16_t;
167
168
0
    TwoByteChars() : Base() {}
169
0
    TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
170
0
    TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
171
};
172
173
/*
174
 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
175
 */
176
class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
177
{
178
    typedef mozilla::RangedPtr<char16_t> Base;
179
180
  public:
181
    using CharT = char16_t;
182
183
0
    TwoByteCharsZ() : Base(nullptr, 0) {}
184
185
    TwoByteCharsZ(char16_t* chars, size_t length)
186
      : Base(chars, length)
187
3
    {
188
3
        MOZ_ASSERT(chars[length] == '\0');
189
3
    }
190
191
    using Base::operator=;
192
};
193
194
typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
195
196
/*
197
 * Like TwoByteChars, but the chars are const.
198
 */
199
class ConstTwoByteChars : public mozilla::Range<const char16_t>
200
{
201
    typedef mozilla::Range<const char16_t> Base;
202
203
  public:
204
    using CharT = char16_t;
205
206
0
    ConstTwoByteChars() : Base() {}
207
0
    ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
208
};
209
210
/*
211
 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
212
 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
213
 * contains any UTF-16 extension characters, then this may give invalid Latin1
214
 * output. The returned string is zero terminated. The returned string or the
215
 * returned string's |start()| must be freed with JS_free or js_free,
216
 * respectively. If allocation fails, an OOM error will be set and the method
217
 * will return a nullptr chars (which can be tested for with the ! operator).
218
 * This method cannot trigger GC.
219
 */
220
extern Latin1CharsZ
221
LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
222
                                   const mozilla::Range<const char16_t> tbchars);
223
224
inline Latin1CharsZ
225
LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, const char16_t* begin, size_t length)
226
0
{
227
0
    const mozilla::Range<const char16_t> tbchars(begin, length);
228
0
    return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
229
0
}
230
231
template <typename CharT>
232
extern UTF8CharsZ
233
CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range<CharT> chars);
234
235
JS_PUBLIC_API(uint32_t)
236
Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
237
238
/*
239
 * Inflate bytes in UTF-8 encoding to char16_t.
240
 * - On error, returns an empty TwoByteCharsZ.
241
 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
242
 *   its length;  the length value excludes the trailing null.
243
 */
244
extern JS_PUBLIC_API(TwoByteCharsZ)
245
UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
246
247
/*
248
 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
249
 */
250
extern JS_PUBLIC_API(TwoByteCharsZ)
251
UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
252
253
/*
254
 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
255
 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
256
 * input.
257
 */
258
extern JS_PUBLIC_API(TwoByteCharsZ)
259
LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
260
261
extern JS_PUBLIC_API(TwoByteCharsZ)
262
LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
263
264
/*
265
 * Returns the length of the char buffer required to encode |s| as UTF8.
266
 * Does not include the null-terminator.
267
 */
268
JS_PUBLIC_API(size_t)
269
GetDeflatedUTF8StringLength(JSFlatString* s);
270
271
/*
272
 * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
273
 * to encode the entire string or pass the length of the buffer as |dstlenp|,
274
 * in which case the function will encode characters from the string until
275
 * the buffer is exhausted. Does not write the null terminator.
276
 *
277
 * If |dstlenp| is provided, it will be updated to hold the number of bytes
278
 * written to the buffer. If |numcharsp| is provided, it will be updated to hold
279
 * the number of Unicode characters written to the buffer (which can be less
280
 * than the length of the string, if the buffer is exhausted before the string
281
 * is fully encoded).
282
 */
283
JS_PUBLIC_API(void)
284
DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
285
                          size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
286
287
/*
288
 * The smallest character encoding capable of fully representing a particular
289
 * string.
290
 */
291
enum class SmallestEncoding {
292
    ASCII,
293
    Latin1,
294
    UTF16
295
};
296
297
/*
298
 * Returns the smallest encoding possible for the given string: if all
299
 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
300
 * Latin-1, else UTF16.
301
 */
302
JS_PUBLIC_API(SmallestEncoding)
303
FindSmallestEncoding(UTF8Chars utf8);
304
305
/*
306
  * Return a null-terminated Latin-1 string copied from the input string,
307
  * storing its length (excluding null terminator) in |*outlen|.  Fail and
308
  * report an error if the string contains non-Latin-1 codepoints.  Returns
309
  * Latin1CharsZ() on failure.
310
 */
311
extern JS_PUBLIC_API(Latin1CharsZ)
312
UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
313
314
/*
315
 * Return a null-terminated Latin-1 string copied from the input string,
316
 * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
317
 * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
318
 */
319
extern JS_PUBLIC_API(Latin1CharsZ)
320
LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
321
322
/*
323
 * Returns true if all characters in the given null-terminated string are
324
 * ASCII, i.e. < 0x80, false otherwise.
325
 */
326
extern JS_PUBLIC_API(bool)
327
StringIsASCII(const char* s);
328
329
} // namespace JS
330
331
0
inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
332
0
inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
333
334
/**
335
 * DEPRECATED
336
 *
337
 * Allocate memory sufficient to contain the characters of |str| truncated to
338
 * Latin-1 and a trailing null terminator, fill the memory with the characters
339
 * interpreted in that manner plus the null terminator, and return a pointer to
340
 * the memory.
341
 *
342
 * This function *loses information* when it copies the characters of |str| if
343
 * |str| contains code units greater than 0xFF.  Additionally, users that
344
 * depend on null-termination will misinterpret the copied characters if |str|
345
 * contains any nulls.  Avoid using this function if possible, because it will
346
 * eventually be removed.
347
 */
348
extern JS_PUBLIC_API(JS::UniqueChars)
349
JS_EncodeStringToLatin1(JSContext* cx, JSString* str);
350
351
/**
352
 * DEPRECATED
353
 *
354
 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
355
 *
356
 * This function *loses information* when it copies the characters of |str| if
357
 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
358
 * instead.
359
 *
360
 * The returned string is also subject to misinterpretation if |str| contains
361
 * any nulls (which are faithfully transcribed into the returned string, but
362
 * which will implicitly truncate the string if it's passed to functions that
363
 * expect null-terminated strings).
364
 *
365
 * Avoid using this function if possible, because we'll remove it once we can
366
 * devise a better API for the task.
367
 */
368
extern JS_PUBLIC_API(JS::UniqueChars)
369
JS_EncodeStringToUTF8(JSContext* cx, JS::Handle<JSString*> str);
370
371
/**
372
 * DEPRECATED
373
 *
374
 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
375
 *
376
 * This function asserts in debug mode that the input string contains only
377
 * ASCII characters.
378
 *
379
 * The returned string is also subject to misinterpretation if |str| contains
380
 * any nulls (which are faithfully transcribed into the returned string, but
381
 * which will implicitly truncate the string if it's passed to functions that
382
 * expect null-terminated strings).
383
 *
384
 * Avoid using this function if possible, because we'll remove it once we can
385
 * devise a better API for the task.
386
 */
387
extern JS_PUBLIC_API(JS::UniqueChars)
388
JS_EncodeStringToASCII(JSContext* cx, JSString* str);
389
390
#endif /* js_CharacterEncoding_h */