/work/obj-fuzz/dist/include/js/CharacterEncoding.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
2 | | * vim: set ts=8 sts=4 et sw=4 tw=99: |
3 | | * This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #ifndef js_CharacterEncoding_h |
8 | | #define js_CharacterEncoding_h |
9 | | |
10 | | #include "mozilla/Range.h" |
11 | | |
12 | | #include "js/TypeDecls.h" |
13 | | #include "js/Utility.h" |
14 | | |
15 | | class JSFlatString; |
16 | | |
17 | | namespace JS { |
18 | | |
19 | | /* |
20 | | * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI |
21 | | * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each |
22 | | * byte is treated as a 2-byte character, and there is no way to pass in a |
23 | | * string containing characters beyond U+00FF. |
24 | | */ |
25 | | class Latin1Chars : public mozilla::Range<Latin1Char> |
26 | | { |
27 | | typedef mozilla::Range<Latin1Char> Base; |
28 | | |
29 | | public: |
30 | | using CharT = Latin1Char; |
31 | | |
32 | 0 | Latin1Chars() : Base() {} |
33 | 0 | Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {} |
34 | | Latin1Chars(const Latin1Char* aBytes, size_t aLength) |
35 | | : Base(const_cast<Latin1Char*>(aBytes), aLength) |
36 | 0 | {} |
37 | | Latin1Chars(const char* aBytes, size_t aLength) |
38 | | : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength) |
39 | 0 | {} |
40 | | }; |
41 | | |
42 | | /* |
43 | | * A Latin1Chars, but with \0 termination for C compatibility. |
44 | | */ |
45 | | class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> |
46 | | { |
47 | | typedef mozilla::RangedPtr<Latin1Char> Base; |
48 | | |
49 | | public: |
50 | | using CharT = Latin1Char; |
51 | | |
52 | 0 | Latin1CharsZ() : Base(nullptr, 0) {} |
53 | | |
54 | | Latin1CharsZ(char* aBytes, size_t aLength) |
55 | | : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) |
56 | 0 | { |
57 | 0 | MOZ_ASSERT(aBytes[aLength] == '\0'); |
58 | 0 | } |
59 | | |
60 | | Latin1CharsZ(Latin1Char* aBytes, size_t aLength) |
61 | | : Base(aBytes, aLength) |
62 | 0 | { |
63 | 0 | MOZ_ASSERT(aBytes[aLength] == '\0'); |
64 | 0 | } |
65 | | |
66 | | using Base::operator=; |
67 | | |
68 | 0 | char* c_str() { return reinterpret_cast<char*>(get()); } |
69 | | }; |
70 | | |
71 | | class UTF8Chars : public mozilla::Range<unsigned char> |
72 | | { |
73 | | typedef mozilla::Range<unsigned char> Base; |
74 | | |
75 | | public: |
76 | | using CharT = unsigned char; |
77 | | |
78 | 0 | UTF8Chars() : Base() {} |
79 | | UTF8Chars(char* aBytes, size_t aLength) |
80 | | : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) |
81 | 0 | {} |
82 | | UTF8Chars(const char* aBytes, size_t aLength) |
83 | | : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength) |
84 | 3 | {} |
85 | | }; |
86 | | |
87 | | /* |
88 | | * SpiderMonkey also deals directly with UTF-8 encoded text in some places. |
89 | | */ |
90 | | class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> |
91 | | { |
92 | | typedef mozilla::RangedPtr<unsigned char> Base; |
93 | | |
94 | | public: |
95 | | using CharT = unsigned char; |
96 | | |
97 | 0 | UTF8CharsZ() : Base(nullptr, 0) {} |
98 | | |
99 | | UTF8CharsZ(char* aBytes, size_t aLength) |
100 | | : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) |
101 | 14 | { |
102 | 14 | MOZ_ASSERT(aBytes[aLength] == '\0'); |
103 | 14 | } |
104 | | |
105 | | UTF8CharsZ(unsigned char* aBytes, size_t aLength) |
106 | | : Base(aBytes, aLength) |
107 | 0 | { |
108 | 0 | MOZ_ASSERT(aBytes[aLength] == '\0'); |
109 | 0 | } |
110 | | |
111 | | using Base::operator=; |
112 | | |
113 | 14 | char* c_str() { return reinterpret_cast<char*>(get()); } |
114 | | }; |
115 | | |
116 | | /* |
117 | | * A wrapper for a "const char*" that is encoded using UTF-8. |
118 | | * This class does not manage ownership of the data; that is left |
119 | | * to others. This differs from UTF8CharsZ in that the chars are |
120 | | * const and it disallows assignment. |
121 | | */ |
122 | | class JS_PUBLIC_API(ConstUTF8CharsZ) |
123 | | { |
124 | | const char* data_; |
125 | | |
126 | | public: |
127 | | using CharT = unsigned char; |
128 | | |
129 | | ConstUTF8CharsZ() : data_(nullptr) |
130 | 0 | {} |
131 | | |
132 | | ConstUTF8CharsZ(const char* aBytes, size_t aLength) |
133 | | : data_(aBytes) |
134 | 0 | { |
135 | 0 | MOZ_ASSERT(aBytes[aLength] == '\0'); |
136 | | #ifdef DEBUG |
137 | | validate(aLength); |
138 | | #endif |
139 | | } |
140 | | |
141 | 0 | const void* get() const { return data_; } |
142 | | |
143 | 0 | const char* c_str() const { return data_; } |
144 | | |
145 | 0 | explicit operator bool() const { return data_ != nullptr; } |
146 | | |
147 | | private: |
148 | | #ifdef DEBUG |
149 | | void validate(size_t aLength); |
150 | | #endif |
151 | | }; |
152 | | |
153 | | /* |
154 | | * SpiderMonkey uses a 2-byte character representation: it is a |
155 | | * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, |
156 | | * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a |
157 | | * sufficiently dedicated JavaScript program to be fully unicode-aware by |
158 | | * manually interpreting UTF-16 extension characters embedded in the JS |
159 | | * string. |
160 | | */ |
161 | | class TwoByteChars : public mozilla::Range<char16_t> |
162 | | { |
163 | | typedef mozilla::Range<char16_t> Base; |
164 | | |
165 | | public: |
166 | | using CharT = char16_t; |
167 | | |
168 | 0 | TwoByteChars() : Base() {} |
169 | 0 | TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} |
170 | 0 | TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {} |
171 | | }; |
172 | | |
173 | | /* |
174 | | * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. |
175 | | */ |
176 | | class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> |
177 | | { |
178 | | typedef mozilla::RangedPtr<char16_t> Base; |
179 | | |
180 | | public: |
181 | | using CharT = char16_t; |
182 | | |
183 | 0 | TwoByteCharsZ() : Base(nullptr, 0) {} |
184 | | |
185 | | TwoByteCharsZ(char16_t* chars, size_t length) |
186 | | : Base(chars, length) |
187 | 3 | { |
188 | 3 | MOZ_ASSERT(chars[length] == '\0'); |
189 | 3 | } |
190 | | |
191 | | using Base::operator=; |
192 | | }; |
193 | | |
194 | | typedef mozilla::RangedPtr<const char16_t> ConstCharPtr; |
195 | | |
196 | | /* |
197 | | * Like TwoByteChars, but the chars are const. |
198 | | */ |
199 | | class ConstTwoByteChars : public mozilla::Range<const char16_t> |
200 | | { |
201 | | typedef mozilla::Range<const char16_t> Base; |
202 | | |
203 | | public: |
204 | | using CharT = char16_t; |
205 | | |
206 | 0 | ConstTwoByteChars() : Base() {} |
207 | 0 | ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} |
208 | | }; |
209 | | |
210 | | /* |
211 | | * Convert a 2-byte character sequence to "ISO-Latin-1". This works by |
212 | | * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source |
213 | | * contains any UTF-16 extension characters, then this may give invalid Latin1 |
214 | | * output. The returned string is zero terminated. The returned string or the |
215 | | * returned string's |start()| must be freed with JS_free or js_free, |
216 | | * respectively. If allocation fails, an OOM error will be set and the method |
217 | | * will return a nullptr chars (which can be tested for with the ! operator). |
218 | | * This method cannot trigger GC. |
219 | | */ |
220 | | extern Latin1CharsZ |
221 | | LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, |
222 | | const mozilla::Range<const char16_t> tbchars); |
223 | | |
224 | | inline Latin1CharsZ |
225 | | LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, const char16_t* begin, size_t length) |
226 | 0 | { |
227 | 0 | const mozilla::Range<const char16_t> tbchars(begin, length); |
228 | 0 | return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars); |
229 | 0 | } |
230 | | |
231 | | template <typename CharT> |
232 | | extern UTF8CharsZ |
233 | | CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range<CharT> chars); |
234 | | |
235 | | JS_PUBLIC_API(uint32_t) |
236 | | Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length); |
237 | | |
238 | | /* |
239 | | * Inflate bytes in UTF-8 encoding to char16_t. |
240 | | * - On error, returns an empty TwoByteCharsZ. |
241 | | * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold |
242 | | * its length; the length value excludes the trailing null. |
243 | | */ |
244 | | extern JS_PUBLIC_API(TwoByteCharsZ) |
245 | | UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); |
246 | | |
247 | | /* |
248 | | * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ. |
249 | | */ |
250 | | extern JS_PUBLIC_API(TwoByteCharsZ) |
251 | | UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen); |
252 | | |
253 | | /* |
254 | | * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters |
255 | | * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8 |
256 | | * input. |
257 | | */ |
258 | | extern JS_PUBLIC_API(TwoByteCharsZ) |
259 | | LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); |
260 | | |
261 | | extern JS_PUBLIC_API(TwoByteCharsZ) |
262 | | LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen); |
263 | | |
264 | | /* |
265 | | * Returns the length of the char buffer required to encode |s| as UTF8. |
266 | | * Does not include the null-terminator. |
267 | | */ |
268 | | JS_PUBLIC_API(size_t) |
269 | | GetDeflatedUTF8StringLength(JSFlatString* s); |
270 | | |
271 | | /* |
272 | | * Encode |src| as UTF8. The caller must either ensure |dst| has enough space |
273 | | * to encode the entire string or pass the length of the buffer as |dstlenp|, |
274 | | * in which case the function will encode characters from the string until |
275 | | * the buffer is exhausted. Does not write the null terminator. |
276 | | * |
277 | | * If |dstlenp| is provided, it will be updated to hold the number of bytes |
278 | | * written to the buffer. If |numcharsp| is provided, it will be updated to hold |
279 | | * the number of Unicode characters written to the buffer (which can be less |
280 | | * than the length of the string, if the buffer is exhausted before the string |
281 | | * is fully encoded). |
282 | | */ |
283 | | JS_PUBLIC_API(void) |
284 | | DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst, |
285 | | size_t* dstlenp = nullptr, size_t* numcharsp = nullptr); |
286 | | |
287 | | /* |
288 | | * The smallest character encoding capable of fully representing a particular |
289 | | * string. |
290 | | */ |
291 | | enum class SmallestEncoding { |
292 | | ASCII, |
293 | | Latin1, |
294 | | UTF16 |
295 | | }; |
296 | | |
297 | | /* |
298 | | * Returns the smallest encoding possible for the given string: if all |
299 | | * codepoints are <128 then ASCII, otherwise if all codepoints are <256 |
300 | | * Latin-1, else UTF16. |
301 | | */ |
302 | | JS_PUBLIC_API(SmallestEncoding) |
303 | | FindSmallestEncoding(UTF8Chars utf8); |
304 | | |
305 | | /* |
306 | | * Return a null-terminated Latin-1 string copied from the input string, |
307 | | * storing its length (excluding null terminator) in |*outlen|. Fail and |
308 | | * report an error if the string contains non-Latin-1 codepoints. Returns |
309 | | * Latin1CharsZ() on failure. |
310 | | */ |
311 | | extern JS_PUBLIC_API(Latin1CharsZ) |
312 | | UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); |
313 | | |
314 | | /* |
315 | | * Return a null-terminated Latin-1 string copied from the input string, |
316 | | * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1 |
317 | | * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure. |
318 | | */ |
319 | | extern JS_PUBLIC_API(Latin1CharsZ) |
320 | | LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); |
321 | | |
322 | | /* |
323 | | * Returns true if all characters in the given null-terminated string are |
324 | | * ASCII, i.e. < 0x80, false otherwise. |
325 | | */ |
326 | | extern JS_PUBLIC_API(bool) |
327 | | StringIsASCII(const char* s); |
328 | | |
329 | | } // namespace JS |
330 | | |
331 | 0 | inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } |
332 | 0 | inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); } |
333 | | |
334 | | /** |
335 | | * DEPRECATED |
336 | | * |
337 | | * Allocate memory sufficient to contain the characters of |str| truncated to |
338 | | * Latin-1 and a trailing null terminator, fill the memory with the characters |
339 | | * interpreted in that manner plus the null terminator, and return a pointer to |
340 | | * the memory. |
341 | | * |
342 | | * This function *loses information* when it copies the characters of |str| if |
343 | | * |str| contains code units greater than 0xFF. Additionally, users that |
344 | | * depend on null-termination will misinterpret the copied characters if |str| |
345 | | * contains any nulls. Avoid using this function if possible, because it will |
346 | | * eventually be removed. |
347 | | */ |
348 | | extern JS_PUBLIC_API(JS::UniqueChars) |
349 | | JS_EncodeStringToLatin1(JSContext* cx, JSString* str); |
350 | | |
351 | | /** |
352 | | * DEPRECATED |
353 | | * |
354 | | * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string. |
355 | | * |
356 | | * This function *loses information* when it copies the characters of |str| if |
357 | | * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied |
358 | | * instead. |
359 | | * |
360 | | * The returned string is also subject to misinterpretation if |str| contains |
361 | | * any nulls (which are faithfully transcribed into the returned string, but |
362 | | * which will implicitly truncate the string if it's passed to functions that |
363 | | * expect null-terminated strings). |
364 | | * |
365 | | * Avoid using this function if possible, because we'll remove it once we can |
366 | | * devise a better API for the task. |
367 | | */ |
368 | | extern JS_PUBLIC_API(JS::UniqueChars) |
369 | | JS_EncodeStringToUTF8(JSContext* cx, JS::Handle<JSString*> str); |
370 | | |
371 | | /** |
372 | | * DEPRECATED |
373 | | * |
374 | | * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string. |
375 | | * |
376 | | * This function asserts in debug mode that the input string contains only |
377 | | * ASCII characters. |
378 | | * |
379 | | * The returned string is also subject to misinterpretation if |str| contains |
380 | | * any nulls (which are faithfully transcribed into the returned string, but |
381 | | * which will implicitly truncate the string if it's passed to functions that |
382 | | * expect null-terminated strings). |
383 | | * |
384 | | * Avoid using this function if possible, because we'll remove it once we can |
385 | | * devise a better API for the task. |
386 | | */ |
387 | | extern JS_PUBLIC_API(JS::UniqueChars) |
388 | | JS_EncodeStringToASCII(JSContext* cx, JSString* str); |
389 | | |
390 | | #endif /* js_CharacterEncoding_h */ |