/src/node/src/node_i18n.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright Joyent, Inc. and other Node contributors. |
2 | | // |
3 | | // Permission is hereby granted, free of charge, to any person obtaining a |
4 | | // copy of this software and associated documentation files (the |
5 | | // "Software"), to deal in the Software without restriction, including |
6 | | // without limitation the rights to use, copy, modify, merge, publish, |
7 | | // distribute, sublicense, and/or sell copies of the Software, and to permit |
8 | | // persons to whom the Software is furnished to do so, subject to the |
9 | | // following conditions: |
10 | | // |
11 | | // The above copyright notice and this permission notice shall be included |
12 | | // in all copies or substantial portions of the Software. |
13 | | // |
14 | | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
15 | | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
16 | | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN |
17 | | // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, |
18 | | // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
19 | | // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
20 | | // USE OR OTHER DEALINGS IN THE SOFTWARE. |
21 | | |
22 | | /* |
23 | | * notes: by srl295 |
24 | | * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data |
25 | | * ( stubdata/libicudata.a ) containing nothing, no data, and it's also |
26 | | * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT |
27 | | * macro names. That's the "english+root" data. |
28 | | * |
29 | | * If icu_data_path is non-null, the user has provided a path and we assume |
30 | | * it goes somewhere useful. We set that path in ICU, and exit. |
31 | | * If icu_data_path is null, they haven't set a path and we want the |
32 | | * "english+root" data. We call |
33 | | * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...) |
34 | | * to load up the english+root data. |
35 | | * |
36 | | * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full |
37 | | * data. All of the variables and command line options for changing data at |
38 | | * runtime are disabled, as they wouldn't fully override the internal data. |
39 | | * See: http://bugs.icu-project.org/trac/ticket/10924 |
40 | | */ |
41 | | |
42 | | |
43 | | #include "node_i18n.h" |
44 | | #include "node_external_reference.h" |
45 | | |
46 | | #if defined(NODE_HAVE_I18N_SUPPORT) |
47 | | |
48 | | #include "base_object-inl.h" |
49 | | #include "node.h" |
50 | | #include "node_buffer.h" |
51 | | #include "node_errors.h" |
52 | | #include "node_internals.h" |
53 | | #include "string_bytes.h" |
54 | | #include "util-inl.h" |
55 | | #include "v8.h" |
56 | | |
57 | | #include <unicode/putil.h> |
58 | | #include <unicode/timezone.h> |
59 | | #include <unicode/uchar.h> |
60 | | #include <unicode/uclean.h> |
61 | | #include <unicode/ucnv.h> |
62 | | #include <unicode/udata.h> |
63 | | #include <unicode/uidna.h> |
64 | | #include <unicode/ulocdata.h> |
65 | | #include <unicode/urename.h> |
66 | | #include <unicode/ustring.h> |
67 | | #include <unicode/utf16.h> |
68 | | #include <unicode/utf8.h> |
69 | | #include <unicode/utypes.h> |
70 | | #include <unicode/uvernum.h> |
71 | | #include <unicode/uversion.h> |
72 | | |
73 | | #ifdef NODE_HAVE_SMALL_ICU |
74 | | /* if this is defined, we have a 'secondary' entry point. |
75 | | compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */ |
76 | | #define SMALL_ICUDATA_ENTRY_POINT \ |
77 | | SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME) |
78 | | #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff) |
79 | | #ifndef U_LIB_SUFFIX_C_NAME |
80 | | #define SMALL_DEF(major, suff) icusmdt##major##_dat |
81 | | #else |
82 | | #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat |
83 | | #endif |
84 | | |
85 | | extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[]; |
86 | | #endif |
87 | | |
88 | | namespace node { |
89 | | |
90 | | using v8::Context; |
91 | | using v8::FunctionCallbackInfo; |
92 | | using v8::FunctionTemplate; |
93 | | using v8::Int32; |
94 | | using v8::Isolate; |
95 | | using v8::Local; |
96 | | using v8::MaybeLocal; |
97 | | using v8::NewStringType; |
98 | | using v8::Object; |
99 | | using v8::ObjectTemplate; |
100 | | using v8::String; |
101 | | using v8::Value; |
102 | | |
103 | | namespace i18n { |
104 | | namespace { |
105 | | |
106 | | template <typename T> |
107 | 4 | MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) { |
108 | 4 | MaybeLocal<Object> ret = Buffer::New(env, buf); |
109 | 4 | if (ret.IsEmpty()) |
110 | 0 | return ret; |
111 | | |
112 | 4 | static_assert(sizeof(T) == 1 || sizeof(T) == 2, |
113 | 4 | "Currently only one- or two-byte buffers are supported"); |
114 | 4 | if (sizeof(T) > 1 && IsBigEndian()) { |
115 | 0 | SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf); |
116 | 0 | SwapBytes16(retbuf_data, retbuf_length); |
117 | 0 | } |
118 | | |
119 | 4 | return ret; |
120 | 4 | } node_i18n.cc:v8::MaybeLocal<v8::Object> node::i18n::(anonymous namespace)::ToBufferEndian<char>(node::Environment*, node::MaybeStackBuffer<char, 1024ul>*) Line | Count | Source | 107 | 4 | MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) { | 108 | 4 | MaybeLocal<Object> ret = Buffer::New(env, buf); | 109 | 4 | if (ret.IsEmpty()) | 110 | 0 | return ret; | 111 | | | 112 | 4 | static_assert(sizeof(T) == 1 || sizeof(T) == 2, | 113 | 4 | "Currently only one- or two-byte buffers are supported"); | 114 | 4 | if (sizeof(T) > 1 && IsBigEndian()) { | 115 | 0 | SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf); | 116 | 0 | SwapBytes16(retbuf_data, retbuf_length); | 117 | 0 | } | 118 | | | 119 | 4 | return ret; | 120 | 4 | } |
Unexecuted instantiation: node_i18n.cc:v8::MaybeLocal<v8::Object> node::i18n::(anonymous namespace)::ToBufferEndian<char16_t>(node::Environment*, node::MaybeStackBuffer<char16_t, 1024ul>*) |
121 | | |
122 | | // One-Shot Converters |
123 | | |
124 | | void CopySourceBuffer(MaybeStackBuffer<UChar>* dest, |
125 | | const char* data, |
126 | | const size_t length, |
127 | 0 | const size_t length_in_chars) { |
128 | 0 | dest->AllocateSufficientStorage(length_in_chars); |
129 | 0 | char* dst = reinterpret_cast<char*>(**dest); |
130 | 0 | memcpy(dst, data, length); |
131 | 0 | if (IsBigEndian()) { |
132 | 0 | SwapBytes16(dst, length); |
133 | 0 | } |
134 | 0 | } |
135 | | |
136 | | typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env, |
137 | | const char* fromEncoding, |
138 | | const char* toEncoding, |
139 | | const char* source, |
140 | | const size_t source_length, |
141 | | UErrorCode* status); |
142 | | |
143 | | MaybeLocal<Object> Transcode(Environment* env, |
144 | | const char* fromEncoding, |
145 | | const char* toEncoding, |
146 | | const char* source, |
147 | | const size_t source_length, |
148 | 4 | UErrorCode* status) { |
149 | 4 | *status = U_ZERO_ERROR; |
150 | 4 | MaybeLocal<Object> ret; |
151 | 4 | MaybeStackBuffer<char> result; |
152 | 4 | Converter to(toEncoding); |
153 | 4 | Converter from(fromEncoding); |
154 | | |
155 | 4 | size_t sublen = ucnv_getMinCharSize(to.conv()); |
156 | 4 | std::string sub(sublen, '?'); |
157 | 4 | to.set_subst_chars(sub.c_str()); |
158 | | |
159 | 4 | const uint32_t limit = source_length * to.max_char_size(); |
160 | 4 | result.AllocateSufficientStorage(limit); |
161 | 4 | char* target = *result; |
162 | 4 | ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, |
163 | 4 | &source, source + source_length, nullptr, nullptr, |
164 | 4 | nullptr, nullptr, true, true, status); |
165 | 4 | if (U_SUCCESS(*status)) { |
166 | 4 | result.SetLength(target - &result[0]); |
167 | 4 | ret = ToBufferEndian(env, &result); |
168 | 4 | } |
169 | 4 | return ret; |
170 | 4 | } |
171 | | |
172 | | MaybeLocal<Object> TranscodeToUcs2(Environment* env, |
173 | | const char* fromEncoding, |
174 | | const char* toEncoding, |
175 | | const char* source, |
176 | | const size_t source_length, |
177 | 0 | UErrorCode* status) { |
178 | 0 | *status = U_ZERO_ERROR; |
179 | 0 | MaybeLocal<Object> ret; |
180 | 0 | MaybeStackBuffer<UChar> destbuf(source_length); |
181 | 0 | Converter from(fromEncoding); |
182 | 0 | const size_t length_in_chars = source_length * sizeof(UChar); |
183 | 0 | ucnv_toUChars(from.conv(), *destbuf, length_in_chars, |
184 | 0 | source, source_length, status); |
185 | 0 | if (U_SUCCESS(*status)) |
186 | 0 | ret = ToBufferEndian(env, &destbuf); |
187 | 0 | return ret; |
188 | 0 | } |
189 | | |
190 | | MaybeLocal<Object> TranscodeFromUcs2(Environment* env, |
191 | | const char* fromEncoding, |
192 | | const char* toEncoding, |
193 | | const char* source, |
194 | | const size_t source_length, |
195 | 0 | UErrorCode* status) { |
196 | 0 | *status = U_ZERO_ERROR; |
197 | 0 | MaybeStackBuffer<UChar> sourcebuf; |
198 | 0 | MaybeLocal<Object> ret; |
199 | 0 | Converter to(toEncoding); |
200 | |
|
201 | 0 | size_t sublen = ucnv_getMinCharSize(to.conv()); |
202 | 0 | std::string sub(sublen, '?'); |
203 | 0 | to.set_subst_chars(sub.c_str()); |
204 | |
|
205 | 0 | const size_t length_in_chars = source_length / sizeof(UChar); |
206 | 0 | CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); |
207 | 0 | MaybeStackBuffer<char> destbuf(length_in_chars); |
208 | 0 | const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars, |
209 | 0 | *sourcebuf, length_in_chars, status); |
210 | 0 | if (U_SUCCESS(*status)) { |
211 | 0 | destbuf.SetLength(len); |
212 | 0 | ret = ToBufferEndian(env, &destbuf); |
213 | 0 | } |
214 | 0 | return ret; |
215 | 0 | } |
216 | | |
217 | | MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env, |
218 | | const char* fromEncoding, |
219 | | const char* toEncoding, |
220 | | const char* source, |
221 | | const size_t source_length, |
222 | 0 | UErrorCode* status) { |
223 | 0 | *status = U_ZERO_ERROR; |
224 | 0 | MaybeStackBuffer<UChar> destbuf; |
225 | 0 | int32_t result_length; |
226 | 0 | u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, |
227 | 0 | source, source_length, status); |
228 | 0 | MaybeLocal<Object> ret; |
229 | 0 | if (U_SUCCESS(*status)) { |
230 | 0 | destbuf.SetLength(result_length); |
231 | 0 | ret = ToBufferEndian(env, &destbuf); |
232 | 0 | } else if (*status == U_BUFFER_OVERFLOW_ERROR) { |
233 | 0 | *status = U_ZERO_ERROR; |
234 | 0 | destbuf.AllocateSufficientStorage(result_length); |
235 | 0 | u_strFromUTF8(*destbuf, result_length, &result_length, |
236 | 0 | source, source_length, status); |
237 | 0 | if (U_SUCCESS(*status)) { |
238 | 0 | destbuf.SetLength(result_length); |
239 | 0 | ret = ToBufferEndian(env, &destbuf); |
240 | 0 | } |
241 | 0 | } |
242 | 0 | return ret; |
243 | 0 | } |
244 | | |
245 | | MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env, |
246 | | const char* fromEncoding, |
247 | | const char* toEncoding, |
248 | | const char* source, |
249 | | const size_t source_length, |
250 | 0 | UErrorCode* status) { |
251 | 0 | *status = U_ZERO_ERROR; |
252 | 0 | MaybeLocal<Object> ret; |
253 | 0 | const size_t length_in_chars = source_length / sizeof(UChar); |
254 | 0 | int32_t result_length; |
255 | 0 | MaybeStackBuffer<UChar> sourcebuf; |
256 | 0 | MaybeStackBuffer<char> destbuf; |
257 | 0 | CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); |
258 | 0 | u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, |
259 | 0 | *sourcebuf, length_in_chars, status); |
260 | 0 | if (U_SUCCESS(*status)) { |
261 | 0 | destbuf.SetLength(result_length); |
262 | 0 | ret = ToBufferEndian(env, &destbuf); |
263 | 0 | } else if (*status == U_BUFFER_OVERFLOW_ERROR) { |
264 | 0 | *status = U_ZERO_ERROR; |
265 | 0 | destbuf.AllocateSufficientStorage(result_length); |
266 | 0 | u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, |
267 | 0 | length_in_chars, status); |
268 | 0 | if (U_SUCCESS(*status)) { |
269 | 0 | destbuf.SetLength(result_length); |
270 | 0 | ret = ToBufferEndian(env, &destbuf); |
271 | 0 | } |
272 | 0 | } |
273 | 0 | return ret; |
274 | 0 | } |
275 | | |
276 | 8 | const char* EncodingName(const enum encoding encoding) { |
277 | 8 | switch (encoding) { |
278 | 4 | case ASCII: return "us-ascii"; |
279 | 0 | case LATIN1: return "iso8859-1"; |
280 | 0 | case UCS2: return "utf16le"; |
281 | 4 | case UTF8: return "utf-8"; |
282 | 0 | default: return nullptr; |
283 | 8 | } |
284 | 8 | } |
285 | | |
286 | 8 | bool SupportedEncoding(const enum encoding encoding) { |
287 | 8 | switch (encoding) { |
288 | 4 | case ASCII: |
289 | 4 | case LATIN1: |
290 | 4 | case UCS2: |
291 | 8 | case UTF8: return true; |
292 | 0 | default: return false; |
293 | 8 | } |
294 | 8 | } |
295 | | |
296 | 4 | void Transcode(const FunctionCallbackInfo<Value>&args) { |
297 | 4 | Environment* env = Environment::GetCurrent(args); |
298 | 4 | Isolate* isolate = env->isolate(); |
299 | 4 | UErrorCode status = U_ZERO_ERROR; |
300 | 4 | MaybeLocal<Object> result; |
301 | | |
302 | 4 | ArrayBufferViewContents<char> input(args[0]); |
303 | 4 | const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER); |
304 | 4 | const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER); |
305 | | |
306 | 4 | if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) { |
307 | 4 | TranscodeFunc tfn = &Transcode; |
308 | 4 | switch (fromEncoding) { |
309 | 0 | case ASCII: |
310 | 0 | case LATIN1: |
311 | 0 | if (toEncoding == UCS2) |
312 | 0 | tfn = &TranscodeToUcs2; |
313 | 0 | break; |
314 | 4 | case UTF8: |
315 | 4 | if (toEncoding == UCS2) |
316 | 0 | tfn = &TranscodeUcs2FromUtf8; |
317 | 4 | break; |
318 | 0 | case UCS2: |
319 | 0 | switch (toEncoding) { |
320 | 0 | case UCS2: |
321 | 0 | tfn = &Transcode; |
322 | 0 | break; |
323 | 0 | case UTF8: |
324 | 0 | tfn = &TranscodeUtf8FromUcs2; |
325 | 0 | break; |
326 | 0 | default: |
327 | 0 | tfn = &TranscodeFromUcs2; |
328 | 0 | } |
329 | 0 | break; |
330 | 0 | default: |
331 | | // This should not happen because of the SupportedEncoding checks |
332 | 0 | ABORT(); |
333 | 4 | } |
334 | | |
335 | 4 | result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding), |
336 | 4 | input.data(), input.length(), &status); |
337 | 4 | } else { |
338 | 0 | status = U_ILLEGAL_ARGUMENT_ERROR; |
339 | 0 | } |
340 | | |
341 | 4 | if (result.IsEmpty()) |
342 | 0 | return args.GetReturnValue().Set(status); |
343 | | |
344 | 4 | return args.GetReturnValue().Set(result.ToLocalChecked()); |
345 | 4 | } |
346 | | |
347 | 0 | void ICUErrorName(const FunctionCallbackInfo<Value>& args) { |
348 | 0 | Environment* env = Environment::GetCurrent(args); |
349 | 0 | CHECK(args[0]->IsInt32()); |
350 | 0 | UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value()); |
351 | 0 | args.GetReturnValue().Set( |
352 | 0 | String::NewFromUtf8(env->isolate(), |
353 | 0 | u_errorName(status)).ToLocalChecked()); |
354 | 0 | } |
355 | | |
356 | | } // anonymous namespace |
357 | | |
358 | 8 | Converter::Converter(const char* name, const char* sub) { |
359 | 8 | UErrorCode status = U_ZERO_ERROR; |
360 | 8 | UConverter* conv = ucnv_open(name, &status); |
361 | 8 | CHECK(U_SUCCESS(status)); |
362 | 8 | conv_.reset(conv); |
363 | 8 | set_subst_chars(sub); |
364 | 8 | } |
365 | | |
366 | | Converter::Converter(UConverter* converter, const char* sub) |
367 | 0 | : conv_(converter) { |
368 | 0 | set_subst_chars(sub); |
369 | 0 | } |
370 | | |
371 | 12 | void Converter::set_subst_chars(const char* sub) { |
372 | 12 | CHECK(conv_); |
373 | 12 | UErrorCode status = U_ZERO_ERROR; |
374 | 12 | if (sub != nullptr) { |
375 | 4 | ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status); |
376 | 4 | CHECK(U_SUCCESS(status)); |
377 | 4 | } |
378 | 12 | } |
379 | | |
380 | 0 | void Converter::reset() { |
381 | 0 | ucnv_reset(conv_.get()); |
382 | 0 | } |
383 | | |
384 | 0 | size_t Converter::min_char_size() const { |
385 | 0 | CHECK(conv_); |
386 | 0 | return ucnv_getMinCharSize(conv_.get()); |
387 | 0 | } |
388 | | |
389 | 4 | size_t Converter::max_char_size() const { |
390 | 4 | CHECK(conv_); |
391 | 4 | return ucnv_getMaxCharSize(conv_.get()); |
392 | 4 | } |
393 | | |
394 | 0 | void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) { |
395 | 0 | Environment* env = Environment::GetCurrent(args); |
396 | |
|
397 | 0 | CHECK_GE(args.Length(), 1); |
398 | 0 | Utf8Value label(env->isolate(), args[0]); |
399 | |
|
400 | 0 | UErrorCode status = U_ZERO_ERROR; |
401 | 0 | ConverterPointer conv(ucnv_open(*label, &status)); |
402 | 0 | args.GetReturnValue().Set(!!U_SUCCESS(status)); |
403 | 0 | } |
404 | | |
405 | 0 | void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) { |
406 | 0 | Environment* env = Environment::GetCurrent(args); |
407 | |
|
408 | 0 | Local<ObjectTemplate> t = env->i18n_converter_template(); |
409 | 0 | Local<Object> obj; |
410 | 0 | if (!t->NewInstance(env->context()).ToLocal(&obj)) return; |
411 | | |
412 | 0 | CHECK_GE(args.Length(), 2); |
413 | 0 | Utf8Value label(env->isolate(), args[0]); |
414 | 0 | int flags = args[1]->Uint32Value(env->context()).ToChecked(); |
415 | 0 | bool fatal = |
416 | 0 | (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL; |
417 | |
|
418 | 0 | UErrorCode status = U_ZERO_ERROR; |
419 | 0 | UConverter* conv = ucnv_open(*label, &status); |
420 | 0 | if (U_FAILURE(status)) |
421 | 0 | return; |
422 | | |
423 | 0 | if (fatal) { |
424 | 0 | status = U_ZERO_ERROR; |
425 | 0 | ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, |
426 | 0 | nullptr, nullptr, nullptr, &status); |
427 | 0 | } |
428 | |
|
429 | 0 | auto converter = new ConverterObject(env, obj, conv, flags); |
430 | 0 | size_t sublen = ucnv_getMinCharSize(conv); |
431 | 0 | std::string sub(sublen, '?'); |
432 | 0 | converter->set_subst_chars(sub.c_str()); |
433 | |
|
434 | 0 | args.GetReturnValue().Set(obj); |
435 | 0 | } |
436 | | |
437 | 0 | void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) { |
438 | 0 | Environment* env = Environment::GetCurrent(args); |
439 | |
|
440 | 0 | CHECK_GE(args.Length(), 4); // Converter, Buffer, Flags, Encoding |
441 | | |
442 | 0 | ConverterObject* converter; |
443 | 0 | ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>()); |
444 | | |
445 | 0 | if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() || |
446 | 0 | args[1]->IsArrayBufferView())) { |
447 | 0 | return node::THROW_ERR_INVALID_ARG_TYPE( |
448 | 0 | env->isolate(), |
449 | 0 | "The \"input\" argument must be an instance of SharedArrayBuffer, " |
450 | 0 | "ArrayBuffer or ArrayBufferView."); |
451 | 0 | } |
452 | | |
453 | 0 | ArrayBufferViewContents<char> input(args[1]); |
454 | 0 | int flags = args[2]->Uint32Value(env->context()).ToChecked(); |
455 | |
|
456 | 0 | CHECK(args[3]->IsString()); |
457 | 0 | Local<String> from_encoding = args[3].As<String>(); |
458 | |
|
459 | 0 | UErrorCode status = U_ZERO_ERROR; |
460 | 0 | MaybeStackBuffer<UChar> result; |
461 | |
|
462 | 0 | UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH; |
463 | | |
464 | | // When flushing the final chunk, the limit is the maximum |
465 | | // of either the input buffer length or the number of pending |
466 | | // characters times the min char size, multiplied by 2 as unicode may |
467 | | // take up to 2 UChars to encode a character |
468 | 0 | size_t limit = 2 * converter->min_char_size() * |
469 | 0 | (!flush ? |
470 | 0 | input.length() : |
471 | 0 | std::max( |
472 | 0 | input.length(), |
473 | 0 | static_cast<size_t>( |
474 | 0 | ucnv_toUCountPending(converter->conv(), &status)))); |
475 | 0 | status = U_ZERO_ERROR; |
476 | |
|
477 | 0 | if (limit > 0) |
478 | 0 | result.AllocateSufficientStorage(limit); |
479 | |
|
480 | 0 | auto cleanup = OnScopeLeave([&]() { |
481 | 0 | if (flush) { |
482 | | // Reset the converter state. |
483 | 0 | converter->set_bom_seen(false); |
484 | 0 | converter->reset(); |
485 | 0 | } |
486 | 0 | }); |
487 | |
|
488 | 0 | const char* source = input.data(); |
489 | 0 | size_t source_length = input.length(); |
490 | |
|
491 | 0 | UChar* target = *result; |
492 | 0 | ucnv_toUnicode(converter->conv(), |
493 | 0 | &target, |
494 | 0 | target + limit, |
495 | 0 | &source, |
496 | 0 | source + source_length, |
497 | 0 | nullptr, |
498 | 0 | flush, |
499 | 0 | &status); |
500 | |
|
501 | 0 | if (U_SUCCESS(status)) { |
502 | 0 | bool omit_initial_bom = false; |
503 | 0 | if (limit > 0) { |
504 | 0 | result.SetLength(target - &result[0]); |
505 | 0 | if (result.length() > 0 && |
506 | 0 | converter->unicode() && |
507 | 0 | !converter->ignore_bom() && |
508 | 0 | !converter->bom_seen()) { |
509 | | // If the very first result in the stream is a BOM, and we are not |
510 | | // explicitly told to ignore it, then we mark it for discarding. |
511 | 0 | if (result[0] == 0xFEFF) |
512 | 0 | omit_initial_bom = true; |
513 | 0 | converter->set_bom_seen(true); |
514 | 0 | } |
515 | 0 | } |
516 | |
|
517 | 0 | Local<Value> error; |
518 | 0 | UChar* output = result.out(); |
519 | 0 | size_t beginning = 0; |
520 | 0 | size_t length = result.length() * sizeof(UChar); |
521 | |
|
522 | 0 | if (omit_initial_bom) { |
523 | | // Perform `ret = ret.slice(2)`. |
524 | 0 | beginning += 2; |
525 | 0 | length -= 2; |
526 | 0 | } |
527 | |
|
528 | 0 | char* value = reinterpret_cast<char*>(output) + beginning; |
529 | |
|
530 | 0 | if (IsBigEndian()) { |
531 | 0 | SwapBytes16(value, length); |
532 | 0 | } |
533 | |
|
534 | 0 | MaybeLocal<Value> encoded = |
535 | 0 | StringBytes::Encode(env->isolate(), value, length, UCS2, &error); |
536 | |
|
537 | 0 | Local<Value> ret; |
538 | 0 | if (encoded.ToLocal(&ret)) { |
539 | 0 | args.GetReturnValue().Set(ret); |
540 | 0 | return; |
541 | 0 | } |
542 | 0 | } |
543 | | |
544 | 0 | node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( |
545 | 0 | env->isolate(), |
546 | 0 | "The encoded data was not valid for encoding %s", |
547 | 0 | *node::Utf8Value(env->isolate(), from_encoding)); |
548 | 0 | } |
549 | | |
550 | | ConverterObject::ConverterObject( |
551 | | Environment* env, |
552 | | Local<Object> wrap, |
553 | | UConverter* converter, |
554 | | int flags, |
555 | | const char* sub) |
556 | 0 | : BaseObject(env, wrap), |
557 | 0 | Converter(converter, sub), |
558 | 0 | flags_(flags) { |
559 | 0 | MakeWeak(); |
560 | |
|
561 | 0 | switch (ucnv_getType(converter)) { |
562 | 0 | case UCNV_UTF8: |
563 | 0 | case UCNV_UTF16_BigEndian: |
564 | 0 | case UCNV_UTF16_LittleEndian: |
565 | 0 | flags_ |= CONVERTER_FLAGS_UNICODE; |
566 | 0 | break; |
567 | 0 | default: { |
568 | | // Fall through |
569 | 0 | } |
570 | 0 | } |
571 | 0 | } |
572 | | |
573 | 126k | bool InitializeICUDirectory(const std::string& path, std::string* error) { |
574 | 126k | UErrorCode status = U_ZERO_ERROR; |
575 | 126k | if (path.empty()) { |
576 | | #ifdef NODE_HAVE_SMALL_ICU |
577 | | // install the 'small' data. |
578 | | udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status); |
579 | | #else // !NODE_HAVE_SMALL_ICU |
580 | | // no small data, so nothing to do. |
581 | 126k | #endif // !NODE_HAVE_SMALL_ICU |
582 | 126k | } else { |
583 | 0 | u_setDataDirectory(path.c_str()); |
584 | 0 | u_init(&status); |
585 | 0 | } |
586 | 126k | if (status == U_ZERO_ERROR) { |
587 | 126k | return true; |
588 | 126k | } |
589 | | |
590 | 0 | *error = u_errorName(status); |
591 | 0 | return false; |
592 | 126k | } |
593 | | |
594 | 0 | void SetDefaultTimeZone(const char* tzid) { |
595 | 0 | size_t tzidlen = strlen(tzid) + 1; |
596 | 0 | UErrorCode status = U_ZERO_ERROR; |
597 | 0 | MaybeStackBuffer<UChar, 256> id(tzidlen); |
598 | 0 | u_charsToUChars(tzid, id.out(), tzidlen); |
599 | | // This is threadsafe: |
600 | 0 | ucal_setDefaultTimeZone(id.out(), &status); |
601 | 0 | CHECK(U_SUCCESS(status)); |
602 | 0 | } |
603 | | |
604 | | int32_t ToUnicode(MaybeStackBuffer<char>* buf, |
605 | | const char* input, |
606 | 0 | size_t length) { |
607 | 0 | UErrorCode status = U_ZERO_ERROR; |
608 | 0 | uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE; |
609 | 0 | UIDNA* uidna = uidna_openUTS46(options, &status); |
610 | 0 | if (U_FAILURE(status)) |
611 | 0 | return -1; |
612 | 0 | UIDNAInfo info = UIDNA_INFO_INITIALIZER; |
613 | |
|
614 | 0 | int32_t len = uidna_nameToUnicodeUTF8(uidna, |
615 | 0 | input, length, |
616 | 0 | **buf, buf->capacity(), |
617 | 0 | &info, |
618 | 0 | &status); |
619 | | |
620 | | // Do not check info.errors like we do with ToASCII since ToUnicode always |
621 | | // returns a string, despite any possible errors that may have occurred. |
622 | |
|
623 | 0 | if (status == U_BUFFER_OVERFLOW_ERROR) { |
624 | 0 | status = U_ZERO_ERROR; |
625 | 0 | buf->AllocateSufficientStorage(len); |
626 | 0 | len = uidna_nameToUnicodeUTF8(uidna, |
627 | 0 | input, length, |
628 | 0 | **buf, buf->capacity(), |
629 | 0 | &info, |
630 | 0 | &status); |
631 | 0 | } |
632 | | |
633 | | // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode |
634 | | // string, regardless of whether an error occurred. |
635 | |
|
636 | 0 | if (U_FAILURE(status)) { |
637 | 0 | len = -1; |
638 | 0 | buf->SetLength(0); |
639 | 0 | } else { |
640 | 0 | buf->SetLength(len); |
641 | 0 | } |
642 | |
|
643 | 0 | uidna_close(uidna); |
644 | 0 | return len; |
645 | 0 | } |
646 | | |
647 | | int32_t ToASCII(MaybeStackBuffer<char>* buf, |
648 | | const char* input, |
649 | | size_t length, |
650 | 0 | idna_mode mode) { |
651 | 0 | UErrorCode status = U_ZERO_ERROR; |
652 | 0 | uint32_t options = // CheckHyphens = false; handled later |
653 | 0 | UIDNA_CHECK_BIDI | // CheckBidi = true |
654 | 0 | UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true |
655 | 0 | UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing |
656 | 0 | if (mode == idna_mode::kStrict) { |
657 | 0 | options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict |
658 | | // VerifyDnsLength = beStrict; |
659 | | // handled later |
660 | 0 | } |
661 | |
|
662 | 0 | UIDNA* uidna = uidna_openUTS46(options, &status); |
663 | 0 | if (U_FAILURE(status)) |
664 | 0 | return -1; |
665 | 0 | UIDNAInfo info = UIDNA_INFO_INITIALIZER; |
666 | |
|
667 | 0 | int32_t len = uidna_nameToASCII_UTF8(uidna, |
668 | 0 | input, length, |
669 | 0 | **buf, buf->capacity(), |
670 | 0 | &info, |
671 | 0 | &status); |
672 | |
|
673 | 0 | if (status == U_BUFFER_OVERFLOW_ERROR) { |
674 | 0 | status = U_ZERO_ERROR; |
675 | 0 | buf->AllocateSufficientStorage(len); |
676 | 0 | len = uidna_nameToASCII_UTF8(uidna, |
677 | 0 | input, length, |
678 | 0 | **buf, buf->capacity(), |
679 | 0 | &info, |
680 | 0 | &status); |
681 | 0 | } |
682 | | |
683 | | // In UTS #46 which specifies ToASCII, certain error conditions are |
684 | | // configurable through options, and the WHATWG URL Standard promptly elects |
685 | | // to disable some of them to accommodate for real-world use cases. |
686 | | // Unfortunately, ICU4C's IDNA module does not support disabling some of |
687 | | // these options through `options` above, and thus continues throwing |
688 | | // unnecessary errors. To counter this situation, we just filter out the |
689 | | // errors that may have happened afterwards, before deciding whether to |
690 | | // return an error from this function. |
691 | | |
692 | | // CheckHyphens = false |
693 | | // (Specified in the current UTS #46 draft rev. 18.) |
694 | | // Refs: |
695 | | // - https://github.com/whatwg/url/issues/53 |
696 | | // - https://github.com/whatwg/url/pull/309 |
697 | | // - http://www.unicode.org/review/pri317/ |
698 | | // - http://www.unicode.org/reports/tr46/tr46-18.html |
699 | | // - https://www.icann.org/news/announcement-2000-01-07-en |
700 | 0 | info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; |
701 | 0 | info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; |
702 | 0 | info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; |
703 | |
|
704 | 0 | if (mode != idna_mode::kStrict) { |
705 | | // VerifyDnsLength = beStrict |
706 | 0 | info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; |
707 | 0 | info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; |
708 | 0 | info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
709 | 0 | } |
710 | |
|
711 | 0 | if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) { |
712 | 0 | len = -1; |
713 | 0 | buf->SetLength(0); |
714 | 0 | } else { |
715 | 0 | buf->SetLength(len); |
716 | 0 | } |
717 | |
|
718 | 0 | uidna_close(uidna); |
719 | 0 | return len; |
720 | 0 | } |
721 | | |
722 | 0 | static void ToUnicode(const FunctionCallbackInfo<Value>& args) { |
723 | 0 | Environment* env = Environment::GetCurrent(args); |
724 | 0 | CHECK_GE(args.Length(), 1); |
725 | 0 | CHECK(args[0]->IsString()); |
726 | 0 | Utf8Value val(env->isolate(), args[0]); |
727 | |
|
728 | 0 | MaybeStackBuffer<char> buf; |
729 | 0 | int32_t len = ToUnicode(&buf, *val, val.length()); |
730 | |
|
731 | 0 | if (len < 0) { |
732 | 0 | return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode"); |
733 | 0 | } |
734 | | |
735 | 0 | args.GetReturnValue().Set( |
736 | 0 | String::NewFromUtf8(env->isolate(), |
737 | 0 | *buf, |
738 | 0 | NewStringType::kNormal, |
739 | 0 | len).ToLocalChecked()); |
740 | 0 | } |
741 | | |
742 | 0 | static void ToASCII(const FunctionCallbackInfo<Value>& args) { |
743 | 0 | Environment* env = Environment::GetCurrent(args); |
744 | 0 | CHECK_GE(args.Length(), 1); |
745 | 0 | CHECK(args[0]->IsString()); |
746 | 0 | Utf8Value val(env->isolate(), args[0]); |
747 | | // optional arg |
748 | 0 | bool lenient = args[1]->BooleanValue(env->isolate()); |
749 | 0 | idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault; |
750 | |
|
751 | 0 | MaybeStackBuffer<char> buf; |
752 | 0 | int32_t len = ToASCII(&buf, *val, val.length(), mode); |
753 | |
|
754 | 0 | if (len < 0) { |
755 | 0 | return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII"); |
756 | 0 | } |
757 | | |
758 | 0 | args.GetReturnValue().Set( |
759 | 0 | String::NewFromUtf8(env->isolate(), |
760 | 0 | *buf, |
761 | 0 | NewStringType::kNormal, |
762 | 0 | len).ToLocalChecked()); |
763 | 0 | } |
764 | | |
765 | | // This is similar to wcwidth except that it takes the current unicode |
766 | | // character properties database into consideration, allowing it to |
767 | | // correctly calculate the column widths of things like emoji's and |
768 | | // newer wide characters. wcwidth, on the other hand, uses a fixed |
769 | | // algorithm that does not take things like emoji into proper |
770 | | // consideration. |
771 | | // |
772 | | // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by |
773 | | // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs |
774 | | // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't |
775 | | // allow it to be input. Linux's PTY terminal prints control characters as |
776 | | // Narrow rhombi. |
777 | | // |
778 | | // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final |
779 | | // consonants are 0-width when combined with initial consonants; otherwise they |
780 | | // are technically Wide. But many terminals (including Konsole and |
781 | | // VTE/GLib-based) implement all medials and finals as 0-width. |
782 | | // |
783 | | // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width |
784 | | // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420 |
785 | | // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223 |
786 | | static int GetColumnWidth(UChar32 codepoint, |
787 | 0 | bool ambiguous_as_full_width = false) { |
788 | | // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a |
789 | | // codepoint as being full width, wide, ambiguous, neutral, narrow, |
790 | | // or halfwidth. |
791 | 0 | const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); |
792 | 0 | switch (eaw) { |
793 | 0 | case U_EA_FULLWIDTH: |
794 | 0 | case U_EA_WIDE: |
795 | 0 | return 2; |
796 | 0 | case U_EA_AMBIGUOUS: |
797 | | // See: http://www.unicode.org/reports/tr11/#Ambiguous for details |
798 | 0 | if (ambiguous_as_full_width) { |
799 | 0 | return 2; |
800 | 0 | } |
801 | | // If ambiguous_as_full_width is false: |
802 | 0 | [[fallthrough]]; |
803 | 0 | case U_EA_NEUTRAL: |
804 | 0 | if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { |
805 | 0 | return 2; |
806 | 0 | } |
807 | 0 | [[fallthrough]]; |
808 | 0 | case U_EA_HALFWIDTH: |
809 | 0 | case U_EA_NARROW: |
810 | 0 | default: |
811 | 0 | const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code |
812 | 0 | U_GC_CF_MASK | // Format control character |
813 | 0 | U_GC_ME_MASK | // Enclosing mark |
814 | 0 | U_GC_MN_MASK; // Nonspacing mark |
815 | 0 | if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width |
816 | 0 | ((U_MASK(u_charType(codepoint)) & zero_width_mask) || |
817 | 0 | u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) { |
818 | 0 | return 0; |
819 | 0 | } |
820 | 0 | return 1; |
821 | 0 | } |
822 | 0 | } |
823 | | |
824 | | // Returns the column width for the given String. |
825 | 0 | static void GetStringWidth(const FunctionCallbackInfo<Value>& args) { |
826 | 0 | Environment* env = Environment::GetCurrent(args); |
827 | 0 | CHECK(args[0]->IsString()); |
828 | | |
829 | 0 | bool ambiguous_as_full_width = args[1]->IsTrue(); |
830 | 0 | bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue(); |
831 | |
|
832 | 0 | TwoByteValue value(env->isolate(), args[0]); |
833 | | // reinterpret_cast is required by windows to compile |
834 | 0 | UChar* str = reinterpret_cast<UChar*>(*value); |
835 | 0 | static_assert(sizeof(*str) == sizeof(**value), |
836 | 0 | "sizeof(*str) == sizeof(**value)"); |
837 | 0 | UChar32 c = 0; |
838 | 0 | UChar32 p; |
839 | 0 | size_t n = 0; |
840 | 0 | uint32_t width = 0; |
841 | |
|
842 | 0 | while (n < value.length()) { |
843 | 0 | p = c; |
844 | 0 | U16_NEXT(str, n, value.length(), c); |
845 | | // Don't count individual emoji codepoints that occur within an |
846 | | // emoji sequence. This is not necessarily foolproof. Some |
847 | | // environments display emoji sequences in the appropriate |
848 | | // condensed form (as a single emoji glyph), other environments |
849 | | // may not understand an emoji sequence and will display each |
850 | | // individual emoji separately. When this happens, the width |
851 | | // calculated will be off, and there's no reliable way of knowing |
852 | | // in advance if a particular sequence is going to be supported. |
853 | | // The expand_emoji_sequence option allows the caller to skip this |
854 | | // check and count each code within an emoji sequence separately. |
855 | | // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences |
856 | 0 | if (!expand_emoji_sequence && |
857 | 0 | n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) |
858 | 0 | (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || |
859 | 0 | u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { |
860 | 0 | continue; |
861 | 0 | } |
862 | 0 | width += GetColumnWidth(c, ambiguous_as_full_width); |
863 | 0 | } |
864 | 0 | args.GetReturnValue().Set(width); |
865 | 0 | } |
866 | | |
867 | | static void CreatePerIsolateProperties(IsolateData* isolate_data, |
868 | 127k | Local<ObjectTemplate> target) { |
869 | 127k | Isolate* isolate = isolate_data->isolate(); |
870 | | |
871 | 127k | SetMethod(isolate, target, "toUnicode", ToUnicode); |
872 | 127k | SetMethod(isolate, target, "toASCII", ToASCII); |
873 | 127k | SetMethod(isolate, target, "getStringWidth", GetStringWidth); |
874 | | |
875 | | // One-shot converters |
876 | 127k | SetMethod(isolate, target, "icuErrName", ICUErrorName); |
877 | 127k | SetMethod(isolate, target, "transcode", Transcode); |
878 | | |
879 | | // ConverterObject |
880 | 127k | { |
881 | 127k | Local<FunctionTemplate> t = NewFunctionTemplate(isolate, nullptr); |
882 | 127k | t->InstanceTemplate()->SetInternalFieldCount( |
883 | 127k | ConverterObject::kInternalFieldCount); |
884 | 127k | Local<String> converter_string = |
885 | 127k | FIXED_ONE_BYTE_STRING(isolate, "Converter"); |
886 | 127k | t->SetClassName(converter_string); |
887 | 127k | isolate_data->set_i18n_converter_template(t->InstanceTemplate()); |
888 | 127k | } |
889 | | |
890 | 127k | SetMethod(isolate, target, "getConverter", ConverterObject::Create); |
891 | 127k | SetMethod(isolate, target, "decode", ConverterObject::Decode); |
892 | 127k | SetMethod(isolate, target, "hasConverter", ConverterObject::Has); |
893 | 127k | } |
894 | | |
895 | | void CreatePerContextProperties(Local<Object> target, |
896 | | Local<Value> unused, |
897 | | Local<Context> context, |
898 | 127k | void* priv) {} |
899 | | |
900 | 0 | void RegisterExternalReferences(ExternalReferenceRegistry* registry) { |
901 | 0 | registry->Register(ToUnicode); |
902 | 0 | registry->Register(ToASCII); |
903 | 0 | registry->Register(GetStringWidth); |
904 | 0 | registry->Register(ICUErrorName); |
905 | 0 | registry->Register(Transcode); |
906 | 0 | registry->Register(ConverterObject::Create); |
907 | 0 | registry->Register(ConverterObject::Decode); |
908 | 0 | registry->Register(ConverterObject::Has); |
909 | 0 | } |
910 | | |
911 | | } // namespace i18n |
912 | | } // namespace node |
913 | | |
914 | | NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::CreatePerContextProperties) |
915 | | NODE_BINDING_PER_ISOLATE_INIT(icu, node::i18n::CreatePerIsolateProperties) |
916 | | NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences) |
917 | | |
918 | | #endif // NODE_HAVE_I18N_SUPPORT |