Coverage Report

Created: 2025-10-31 09:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/node/src/node_i18n.cc
Line
Count
Source
1
// Copyright Joyent, Inc. and other Node contributors.
2
//
3
// Permission is hereby granted, free of charge, to any person obtaining a
4
// copy of this software and associated documentation files (the
5
// "Software"), to deal in the Software without restriction, including
6
// without limitation the rights to use, copy, modify, merge, publish,
7
// distribute, sublicense, and/or sell copies of the Software, and to permit
8
// persons to whom the Software is furnished to do so, subject to the
9
// following conditions:
10
//
11
// The above copyright notice and this permission notice shall be included
12
// in all copies or substantial portions of the Software.
13
//
14
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17
// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20
// USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22
/*
23
 * notes: by srl295
24
 *  - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25
 *     ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26
 *    linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27
 *    macro names. That's the "english+root" data.
28
 *
29
 *    If icu_data_path is non-null, the user has provided a path and we assume
30
 *    it goes somewhere useful. We set that path in ICU, and exit.
31
 *    If icu_data_path is null, they haven't set a path and we want the
32
 *    "english+root" data.  We call
33
 *       udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34
 *    to load up the english+root data.
35
 *
36
 *  - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37
 *    data. All of the variables and command line options for changing data at
38
 *    runtime are disabled, as they wouldn't fully override the internal data.
39
 *    See:  http://bugs.icu-project.org/trac/ticket/10924
40
 */
41
42
43
#include "node_i18n.h"
44
#include "node_external_reference.h"
45
#include "simdutf.h"
46
47
#if defined(NODE_HAVE_I18N_SUPPORT)
48
49
#include "base_object-inl.h"
50
#include "node.h"
51
#include "node_buffer.h"
52
#include "node_errors.h"
53
#include "node_internals.h"
54
#include "string_bytes.h"
55
#include "util-inl.h"
56
#include "v8.h"
57
58
#include <unicode/putil.h>
59
#include <unicode/timezone.h>
60
#include <unicode/uchar.h>
61
#include <unicode/uclean.h>
62
#include <unicode/ucnv.h>
63
#include <unicode/ulocdata.h>
64
#include <unicode/urename.h>
65
#include <unicode/utf16.h>
66
#include <unicode/utypes.h>
67
#include <unicode/uvernum.h>
68
#include <unicode/uversion.h>
69
#include "nbytes.h"
70
71
#ifdef NODE_HAVE_SMALL_ICU
72
#include <unicode/udata.h>
73
74
/* if this is defined, we have a 'secondary' entry point.
75
   compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
76
#define SMALL_ICUDATA_ENTRY_POINT \
77
  SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
78
#define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
79
#ifndef U_LIB_SUFFIX_C_NAME
80
#define SMALL_DEF(major, suff) icusmdt##major##_dat
81
#else
82
#define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
83
#endif
84
85
extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
86
#endif
87
88
namespace node {
89
90
using v8::Context;
91
using v8::FunctionCallbackInfo;
92
using v8::FunctionTemplate;
93
using v8::Int32;
94
using v8::Isolate;
95
using v8::Local;
96
using v8::MaybeLocal;
97
using v8::Object;
98
using v8::ObjectTemplate;
99
using v8::String;
100
using v8::Value;
101
102
namespace i18n {
103
namespace {
104
105
template <typename T>
106
0
MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
107
0
  Local<Object> ret;
108
0
  if (!Buffer::New(env, buf).ToLocal(&ret)) {
109
0
    return {};
110
0
  }
111
112
0
  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
113
0
                "Currently only one- or two-byte buffers are supported");
114
  if constexpr (sizeof(T) > 1 && IsBigEndian()) {
115
    SPREAD_BUFFER_ARG(ret, retbuf);
116
    CHECK(nbytes::SwapBytes16(retbuf_data, retbuf_length));
117
  }
118
119
0
  return ret;
120
0
}
121
122
// One-Shot Converters
123
124
void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
125
                      const char* data,
126
                      const size_t length,
127
0
                      const size_t length_in_chars) {
128
0
  dest->AllocateSufficientStorage(length_in_chars);
129
0
  char* dst = reinterpret_cast<char*>(**dest);
130
0
  memcpy(dst, data, length);
131
  if constexpr (IsBigEndian()) {
132
    CHECK(nbytes::SwapBytes16(dst, length));
133
  }
134
0
}
135
136
typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
137
                                            const char* fromEncoding,
138
                                            const char* toEncoding,
139
                                            const char* source,
140
                                            const size_t source_length,
141
                                            UErrorCode* status);
142
143
MaybeLocal<Object> Transcode(Environment* env,
144
                             const char* fromEncoding,
145
                             const char* toEncoding,
146
                             const char* source,
147
                             const size_t source_length,
148
0
                             UErrorCode* status) {
149
0
  MaybeLocal<Object> ret;
150
0
  MaybeStackBuffer<char> result;
151
0
  Converter to(toEncoding);
152
0
  Converter from(fromEncoding);
153
154
0
  size_t sublen = ucnv_getMinCharSize(to.conv());
155
0
  std::string sub(sublen, '?');
156
0
  to.set_subst_chars(sub.c_str());
157
158
0
  const uint32_t limit = source_length * to.max_char_size();
159
0
  result.AllocateSufficientStorage(limit);
160
0
  char* target = *result;
161
0
  ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
162
0
                 &source, source + source_length, nullptr, nullptr,
163
0
                 nullptr, nullptr, true, true, status);
164
0
  if (U_SUCCESS(*status)) {
165
0
    result.SetLength(target - &result[0]);
166
0
    ret = ToBufferEndian(env, &result);
167
0
  }
168
0
  return ret;
169
0
}
170
171
MaybeLocal<Object> TranscodeLatin1ToUcs2(Environment* env,
172
                                         const char* fromEncoding,
173
                                         const char* toEncoding,
174
                                         const char* source,
175
                                         const size_t source_length,
176
0
                                         UErrorCode* status) {
177
0
  MaybeStackBuffer<char16_t> destbuf(source_length);
178
0
  auto actual_length =
179
0
      simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out());
180
0
  if (actual_length == 0) {
181
0
    *status = U_INVALID_CHAR_FOUND;
182
0
    return {};
183
0
  }
184
185
0
  return Buffer::New(env, &destbuf);
186
0
}
187
188
MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
189
                                     const char* fromEncoding,
190
                                     const char* toEncoding,
191
                                     const char* source,
192
                                     const size_t source_length,
193
0
                                     UErrorCode* status) {
194
0
  MaybeStackBuffer<UChar> sourcebuf;
195
0
  MaybeLocal<Object> ret;
196
0
  Converter to(toEncoding);
197
198
0
  std::string sub(to.min_char_size(), '?');
199
0
  to.set_subst_chars(sub.c_str());
200
201
0
  const size_t length_in_chars = source_length / sizeof(UChar);
202
0
  CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
203
0
  MaybeStackBuffer<char> destbuf(length_in_chars);
204
0
  const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
205
0
                                       *sourcebuf, length_in_chars, status);
206
0
  if (U_SUCCESS(*status)) {
207
0
    destbuf.SetLength(len);
208
0
    ret = ToBufferEndian(env, &destbuf);
209
0
  }
210
0
  return ret;
211
0
}
212
213
MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
214
                                         const char* fromEncoding,
215
                                         const char* toEncoding,
216
                                         const char* source,
217
                                         const size_t source_length,
218
0
                                         UErrorCode* status) {
219
0
  size_t expected_utf16_length =
220
0
      simdutf::utf16_length_from_utf8(source, source_length);
221
0
  MaybeStackBuffer<char16_t> destbuf(expected_utf16_length);
222
0
  auto actual_length =
223
0
      simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out());
224
225
0
  if (actual_length == 0) {
226
0
    *status = U_INVALID_CHAR_FOUND;
227
0
    return {};
228
0
  }
229
230
0
  return Buffer::New(env, &destbuf);
231
0
}
232
233
MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
234
                                         const char* fromEncoding,
235
                                         const char* toEncoding,
236
                                         const char* source,
237
                                         const size_t source_length,
238
0
                                         UErrorCode* status) {
239
0
  const size_t length_in_chars = source_length / sizeof(UChar);
240
0
  size_t expected_utf8_length = simdutf::utf8_length_from_utf16le(
241
0
      reinterpret_cast<const char16_t*>(source), length_in_chars);
242
243
0
  MaybeStackBuffer<char> destbuf(expected_utf8_length);
244
0
  auto actual_length = simdutf::convert_utf16le_to_utf8(
245
0
      reinterpret_cast<const char16_t*>(source),
246
0
      length_in_chars,
247
0
      destbuf.out());
248
249
0
  if (actual_length == 0) {
250
0
    *status = U_INVALID_CHAR_FOUND;
251
0
    return {};
252
0
  }
253
254
0
  return Buffer::New(env, &destbuf);
255
0
}
256
257
0
constexpr const char* EncodingName(const enum encoding encoding) {
258
0
  switch (encoding) {
259
0
    case ASCII: return "us-ascii";
260
0
    case LATIN1: return "iso8859-1";
261
0
    case UCS2: return "utf16le";
262
0
    case UTF8: return "utf-8";
263
0
    default: return nullptr;
264
0
  }
265
0
}
266
267
0
constexpr bool SupportedEncoding(const enum encoding encoding) {
268
0
  switch (encoding) {
269
0
    case ASCII:
270
0
    case LATIN1:
271
0
    case UCS2:
272
0
    case UTF8: return true;
273
0
    default: return false;
274
0
  }
275
0
}
276
277
0
void Transcode(const FunctionCallbackInfo<Value>&args) {
278
0
  Environment* env = Environment::GetCurrent(args);
279
0
  Isolate* isolate = env->isolate();
280
0
  UErrorCode status = U_ZERO_ERROR;
281
0
  MaybeLocal<Object> result;
282
283
0
  ArrayBufferViewContents<char> input(args[0]);
284
0
  const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
285
0
  const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
286
287
0
  if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
288
0
    TranscodeFunc tfn = &Transcode;
289
0
    switch (fromEncoding) {
290
0
      case ASCII:
291
0
      case LATIN1:
292
0
        if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2;
293
0
        break;
294
0
      case UTF8:
295
0
        if (toEncoding == UCS2)
296
0
          tfn = &TranscodeUcs2FromUtf8;
297
0
        break;
298
0
      case UCS2:
299
0
        switch (toEncoding) {
300
0
          case UCS2:
301
0
            tfn = &Transcode;
302
0
            break;
303
0
          case UTF8:
304
0
            tfn = &TranscodeUtf8FromUcs2;
305
0
            break;
306
0
          default:
307
0
            tfn = &TranscodeFromUcs2;
308
0
        }
309
0
        break;
310
0
      default:
311
        // This should not happen because of the SupportedEncoding checks
312
0
        ABORT();
313
0
    }
314
315
0
    result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
316
0
                 input.data(), input.length(), &status);
317
0
  } else {
318
0
    status = U_ILLEGAL_ARGUMENT_ERROR;
319
0
  }
320
321
0
  Local<Object> res;
322
0
  if (result.ToLocal(&res)) {
323
0
    return args.GetReturnValue().Set(res);
324
0
  }
325
326
0
  return args.GetReturnValue().Set(status);
327
0
}
328
329
0
void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
330
0
  CHECK(args[0]->IsInt32());
331
0
  UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
332
0
  args.GetReturnValue().Set(
333
0
      OneByteString(args.GetIsolate(), u_errorName(status)));
334
0
}
335
336
}  // anonymous namespace
337
338
0
Converter::Converter(const char* name, const char* sub) {
339
0
  UErrorCode status = U_ZERO_ERROR;
340
0
  UConverter* conv = ucnv_open(name, &status);
341
0
  CHECK(U_SUCCESS(status));
342
0
  conv_.reset(conv);
343
0
  set_subst_chars(sub);
344
0
}
345
346
Converter::Converter(UConverter* converter, const char* sub)
347
0
    : conv_(converter) {
348
0
  set_subst_chars(sub);
349
0
}
350
351
0
void Converter::set_subst_chars(const char* sub) {
352
0
  CHECK(conv_);
353
0
  UErrorCode status = U_ZERO_ERROR;
354
0
  if (sub != nullptr) {
355
0
    ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
356
0
    CHECK(U_SUCCESS(status));
357
0
  }
358
0
}
359
360
0
void Converter::reset() {
361
0
  ucnv_reset(conv_.get());
362
0
}
363
364
0
size_t Converter::min_char_size() const {
365
0
  CHECK(conv_);
366
0
  return ucnv_getMinCharSize(conv_.get());
367
0
}
368
369
0
size_t Converter::max_char_size() const {
370
0
  CHECK(conv_);
371
0
  return ucnv_getMaxCharSize(conv_.get());
372
0
}
373
374
0
void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
375
0
  CHECK_GE(args.Length(), 1);
376
0
  Utf8Value label(args.GetIsolate(), args[0]);
377
378
0
  UErrorCode status = U_ZERO_ERROR;
379
0
  ConverterPointer conv(ucnv_open(*label, &status));
380
0
  args.GetReturnValue().Set(!!U_SUCCESS(status));
381
0
}
382
383
0
void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
384
0
  Environment* env = Environment::GetCurrent(args);
385
386
0
  Local<ObjectTemplate> t = env->i18n_converter_template();
387
0
  Local<Object> obj;
388
0
  if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
389
390
0
  CHECK_GE(args.Length(), 2);
391
0
  Utf8Value label(env->isolate(), args[0]);
392
0
  uint32_t flags;
393
0
  if (!args[1]->Uint32Value(env->context()).To(&flags)) {
394
0
    return;
395
0
  }
396
0
  bool fatal =
397
0
      (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
398
399
0
  UErrorCode status = U_ZERO_ERROR;
400
0
  UConverter* conv = ucnv_open(*label, &status);
401
0
  if (U_FAILURE(status))
402
0
    return;
403
404
0
  if (fatal) {
405
0
    status = U_ZERO_ERROR;
406
0
    ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
407
0
                        nullptr, nullptr, nullptr, &status);
408
0
  }
409
410
0
  auto converter = new ConverterObject(env, obj, conv, flags);
411
0
  size_t sublen = ucnv_getMinCharSize(conv);
412
0
  std::string sub(sublen, '?');
413
0
  converter->set_subst_chars(sub.c_str());
414
415
0
  args.GetReturnValue().Set(obj);
416
0
}
417
418
0
void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
419
0
  Environment* env = Environment::GetCurrent(args);
420
421
0
  CHECK_GE(args.Length(), 4);  // Converter, Buffer, Flags, Encoding
422
423
0
  ConverterObject* converter;
424
0
  ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
425
426
0
  if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() ||
427
0
        args[1]->IsArrayBufferView())) {
428
0
    return node::THROW_ERR_INVALID_ARG_TYPE(
429
0
        env->isolate(),
430
0
        "The \"input\" argument must be an instance of SharedArrayBuffer, "
431
0
        "ArrayBuffer or ArrayBufferView.");
432
0
  }
433
434
0
  ArrayBufferViewContents<char> input(args[1]);
435
0
  uint32_t flags;
436
0
  if (!args[2]->Uint32Value(env->context()).To(&flags)) {
437
0
    return;
438
0
  }
439
440
0
  CHECK(args[3]->IsString());
441
0
  Local<String> from_encoding = args[3].As<String>();
442
443
0
  UErrorCode status = U_ZERO_ERROR;
444
0
  MaybeStackBuffer<UChar> result;
445
446
0
  UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
447
448
  // When flushing the final chunk, the limit is the maximum
449
  // of either the input buffer length or the number of pending
450
  // characters times the min char size, multiplied by 2 as unicode may
451
  // take up to 2 UChars to encode a character
452
0
  size_t limit = 2 * converter->min_char_size() *
453
0
      (!flush ?
454
0
          input.length() :
455
0
          std::max(
456
0
              input.length(),
457
0
              static_cast<size_t>(
458
0
                  ucnv_toUCountPending(converter->conv(), &status))));
459
0
  status = U_ZERO_ERROR;
460
461
0
  if (limit > 0)
462
0
    result.AllocateSufficientStorage(limit);
463
464
0
  auto cleanup = OnScopeLeave([&]() {
465
0
    if (flush) {
466
      // Reset the converter state.
467
0
      converter->set_bom_seen(false);
468
0
      converter->reset();
469
0
    }
470
0
  });
471
472
0
  const char* source = input.data();
473
0
  size_t source_length = input.length();
474
475
0
  UChar* target = *result;
476
0
  ucnv_toUnicode(converter->conv(),
477
0
                 &target,
478
0
                 target + limit,
479
0
                 &source,
480
0
                 source + source_length,
481
0
                 nullptr,
482
0
                 flush,
483
0
                 &status);
484
485
0
  if (U_SUCCESS(status)) {
486
0
    bool omit_initial_bom = false;
487
0
    if (limit > 0) {
488
0
      result.SetLength(target - &result[0]);
489
0
      if (result.length() > 0 &&
490
0
          converter->unicode() &&
491
0
          !converter->ignore_bom() &&
492
0
          !converter->bom_seen()) {
493
        // If the very first result in the stream is a BOM, and we are not
494
        // explicitly told to ignore it, then we mark it for discarding.
495
0
        if (result[0] == 0xFEFF)
496
0
          omit_initial_bom = true;
497
0
        converter->set_bom_seen(true);
498
0
      }
499
0
    }
500
501
0
    UChar* output = result.out();
502
0
    size_t beginning = 0;
503
0
    size_t length = result.length() * sizeof(UChar);
504
505
0
    if (omit_initial_bom) {
506
      // Perform `ret = ret.slice(2)`.
507
0
      beginning += 2;
508
0
      length -= 2;
509
0
    }
510
511
0
    char* value = reinterpret_cast<char*>(output) + beginning;
512
513
    if constexpr (IsBigEndian()) {
514
      CHECK(nbytes::SwapBytes16(value, length));
515
    }
516
517
0
    Local<Value> ret;
518
0
    if (StringBytes::Encode(env->isolate(), value, length, UCS2)
519
0
            .ToLocal(&ret)) {
520
0
      args.GetReturnValue().Set(ret);
521
0
      return;
522
0
    }
523
0
  }
524
525
0
  node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
526
0
      env->isolate(),
527
0
      "The encoded data was not valid for encoding %s",
528
0
      *node::Utf8Value(env->isolate(), from_encoding));
529
0
}
530
531
ConverterObject::ConverterObject(
532
    Environment* env,
533
    Local<Object> wrap,
534
    UConverter* converter,
535
    int flags,
536
    const char* sub)
537
0
    : BaseObject(env, wrap),
538
0
      Converter(converter, sub),
539
0
      flags_(flags) {
540
0
  MakeWeak();
541
542
0
  switch (ucnv_getType(converter)) {
543
0
    case UCNV_UTF8:
544
0
    case UCNV_UTF16_BigEndian:
545
0
    case UCNV_UTF16_LittleEndian:
546
0
      flags_ |= CONVERTER_FLAGS_UNICODE;
547
0
      break;
548
0
    default: {
549
      // Fall through
550
0
    }
551
0
  }
552
0
}
553
554
35
bool InitializeICUDirectory(const std::string& path, std::string* error) {
555
35
  UErrorCode status = U_ZERO_ERROR;
556
35
  if (path.empty()) {
557
#ifdef NODE_HAVE_SMALL_ICU
558
    // install the 'small' data.
559
    udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
560
#else  // !NODE_HAVE_SMALL_ICU
561
    // no small data, so nothing to do.
562
35
#endif  // !NODE_HAVE_SMALL_ICU
563
35
  } else {
564
0
    u_setDataDirectory(path.c_str());
565
0
    u_init(&status);
566
0
  }
567
35
  if (status == U_ZERO_ERROR) {
568
35
    return true;
569
35
  }
570
571
0
  *error = u_errorName(status);
572
0
  return false;
573
35
}
574
575
0
void SetDefaultTimeZone(const char* tzid) {
576
0
  size_t tzidlen = strlen(tzid) + 1;
577
0
  UErrorCode status = U_ZERO_ERROR;
578
0
  MaybeStackBuffer<UChar, 256> id(tzidlen);
579
0
  u_charsToUChars(tzid, id.out(), tzidlen);
580
  // This is threadsafe:
581
0
  ucal_setDefaultTimeZone(id.out(), &status);
582
0
  CHECK(U_SUCCESS(status));
583
0
}
584
585
// This is similar to wcwidth except that it takes the current unicode
586
// character properties database into consideration, allowing it to
587
// correctly calculate the column widths of things like emoji's and
588
// newer wide characters. wcwidth, on the other hand, uses a fixed
589
// algorithm that does not take things like emoji into proper
590
// consideration.
591
//
592
// TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
593
// GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
594
// below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
595
// allow it to be input. Linux's PTY terminal prints control characters as
596
// Narrow rhombi.
597
//
598
// TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
599
// consonants are 0-width when combined with initial consonants; otherwise they
600
// are technically Wide. But many terminals (including Konsole and
601
// VTE/GLib-based) implement all medials and finals as 0-width.
602
//
603
// Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
604
// Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
605
// Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
606
static int GetColumnWidth(UChar32 codepoint,
607
0
                          bool ambiguous_as_full_width = false) {
608
  // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
609
  // codepoint as being full width, wide, ambiguous, neutral, narrow,
610
  // or halfwidth.
611
0
  const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
612
0
  switch (eaw) {
613
0
    case U_EA_FULLWIDTH:
614
0
    case U_EA_WIDE:
615
0
      return 2;
616
0
    case U_EA_AMBIGUOUS:
617
      // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
618
0
      if (ambiguous_as_full_width) {
619
0
        return 2;
620
0
      }
621
      // If ambiguous_as_full_width is false:
622
0
      [[fallthrough]];
623
0
    case U_EA_NEUTRAL:
624
0
      if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
625
0
        return 2;
626
0
      }
627
0
      [[fallthrough]];
628
0
    case U_EA_HALFWIDTH:
629
0
    case U_EA_NARROW:
630
0
    default:
631
0
      const auto zero_width_mask = U_GC_CC_MASK |  // C0/C1 control code
632
0
                                  U_GC_CF_MASK |  // Format control character
633
0
                                  U_GC_ME_MASK |  // Enclosing mark
634
0
                                  U_GC_MN_MASK;   // Nonspacing mark
635
0
      if (codepoint != 0x00AD &&  // SOFT HYPHEN is Cf but not zero-width
636
0
          ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
637
0
          u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
638
0
        return 0;
639
0
      }
640
0
      return 1;
641
0
  }
642
0
}
643
644
// Returns the column width for the given String.
645
0
static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
646
0
  CHECK(args[0]->IsString());
647
648
0
  bool ambiguous_as_full_width = args[1]->IsTrue();
649
0
  bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
650
651
0
  TwoByteValue value(args.GetIsolate(), args[0]);
652
  // reinterpret_cast is required by windows to compile
653
0
  UChar* str = reinterpret_cast<UChar*>(*value);
654
0
  static_assert(sizeof(*str) == sizeof(**value),
655
0
                "sizeof(*str) == sizeof(**value)");
656
0
  UChar32 c = 0;
657
0
  UChar32 p;
658
0
  size_t n = 0;
659
0
  uint32_t width = 0;
660
661
0
  while (n < value.length()) {
662
0
    p = c;
663
0
    U16_NEXT(str, n, value.length(), c);
664
    // Don't count individual emoji codepoints that occur within an
665
    // emoji sequence. This is not necessarily foolproof. Some
666
    // environments display emoji sequences in the appropriate
667
    // condensed form (as a single emoji glyph), other environments
668
    // may not understand an emoji sequence and will display each
669
    // individual emoji separately. When this happens, the width
670
    // calculated will be off, and there's no reliable way of knowing
671
    // in advance if a particular sequence is going to be supported.
672
    // The expand_emoji_sequence option allows the caller to skip this
673
    // check and count each code within an emoji sequence separately.
674
    // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
675
0
    if (!expand_emoji_sequence &&
676
0
        n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
677
0
        (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
678
0
         u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
679
0
      continue;
680
0
    }
681
0
    width += GetColumnWidth(c, ambiguous_as_full_width);
682
0
  }
683
0
  args.GetReturnValue().Set(width);
684
0
}
685
686
static void CreatePerIsolateProperties(IsolateData* isolate_data,
687
35
                                       Local<ObjectTemplate> target) {
688
35
  Isolate* isolate = isolate_data->isolate();
689
690
35
  SetMethod(isolate, target, "getStringWidth", GetStringWidth);
691
692
  // One-shot converters
693
35
  SetMethod(isolate, target, "icuErrName", ICUErrorName);
694
35
  SetMethod(isolate, target, "transcode", Transcode);
695
696
  // ConverterObject
697
35
  {
698
35
    Local<FunctionTemplate> t = NewFunctionTemplate(isolate, nullptr);
699
35
    t->InstanceTemplate()->SetInternalFieldCount(
700
35
        ConverterObject::kInternalFieldCount);
701
35
    Local<String> converter_string =
702
35
        FIXED_ONE_BYTE_STRING(isolate, "Converter");
703
35
    t->SetClassName(converter_string);
704
35
    isolate_data->set_i18n_converter_template(t->InstanceTemplate());
705
35
  }
706
707
35
  SetMethod(isolate, target, "getConverter", ConverterObject::Create);
708
35
  SetMethod(isolate, target, "decode", ConverterObject::Decode);
709
35
  SetMethod(isolate, target, "hasConverter", ConverterObject::Has);
710
35
}
711
712
void CreatePerContextProperties(Local<Object> target,
713
                                Local<Value> unused,
714
                                Local<Context> context,
715
35
                                void* priv) {}
716
717
0
void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
718
0
  registry->Register(GetStringWidth);
719
0
  registry->Register(ICUErrorName);
720
0
  registry->Register(Transcode);
721
0
  registry->Register(ConverterObject::Create);
722
0
  registry->Register(ConverterObject::Decode);
723
0
  registry->Register(ConverterObject::Has);
724
0
}
725
726
}  // namespace i18n
727
}  // namespace node
728
729
NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::CreatePerContextProperties)
730
NODE_BINDING_PER_ISOLATE_INIT(icu, node::i18n::CreatePerIsolateProperties)
731
NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences)
732
733
#endif  // NODE_HAVE_I18N_SUPPORT