Coverage Report

Created: 2026-03-31 07:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ruby/encoding.c
Line
Count
Source
1
/**********************************************************************
2
3
  encoding.c -
4
5
  $Author$
6
  created at: Thu May 24 17:23:27 JST 2007
7
8
  Copyright (C) 2007 Yukihiro Matsumoto
9
10
**********************************************************************/
11
12
#include "ruby/internal/config.h"
13
14
#include <ctype.h>
15
16
#include "encindex.h"
17
#include "internal.h"
18
#include "internal/enc.h"
19
#include "internal/encoding.h"
20
#include "internal/error.h"
21
#include "internal/inits.h"
22
#include "internal/load.h"
23
#include "internal/object.h"
24
#include "internal/string.h"
25
#include "internal/vm.h"
26
#include "regenc.h"
27
#include "ruby/atomic.h"
28
#include "ruby/encoding.h"
29
#include "ruby/util.h"
30
#include "ruby/ractor.h"
31
#include "ruby_assert.h"
32
#include "vm_sync.h"
33
#include "ruby_atomic.h"
34
35
#ifndef ENC_DEBUG
36
#define ENC_DEBUG 0
37
#endif
38
2.77M
#define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
39
2.77M
#define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
40
41
#undef rb_ascii8bit_encindex
42
#undef rb_utf8_encindex
43
#undef rb_usascii_encindex
44
45
typedef OnigEncodingType rb_raw_encoding;
46
47
#if defined __GNUC__ && __GNUC__ >= 4
48
#pragma GCC visibility push(default)
49
int rb_enc_register(const char *name, rb_encoding *encoding);
50
void rb_enc_set_base(const char *name, const char *orig);
51
int rb_enc_set_dummy(int index);
52
void rb_encdb_declare(const char *name);
53
int rb_encdb_replicate(const char *name, const char *orig);
54
int rb_encdb_dummy(const char *name);
55
int rb_encdb_alias(const char *alias, const char *orig);
56
#pragma GCC visibility pop
57
#endif
58
59
static ID id_encoding, id_i_name;
60
VALUE rb_cEncoding;
61
62
27
#define ENCODING_LIST_CAPA 256
63
static VALUE rb_encoding_list;
64
65
struct rb_encoding_entry {
66
    rb_atomic_t loaded;
67
    const char *name;
68
    rb_encoding *enc;
69
    rb_encoding *base;
70
};
71
72
static struct enc_table {
73
    struct rb_encoding_entry list[ENCODING_LIST_CAPA];
74
    int count;
75
    st_table *names;
76
} global_enc_table;
77
78
static int
79
enc_names_free_i(st_data_t name, st_data_t idx, st_data_t args)
80
0
{
81
0
    ruby_xfree((void *)name);
82
0
    return ST_DELETE;
83
0
}
84
85
void
86
rb_free_global_enc_table(void)
87
0
{
88
0
    for (size_t i = 0; i < ENCODING_LIST_CAPA; i++) {
89
0
        xfree((void *)global_enc_table.list[i].enc);
90
0
    }
91
92
0
    st_foreach(global_enc_table.names, enc_names_free_i, (st_data_t)0);
93
0
    st_free_table(global_enc_table.names);
94
0
}
95
96
static rb_encoding *global_enc_ascii,
97
                   *global_enc_utf_8,
98
                   *global_enc_us_ascii;
99
100
static int filesystem_encindex = ENCINDEX_ASCII_8BIT;
101
102
#define GLOBAL_ENC_TABLE_LOCKING(tbl) \
103
87.2k
    for (struct enc_table *tbl = &global_enc_table, **locking = &tbl; \
104
174k
         locking; \
105
87.2k
         locking = NULL) \
106
87.2k
        RB_VM_LOCKING()
107
108
109
10.2M
#define ENC_DUMMY_FLAG (1<<24)
110
19.1M
#define ENC_INDEX_MASK (~(~0U<<24))
111
112
16.5M
#define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
113
10.2M
#define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
114
0
#define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
115
116
9
#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
117
0
#define UNSPECIFIED_ENCODING INT_MAX
118
119
0
#define ENCODING_NAMELEN_MAX 63
120
0
#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
121
122
static const rb_data_type_t encoding_data_type = {
123
    "encoding",
124
    {0, 0, 0,},
125
    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
126
};
127
128
0
#define is_encoding_type(obj) (RTYPEDDATA_TYPE(obj) == &encoding_data_type)
129
0
#define is_data_encoding(obj) (rbimpl_rtypeddata_p(obj) && is_encoding_type(obj))
130
0
#define is_obj_encoding(obj) (rbimpl_obj_typeddata_p(obj) && is_encoding_type(obj))
131
132
int
133
rb_data_is_encoding(VALUE obj)
134
0
{
135
0
    return is_data_encoding(obj);
136
0
}
137
138
static VALUE
139
enc_new(rb_encoding *encoding)
140
108
{
141
108
    VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding);
142
108
    rb_ivar_set(enc, id_i_name, rb_fstring_cstr(encoding->name));
143
108
    RB_OBJ_SET_FROZEN_SHAREABLE(enc);
144
108
    return enc;
145
108
}
146
147
static void
148
enc_list_update(int index, rb_raw_encoding *encoding)
149
108
{
150
108
    RUBY_ASSERT(index < ENCODING_LIST_CAPA);
151
152
108
    VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
153
154
108
    if (list && NIL_P(rb_ary_entry(list, index))) {
155
0
        VALUE new_list = rb_ary_dup(list);
156
0
        RBASIC_CLEAR_CLASS(new_list);
157
        /* initialize encoding data */
158
0
        rb_ary_store(new_list, index, enc_new(encoding));
159
0
        rb_ary_freeze(new_list);
160
0
        FL_SET_RAW(new_list, RUBY_FL_SHAREABLE);
161
0
        RUBY_ATOMIC_VALUE_SET(rb_encoding_list, new_list);
162
0
    }
163
108
}
164
165
static VALUE
166
enc_list_lookup(int idx)
167
0
{
168
0
    VALUE list, enc = Qnil;
169
170
0
    if (idx < ENCODING_LIST_CAPA) {
171
0
        list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
172
0
        RUBY_ASSERT(list);
173
0
        enc = rb_ary_entry(list, idx);
174
0
    }
175
176
0
    if (NIL_P(enc)) {
177
0
        rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
178
0
    }
179
0
    else {
180
0
        return enc;
181
0
    }
182
0
}
183
184
static VALUE
185
rb_enc_from_encoding_index(int idx)
186
0
{
187
0
    return enc_list_lookup(idx);
188
0
}
189
190
VALUE
191
rb_enc_from_encoding(rb_encoding *encoding)
192
0
{
193
0
    int idx;
194
0
    if (!encoding) return Qnil;
195
0
    idx = ENC_TO_ENCINDEX(encoding);
196
0
    return rb_enc_from_encoding_index(idx);
197
0
}
198
199
int
200
rb_enc_to_index(rb_encoding *enc)
201
9.13M
{
202
9.13M
    return enc ? ENC_TO_ENCINDEX(enc) : 0;
203
9.13M
}
204
205
int
206
rb_enc_dummy_p(rb_encoding *enc)
207
10.2M
{
208
10.2M
    return ENC_DUMMY_P(enc) != 0;
209
10.2M
}
210
211
static int
212
check_encoding(rb_encoding *enc)
213
0
{
214
0
    int index = rb_enc_to_index(enc);
215
0
    if (rb_enc_from_index(index) != enc)
216
0
        return -1;
217
0
    if (rb_enc_autoload_p(enc)) {
218
0
        index = rb_enc_autoload(enc);
219
0
    }
220
0
    return index;
221
0
}
222
223
static int
224
enc_check_encoding(VALUE obj)
225
0
{
226
0
    if (!is_obj_encoding(obj)) {
227
0
        return -1;
228
0
    }
229
0
    return check_encoding(RTYPEDDATA_GET_DATA(obj));
230
0
}
231
232
NORETURN(static void not_encoding(VALUE enc));
233
static void
234
not_encoding(VALUE enc)
235
0
{
236
0
    rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
237
0
             rb_obj_class(enc));
238
0
}
239
240
static rb_encoding *
241
must_encoding(VALUE enc)
242
0
{
243
0
    int index = enc_check_encoding(enc);
244
0
    if (index < 0) {
245
0
        not_encoding(enc);
246
0
    }
247
0
    return RTYPEDDATA_GET_DATA(enc);
248
0
}
249
250
static rb_encoding *
251
must_encindex(int index)
252
2.61M
{
253
2.61M
    rb_encoding *enc = rb_enc_from_index(index);
254
2.61M
    if (!enc) {
255
0
        rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
256
0
                 index);
257
0
    }
258
2.61M
    if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) {
259
0
        rb_loaderror("failed to load encoding (%s)",
260
0
                     rb_enc_name(enc));
261
0
    }
262
2.61M
    if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
263
0
        rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
264
0
                 index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc));
265
0
    }
266
2.61M
    return enc;
267
2.61M
}
268
269
int
270
rb_to_encoding_index(VALUE enc)
271
0
{
272
0
    ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock
273
0
    int idx;
274
0
    const char *name;
275
276
0
    idx = enc_check_encoding(enc);
277
0
    if (idx >= 0) {
278
0
        return idx;
279
0
    }
280
0
    else if (NIL_P(enc = rb_check_string_type(enc))) {
281
0
        return -1;
282
0
    }
283
0
    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
284
0
        return -1;
285
0
    }
286
0
    if (!(name = rb_str_to_cstr(enc))) {
287
0
        return -1;
288
0
    }
289
0
    return rb_enc_find_index(name);
290
0
}
291
292
static const char *
293
name_for_encoding(volatile VALUE *enc)
294
0
{
295
0
    VALUE name = StringValue(*enc);
296
0
    const char *n;
297
298
0
    if (!rb_enc_asciicompat(rb_enc_get(name))) {
299
0
        rb_raise(rb_eArgError, "invalid encoding name (non ASCII)");
300
0
    }
301
0
    if (!(n = rb_str_to_cstr(name))) {
302
0
        rb_raise(rb_eArgError, "invalid encoding name (NUL byte)");
303
0
    }
304
0
    return n;
305
0
}
306
307
/* Returns encoding index or UNSPECIFIED_ENCODING */
308
static int
309
str_find_encindex(VALUE enc)
310
0
{
311
0
    int idx = rb_enc_find_index(name_for_encoding(&enc));
312
0
    RB_GC_GUARD(enc);
313
0
    return idx;
314
0
}
315
316
static int
317
str_to_encindex(VALUE enc)
318
0
{
319
0
    int idx = str_find_encindex(enc);
320
0
    if (idx < 0) {
321
0
        rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
322
0
    }
323
0
    return idx;
324
0
}
325
326
static rb_encoding *
327
str_to_encoding(VALUE enc)
328
0
{
329
0
    return rb_enc_from_index(str_to_encindex(enc));
330
0
}
331
332
rb_encoding *
333
rb_to_encoding(VALUE enc)
334
0
{
335
0
    if (enc_check_encoding(enc) >= 0) return RTYPEDDATA_GET_DATA(enc);
336
0
    return str_to_encoding(enc);
337
0
}
338
339
rb_encoding *
340
rb_find_encoding(VALUE enc)
341
0
{
342
0
    int idx;
343
0
    if (enc_check_encoding(enc) >= 0) return RTYPEDDATA_GET_DATA(enc);
344
0
    idx = str_find_encindex(enc);
345
0
    if (idx < 0) return NULL;
346
0
    return rb_enc_from_index(idx);
347
0
}
348
349
static int
350
enc_table_expand(struct enc_table *enc_table, int newsize)
351
9
{
352
9
    if (newsize > ENCODING_LIST_CAPA) {
353
0
        rb_raise(rb_eEncodingError, "too many encoding (> %d)", ENCODING_LIST_CAPA);
354
0
    }
355
9
    return newsize;
356
9
}
357
358
/* Load an encoding using the values from base_encoding */
359
static void
360
enc_load_from_base(struct enc_table *enc_table, int index, rb_encoding *base_encoding)
361
27
{
362
27
    ASSERT_vm_locking();
363
364
27
    struct rb_encoding_entry *ent = &enc_table->list[index];
365
366
27
    if (ent->loaded) {
367
0
        return;
368
0
    }
369
370
27
    rb_raw_encoding *encoding = (rb_raw_encoding *)ent->enc;
371
27
    RUBY_ASSERT(encoding);
372
373
    // FIXME: Before the base is loaded, the encoding may be accessed
374
    // concurrently by other Ractors.
375
    // We're copying all fields from base_encoding except name and
376
    // ruby_encoding_index which we preserve from the original. Since these are
377
    // the only fields other threads should read it is likely safe despite
378
    // technically being a data race.
379
27
    rb_raw_encoding tmp_encoding = *base_encoding;
380
27
    tmp_encoding.name = encoding->name;
381
27
    tmp_encoding.ruby_encoding_index = encoding->ruby_encoding_index;
382
27
    *encoding = tmp_encoding;
383
384
27
    RUBY_ATOMIC_SET(ent->loaded, encoding->max_enc_len);
385
27
}
386
387
static int
388
enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding)
389
108
{
390
108
    ASSERT_vm_locking();
391
392
108
    struct rb_encoding_entry *ent = &enc_table->list[index];
393
108
    rb_raw_encoding *encoding;
394
395
108
    RUBY_ASSERT(!ent->loaded);
396
108
    RUBY_ASSERT(!ent->name);
397
108
    RUBY_ASSERT(!ent->enc);
398
108
    RUBY_ASSERT(!ent->base);
399
400
108
    RUBY_ASSERT(valid_encoding_name_p(name));
401
402
108
    ent->name = name = strdup(name);
403
404
108
    encoding = ZALLOC(rb_raw_encoding);
405
108
    encoding->name = name;
406
108
    encoding->ruby_encoding_index = index;
407
108
    ent->enc = encoding;
408
409
108
    if (st_insert(enc_table->names, (st_data_t)name, (st_data_t)index)) {
410
0
        rb_bug("encoding name was somehow registered twice");
411
0
    }
412
413
108
    enc_list_update(index, encoding);
414
415
108
    if (base_encoding) {
416
27
        enc_load_from_base(enc_table, index, base_encoding);
417
27
    }
418
81
    else {
419
        /* it should not be loaded yet */
420
81
        RUBY_ASSERT(!encoding->max_enc_len);
421
81
    }
422
423
108
    return index;
424
108
}
425
426
static int
427
enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
428
0
{
429
0
    ASSERT_vm_locking();
430
431
0
    if (!valid_encoding_name_p(name)) return -1;
432
433
0
    int index = enc_table->count;
434
435
0
    enc_table->count = enc_table_expand(enc_table, index + 1);
436
0
    return enc_register_at(enc_table, index, name, encoding);
437
0
}
438
439
static void set_encoding_const(const char *, rb_encoding *);
440
static int enc_registered(struct enc_table *enc_table, const char *name);
441
442
static rb_encoding *
443
enc_from_index(struct enc_table *enc_table, int index)
444
17.1M
{
445
17.1M
    if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) {
446
0
        return 0;
447
0
    }
448
17.1M
    rb_encoding *enc = enc_table->list[index].enc;
449
17.1M
    RUBY_ASSERT(ENC_TO_ENCINDEX(enc) == index);
450
17.1M
    return enc;
451
17.1M
}
452
453
rb_encoding *
454
rb_enc_from_index(int index)
455
17.1M
{
456
17.1M
    return enc_from_index(&global_enc_table, index);
457
17.1M
}
458
459
int
460
rb_enc_register(const char *name, rb_encoding *encoding)
461
0
{
462
0
    int index;
463
464
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
465
0
        index = enc_registered(enc_table, name);
466
467
0
        if (index >= 0) {
468
0
            rb_encoding *oldenc = enc_from_index(enc_table, index);
469
0
            if (STRCASECMP(name, rb_enc_name(oldenc))) {
470
0
                index = enc_register(enc_table, name, encoding);
471
0
            }
472
0
            else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
473
0
                enc_load_from_base(enc_table, index, encoding);
474
0
            }
475
0
            else {
476
0
                rb_raise(rb_eArgError, "encoding %s is already registered", name);
477
0
            }
478
0
        }
479
0
        else {
480
0
            index = enc_register(enc_table, name, encoding);
481
0
            set_encoding_const(name, rb_enc_from_index(index));
482
0
        }
483
0
    }
484
0
    return index;
485
0
}
486
487
int
488
enc_registered(struct enc_table *enc_table, const char *name)
489
29.5k
{
490
29.5k
    ASSERT_vm_locking();
491
29.5k
    st_data_t idx = 0;
492
493
29.5k
    if (!name) return -1;
494
29.5k
    if (!enc_table->names) return -1;
495
29.5k
    if (st_lookup(enc_table->names, (st_data_t)name, &idx)) {
496
29.3k
        return (int)idx;
497
29.3k
    }
498
190
    return -1;
499
29.5k
}
500
501
int
502
rb_enc_registered(const char *name)
503
0
{
504
0
    int idx;
505
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
506
0
        idx = enc_registered(enc_table, name);
507
0
    }
508
0
    return idx;
509
0
}
510
511
void
512
rb_encdb_declare(const char *name)
513
0
{
514
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
515
0
        int idx = enc_registered(enc_table, name);
516
0
        if (idx < 0) {
517
0
            idx = enc_register(enc_table, name, 0);
518
0
        }
519
0
        set_encoding_const(name, rb_enc_from_index(idx));
520
0
    }
521
0
}
522
523
static void
524
enc_check_addable(struct enc_table *enc_table, const char *name)
525
0
{
526
0
    if (enc_registered(enc_table, name) >= 0) {
527
0
        rb_raise(rb_eArgError, "encoding %s is already registered", name);
528
0
    }
529
0
    else if (!valid_encoding_name_p(name)) {
530
0
        rb_raise(rb_eArgError, "invalid encoding name: %s", name);
531
0
    }
532
0
}
533
534
static rb_encoding*
535
set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base)
536
0
{
537
0
    rb_encoding *enc = enc_table->list[index].enc;
538
539
0
    ASSUME(enc);
540
0
    enc_table->list[index].base = base;
541
0
    if (ENC_DUMMY_P(base)) ENC_SET_DUMMY((rb_raw_encoding *)enc);
542
0
    return enc;
543
0
}
544
545
/* for encdb.h
546
 * Set base encoding for encodings which are not replicas
547
 * but not in their own files.
548
 */
549
void
550
rb_enc_set_base(const char *name, const char *orig)
551
0
{
552
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
553
0
        int idx = enc_registered(enc_table, name);
554
0
        int origidx = enc_registered(enc_table, orig);
555
0
        set_base_encoding(enc_table, idx, rb_enc_from_index(origidx));
556
0
    }
557
0
}
558
559
/* for encdb.h
560
 * Set encoding dummy.
561
 */
562
int
563
rb_enc_set_dummy(int index)
564
0
{
565
0
    rb_encoding *enc = global_enc_table.list[index].enc;
566
0
    ENC_SET_DUMMY((rb_raw_encoding *)enc);
567
0
    return index;
568
0
}
569
570
static int
571
enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
572
0
{
573
0
    int idx;
574
575
0
    enc_check_addable(enc_table, name);
576
0
    idx = enc_register(enc_table, name, encoding);
577
0
    if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name);
578
0
    set_base_encoding(enc_table, idx, encoding);
579
0
    set_encoding_const(name, rb_enc_from_index(idx));
580
0
    return idx;
581
0
}
582
583
static int
584
enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx)
585
0
{
586
0
    if (idx < 0) {
587
0
        idx = enc_register(enc_table, name, origenc);
588
0
    }
589
0
    else {
590
0
        enc_load_from_base(enc_table, idx, origenc);
591
0
    }
592
0
    if (idx >= 0) {
593
0
        set_base_encoding(enc_table, idx, origenc);
594
0
        set_encoding_const(name, rb_enc_from_index(idx));
595
0
    }
596
0
    else {
597
0
        rb_raise(rb_eArgError, "failed to replicate encoding");
598
0
    }
599
0
    return idx;
600
0
}
601
602
int
603
rb_encdb_replicate(const char *name, const char *orig)
604
0
{
605
0
    int r;
606
607
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
608
0
        int origidx = enc_registered(enc_table, orig);
609
0
        int idx = enc_registered(enc_table, name);
610
611
0
        if (origidx < 0) {
612
0
            origidx = enc_register(enc_table, orig, 0);
613
0
        }
614
0
        r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx);
615
0
    }
616
617
0
    return r;
618
0
}
619
620
int
621
rb_define_dummy_encoding(const char *name)
622
0
{
623
0
    int index;
624
625
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
626
0
        index = enc_replicate(enc_table, name, rb_ascii8bit_encoding());
627
0
        rb_encoding *enc = enc_table->list[index].enc;
628
0
        ENC_SET_DUMMY((rb_raw_encoding *)enc);
629
0
    }
630
631
0
    return index;
632
0
}
633
634
int
635
rb_encdb_dummy(const char *name)
636
0
{
637
0
    int index;
638
639
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
640
0
        index = enc_replicate_with_index(enc_table, name,
641
0
                                         rb_ascii8bit_encoding(),
642
0
                                         enc_registered(enc_table, name));
643
0
        rb_encoding *enc = enc_table->list[index].enc;
644
0
        ENC_SET_DUMMY((rb_raw_encoding *)enc);
645
0
    }
646
647
0
    return index;
648
0
}
649
650
/*
651
 * call-seq:
652
 *   enc.dummy? -> true or false
653
 *
654
 * Returns true for dummy encodings.
655
 * A dummy encoding is an encoding for which character handling is not properly
656
 * implemented.
657
 * It is used for stateful encodings.
658
 *
659
 *   Encoding::ISO_2022_JP.dummy?       #=> true
660
 *   Encoding::UTF_8.dummy?             #=> false
661
 *
662
 */
663
static VALUE
664
enc_dummy_p(VALUE enc)
665
0
{
666
0
    return RBOOL(ENC_DUMMY_P(must_encoding(enc)));
667
0
}
668
669
/*
670
 * call-seq:
671
 *   enc.ascii_compatible? -> true or false
672
 *
673
 * Returns whether ASCII-compatible or not.
674
 *
675
 *   Encoding::UTF_8.ascii_compatible?     #=> true
676
 *   Encoding::UTF_16BE.ascii_compatible?  #=> false
677
 *
678
 */
679
static VALUE
680
enc_ascii_compatible_p(VALUE enc)
681
0
{
682
0
    return RBOOL(rb_enc_asciicompat(must_encoding(enc)));
683
0
}
684
685
/*
686
 * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
687
 */
688
int
689
rb_enc_unicode_p(rb_encoding *enc)
690
66.5k
{
691
66.5k
    return ONIGENC_IS_UNICODE(enc);
692
66.5k
}
693
694
static st_data_t
695
enc_dup_name(st_data_t name)
696
0
{
697
0
    return (st_data_t)strdup((const char *)name);
698
0
}
699
700
/*
701
 * Returns copied alias name when the key is added for st_table,
702
 * else returns NULL.
703
 */
704
static int
705
enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx)
706
0
{
707
0
    ASSERT_vm_locking();
708
0
    return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx,
709
0
                      enc_dup_name);
710
0
}
711
712
static int
713
enc_alias(struct enc_table *enc_table, const char *alias, int idx)
714
0
{
715
0
    if (!valid_encoding_name_p(alias)) return -1;
716
0
    if (!enc_alias_internal(enc_table, alias, idx))
717
0
        set_encoding_const(alias, enc_from_index(enc_table, idx));
718
0
    return idx;
719
0
}
720
721
int
722
rb_enc_alias(const char *alias, const char *orig)
723
0
{
724
0
    int idx, r;
725
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
726
0
        enc_check_addable(enc_table, alias); // can raise
727
0
    }
728
729
0
    idx = rb_enc_find_index(orig);
730
0
    if (idx < 0) return -1;
731
732
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
733
0
        r = enc_alias(enc_table, alias, idx);
734
0
    }
735
736
0
    return r;
737
0
}
738
739
int
740
rb_encdb_alias(const char *alias, const char *orig)
741
0
{
742
0
    int r;
743
744
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
745
0
        int idx = enc_registered(enc_table, orig);
746
747
0
        if (idx < 0) {
748
0
            idx = enc_register(enc_table, orig, 0);
749
0
        }
750
0
        r = enc_alias(enc_table, alias, idx);
751
0
    }
752
753
0
    return r;
754
0
}
755
756
static void
757
rb_enc_init(struct enc_table *enc_table)
758
9
{
759
9
    ASSERT_vm_locking();
760
9
    enc_table_expand(enc_table, ENCODING_COUNT + 1);
761
9
    if (!enc_table->names) {
762
9
        enc_table->names = st_init_strcasetable_with_size(ENCODING_LIST_CAPA);
763
9
    }
764
18
#define OnigEncodingASCII_8BIT OnigEncodingASCII
765
27
#define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
766
9
    ENC_REGISTER(ASCII_8BIT);
767
9
    ENC_REGISTER(UTF_8);
768
9
    ENC_REGISTER(US_ASCII);
769
9
    global_enc_ascii = enc_table->list[ENCINDEX_ASCII_8BIT].enc;
770
9
    global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc;
771
9
    global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc;
772
9
#undef ENC_REGISTER
773
9
#undef OnigEncodingASCII_8BIT
774
81
#define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
775
9
    ENCDB_REGISTER("UTF-16BE", UTF_16BE);
776
9
    ENCDB_REGISTER("UTF-16LE", UTF_16LE);
777
9
    ENCDB_REGISTER("UTF-32BE", UTF_32BE);
778
9
    ENCDB_REGISTER("UTF-32LE", UTF_32LE);
779
9
    ENCDB_REGISTER("UTF-16", UTF_16);
780
9
    ENCDB_REGISTER("UTF-32", UTF_32);
781
9
    ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
782
783
9
    ENCDB_REGISTER("EUC-JP", EUC_JP);
784
9
    ENCDB_REGISTER("Windows-31J", Windows_31J);
785
9
#undef ENCDB_REGISTER
786
9
    enc_table->count = ENCINDEX_BUILTIN_MAX;
787
9
}
788
789
rb_encoding *
790
rb_enc_get_from_index(int index)
791
0
{
792
0
    return must_encindex(index);
793
0
}
794
795
int rb_require_internal_silent(VALUE fname);
796
797
static int
798
load_encoding(const char *name)
799
28.9k
{
800
28.9k
    ASSERT_vm_unlocking();
801
28.9k
    VALUE enclib = rb_sprintf("enc/%s.so", name);
802
28.9k
    VALUE debug = ruby_debug;
803
28.9k
    VALUE errinfo;
804
28.9k
    char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
805
28.9k
    int loaded;
806
28.9k
    int idx;
807
808
597k
    while (s < e) {
809
568k
        if (!ISALNUM(*s)) *s = '_';
810
414k
        else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
811
568k
        ++s;
812
568k
    }
813
28.9k
    enclib = rb_fstring(enclib);
814
28.9k
    ruby_debug = Qfalse;
815
28.9k
    errinfo = rb_errinfo();
816
28.9k
    loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK
817
28.9k
    ruby_debug = debug;
818
28.9k
    rb_set_errinfo(errinfo);
819
820
28.9k
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
821
28.9k
        if (loaded < 0 || 1 < loaded) {
822
28.9k
            idx = -1;
823
28.9k
        }
824
0
        else if ((idx = enc_registered(enc_table, name)) < 0) {
825
0
            idx = -1;
826
0
        }
827
0
        else if (rb_enc_autoload_p(enc_table->list[idx].enc)) {
828
0
            idx = -1;
829
0
        }
830
28.9k
    }
831
832
28.9k
    return idx;
833
28.9k
}
834
835
static int
836
enc_autoload_body(rb_encoding *enc)
837
28.7k
{
838
28.7k
    rb_encoding *base;
839
28.7k
    int i = 0;
840
28.7k
    ASSERT_vm_unlocking();
841
842
28.7k
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
843
28.7k
        base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
844
28.7k
    }
845
846
28.7k
    if (base) {
847
0
        bool do_register = true;
848
0
        if (rb_enc_autoload_p(base)) {
849
0
            if (rb_enc_autoload(base) < 0) {
850
0
                do_register = false;
851
0
                i = -1;
852
0
            }
853
0
        }
854
855
0
        if (do_register) {
856
0
            GLOBAL_ENC_TABLE_LOCKING(enc_table) {
857
0
                i = ENC_TO_ENCINDEX(enc);
858
0
                enc_load_from_base(enc_table, i, base);
859
0
                RUBY_ASSERT(((rb_raw_encoding *)enc)->ruby_encoding_index == i);
860
0
            }
861
0
        }
862
0
    }
863
28.7k
    else {
864
28.7k
        i = -2;
865
28.7k
    }
866
867
28.7k
    return i;
868
28.7k
}
869
870
int
871
rb_enc_autoload(rb_encoding *enc)
872
28.7k
{
873
28.7k
    ASSERT_vm_unlocking();
874
28.7k
    int i = enc_autoload_body(enc);
875
28.7k
    if (i == -2) {
876
28.7k
        i = load_encoding(rb_enc_name(enc));
877
28.7k
    }
878
28.7k
    return i;
879
28.7k
}
880
881
bool
882
rb_enc_autoload_p(rb_encoding *enc)
883
2.80M
{
884
2.80M
    int idx = ENC_TO_ENCINDEX(enc);
885
2.80M
    RUBY_ASSERT(rb_enc_from_index(idx) == enc);
886
2.80M
    return !RUBY_ATOMIC_LOAD(global_enc_table.list[idx].loaded);
887
2.80M
}
888
889
/* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
890
int
891
rb_enc_find_index(const char *name)
892
29.5k
{
893
29.5k
    int i;
894
29.5k
    ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary
895
29.5k
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
896
29.5k
        i = enc_registered(enc_table, name);
897
29.5k
    }
898
29.5k
    rb_encoding *enc;
899
900
29.5k
    if (i < 0) {
901
190
        i = load_encoding(name);
902
190
    }
903
29.3k
    else if (!(enc = rb_enc_from_index(i))) {
904
0
        if (i != UNSPECIFIED_ENCODING) {
905
0
            rb_raise(rb_eArgError, "encoding %s is not registered", name);
906
0
        }
907
0
    }
908
29.3k
    else if (rb_enc_autoload_p(enc)) {
909
28.7k
        if (rb_enc_autoload(enc) < 0) {
910
28.7k
            rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
911
28.7k
                    name);
912
28.7k
            return 0;
913
28.7k
        }
914
28.7k
    }
915
763
    return i;
916
29.5k
}
917
918
int
919
rb_enc_find_index2(const char *name, long len)
920
0
{
921
0
    char buf[ENCODING_NAMELEN_MAX+1];
922
923
0
    if (len > ENCODING_NAMELEN_MAX) return -1;
924
0
    memcpy(buf, name, len);
925
0
    buf[len] = '\0';
926
0
    return rb_enc_find_index(buf);
927
0
}
928
929
rb_encoding *
930
rb_enc_find(const char *name)
931
0
{
932
0
    int idx = rb_enc_find_index(name);
933
0
    if (idx < 0) idx = 0;
934
0
    return rb_enc_from_index(idx);
935
0
}
936
937
static inline int
938
enc_capable(VALUE obj)
939
2.61M
{
940
2.61M
    if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
941
2.61M
    switch (BUILTIN_TYPE(obj)) {
942
2.54M
      case T_STRING:
943
2.61M
      case T_REGEXP:
944
2.61M
      case T_FILE:
945
2.61M
      case T_SYMBOL:
946
2.61M
        return TRUE;
947
0
      case T_DATA:
948
0
        if (is_data_encoding(obj)) return TRUE;
949
0
      default:
950
0
        return FALSE;
951
2.61M
    }
952
2.61M
}
953
954
int
955
rb_enc_capable(VALUE obj)
956
0
{
957
0
    return enc_capable(obj);
958
0
}
959
960
ID
961
rb_id_encoding(void)
962
9
{
963
9
    CONST_ID(id_encoding, "encoding");
964
9
    return id_encoding;
965
9
}
966
967
static int
968
enc_get_index_str(VALUE str)
969
12.5M
{
970
12.5M
    int i = ENCODING_GET_INLINED(str);
971
12.5M
    if (i == ENCODING_INLINE_MAX) {
972
0
        VALUE iv;
973
974
#if 0
975
        iv = rb_ivar_get(str, rb_id_encoding());
976
        i = NUM2INT(iv);
977
#else
978
        /*
979
         * Tentatively, assume ASCII-8BIT, if encoding index instance
980
         * variable is not found.  This can happen when freeing after
981
         * all instance variables are removed in `obj_free`.
982
         */
983
0
        iv = rb_attr_get(str, rb_id_encoding());
984
0
        i = NIL_P(iv) ? ENCINDEX_ASCII_8BIT : NUM2INT(iv);
985
0
#endif
986
0
    }
987
12.5M
    return i;
988
12.5M
}
989
990
int
991
rb_enc_get_index(VALUE obj)
992
9.75M
{
993
9.75M
    int i = -1;
994
9.75M
    VALUE tmp;
995
996
9.75M
    if (SPECIAL_CONST_P(obj)) {
997
0
        if (!SYMBOL_P(obj)) return -1;
998
0
        obj = rb_sym2str(obj);
999
0
    }
1000
9.75M
    switch (BUILTIN_TYPE(obj)) {
1001
9.56M
      case T_STRING:
1002
9.56M
      case T_SYMBOL:
1003
9.75M
      case T_REGEXP:
1004
9.75M
        i = enc_get_index_str(obj);
1005
9.75M
        break;
1006
0
      case T_FILE:
1007
0
        tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
1008
0
        if (NIL_P(tmp)) {
1009
0
            tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0);
1010
0
        }
1011
0
        if (is_obj_encoding(tmp)) {
1012
0
            i = enc_check_encoding(tmp);
1013
0
        }
1014
0
        break;
1015
0
      case T_DATA:
1016
0
        if (is_data_encoding(obj)) {
1017
0
            i = enc_check_encoding(obj);
1018
0
        }
1019
0
        break;
1020
0
      default:
1021
0
        break;
1022
9.75M
    }
1023
9.75M
    return i;
1024
9.75M
}
1025
1026
static void
1027
enc_set_index(VALUE obj, int idx)
1028
2.61M
{
1029
2.61M
    if (!enc_capable(obj)) {
1030
0
        rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object");
1031
0
    }
1032
1033
2.61M
    if (idx < ENCODING_INLINE_MAX) {
1034
2.61M
        ENCODING_SET_INLINED(obj, idx);
1035
2.61M
        return;
1036
2.61M
    }
1037
2.61M
    ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
1038
0
    rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1039
0
}
1040
1041
void
1042
rb_enc_raw_set(VALUE obj, rb_encoding *enc)
1043
2.84M
{
1044
2.84M
    RUBY_ASSERT(enc_capable(obj));
1045
1046
2.84M
    int idx = enc ? ENC_TO_ENCINDEX(enc) : 0;
1047
1048
2.84M
    if (idx < ENCODING_INLINE_MAX) {
1049
2.84M
        ENCODING_SET_INLINED(obj, idx);
1050
2.84M
        return;
1051
2.84M
    }
1052
2.84M
    ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
1053
0
    rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1054
0
}
1055
1056
void
1057
rb_enc_set_index(VALUE obj, int idx)
1058
2.33M
{
1059
2.33M
    rb_check_frozen(obj);
1060
2.33M
    must_encindex(idx);
1061
2.33M
    enc_set_index(obj, idx);
1062
2.33M
}
1063
1064
VALUE
1065
rb_enc_associate_index(VALUE obj, int idx)
1066
1.95M
{
1067
1.95M
    rb_encoding *enc;
1068
1.95M
    int oldidx, oldtermlen, termlen;
1069
1070
1.95M
    rb_check_frozen(obj);
1071
1.95M
    oldidx = rb_enc_get_index(obj);
1072
1.95M
    if (oldidx == idx)
1073
1.66M
        return obj;
1074
289k
    if (SPECIAL_CONST_P(obj)) {
1075
0
        rb_raise(rb_eArgError, "cannot set encoding");
1076
0
    }
1077
289k
    enc = must_encindex(idx);
1078
289k
    if (!ENC_CODERANGE_ASCIIONLY(obj) ||
1079
178k
        !rb_enc_asciicompat(enc)) {
1080
178k
        ENC_CODERANGE_CLEAR(obj);
1081
178k
    }
1082
289k
    termlen = rb_enc_mbminlen(enc);
1083
289k
    oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
1084
289k
    if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) {
1085
0
        rb_str_change_terminator_length(obj, oldtermlen, termlen);
1086
0
    }
1087
289k
    enc_set_index(obj, idx);
1088
289k
    return obj;
1089
289k
}
1090
1091
VALUE
1092
rb_enc_associate(VALUE obj, rb_encoding *enc)
1093
1.79M
{
1094
1.79M
    return rb_enc_associate_index(obj, rb_enc_to_index(enc));
1095
1.79M
}
1096
1097
rb_encoding*
1098
rb_enc_get(VALUE obj)
1099
4.44M
{
1100
4.44M
    return rb_enc_from_index(rb_enc_get_index(obj));
1101
4.44M
}
1102
1103
const char *
1104
rb_enc_inspect_name(rb_encoding *enc)
1105
190
{
1106
190
    if (enc == global_enc_ascii) {
1107
95
        return "BINARY (ASCII-8BIT)";
1108
95
    }
1109
95
    return enc->name;
1110
190
}
1111
1112
static rb_encoding*
1113
rb_encoding_check(rb_encoding* enc, VALUE str1, VALUE str2)
1114
2.77M
{
1115
2.77M
    if (!enc)
1116
0
        rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1117
0
                 rb_enc_inspect_name(rb_enc_get(str1)),
1118
0
                 rb_enc_inspect_name(rb_enc_get(str2)));
1119
2.77M
    return enc;
1120
2.77M
}
1121
1122
static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
1123
1124
rb_encoding*
1125
rb_enc_check_str(VALUE str1, VALUE str2)
1126
1.38M
{
1127
1.38M
    rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2));
1128
0
    return rb_encoding_check(enc, str1, str2);
1129
1.38M
}
1130
1131
rb_encoding*
1132
rb_enc_check(VALUE str1, VALUE str2)
1133
1.38M
{
1134
1.38M
    rb_encoding *enc = rb_enc_compatible(str1, str2);
1135
1.38M
    return rb_encoding_check(enc, str1, str2);
1136
1.38M
}
1137
1138
static rb_encoding*
1139
enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
1140
2.86M
{
1141
2.86M
    if (idx1 < 0 || idx2 < 0)
1142
0
        return 0;
1143
1144
2.86M
    if (idx1 == idx2) {
1145
2.80M
        return rb_enc_from_index(idx1);
1146
2.80M
    }
1147
1148
69.1k
    int isstr1, isstr2;
1149
69.1k
    rb_encoding *enc1 = rb_enc_from_index(idx1);
1150
69.1k
    rb_encoding *enc2 = rb_enc_from_index(idx2);
1151
1152
69.1k
    isstr2 = RB_TYPE_P(str2, T_STRING);
1153
69.1k
    if (isstr2 && RSTRING_LEN(str2) == 0)
1154
4
        return enc1;
1155
69.1k
    isstr1 = RB_TYPE_P(str1, T_STRING);
1156
69.1k
    if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0)
1157
0
        return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
1158
69.1k
    if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
1159
0
        return 0;
1160
0
    }
1161
1162
    /* objects whose encoding is the same of contents */
1163
69.1k
    if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
1164
0
        return enc1;
1165
69.1k
    if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
1166
0
        return enc2;
1167
1168
69.1k
    if (!isstr1) {
1169
0
        VALUE tmp = str1;
1170
0
        int idx0 = idx1;
1171
0
        str1 = str2;
1172
0
        str2 = tmp;
1173
0
        idx1 = idx2;
1174
0
        idx2 = idx0;
1175
0
        idx0 = isstr1;
1176
0
        isstr1 = isstr2;
1177
0
        isstr2 = idx0;
1178
0
    }
1179
69.1k
    if (isstr1) {
1180
69.1k
        int cr1, cr2;
1181
1182
69.1k
        cr1 = rb_enc_str_coderange(str1);
1183
69.1k
        if (isstr2) {
1184
69.1k
            cr2 = rb_enc_str_coderange(str2);
1185
69.1k
            if (cr1 != cr2) {
1186
                /* may need to handle ENC_CODERANGE_BROKEN */
1187
383
                if (cr1 == ENC_CODERANGE_7BIT) return enc2;
1188
0
                if (cr2 == ENC_CODERANGE_7BIT) return enc1;
1189
0
            }
1190
68.7k
            if (cr2 == ENC_CODERANGE_7BIT) {
1191
68.7k
                return enc1;
1192
68.7k
            }
1193
68.7k
        }
1194
0
        if (cr1 == ENC_CODERANGE_7BIT)
1195
0
            return enc2;
1196
0
    }
1197
0
    return 0;
1198
69.1k
}
1199
1200
static rb_encoding*
1201
enc_compatible_str(VALUE str1, VALUE str2)
1202
1.38M
{
1203
1.38M
    int idx1 = enc_get_index_str(str1);
1204
1.38M
    int idx2 = enc_get_index_str(str2);
1205
1206
1.38M
    return enc_compatible_latter(str1, str2, idx1, idx2);
1207
1.38M
}
1208
1209
rb_encoding*
1210
rb_enc_compatible(VALUE str1, VALUE str2)
1211
1.48M
{
1212
1.48M
    int idx1 = rb_enc_get_index(str1);
1213
1.48M
    int idx2 = rb_enc_get_index(str2);
1214
1215
1.48M
    return enc_compatible_latter(str1, str2, idx1, idx2);
1216
1.48M
}
1217
1218
void
1219
rb_enc_copy(VALUE obj1, VALUE obj2)
1220
49.3k
{
1221
49.3k
    rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
1222
49.3k
}
1223
1224
1225
/*
1226
 *  call-seq:
1227
 *     encoding -> encoding
1228
 *
1229
 *  Returns an Encoding object that represents the encoding of +self+;
1230
 *  see {Encodings}[rdoc-ref:encodings.rdoc].
1231
 *
1232
 *  Related: see {Querying}[rdoc-ref:String@Querying].
1233
 */
1234
1235
VALUE
1236
rb_obj_encoding(VALUE obj)
1237
0
{
1238
0
    int idx = rb_enc_get_index(obj);
1239
0
    if (idx < 0) {
1240
0
        rb_raise(rb_eTypeError, "unknown encoding");
1241
0
    }
1242
0
    return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
1243
0
}
1244
1245
int
1246
rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
1247
843k
{
1248
843k
    return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1249
843k
}
1250
1251
int
1252
rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
1253
74.1M
{
1254
74.1M
    int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1255
74.1M
    if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
1256
74.1M
        return MBCLEN_CHARFOUND_LEN(n);
1257
729
    else {
1258
729
        int min = rb_enc_mbminlen(enc);
1259
729
        return min <= e-p ? min : (int)(e-p);
1260
729
    }
1261
74.1M
}
1262
1263
int
1264
rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
1265
64.6M
{
1266
64.6M
    int n;
1267
64.6M
    if (e <= p)
1268
5
        return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
1269
64.6M
    n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1270
64.6M
    if (e-p < n)
1271
0
        return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
1272
64.6M
    return n;
1273
64.6M
}
1274
1275
int
1276
rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
1277
6.00M
{
1278
6.00M
    unsigned int c;
1279
6.00M
    int l;
1280
6.00M
    if (e <= p)
1281
0
        return -1;
1282
6.00M
    if (rb_enc_asciicompat(enc)) {
1283
6.00M
        c = (unsigned char)*p;
1284
6.00M
        if (!ISASCII(c))
1285
928k
            return -1;
1286
5.07M
        if (len) *len = 1;
1287
5.07M
        return c;
1288
6.00M
    }
1289
0
    l = rb_enc_precise_mbclen(p, e, enc);
1290
0
    if (!MBCLEN_CHARFOUND_P(l))
1291
0
        return -1;
1292
0
    c = rb_enc_mbc_to_codepoint(p, e, enc);
1293
0
    if (!rb_enc_isascii(c, enc))
1294
0
        return -1;
1295
0
    if (len) *len = l;
1296
0
    return c;
1297
0
}
1298
1299
unsigned int
1300
rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
1301
53.3k
{
1302
53.3k
    int r;
1303
53.3k
    if (e <= p)
1304
0
        rb_raise(rb_eArgError, "empty string");
1305
53.3k
    r = rb_enc_precise_mbclen(p, e, enc);
1306
53.3k
    if (!MBCLEN_CHARFOUND_P(r)) {
1307
0
        rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1308
0
    }
1309
53.3k
    if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
1310
53.3k
    return rb_enc_mbc_to_codepoint(p, e, enc);
1311
53.3k
}
1312
1313
int
1314
rb_enc_codelen(int c, rb_encoding *enc)
1315
30.6k
{
1316
30.6k
    int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
1317
30.6k
    if (n == 0) {
1318
0
        rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
1319
0
    }
1320
30.6k
    return n;
1321
30.6k
}
1322
1323
int
1324
rb_enc_toupper(int c, rb_encoding *enc)
1325
0
{
1326
0
    return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
1327
0
}
1328
1329
int
1330
rb_enc_tolower(int c, rb_encoding *enc)
1331
0
{
1332
0
    return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
1333
0
}
1334
1335
/*
1336
 * call-seq:
1337
 *   enc.inspect -> string
1338
 *
1339
 * Returns a string which represents the encoding for programmers.
1340
 *
1341
 *   Encoding::UTF_8.inspect       #=> "#<Encoding:UTF-8>"
1342
 *   Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1343
 */
1344
static VALUE
1345
enc_inspect(VALUE self)
1346
0
{
1347
0
    rb_encoding *enc;
1348
1349
0
    if (!is_obj_encoding(self)) { /* do not resolve autoload */
1350
0
        not_encoding(self);
1351
0
    }
1352
0
    if (!(enc = RTYPEDDATA_GET_DATA(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
1353
0
        rb_raise(rb_eTypeError, "broken Encoding");
1354
0
    }
1355
1356
0
    return rb_enc_sprintf(rb_usascii_encoding(),
1357
0
                          "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
1358
0
                          rb_enc_inspect_name(enc),
1359
0
                          (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
1360
0
                          rb_enc_autoload_p(enc) ? " (autoload)" : "");
1361
0
}
1362
1363
1364
static int
1365
enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1366
0
{
1367
0
    VALUE *arg = (VALUE *)args;
1368
1369
0
    if ((int)idx == (int)arg[0]) {
1370
0
        VALUE str = rb_interned_str_cstr((char *)name);
1371
0
        rb_ary_push(arg[1], str);
1372
0
    }
1373
0
    return ST_CONTINUE;
1374
0
}
1375
1376
/*
1377
 * call-seq:
1378
 *   enc.names -> array
1379
 *
1380
 * Returns the list of name and aliases of the encoding.
1381
 *
1382
 *   Encoding::WINDOWS_31J.names  #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1383
 */
1384
static VALUE
1385
enc_names(VALUE self)
1386
0
{
1387
0
    VALUE args[2];
1388
1389
0
    args[0] = (VALUE)rb_to_encoding_index(self);
1390
0
    args[1] = rb_ary_new2(0);
1391
1392
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1393
0
        st_foreach(enc_table->names, enc_names_i, (st_data_t)args);
1394
0
    }
1395
0
    return args[1];
1396
0
}
1397
1398
/*
1399
 * call-seq:
1400
 *   Encoding.list -> [enc1, enc2, ...]
1401
 *
1402
 * Returns the list of loaded encodings.
1403
 *
1404
 *   Encoding.list
1405
 *   #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1406
 *         #<Encoding:ISO-2022-JP (dummy)>]
1407
 *
1408
 *   Encoding.find("US-ASCII")
1409
 *   #=> #<Encoding:US-ASCII>
1410
 *
1411
 *   Encoding.list
1412
 *   #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1413
 *         #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1414
 *
1415
 */
1416
static VALUE
1417
enc_list(VALUE klass)
1418
0
{
1419
0
    VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
1420
0
    return rb_ary_dup(list);
1421
0
}
1422
1423
/*
1424
 * call-seq:
1425
 *   Encoding.find(string) -> enc
1426
 *
1427
 * Search the encoding with specified <i>name</i>.
1428
 * <i>name</i> should be a string.
1429
 *
1430
 *   Encoding.find("US-ASCII")  #=> #<Encoding:US-ASCII>
1431
 *
1432
 * Names which this method accept are encoding names and aliases
1433
 * including following special aliases
1434
 *
1435
 * "external"::   default external encoding
1436
 * "internal"::   default internal encoding
1437
 * "locale"::     locale encoding
1438
 * "filesystem":: filesystem encoding
1439
 *
1440
 * An ArgumentError is raised when no encoding with <i>name</i>.
1441
 * Only <code>Encoding.find("internal")</code> however returns nil
1442
 * when no encoding named "internal", in other words, when Ruby has no
1443
 * default internal encoding.
1444
 */
1445
static VALUE
1446
enc_find(VALUE klass, VALUE enc)
1447
0
{
1448
0
    int idx;
1449
0
    if (is_obj_encoding(enc))
1450
0
        return enc;
1451
0
    idx = str_to_encindex(enc);
1452
0
    if (idx == UNSPECIFIED_ENCODING) return Qnil;
1453
0
    return rb_enc_from_encoding_index(idx);
1454
0
}
1455
1456
/*
1457
 * call-seq:
1458
 *   Encoding.compatible?(obj1, obj2) -> enc or nil
1459
 *
1460
 * Checks the compatibility of two objects.
1461
 *
1462
 * If the objects are both strings they are compatible when they are
1463
 * concatenatable.  The encoding of the concatenated string will be returned
1464
 * if they are compatible, nil if they are not.
1465
 *
1466
 *   Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1467
 *   #=> #<Encoding:ISO-8859-1>
1468
 *
1469
 *   Encoding.compatible?(
1470
 *     "\xa1".force_encoding("iso-8859-1"),
1471
 *     "\xa1\xa1".force_encoding("euc-jp"))
1472
 *   #=> nil
1473
 *
1474
 * If the objects are non-strings their encodings are compatible when they
1475
 * have an encoding and:
1476
 * * Either encoding is US-ASCII compatible
1477
 * * One of the encodings is a 7-bit encoding
1478
 *
1479
 */
1480
static VALUE
1481
enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1482
0
{
1483
0
    rb_encoding *enc;
1484
1485
0
    if (!enc_capable(str1)) return Qnil;
1486
0
    if (!enc_capable(str2)) return Qnil;
1487
0
    enc = rb_enc_compatible(str1, str2);
1488
0
    if (!enc) return Qnil;
1489
0
    return rb_enc_from_encoding(enc);
1490
0
}
1491
1492
NORETURN(static VALUE enc_s_alloc(VALUE klass));
1493
/* :nodoc: */
1494
static VALUE
1495
enc_s_alloc(VALUE klass)
1496
0
{
1497
0
    rb_undefined_alloc(klass);
1498
0
    UNREACHABLE_RETURN(Qnil);
1499
0
}
1500
1501
/* :nodoc: */
1502
static VALUE
1503
enc_dump(int argc, VALUE *argv, VALUE self)
1504
0
{
1505
0
    rb_check_arity(argc, 0, 1);
1506
0
    return rb_attr_get(self, id_i_name);
1507
0
}
1508
1509
/* :nodoc: */
1510
static VALUE
1511
enc_load(VALUE klass, VALUE str)
1512
0
{
1513
0
    return str;
1514
0
}
1515
1516
/* :nodoc: */
1517
static VALUE
1518
enc_m_loader(VALUE klass, VALUE str)
1519
0
{
1520
0
    return enc_find(klass, str);
1521
0
}
1522
1523
rb_encoding *
1524
rb_ascii8bit_encoding(void)
1525
710k
{
1526
710k
    return global_enc_ascii;
1527
710k
}
1528
1529
int
1530
rb_ascii8bit_encindex(void)
1531
0
{
1532
0
    return ENCINDEX_ASCII_8BIT;
1533
0
}
1534
1535
rb_encoding *
1536
rb_utf8_encoding(void)
1537
237k
{
1538
237k
    return global_enc_utf_8;
1539
237k
}
1540
1541
int
1542
rb_utf8_encindex(void)
1543
0
{
1544
0
    return ENCINDEX_UTF_8;
1545
0
}
1546
1547
rb_encoding *
1548
rb_usascii_encoding(void)
1549
432k
{
1550
432k
    return global_enc_us_ascii;
1551
432k
}
1552
1553
int
1554
rb_usascii_encindex(void)
1555
0
{
1556
0
    return ENCINDEX_US_ASCII;
1557
0
}
1558
1559
int rb_locale_charmap_index(void);
1560
1561
int
1562
rb_locale_encindex(void)
1563
0
{
1564
    // `rb_locale_charmap_index` can call `enc_find_index`, which can
1565
    // load an encoding. This needs to be done without VM lock held.
1566
0
    ASSERT_vm_unlocking();
1567
0
    int idx = rb_locale_charmap_index();
1568
1569
0
    if (idx < 0) idx = ENCINDEX_UTF_8;
1570
1571
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1572
0
        if (enc_registered(enc_table, "locale") < 0) {
1573
# if defined _WIN32
1574
            void Init_w32_codepage(void);
1575
            Init_w32_codepage();
1576
# endif
1577
0
            GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1578
0
                enc_alias_internal(enc_table, "locale", idx);
1579
0
            }
1580
0
        }
1581
0
    }
1582
1583
0
    return idx;
1584
0
}
1585
1586
rb_encoding *
1587
rb_locale_encoding(void)
1588
0
{
1589
0
    return rb_enc_from_index(rb_locale_encindex());
1590
0
}
1591
1592
int
1593
rb_filesystem_encindex(void)
1594
1.40M
{
1595
1.40M
    return filesystem_encindex;
1596
1.40M
}
1597
1598
rb_encoding *
1599
rb_filesystem_encoding(void)
1600
1.40M
{
1601
1.40M
    return rb_enc_from_index(rb_filesystem_encindex());
1602
1.40M
}
1603
1604
struct default_encoding {
1605
    int index;      /* -2 => not yet set, -1 => nil */
1606
    rb_encoding *enc;
1607
};
1608
1609
static struct default_encoding default_external = {0};
1610
1611
static int
1612
enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1613
0
{
1614
0
    int overridden = FALSE;
1615
1616
0
    if (def->index != -2)
1617
        /* Already set */
1618
0
        overridden = TRUE;
1619
1620
0
    int index = 0;
1621
0
    if (!NIL_P(encoding)) {
1622
0
        enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock.
1623
0
        index = rb_enc_to_index(rb_to_encoding(encoding));
1624
0
    }
1625
1626
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1627
0
        if (NIL_P(encoding)) {
1628
0
            def->index = -1;
1629
0
            def->enc = 0;
1630
0
            char *name_dup = strdup(name);
1631
1632
0
            st_data_t existing_name = (st_data_t)name_dup;
1633
0
            if (st_delete(enc_table->names, &existing_name, NULL)) {
1634
0
                xfree((void *)existing_name);
1635
0
            }
1636
1637
0
            st_insert(enc_table->names, (st_data_t)name_dup,
1638
0
                      (st_data_t)UNSPECIFIED_ENCODING);
1639
0
        }
1640
0
        else {
1641
0
            def->index = index;
1642
0
            def->enc = 0;
1643
0
            enc_alias_internal(enc_table, name, def->index);
1644
0
        }
1645
1646
0
        if (def == &default_external) {
1647
0
            int fs_idx = Init_enc_set_filesystem_encoding();
1648
0
            enc_alias_internal(enc_table, "filesystem", fs_idx);
1649
0
            filesystem_encindex = fs_idx;
1650
0
        }
1651
0
    }
1652
1653
0
    return overridden;
1654
0
}
1655
1656
rb_encoding *
1657
rb_default_external_encoding(void)
1658
178k
{
1659
178k
    if (default_external.enc) return default_external.enc;
1660
1661
9
    if (default_external.index >= 0) {
1662
9
        default_external.enc = rb_enc_from_index(default_external.index);
1663
9
        return default_external.enc;
1664
9
    }
1665
0
    else {
1666
0
        return rb_locale_encoding();
1667
0
    }
1668
9
}
1669
1670
VALUE
1671
rb_enc_default_external(void)
1672
0
{
1673
0
    return rb_enc_from_encoding(rb_default_external_encoding());
1674
0
}
1675
1676
/*
1677
 * call-seq:
1678
 *   Encoding.default_external -> enc
1679
 *
1680
 * Returns default external encoding.
1681
 *
1682
 * The default external encoding is used by default for strings created from
1683
 * the following locations:
1684
 *
1685
 * * CSV
1686
 * * File data read from disk
1687
 * * SDBM
1688
 * * StringIO
1689
 * * Zlib::GzipReader
1690
 * * Zlib::GzipWriter
1691
 * * String#inspect
1692
 * * Regexp#inspect
1693
 *
1694
 * While strings created from these locations will have this encoding, the
1695
 * encoding may not be valid.  Be sure to check String#valid_encoding?.
1696
 *
1697
 * File data written to disk will be transcoded to the default external
1698
 * encoding when written, if default_internal is not nil.
1699
 *
1700
 * The default external encoding is initialized by the -E option.
1701
 * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1702
 * other operating systems.
1703
 */
1704
static VALUE
1705
get_default_external(VALUE klass)
1706
0
{
1707
0
    return rb_enc_default_external();
1708
0
}
1709
1710
void
1711
rb_enc_set_default_external(VALUE encoding)
1712
0
{
1713
0
    if (NIL_P(encoding)) {
1714
0
        rb_raise(rb_eArgError, "default external can not be nil");
1715
0
    }
1716
0
    enc_set_default_encoding(&default_external, encoding,
1717
0
                            "external");
1718
0
}
1719
1720
/*
1721
 * call-seq:
1722
 *   Encoding.default_external = enc
1723
 *
1724
 * Sets default external encoding.  You should not set
1725
 * Encoding::default_external in ruby code as strings created before changing
1726
 * the value may have a different encoding from strings created after the value
1727
 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1728
 * the correct default_external.
1729
 *
1730
 * See Encoding::default_external for information on how the default external
1731
 * encoding is used.
1732
 */
1733
static VALUE
1734
set_default_external(VALUE klass, VALUE encoding)
1735
0
{
1736
0
    rb_warning("setting Encoding.default_external");
1737
0
    rb_enc_set_default_external(encoding);
1738
0
    return encoding;
1739
0
}
1740
1741
static struct default_encoding default_internal = {-2};
1742
1743
rb_encoding *
1744
rb_default_internal_encoding(void)
1745
178k
{
1746
178k
    if (!default_internal.enc && default_internal.index >= 0) {
1747
0
        default_internal.enc = rb_enc_from_index(default_internal.index);
1748
0
    }
1749
178k
    return default_internal.enc; /* can be NULL */
1750
178k
}
1751
1752
VALUE
1753
rb_enc_default_internal(void)
1754
0
{
1755
    /* Note: These functions cope with default_internal not being set */
1756
0
    return rb_enc_from_encoding(rb_default_internal_encoding());
1757
0
}
1758
1759
/*
1760
 * call-seq:
1761
 *   Encoding.default_internal -> enc
1762
 *
1763
 * Returns default internal encoding.  Strings will be transcoded to the
1764
 * default internal encoding in the following places if the default internal
1765
 * encoding is not nil:
1766
 *
1767
 * * CSV
1768
 * * Etc.sysconfdir and Etc.systmpdir
1769
 * * File data read from disk
1770
 * * File names from Dir
1771
 * * Integer#chr
1772
 * * String#inspect and Regexp#inspect
1773
 * * Strings returned from Readline
1774
 * * Strings returned from SDBM
1775
 * * Time#zone
1776
 * * Values from ENV
1777
 * * Values in ARGV including $PROGRAM_NAME
1778
 *
1779
 * Additionally String#encode and String#encode! use the default internal
1780
 * encoding if no encoding is given.
1781
 *
1782
 * The script encoding (__ENCODING__), not default_internal, is used as the
1783
 * encoding of created strings.
1784
 *
1785
 * Encoding::default_internal is initialized with -E option or nil otherwise.
1786
 */
1787
static VALUE
1788
get_default_internal(VALUE klass)
1789
0
{
1790
0
    return rb_enc_default_internal();
1791
0
}
1792
1793
void
1794
rb_enc_set_default_internal(VALUE encoding)
1795
0
{
1796
0
    enc_set_default_encoding(&default_internal, encoding,
1797
0
                            "internal");
1798
0
}
1799
1800
/*
1801
 * call-seq:
1802
 *   Encoding.default_internal = enc or nil
1803
 *
1804
 * Sets default internal encoding or removes default internal encoding when
1805
 * passed nil.  You should not set Encoding::default_internal in ruby code as
1806
 * strings created before changing the value may have a different encoding
1807
 * from strings created after the change.  Instead you should use
1808
 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1809
 *
1810
 * See Encoding::default_internal for information on how the default internal
1811
 * encoding is used.
1812
 */
1813
static VALUE
1814
set_default_internal(VALUE klass, VALUE encoding)
1815
0
{
1816
0
    rb_warning("setting Encoding.default_internal");
1817
0
    rb_enc_set_default_internal(encoding);
1818
0
    return encoding;
1819
0
}
1820
1821
static void
1822
set_encoding_const(const char *name, rb_encoding *enc)
1823
0
{
1824
0
    VALUE encoding = rb_enc_from_encoding(enc);
1825
0
    char *s = (char *)name;
1826
0
    int haslower = 0, hasupper = 0, valid = 0;
1827
1828
0
    if (ISDIGIT(*s)) return;
1829
0
    if (ISUPPER(*s)) {
1830
0
        hasupper = 1;
1831
0
        while (*++s && (ISALNUM(*s) || *s == '_')) {
1832
0
            if (ISLOWER(*s)) haslower = 1;
1833
0
        }
1834
0
    }
1835
0
    if (!*s) {
1836
0
        if (s - name > ENCODING_NAMELEN_MAX) return;
1837
0
        valid = 1;
1838
0
        rb_define_const(rb_cEncoding, name, encoding);
1839
0
    }
1840
0
    if (!valid || haslower) {
1841
0
        size_t len = s - name;
1842
0
        if (len > ENCODING_NAMELEN_MAX) return;
1843
0
        if (!haslower || !hasupper) {
1844
0
            do {
1845
0
                if (ISLOWER(*s)) haslower = 1;
1846
0
                if (ISUPPER(*s)) hasupper = 1;
1847
0
            } while (*++s && (!haslower || !hasupper));
1848
0
            len = s - name;
1849
0
        }
1850
0
        len += strlen(s);
1851
0
        if (len++ > ENCODING_NAMELEN_MAX) return;
1852
0
        MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1853
0
        name = s;
1854
0
        if (!valid) {
1855
0
            if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1856
0
            for (; *s; ++s) {
1857
0
                if (!ISALNUM(*s)) *s = '_';
1858
0
            }
1859
0
            if (hasupper) {
1860
0
                rb_define_const(rb_cEncoding, name, encoding);
1861
0
            }
1862
0
        }
1863
0
        if (haslower) {
1864
0
            for (s = (char *)name; *s; ++s) {
1865
0
                if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1866
0
            }
1867
0
            rb_define_const(rb_cEncoding, name, encoding);
1868
0
        }
1869
0
    }
1870
0
}
1871
1872
static int
1873
rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1874
0
{
1875
0
    VALUE ary = (VALUE)arg;
1876
0
    VALUE str = rb_interned_str_cstr((char *)name);
1877
0
    rb_ary_push(ary, str);
1878
0
    return ST_CONTINUE;
1879
0
}
1880
1881
/*
1882
 * call-seq:
1883
 *   Encoding.name_list -> ["enc1", "enc2", ...]
1884
 *
1885
 * Returns the list of available encoding names.
1886
 *
1887
 *   Encoding.name_list
1888
 *   #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1889
 *         "ISO-8859-1", "Shift_JIS", "EUC-JP",
1890
 *         "Windows-31J",
1891
 *         "BINARY", "CP932", "eucJP"]
1892
 *
1893
 */
1894
1895
static VALUE
1896
rb_enc_name_list(VALUE klass)
1897
0
{
1898
0
    VALUE ary;
1899
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1900
0
        ary = rb_ary_new2(enc_table->names->num_entries);
1901
0
        st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
1902
0
    }
1903
0
    return ary;
1904
0
}
1905
1906
static int
1907
rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1908
0
{
1909
0
    VALUE *p = (VALUE *)arg;
1910
0
    VALUE aliases = p[0], ary = p[1];
1911
0
    int idx = (int)orig;
1912
0
    VALUE key, str = rb_ary_entry(ary, idx);
1913
1914
0
    if (NIL_P(str)) {
1915
0
        rb_encoding *enc = rb_enc_from_index(idx);
1916
1917
0
        if (!enc) return ST_CONTINUE;
1918
0
        if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1919
0
            return ST_CONTINUE;
1920
0
        }
1921
0
        str = rb_fstring_cstr(rb_enc_name(enc));
1922
0
        rb_ary_store(ary, idx, str);
1923
0
    }
1924
0
    key = rb_interned_str_cstr((char *)name);
1925
0
    rb_hash_aset(aliases, key, str);
1926
0
    return ST_CONTINUE;
1927
0
}
1928
1929
/*
1930
 * call-seq:
1931
 *   Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1932
 *
1933
 * Returns the hash of available encoding alias and original encoding name.
1934
 *
1935
 *   Encoding.aliases
1936
 *   #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1937
 *         "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1938
 *
1939
 */
1940
1941
static VALUE
1942
rb_enc_aliases(VALUE klass)
1943
0
{
1944
0
    VALUE aliases[2];
1945
0
    aliases[0] = rb_hash_new();
1946
0
    aliases[1] = rb_ary_new();
1947
1948
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
1949
0
        st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases);
1950
0
    }
1951
1952
0
    return aliases[0];
1953
0
}
1954
1955
/*
1956
 * An \Encoding instance represents a character encoding usable in Ruby.
1957
 * It is defined as a constant under the \Encoding namespace.
1958
 * It has a name and, optionally, aliases:
1959
 *
1960
 *   Encoding::US_ASCII.name  # => "US-ASCII"
1961
 *   Encoding::US_ASCII.names # => ["US-ASCII", "ASCII", "ANSI_X3.4-1968", "646"]
1962
 *
1963
 * A Ruby method that accepts an encoding as an argument will accept:
1964
 *
1965
 * - An \Encoding object.
1966
 * - The name of an encoding.
1967
 * - An alias for an encoding name.
1968
 *
1969
 * These are equivalent:
1970
 *
1971
 *   'foo'.encode(Encoding::US_ASCII) # Encoding object.
1972
 *   'foo'.encode('US-ASCII')         # Encoding name.
1973
 *   'foo'.encode('ASCII')            # Encoding alias.
1974
 *
1975
 * For a full discussion of encodings and their uses,
1976
 * see {the Encodings document}[rdoc-ref:encodings.rdoc].
1977
 *
1978
 * Encoding::ASCII_8BIT is a special-purpose encoding that is usually used for
1979
 * a string of bytes, not a string of characters.
1980
 * But as the name indicates, its characters in the ASCII range
1981
 * are considered as ASCII characters.
1982
 * This is useful when you use other ASCII-compatible encodings.
1983
 *
1984
 */
1985
1986
void
1987
Init_Encoding(void)
1988
9
{
1989
9
    VALUE list;
1990
9
    int i;
1991
1992
9
    id_i_name = rb_intern_const("@name");
1993
9
    rb_cEncoding = rb_define_class("Encoding", rb_cObject);
1994
9
    rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
1995
9
    rb_undef_method(CLASS_OF(rb_cEncoding), "new");
1996
1997
    /* The name of the encoding.
1998
     *
1999
     *   Encoding::UTF_8.name      #=> "UTF-8"
2000
     */
2001
9
    rb_attr(rb_cEncoding, rb_intern("name"), TRUE, FALSE, Qfalse);
2002
9
    rb_define_alias(rb_cEncoding, "to_s", "name");
2003
2004
9
    rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
2005
9
    rb_define_method(rb_cEncoding, "names", enc_names, 0);
2006
9
    rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
2007
9
    rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
2008
9
    rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
2009
9
    rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
2010
9
    rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
2011
9
    rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
2012
9
    rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
2013
2014
9
    rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
2015
9
    rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
2016
2017
9
    rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
2018
9
    rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
2019
9
    rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
2020
9
    rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
2021
9
    rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */
2022
2023
9
    struct enc_table *enc_table = &global_enc_table;
2024
2025
9
    rb_gc_register_address(&rb_encoding_list);
2026
9
    list = rb_encoding_list = rb_ary_new2(ENCODING_LIST_CAPA);
2027
9
    RBASIC_CLEAR_CLASS(list);
2028
2029
117
    for (i = 0; i < enc_table->count; ++i) {
2030
108
        rb_ary_push(list, enc_new(enc_table->list[i].enc));
2031
108
    }
2032
2033
9
    rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader);
2034
9
}
2035
2036
void
2037
Init_unicode_version(void)
2038
9
{
2039
9
    extern const char onigenc_unicode_version_string[];
2040
2041
9
    VALUE str = rb_usascii_str_new_static(onigenc_unicode_version_string,
2042
9
                                          strlen(onigenc_unicode_version_string));
2043
9
    OBJ_FREEZE(str);
2044
    /* The supported Unicode version. */
2045
9
    rb_define_const(rb_cEncoding, "UNICODE_VERSION", str);
2046
9
}
2047
2048
void
2049
Init_encodings(void)
2050
9
{
2051
9
    rb_enc_init(&global_enc_table);
2052
9
}
2053
2054
/* locale insensitive ctype functions */
2055
2056
void
2057
rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
2058
0
{
2059
0
    GLOBAL_ENC_TABLE_LOCKING(enc_table) {
2060
0
        st_foreach(enc_table->names, func, arg);
2061
0
    }
2062
0
}