Line | Count | Source |
1 | | /********************************************************************** |
2 | | |
3 | | encoding.c - |
4 | | |
5 | | $Author$ |
6 | | created at: Thu May 24 17:23:27 JST 2007 |
7 | | |
8 | | Copyright (C) 2007 Yukihiro Matsumoto |
9 | | |
10 | | **********************************************************************/ |
11 | | |
12 | | #include "ruby/internal/config.h" |
13 | | |
14 | | #include <ctype.h> |
15 | | |
16 | | #include "encindex.h" |
17 | | #include "internal.h" |
18 | | #include "internal/enc.h" |
19 | | #include "internal/encoding.h" |
20 | | #include "internal/error.h" |
21 | | #include "internal/inits.h" |
22 | | #include "internal/load.h" |
23 | | #include "internal/object.h" |
24 | | #include "internal/string.h" |
25 | | #include "internal/vm.h" |
26 | | #include "regenc.h" |
27 | | #include "ruby/atomic.h" |
28 | | #include "ruby/encoding.h" |
29 | | #include "ruby/util.h" |
30 | | #include "ruby/ractor.h" |
31 | | #include "ruby_assert.h" |
32 | | #include "vm_sync.h" |
33 | | #include "ruby_atomic.h" |
34 | | |
35 | | #ifndef ENC_DEBUG |
36 | | #define ENC_DEBUG 0 |
37 | | #endif |
38 | 2.77M | #define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr) |
39 | 2.77M | #define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str) |
40 | | |
41 | | #undef rb_ascii8bit_encindex |
42 | | #undef rb_utf8_encindex |
43 | | #undef rb_usascii_encindex |
44 | | |
45 | | typedef OnigEncodingType rb_raw_encoding; |
46 | | |
47 | | #if defined __GNUC__ && __GNUC__ >= 4 |
48 | | #pragma GCC visibility push(default) |
49 | | int rb_enc_register(const char *name, rb_encoding *encoding); |
50 | | void rb_enc_set_base(const char *name, const char *orig); |
51 | | int rb_enc_set_dummy(int index); |
52 | | void rb_encdb_declare(const char *name); |
53 | | int rb_encdb_replicate(const char *name, const char *orig); |
54 | | int rb_encdb_dummy(const char *name); |
55 | | int rb_encdb_alias(const char *alias, const char *orig); |
56 | | #pragma GCC visibility pop |
57 | | #endif |
58 | | |
59 | | static ID id_encoding, id_i_name; |
60 | | VALUE rb_cEncoding; |
61 | | |
62 | 27 | #define ENCODING_LIST_CAPA 256 |
63 | | static VALUE rb_encoding_list; |
64 | | |
65 | | struct rb_encoding_entry { |
66 | | rb_atomic_t loaded; |
67 | | const char *name; |
68 | | rb_encoding *enc; |
69 | | rb_encoding *base; |
70 | | }; |
71 | | |
72 | | static struct enc_table { |
73 | | struct rb_encoding_entry list[ENCODING_LIST_CAPA]; |
74 | | int count; |
75 | | st_table *names; |
76 | | } global_enc_table; |
77 | | |
78 | | static int |
79 | | enc_names_free_i(st_data_t name, st_data_t idx, st_data_t args) |
80 | 0 | { |
81 | 0 | ruby_xfree((void *)name); |
82 | 0 | return ST_DELETE; |
83 | 0 | } |
84 | | |
85 | | void |
86 | | rb_free_global_enc_table(void) |
87 | 0 | { |
88 | 0 | for (size_t i = 0; i < ENCODING_LIST_CAPA; i++) { |
89 | 0 | xfree((void *)global_enc_table.list[i].enc); |
90 | 0 | } |
91 | |
|
92 | 0 | st_foreach(global_enc_table.names, enc_names_free_i, (st_data_t)0); |
93 | 0 | st_free_table(global_enc_table.names); |
94 | 0 | } |
95 | | |
96 | | static rb_encoding *global_enc_ascii, |
97 | | *global_enc_utf_8, |
98 | | *global_enc_us_ascii; |
99 | | |
100 | | static int filesystem_encindex = ENCINDEX_ASCII_8BIT; |
101 | | |
102 | | #define GLOBAL_ENC_TABLE_LOCKING(tbl) \ |
103 | 87.2k | for (struct enc_table *tbl = &global_enc_table, **locking = &tbl; \ |
104 | 174k | locking; \ |
105 | 87.2k | locking = NULL) \ |
106 | 87.2k | RB_VM_LOCKING() |
107 | | |
108 | | |
109 | 10.2M | #define ENC_DUMMY_FLAG (1<<24) |
110 | 19.1M | #define ENC_INDEX_MASK (~(~0U<<24)) |
111 | | |
112 | 16.5M | #define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK) |
113 | 10.2M | #define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG) |
114 | 0 | #define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG) |
115 | | |
116 | 9 | #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX |
117 | 0 | #define UNSPECIFIED_ENCODING INT_MAX |
118 | | |
119 | 0 | #define ENCODING_NAMELEN_MAX 63 |
120 | 0 | #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX) |
121 | | |
122 | | static const rb_data_type_t encoding_data_type = { |
123 | | "encoding", |
124 | | {0, 0, 0,}, |
125 | | 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED |
126 | | }; |
127 | | |
128 | 0 | #define is_encoding_type(obj) (RTYPEDDATA_TYPE(obj) == &encoding_data_type) |
129 | 0 | #define is_data_encoding(obj) (rbimpl_rtypeddata_p(obj) && is_encoding_type(obj)) |
130 | 0 | #define is_obj_encoding(obj) (rbimpl_obj_typeddata_p(obj) && is_encoding_type(obj)) |
131 | | |
132 | | int |
133 | | rb_data_is_encoding(VALUE obj) |
134 | 0 | { |
135 | 0 | return is_data_encoding(obj); |
136 | 0 | } |
137 | | |
138 | | static VALUE |
139 | | enc_new(rb_encoding *encoding) |
140 | 108 | { |
141 | 108 | VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding); |
142 | 108 | rb_ivar_set(enc, id_i_name, rb_fstring_cstr(encoding->name)); |
143 | 108 | RB_OBJ_SET_FROZEN_SHAREABLE(enc); |
144 | 108 | return enc; |
145 | 108 | } |
146 | | |
147 | | static void |
148 | | enc_list_update(int index, rb_raw_encoding *encoding) |
149 | 108 | { |
150 | 108 | RUBY_ASSERT(index < ENCODING_LIST_CAPA); |
151 | | |
152 | 108 | VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list); |
153 | | |
154 | 108 | if (list && NIL_P(rb_ary_entry(list, index))) { |
155 | 0 | VALUE new_list = rb_ary_dup(list); |
156 | 0 | RBASIC_CLEAR_CLASS(new_list); |
157 | | /* initialize encoding data */ |
158 | 0 | rb_ary_store(new_list, index, enc_new(encoding)); |
159 | 0 | rb_ary_freeze(new_list); |
160 | 0 | FL_SET_RAW(new_list, RUBY_FL_SHAREABLE); |
161 | 0 | RUBY_ATOMIC_VALUE_SET(rb_encoding_list, new_list); |
162 | 0 | } |
163 | 108 | } |
164 | | |
165 | | static VALUE |
166 | | enc_list_lookup(int idx) |
167 | 0 | { |
168 | 0 | VALUE list, enc = Qnil; |
169 | |
|
170 | 0 | if (idx < ENCODING_LIST_CAPA) { |
171 | 0 | list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list); |
172 | 0 | RUBY_ASSERT(list); |
173 | 0 | enc = rb_ary_entry(list, idx); |
174 | 0 | } |
175 | |
|
176 | 0 | if (NIL_P(enc)) { |
177 | 0 | rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx); |
178 | 0 | } |
179 | 0 | else { |
180 | 0 | return enc; |
181 | 0 | } |
182 | 0 | } |
183 | | |
184 | | static VALUE |
185 | | rb_enc_from_encoding_index(int idx) |
186 | 0 | { |
187 | 0 | return enc_list_lookup(idx); |
188 | 0 | } |
189 | | |
190 | | VALUE |
191 | | rb_enc_from_encoding(rb_encoding *encoding) |
192 | 0 | { |
193 | 0 | int idx; |
194 | 0 | if (!encoding) return Qnil; |
195 | 0 | idx = ENC_TO_ENCINDEX(encoding); |
196 | 0 | return rb_enc_from_encoding_index(idx); |
197 | 0 | } |
198 | | |
199 | | int |
200 | | rb_enc_to_index(rb_encoding *enc) |
201 | 9.13M | { |
202 | 9.13M | return enc ? ENC_TO_ENCINDEX(enc) : 0; |
203 | 9.13M | } |
204 | | |
205 | | int |
206 | | rb_enc_dummy_p(rb_encoding *enc) |
207 | 10.2M | { |
208 | 10.2M | return ENC_DUMMY_P(enc) != 0; |
209 | 10.2M | } |
210 | | |
211 | | static int |
212 | | check_encoding(rb_encoding *enc) |
213 | 0 | { |
214 | 0 | int index = rb_enc_to_index(enc); |
215 | 0 | if (rb_enc_from_index(index) != enc) |
216 | 0 | return -1; |
217 | 0 | if (rb_enc_autoload_p(enc)) { |
218 | 0 | index = rb_enc_autoload(enc); |
219 | 0 | } |
220 | 0 | return index; |
221 | 0 | } |
222 | | |
223 | | static int |
224 | | enc_check_encoding(VALUE obj) |
225 | 0 | { |
226 | 0 | if (!is_obj_encoding(obj)) { |
227 | 0 | return -1; |
228 | 0 | } |
229 | 0 | return check_encoding(RTYPEDDATA_GET_DATA(obj)); |
230 | 0 | } |
231 | | |
232 | | NORETURN(static void not_encoding(VALUE enc)); |
233 | | static void |
234 | | not_encoding(VALUE enc) |
235 | 0 | { |
236 | 0 | rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)", |
237 | 0 | rb_obj_class(enc)); |
238 | 0 | } |
239 | | |
240 | | static rb_encoding * |
241 | | must_encoding(VALUE enc) |
242 | 0 | { |
243 | 0 | int index = enc_check_encoding(enc); |
244 | 0 | if (index < 0) { |
245 | 0 | not_encoding(enc); |
246 | 0 | } |
247 | 0 | return RTYPEDDATA_GET_DATA(enc); |
248 | 0 | } |
249 | | |
250 | | static rb_encoding * |
251 | | must_encindex(int index) |
252 | 2.61M | { |
253 | 2.61M | rb_encoding *enc = rb_enc_from_index(index); |
254 | 2.61M | if (!enc) { |
255 | 0 | rb_raise(rb_eEncodingError, "encoding index out of bound: %d", |
256 | 0 | index); |
257 | 0 | } |
258 | 2.61M | if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) { |
259 | 0 | rb_loaderror("failed to load encoding (%s)", |
260 | 0 | rb_enc_name(enc)); |
261 | 0 | } |
262 | 2.61M | if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) { |
263 | 0 | rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)", |
264 | 0 | index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc)); |
265 | 0 | } |
266 | 2.61M | return enc; |
267 | 2.61M | } |
268 | | |
269 | | int |
270 | | rb_to_encoding_index(VALUE enc) |
271 | 0 | { |
272 | 0 | ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock |
273 | 0 | int idx; |
274 | 0 | const char *name; |
275 | |
|
276 | 0 | idx = enc_check_encoding(enc); |
277 | 0 | if (idx >= 0) { |
278 | 0 | return idx; |
279 | 0 | } |
280 | 0 | else if (NIL_P(enc = rb_check_string_type(enc))) { |
281 | 0 | return -1; |
282 | 0 | } |
283 | 0 | if (!rb_enc_asciicompat(rb_enc_get(enc))) { |
284 | 0 | return -1; |
285 | 0 | } |
286 | 0 | if (!(name = rb_str_to_cstr(enc))) { |
287 | 0 | return -1; |
288 | 0 | } |
289 | 0 | return rb_enc_find_index(name); |
290 | 0 | } |
291 | | |
292 | | static const char * |
293 | | name_for_encoding(volatile VALUE *enc) |
294 | 0 | { |
295 | 0 | VALUE name = StringValue(*enc); |
296 | 0 | const char *n; |
297 | |
|
298 | 0 | if (!rb_enc_asciicompat(rb_enc_get(name))) { |
299 | 0 | rb_raise(rb_eArgError, "invalid encoding name (non ASCII)"); |
300 | 0 | } |
301 | 0 | if (!(n = rb_str_to_cstr(name))) { |
302 | 0 | rb_raise(rb_eArgError, "invalid encoding name (NUL byte)"); |
303 | 0 | } |
304 | 0 | return n; |
305 | 0 | } |
306 | | |
307 | | /* Returns encoding index or UNSPECIFIED_ENCODING */ |
308 | | static int |
309 | | str_find_encindex(VALUE enc) |
310 | 0 | { |
311 | 0 | int idx = rb_enc_find_index(name_for_encoding(&enc)); |
312 | 0 | RB_GC_GUARD(enc); |
313 | 0 | return idx; |
314 | 0 | } |
315 | | |
316 | | static int |
317 | | str_to_encindex(VALUE enc) |
318 | 0 | { |
319 | 0 | int idx = str_find_encindex(enc); |
320 | 0 | if (idx < 0) { |
321 | 0 | rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc); |
322 | 0 | } |
323 | 0 | return idx; |
324 | 0 | } |
325 | | |
326 | | static rb_encoding * |
327 | | str_to_encoding(VALUE enc) |
328 | 0 | { |
329 | 0 | return rb_enc_from_index(str_to_encindex(enc)); |
330 | 0 | } |
331 | | |
332 | | rb_encoding * |
333 | | rb_to_encoding(VALUE enc) |
334 | 0 | { |
335 | 0 | if (enc_check_encoding(enc) >= 0) return RTYPEDDATA_GET_DATA(enc); |
336 | 0 | return str_to_encoding(enc); |
337 | 0 | } |
338 | | |
339 | | rb_encoding * |
340 | | rb_find_encoding(VALUE enc) |
341 | 0 | { |
342 | 0 | int idx; |
343 | 0 | if (enc_check_encoding(enc) >= 0) return RTYPEDDATA_GET_DATA(enc); |
344 | 0 | idx = str_find_encindex(enc); |
345 | 0 | if (idx < 0) return NULL; |
346 | 0 | return rb_enc_from_index(idx); |
347 | 0 | } |
348 | | |
349 | | static int |
350 | | enc_table_expand(struct enc_table *enc_table, int newsize) |
351 | 9 | { |
352 | 9 | if (newsize > ENCODING_LIST_CAPA) { |
353 | 0 | rb_raise(rb_eEncodingError, "too many encoding (> %d)", ENCODING_LIST_CAPA); |
354 | 0 | } |
355 | 9 | return newsize; |
356 | 9 | } |
357 | | |
358 | | /* Load an encoding using the values from base_encoding */ |
359 | | static void |
360 | | enc_load_from_base(struct enc_table *enc_table, int index, rb_encoding *base_encoding) |
361 | 27 | { |
362 | 27 | ASSERT_vm_locking(); |
363 | | |
364 | 27 | struct rb_encoding_entry *ent = &enc_table->list[index]; |
365 | | |
366 | 27 | if (ent->loaded) { |
367 | 0 | return; |
368 | 0 | } |
369 | | |
370 | 27 | rb_raw_encoding *encoding = (rb_raw_encoding *)ent->enc; |
371 | 27 | RUBY_ASSERT(encoding); |
372 | | |
373 | | // FIXME: Before the base is loaded, the encoding may be accessed |
374 | | // concurrently by other Ractors. |
375 | | // We're copying all fields from base_encoding except name and |
376 | | // ruby_encoding_index which we preserve from the original. Since these are |
377 | | // the only fields other threads should read it is likely safe despite |
378 | | // technically being a data race. |
379 | 27 | rb_raw_encoding tmp_encoding = *base_encoding; |
380 | 27 | tmp_encoding.name = encoding->name; |
381 | 27 | tmp_encoding.ruby_encoding_index = encoding->ruby_encoding_index; |
382 | 27 | *encoding = tmp_encoding; |
383 | | |
384 | 27 | RUBY_ATOMIC_SET(ent->loaded, encoding->max_enc_len); |
385 | 27 | } |
386 | | |
387 | | static int |
388 | | enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding) |
389 | 108 | { |
390 | 108 | ASSERT_vm_locking(); |
391 | | |
392 | 108 | struct rb_encoding_entry *ent = &enc_table->list[index]; |
393 | 108 | rb_raw_encoding *encoding; |
394 | | |
395 | 108 | RUBY_ASSERT(!ent->loaded); |
396 | 108 | RUBY_ASSERT(!ent->name); |
397 | 108 | RUBY_ASSERT(!ent->enc); |
398 | 108 | RUBY_ASSERT(!ent->base); |
399 | | |
400 | 108 | RUBY_ASSERT(valid_encoding_name_p(name)); |
401 | | |
402 | 108 | ent->name = name = strdup(name); |
403 | | |
404 | 108 | encoding = ZALLOC(rb_raw_encoding); |
405 | 108 | encoding->name = name; |
406 | 108 | encoding->ruby_encoding_index = index; |
407 | 108 | ent->enc = encoding; |
408 | | |
409 | 108 | if (st_insert(enc_table->names, (st_data_t)name, (st_data_t)index)) { |
410 | 0 | rb_bug("encoding name was somehow registered twice"); |
411 | 0 | } |
412 | | |
413 | 108 | enc_list_update(index, encoding); |
414 | | |
415 | 108 | if (base_encoding) { |
416 | 27 | enc_load_from_base(enc_table, index, base_encoding); |
417 | 27 | } |
418 | 81 | else { |
419 | | /* it should not be loaded yet */ |
420 | 81 | RUBY_ASSERT(!encoding->max_enc_len); |
421 | 81 | } |
422 | | |
423 | 108 | return index; |
424 | 108 | } |
425 | | |
426 | | static int |
427 | | enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding) |
428 | 0 | { |
429 | 0 | ASSERT_vm_locking(); |
430 | |
|
431 | 0 | if (!valid_encoding_name_p(name)) return -1; |
432 | | |
433 | 0 | int index = enc_table->count; |
434 | |
|
435 | 0 | enc_table->count = enc_table_expand(enc_table, index + 1); |
436 | 0 | return enc_register_at(enc_table, index, name, encoding); |
437 | 0 | } |
438 | | |
439 | | static void set_encoding_const(const char *, rb_encoding *); |
440 | | static int enc_registered(struct enc_table *enc_table, const char *name); |
441 | | |
442 | | static rb_encoding * |
443 | | enc_from_index(struct enc_table *enc_table, int index) |
444 | 17.1M | { |
445 | 17.1M | if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) { |
446 | 0 | return 0; |
447 | 0 | } |
448 | 17.1M | rb_encoding *enc = enc_table->list[index].enc; |
449 | 17.1M | RUBY_ASSERT(ENC_TO_ENCINDEX(enc) == index); |
450 | 17.1M | return enc; |
451 | 17.1M | } |
452 | | |
453 | | rb_encoding * |
454 | | rb_enc_from_index(int index) |
455 | 17.1M | { |
456 | 17.1M | return enc_from_index(&global_enc_table, index); |
457 | 17.1M | } |
458 | | |
459 | | int |
460 | | rb_enc_register(const char *name, rb_encoding *encoding) |
461 | 0 | { |
462 | 0 | int index; |
463 | |
|
464 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
465 | 0 | index = enc_registered(enc_table, name); |
466 | |
|
467 | 0 | if (index >= 0) { |
468 | 0 | rb_encoding *oldenc = enc_from_index(enc_table, index); |
469 | 0 | if (STRCASECMP(name, rb_enc_name(oldenc))) { |
470 | 0 | index = enc_register(enc_table, name, encoding); |
471 | 0 | } |
472 | 0 | else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) { |
473 | 0 | enc_load_from_base(enc_table, index, encoding); |
474 | 0 | } |
475 | 0 | else { |
476 | 0 | rb_raise(rb_eArgError, "encoding %s is already registered", name); |
477 | 0 | } |
478 | 0 | } |
479 | 0 | else { |
480 | 0 | index = enc_register(enc_table, name, encoding); |
481 | 0 | set_encoding_const(name, rb_enc_from_index(index)); |
482 | 0 | } |
483 | 0 | } |
484 | 0 | return index; |
485 | 0 | } |
486 | | |
487 | | int |
488 | | enc_registered(struct enc_table *enc_table, const char *name) |
489 | 29.5k | { |
490 | 29.5k | ASSERT_vm_locking(); |
491 | 29.5k | st_data_t idx = 0; |
492 | | |
493 | 29.5k | if (!name) return -1; |
494 | 29.5k | if (!enc_table->names) return -1; |
495 | 29.5k | if (st_lookup(enc_table->names, (st_data_t)name, &idx)) { |
496 | 29.3k | return (int)idx; |
497 | 29.3k | } |
498 | 190 | return -1; |
499 | 29.5k | } |
500 | | |
501 | | int |
502 | | rb_enc_registered(const char *name) |
503 | 0 | { |
504 | 0 | int idx; |
505 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
506 | 0 | idx = enc_registered(enc_table, name); |
507 | 0 | } |
508 | 0 | return idx; |
509 | 0 | } |
510 | | |
511 | | void |
512 | | rb_encdb_declare(const char *name) |
513 | 0 | { |
514 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
515 | 0 | int idx = enc_registered(enc_table, name); |
516 | 0 | if (idx < 0) { |
517 | 0 | idx = enc_register(enc_table, name, 0); |
518 | 0 | } |
519 | 0 | set_encoding_const(name, rb_enc_from_index(idx)); |
520 | 0 | } |
521 | 0 | } |
522 | | |
523 | | static void |
524 | | enc_check_addable(struct enc_table *enc_table, const char *name) |
525 | 0 | { |
526 | 0 | if (enc_registered(enc_table, name) >= 0) { |
527 | 0 | rb_raise(rb_eArgError, "encoding %s is already registered", name); |
528 | 0 | } |
529 | 0 | else if (!valid_encoding_name_p(name)) { |
530 | 0 | rb_raise(rb_eArgError, "invalid encoding name: %s", name); |
531 | 0 | } |
532 | 0 | } |
533 | | |
534 | | static rb_encoding* |
535 | | set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base) |
536 | 0 | { |
537 | 0 | rb_encoding *enc = enc_table->list[index].enc; |
538 | |
|
539 | 0 | ASSUME(enc); |
540 | 0 | enc_table->list[index].base = base; |
541 | 0 | if (ENC_DUMMY_P(base)) ENC_SET_DUMMY((rb_raw_encoding *)enc); |
542 | 0 | return enc; |
543 | 0 | } |
544 | | |
545 | | /* for encdb.h |
546 | | * Set base encoding for encodings which are not replicas |
547 | | * but not in their own files. |
548 | | */ |
549 | | void |
550 | | rb_enc_set_base(const char *name, const char *orig) |
551 | 0 | { |
552 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
553 | 0 | int idx = enc_registered(enc_table, name); |
554 | 0 | int origidx = enc_registered(enc_table, orig); |
555 | 0 | set_base_encoding(enc_table, idx, rb_enc_from_index(origidx)); |
556 | 0 | } |
557 | 0 | } |
558 | | |
559 | | /* for encdb.h |
560 | | * Set encoding dummy. |
561 | | */ |
562 | | int |
563 | | rb_enc_set_dummy(int index) |
564 | 0 | { |
565 | 0 | rb_encoding *enc = global_enc_table.list[index].enc; |
566 | 0 | ENC_SET_DUMMY((rb_raw_encoding *)enc); |
567 | 0 | return index; |
568 | 0 | } |
569 | | |
570 | | static int |
571 | | enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding) |
572 | 0 | { |
573 | 0 | int idx; |
574 | |
|
575 | 0 | enc_check_addable(enc_table, name); |
576 | 0 | idx = enc_register(enc_table, name, encoding); |
577 | 0 | if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name); |
578 | 0 | set_base_encoding(enc_table, idx, encoding); |
579 | 0 | set_encoding_const(name, rb_enc_from_index(idx)); |
580 | 0 | return idx; |
581 | 0 | } |
582 | | |
583 | | static int |
584 | | enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx) |
585 | 0 | { |
586 | 0 | if (idx < 0) { |
587 | 0 | idx = enc_register(enc_table, name, origenc); |
588 | 0 | } |
589 | 0 | else { |
590 | 0 | enc_load_from_base(enc_table, idx, origenc); |
591 | 0 | } |
592 | 0 | if (idx >= 0) { |
593 | 0 | set_base_encoding(enc_table, idx, origenc); |
594 | 0 | set_encoding_const(name, rb_enc_from_index(idx)); |
595 | 0 | } |
596 | 0 | else { |
597 | 0 | rb_raise(rb_eArgError, "failed to replicate encoding"); |
598 | 0 | } |
599 | 0 | return idx; |
600 | 0 | } |
601 | | |
602 | | int |
603 | | rb_encdb_replicate(const char *name, const char *orig) |
604 | 0 | { |
605 | 0 | int r; |
606 | |
|
607 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
608 | 0 | int origidx = enc_registered(enc_table, orig); |
609 | 0 | int idx = enc_registered(enc_table, name); |
610 | |
|
611 | 0 | if (origidx < 0) { |
612 | 0 | origidx = enc_register(enc_table, orig, 0); |
613 | 0 | } |
614 | 0 | r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx); |
615 | 0 | } |
616 | |
|
617 | 0 | return r; |
618 | 0 | } |
619 | | |
620 | | int |
621 | | rb_define_dummy_encoding(const char *name) |
622 | 0 | { |
623 | 0 | int index; |
624 | |
|
625 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
626 | 0 | index = enc_replicate(enc_table, name, rb_ascii8bit_encoding()); |
627 | 0 | rb_encoding *enc = enc_table->list[index].enc; |
628 | 0 | ENC_SET_DUMMY((rb_raw_encoding *)enc); |
629 | 0 | } |
630 | |
|
631 | 0 | return index; |
632 | 0 | } |
633 | | |
634 | | int |
635 | | rb_encdb_dummy(const char *name) |
636 | 0 | { |
637 | 0 | int index; |
638 | |
|
639 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
640 | 0 | index = enc_replicate_with_index(enc_table, name, |
641 | 0 | rb_ascii8bit_encoding(), |
642 | 0 | enc_registered(enc_table, name)); |
643 | 0 | rb_encoding *enc = enc_table->list[index].enc; |
644 | 0 | ENC_SET_DUMMY((rb_raw_encoding *)enc); |
645 | 0 | } |
646 | |
|
647 | 0 | return index; |
648 | 0 | } |
649 | | |
650 | | /* |
651 | | * call-seq: |
652 | | * enc.dummy? -> true or false |
653 | | * |
654 | | * Returns true for dummy encodings. |
655 | | * A dummy encoding is an encoding for which character handling is not properly |
656 | | * implemented. |
657 | | * It is used for stateful encodings. |
658 | | * |
659 | | * Encoding::ISO_2022_JP.dummy? #=> true |
660 | | * Encoding::UTF_8.dummy? #=> false |
661 | | * |
662 | | */ |
663 | | static VALUE |
664 | | enc_dummy_p(VALUE enc) |
665 | 0 | { |
666 | 0 | return RBOOL(ENC_DUMMY_P(must_encoding(enc))); |
667 | 0 | } |
668 | | |
669 | | /* |
670 | | * call-seq: |
671 | | * enc.ascii_compatible? -> true or false |
672 | | * |
673 | | * Returns whether ASCII-compatible or not. |
674 | | * |
675 | | * Encoding::UTF_8.ascii_compatible? #=> true |
676 | | * Encoding::UTF_16BE.ascii_compatible? #=> false |
677 | | * |
678 | | */ |
679 | | static VALUE |
680 | | enc_ascii_compatible_p(VALUE enc) |
681 | 0 | { |
682 | 0 | return RBOOL(rb_enc_asciicompat(must_encoding(enc))); |
683 | 0 | } |
684 | | |
685 | | /* |
686 | | * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0. |
687 | | */ |
688 | | int |
689 | | rb_enc_unicode_p(rb_encoding *enc) |
690 | 66.5k | { |
691 | 66.5k | return ONIGENC_IS_UNICODE(enc); |
692 | 66.5k | } |
693 | | |
694 | | static st_data_t |
695 | | enc_dup_name(st_data_t name) |
696 | 0 | { |
697 | 0 | return (st_data_t)strdup((const char *)name); |
698 | 0 | } |
699 | | |
700 | | /* |
701 | | * Returns copied alias name when the key is added for st_table, |
702 | | * else returns NULL. |
703 | | */ |
704 | | static int |
705 | | enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx) |
706 | 0 | { |
707 | 0 | ASSERT_vm_locking(); |
708 | 0 | return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx, |
709 | 0 | enc_dup_name); |
710 | 0 | } |
711 | | |
712 | | static int |
713 | | enc_alias(struct enc_table *enc_table, const char *alias, int idx) |
714 | 0 | { |
715 | 0 | if (!valid_encoding_name_p(alias)) return -1; |
716 | 0 | if (!enc_alias_internal(enc_table, alias, idx)) |
717 | 0 | set_encoding_const(alias, enc_from_index(enc_table, idx)); |
718 | 0 | return idx; |
719 | 0 | } |
720 | | |
721 | | int |
722 | | rb_enc_alias(const char *alias, const char *orig) |
723 | 0 | { |
724 | 0 | int idx, r; |
725 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
726 | 0 | enc_check_addable(enc_table, alias); // can raise |
727 | 0 | } |
728 | |
|
729 | 0 | idx = rb_enc_find_index(orig); |
730 | 0 | if (idx < 0) return -1; |
731 | | |
732 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
733 | 0 | r = enc_alias(enc_table, alias, idx); |
734 | 0 | } |
735 | |
|
736 | 0 | return r; |
737 | 0 | } |
738 | | |
739 | | int |
740 | | rb_encdb_alias(const char *alias, const char *orig) |
741 | 0 | { |
742 | 0 | int r; |
743 | |
|
744 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
745 | 0 | int idx = enc_registered(enc_table, orig); |
746 | |
|
747 | 0 | if (idx < 0) { |
748 | 0 | idx = enc_register(enc_table, orig, 0); |
749 | 0 | } |
750 | 0 | r = enc_alias(enc_table, alias, idx); |
751 | 0 | } |
752 | |
|
753 | 0 | return r; |
754 | 0 | } |
755 | | |
756 | | static void |
757 | | rb_enc_init(struct enc_table *enc_table) |
758 | 9 | { |
759 | 9 | ASSERT_vm_locking(); |
760 | 9 | enc_table_expand(enc_table, ENCODING_COUNT + 1); |
761 | 9 | if (!enc_table->names) { |
762 | 9 | enc_table->names = st_init_strcasetable_with_size(ENCODING_LIST_CAPA); |
763 | 9 | } |
764 | 18 | #define OnigEncodingASCII_8BIT OnigEncodingASCII |
765 | 27 | #define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) |
766 | 9 | ENC_REGISTER(ASCII_8BIT); |
767 | 9 | ENC_REGISTER(UTF_8); |
768 | 9 | ENC_REGISTER(US_ASCII); |
769 | 9 | global_enc_ascii = enc_table->list[ENCINDEX_ASCII_8BIT].enc; |
770 | 9 | global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc; |
771 | 9 | global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc; |
772 | 9 | #undef ENC_REGISTER |
773 | 9 | #undef OnigEncodingASCII_8BIT |
774 | 81 | #define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL) |
775 | 9 | ENCDB_REGISTER("UTF-16BE", UTF_16BE); |
776 | 9 | ENCDB_REGISTER("UTF-16LE", UTF_16LE); |
777 | 9 | ENCDB_REGISTER("UTF-32BE", UTF_32BE); |
778 | 9 | ENCDB_REGISTER("UTF-32LE", UTF_32LE); |
779 | 9 | ENCDB_REGISTER("UTF-16", UTF_16); |
780 | 9 | ENCDB_REGISTER("UTF-32", UTF_32); |
781 | 9 | ENCDB_REGISTER("UTF8-MAC", UTF8_MAC); |
782 | | |
783 | 9 | ENCDB_REGISTER("EUC-JP", EUC_JP); |
784 | 9 | ENCDB_REGISTER("Windows-31J", Windows_31J); |
785 | 9 | #undef ENCDB_REGISTER |
786 | 9 | enc_table->count = ENCINDEX_BUILTIN_MAX; |
787 | 9 | } |
788 | | |
789 | | rb_encoding * |
790 | | rb_enc_get_from_index(int index) |
791 | 0 | { |
792 | 0 | return must_encindex(index); |
793 | 0 | } |
794 | | |
795 | | int rb_require_internal_silent(VALUE fname); |
796 | | |
797 | | static int |
798 | | load_encoding(const char *name) |
799 | 28.9k | { |
800 | 28.9k | ASSERT_vm_unlocking(); |
801 | 28.9k | VALUE enclib = rb_sprintf("enc/%s.so", name); |
802 | 28.9k | VALUE debug = ruby_debug; |
803 | 28.9k | VALUE errinfo; |
804 | 28.9k | char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3; |
805 | 28.9k | int loaded; |
806 | 28.9k | int idx; |
807 | | |
808 | 597k | while (s < e) { |
809 | 568k | if (!ISALNUM(*s)) *s = '_'; |
810 | 414k | else if (ISUPPER(*s)) *s = (char)TOLOWER(*s); |
811 | 568k | ++s; |
812 | 568k | } |
813 | 28.9k | enclib = rb_fstring(enclib); |
814 | 28.9k | ruby_debug = Qfalse; |
815 | 28.9k | errinfo = rb_errinfo(); |
816 | 28.9k | loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK |
817 | 28.9k | ruby_debug = debug; |
818 | 28.9k | rb_set_errinfo(errinfo); |
819 | | |
820 | 28.9k | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
821 | 28.9k | if (loaded < 0 || 1 < loaded) { |
822 | 28.9k | idx = -1; |
823 | 28.9k | } |
824 | 0 | else if ((idx = enc_registered(enc_table, name)) < 0) { |
825 | 0 | idx = -1; |
826 | 0 | } |
827 | 0 | else if (rb_enc_autoload_p(enc_table->list[idx].enc)) { |
828 | 0 | idx = -1; |
829 | 0 | } |
830 | 28.9k | } |
831 | | |
832 | 28.9k | return idx; |
833 | 28.9k | } |
834 | | |
835 | | static int |
836 | | enc_autoload_body(rb_encoding *enc) |
837 | 28.7k | { |
838 | 28.7k | rb_encoding *base; |
839 | 28.7k | int i = 0; |
840 | 28.7k | ASSERT_vm_unlocking(); |
841 | | |
842 | 28.7k | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
843 | 28.7k | base = enc_table->list[ENC_TO_ENCINDEX(enc)].base; |
844 | 28.7k | } |
845 | | |
846 | 28.7k | if (base) { |
847 | 0 | bool do_register = true; |
848 | 0 | if (rb_enc_autoload_p(base)) { |
849 | 0 | if (rb_enc_autoload(base) < 0) { |
850 | 0 | do_register = false; |
851 | 0 | i = -1; |
852 | 0 | } |
853 | 0 | } |
854 | |
|
855 | 0 | if (do_register) { |
856 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
857 | 0 | i = ENC_TO_ENCINDEX(enc); |
858 | 0 | enc_load_from_base(enc_table, i, base); |
859 | 0 | RUBY_ASSERT(((rb_raw_encoding *)enc)->ruby_encoding_index == i); |
860 | 0 | } |
861 | 0 | } |
862 | 0 | } |
863 | 28.7k | else { |
864 | 28.7k | i = -2; |
865 | 28.7k | } |
866 | | |
867 | 28.7k | return i; |
868 | 28.7k | } |
869 | | |
870 | | int |
871 | | rb_enc_autoload(rb_encoding *enc) |
872 | 28.7k | { |
873 | 28.7k | ASSERT_vm_unlocking(); |
874 | 28.7k | int i = enc_autoload_body(enc); |
875 | 28.7k | if (i == -2) { |
876 | 28.7k | i = load_encoding(rb_enc_name(enc)); |
877 | 28.7k | } |
878 | 28.7k | return i; |
879 | 28.7k | } |
880 | | |
881 | | bool |
882 | | rb_enc_autoload_p(rb_encoding *enc) |
883 | 2.80M | { |
884 | 2.80M | int idx = ENC_TO_ENCINDEX(enc); |
885 | 2.80M | RUBY_ASSERT(rb_enc_from_index(idx) == enc); |
886 | 2.80M | return !RUBY_ATOMIC_LOAD(global_enc_table.list[idx].loaded); |
887 | 2.80M | } |
888 | | |
889 | | /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */ |
890 | | int |
891 | | rb_enc_find_index(const char *name) |
892 | 29.5k | { |
893 | 29.5k | int i; |
894 | 29.5k | ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary |
895 | 29.5k | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
896 | 29.5k | i = enc_registered(enc_table, name); |
897 | 29.5k | } |
898 | 29.5k | rb_encoding *enc; |
899 | | |
900 | 29.5k | if (i < 0) { |
901 | 190 | i = load_encoding(name); |
902 | 190 | } |
903 | 29.3k | else if (!(enc = rb_enc_from_index(i))) { |
904 | 0 | if (i != UNSPECIFIED_ENCODING) { |
905 | 0 | rb_raise(rb_eArgError, "encoding %s is not registered", name); |
906 | 0 | } |
907 | 0 | } |
908 | 29.3k | else if (rb_enc_autoload_p(enc)) { |
909 | 28.7k | if (rb_enc_autoload(enc) < 0) { |
910 | 28.7k | rb_warn("failed to load encoding (%s); use ASCII-8BIT instead", |
911 | 28.7k | name); |
912 | 28.7k | return 0; |
913 | 28.7k | } |
914 | 28.7k | } |
915 | 763 | return i; |
916 | 29.5k | } |
917 | | |
918 | | int |
919 | | rb_enc_find_index2(const char *name, long len) |
920 | 0 | { |
921 | 0 | char buf[ENCODING_NAMELEN_MAX+1]; |
922 | |
|
923 | 0 | if (len > ENCODING_NAMELEN_MAX) return -1; |
924 | 0 | memcpy(buf, name, len); |
925 | 0 | buf[len] = '\0'; |
926 | 0 | return rb_enc_find_index(buf); |
927 | 0 | } |
928 | | |
929 | | rb_encoding * |
930 | | rb_enc_find(const char *name) |
931 | 0 | { |
932 | 0 | int idx = rb_enc_find_index(name); |
933 | 0 | if (idx < 0) idx = 0; |
934 | 0 | return rb_enc_from_index(idx); |
935 | 0 | } |
936 | | |
937 | | static inline int |
938 | | enc_capable(VALUE obj) |
939 | 2.61M | { |
940 | 2.61M | if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj); |
941 | 2.61M | switch (BUILTIN_TYPE(obj)) { |
942 | 2.54M | case T_STRING: |
943 | 2.61M | case T_REGEXP: |
944 | 2.61M | case T_FILE: |
945 | 2.61M | case T_SYMBOL: |
946 | 2.61M | return TRUE; |
947 | 0 | case T_DATA: |
948 | 0 | if (is_data_encoding(obj)) return TRUE; |
949 | 0 | default: |
950 | 0 | return FALSE; |
951 | 2.61M | } |
952 | 2.61M | } |
953 | | |
954 | | int |
955 | | rb_enc_capable(VALUE obj) |
956 | 0 | { |
957 | 0 | return enc_capable(obj); |
958 | 0 | } |
959 | | |
960 | | ID |
961 | | rb_id_encoding(void) |
962 | 9 | { |
963 | 9 | CONST_ID(id_encoding, "encoding"); |
964 | 9 | return id_encoding; |
965 | 9 | } |
966 | | |
967 | | static int |
968 | | enc_get_index_str(VALUE str) |
969 | 12.5M | { |
970 | 12.5M | int i = ENCODING_GET_INLINED(str); |
971 | 12.5M | if (i == ENCODING_INLINE_MAX) { |
972 | 0 | VALUE iv; |
973 | |
|
974 | | #if 0 |
975 | | iv = rb_ivar_get(str, rb_id_encoding()); |
976 | | i = NUM2INT(iv); |
977 | | #else |
978 | | /* |
979 | | * Tentatively, assume ASCII-8BIT, if encoding index instance |
980 | | * variable is not found. This can happen when freeing after |
981 | | * all instance variables are removed in `obj_free`. |
982 | | */ |
983 | 0 | iv = rb_attr_get(str, rb_id_encoding()); |
984 | 0 | i = NIL_P(iv) ? ENCINDEX_ASCII_8BIT : NUM2INT(iv); |
985 | 0 | #endif |
986 | 0 | } |
987 | 12.5M | return i; |
988 | 12.5M | } |
989 | | |
990 | | int |
991 | | rb_enc_get_index(VALUE obj) |
992 | 9.75M | { |
993 | 9.75M | int i = -1; |
994 | 9.75M | VALUE tmp; |
995 | | |
996 | 9.75M | if (SPECIAL_CONST_P(obj)) { |
997 | 0 | if (!SYMBOL_P(obj)) return -1; |
998 | 0 | obj = rb_sym2str(obj); |
999 | 0 | } |
1000 | 9.75M | switch (BUILTIN_TYPE(obj)) { |
1001 | 9.56M | case T_STRING: |
1002 | 9.56M | case T_SYMBOL: |
1003 | 9.75M | case T_REGEXP: |
1004 | 9.75M | i = enc_get_index_str(obj); |
1005 | 9.75M | break; |
1006 | 0 | case T_FILE: |
1007 | 0 | tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0); |
1008 | 0 | if (NIL_P(tmp)) { |
1009 | 0 | tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0); |
1010 | 0 | } |
1011 | 0 | if (is_obj_encoding(tmp)) { |
1012 | 0 | i = enc_check_encoding(tmp); |
1013 | 0 | } |
1014 | 0 | break; |
1015 | 0 | case T_DATA: |
1016 | 0 | if (is_data_encoding(obj)) { |
1017 | 0 | i = enc_check_encoding(obj); |
1018 | 0 | } |
1019 | 0 | break; |
1020 | 0 | default: |
1021 | 0 | break; |
1022 | 9.75M | } |
1023 | 9.75M | return i; |
1024 | 9.75M | } |
1025 | | |
1026 | | static void |
1027 | | enc_set_index(VALUE obj, int idx) |
1028 | 2.61M | { |
1029 | 2.61M | if (!enc_capable(obj)) { |
1030 | 0 | rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object"); |
1031 | 0 | } |
1032 | | |
1033 | 2.61M | if (idx < ENCODING_INLINE_MAX) { |
1034 | 2.61M | ENCODING_SET_INLINED(obj, idx); |
1035 | 2.61M | return; |
1036 | 2.61M | } |
1037 | 2.61M | ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); |
1038 | 0 | rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx)); |
1039 | 0 | } |
1040 | | |
1041 | | void |
1042 | | rb_enc_raw_set(VALUE obj, rb_encoding *enc) |
1043 | 2.84M | { |
1044 | 2.84M | RUBY_ASSERT(enc_capable(obj)); |
1045 | | |
1046 | 2.84M | int idx = enc ? ENC_TO_ENCINDEX(enc) : 0; |
1047 | | |
1048 | 2.84M | if (idx < ENCODING_INLINE_MAX) { |
1049 | 2.84M | ENCODING_SET_INLINED(obj, idx); |
1050 | 2.84M | return; |
1051 | 2.84M | } |
1052 | 2.84M | ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); |
1053 | 0 | rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx)); |
1054 | 0 | } |
1055 | | |
1056 | | void |
1057 | | rb_enc_set_index(VALUE obj, int idx) |
1058 | 2.33M | { |
1059 | 2.33M | rb_check_frozen(obj); |
1060 | 2.33M | must_encindex(idx); |
1061 | 2.33M | enc_set_index(obj, idx); |
1062 | 2.33M | } |
1063 | | |
1064 | | VALUE |
1065 | | rb_enc_associate_index(VALUE obj, int idx) |
1066 | 1.95M | { |
1067 | 1.95M | rb_encoding *enc; |
1068 | 1.95M | int oldidx, oldtermlen, termlen; |
1069 | | |
1070 | 1.95M | rb_check_frozen(obj); |
1071 | 1.95M | oldidx = rb_enc_get_index(obj); |
1072 | 1.95M | if (oldidx == idx) |
1073 | 1.66M | return obj; |
1074 | 289k | if (SPECIAL_CONST_P(obj)) { |
1075 | 0 | rb_raise(rb_eArgError, "cannot set encoding"); |
1076 | 0 | } |
1077 | 289k | enc = must_encindex(idx); |
1078 | 289k | if (!ENC_CODERANGE_ASCIIONLY(obj) || |
1079 | 178k | !rb_enc_asciicompat(enc)) { |
1080 | 178k | ENC_CODERANGE_CLEAR(obj); |
1081 | 178k | } |
1082 | 289k | termlen = rb_enc_mbminlen(enc); |
1083 | 289k | oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx)); |
1084 | 289k | if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) { |
1085 | 0 | rb_str_change_terminator_length(obj, oldtermlen, termlen); |
1086 | 0 | } |
1087 | 289k | enc_set_index(obj, idx); |
1088 | 289k | return obj; |
1089 | 289k | } |
1090 | | |
1091 | | VALUE |
1092 | | rb_enc_associate(VALUE obj, rb_encoding *enc) |
1093 | 1.79M | { |
1094 | 1.79M | return rb_enc_associate_index(obj, rb_enc_to_index(enc)); |
1095 | 1.79M | } |
1096 | | |
1097 | | rb_encoding* |
1098 | | rb_enc_get(VALUE obj) |
1099 | 4.44M | { |
1100 | 4.44M | return rb_enc_from_index(rb_enc_get_index(obj)); |
1101 | 4.44M | } |
1102 | | |
1103 | | const char * |
1104 | | rb_enc_inspect_name(rb_encoding *enc) |
1105 | 190 | { |
1106 | 190 | if (enc == global_enc_ascii) { |
1107 | 95 | return "BINARY (ASCII-8BIT)"; |
1108 | 95 | } |
1109 | 95 | return enc->name; |
1110 | 190 | } |
1111 | | |
1112 | | static rb_encoding* |
1113 | | rb_encoding_check(rb_encoding* enc, VALUE str1, VALUE str2) |
1114 | 2.77M | { |
1115 | 2.77M | if (!enc) |
1116 | 0 | rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", |
1117 | 0 | rb_enc_inspect_name(rb_enc_get(str1)), |
1118 | 0 | rb_enc_inspect_name(rb_enc_get(str2))); |
1119 | 2.77M | return enc; |
1120 | 2.77M | } |
1121 | | |
1122 | | static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2); |
1123 | | |
1124 | | rb_encoding* |
1125 | | rb_enc_check_str(VALUE str1, VALUE str2) |
1126 | 1.38M | { |
1127 | 1.38M | rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2)); |
1128 | 0 | return rb_encoding_check(enc, str1, str2); |
1129 | 1.38M | } |
1130 | | |
1131 | | rb_encoding* |
1132 | | rb_enc_check(VALUE str1, VALUE str2) |
1133 | 1.38M | { |
1134 | 1.38M | rb_encoding *enc = rb_enc_compatible(str1, str2); |
1135 | 1.38M | return rb_encoding_check(enc, str1, str2); |
1136 | 1.38M | } |
1137 | | |
1138 | | static rb_encoding* |
1139 | | enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2) |
1140 | 2.86M | { |
1141 | 2.86M | if (idx1 < 0 || idx2 < 0) |
1142 | 0 | return 0; |
1143 | | |
1144 | 2.86M | if (idx1 == idx2) { |
1145 | 2.80M | return rb_enc_from_index(idx1); |
1146 | 2.80M | } |
1147 | | |
1148 | 69.1k | int isstr1, isstr2; |
1149 | 69.1k | rb_encoding *enc1 = rb_enc_from_index(idx1); |
1150 | 69.1k | rb_encoding *enc2 = rb_enc_from_index(idx2); |
1151 | | |
1152 | 69.1k | isstr2 = RB_TYPE_P(str2, T_STRING); |
1153 | 69.1k | if (isstr2 && RSTRING_LEN(str2) == 0) |
1154 | 4 | return enc1; |
1155 | 69.1k | isstr1 = RB_TYPE_P(str1, T_STRING); |
1156 | 69.1k | if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0) |
1157 | 0 | return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; |
1158 | 69.1k | if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { |
1159 | 0 | return 0; |
1160 | 0 | } |
1161 | | |
1162 | | /* objects whose encoding is the same of contents */ |
1163 | 69.1k | if (!isstr2 && idx2 == ENCINDEX_US_ASCII) |
1164 | 0 | return enc1; |
1165 | 69.1k | if (!isstr1 && idx1 == ENCINDEX_US_ASCII) |
1166 | 0 | return enc2; |
1167 | | |
1168 | 69.1k | if (!isstr1) { |
1169 | 0 | VALUE tmp = str1; |
1170 | 0 | int idx0 = idx1; |
1171 | 0 | str1 = str2; |
1172 | 0 | str2 = tmp; |
1173 | 0 | idx1 = idx2; |
1174 | 0 | idx2 = idx0; |
1175 | 0 | idx0 = isstr1; |
1176 | 0 | isstr1 = isstr2; |
1177 | 0 | isstr2 = idx0; |
1178 | 0 | } |
1179 | 69.1k | if (isstr1) { |
1180 | 69.1k | int cr1, cr2; |
1181 | | |
1182 | 69.1k | cr1 = rb_enc_str_coderange(str1); |
1183 | 69.1k | if (isstr2) { |
1184 | 69.1k | cr2 = rb_enc_str_coderange(str2); |
1185 | 69.1k | if (cr1 != cr2) { |
1186 | | /* may need to handle ENC_CODERANGE_BROKEN */ |
1187 | 383 | if (cr1 == ENC_CODERANGE_7BIT) return enc2; |
1188 | 0 | if (cr2 == ENC_CODERANGE_7BIT) return enc1; |
1189 | 0 | } |
1190 | 68.7k | if (cr2 == ENC_CODERANGE_7BIT) { |
1191 | 68.7k | return enc1; |
1192 | 68.7k | } |
1193 | 68.7k | } |
1194 | 0 | if (cr1 == ENC_CODERANGE_7BIT) |
1195 | 0 | return enc2; |
1196 | 0 | } |
1197 | 0 | return 0; |
1198 | 69.1k | } |
1199 | | |
1200 | | static rb_encoding* |
1201 | | enc_compatible_str(VALUE str1, VALUE str2) |
1202 | 1.38M | { |
1203 | 1.38M | int idx1 = enc_get_index_str(str1); |
1204 | 1.38M | int idx2 = enc_get_index_str(str2); |
1205 | | |
1206 | 1.38M | return enc_compatible_latter(str1, str2, idx1, idx2); |
1207 | 1.38M | } |
1208 | | |
1209 | | rb_encoding* |
1210 | | rb_enc_compatible(VALUE str1, VALUE str2) |
1211 | 1.48M | { |
1212 | 1.48M | int idx1 = rb_enc_get_index(str1); |
1213 | 1.48M | int idx2 = rb_enc_get_index(str2); |
1214 | | |
1215 | 1.48M | return enc_compatible_latter(str1, str2, idx1, idx2); |
1216 | 1.48M | } |
1217 | | |
1218 | | void |
1219 | | rb_enc_copy(VALUE obj1, VALUE obj2) |
1220 | 49.3k | { |
1221 | 49.3k | rb_enc_associate_index(obj1, rb_enc_get_index(obj2)); |
1222 | 49.3k | } |
1223 | | |
1224 | | |
1225 | | /* |
1226 | | * call-seq: |
1227 | | * encoding -> encoding |
1228 | | * |
1229 | | * Returns an Encoding object that represents the encoding of +self+; |
1230 | | * see {Encodings}[rdoc-ref:encodings.rdoc]. |
1231 | | * |
1232 | | * Related: see {Querying}[rdoc-ref:String@Querying]. |
1233 | | */ |
1234 | | |
1235 | | VALUE |
1236 | | rb_obj_encoding(VALUE obj) |
1237 | 0 | { |
1238 | 0 | int idx = rb_enc_get_index(obj); |
1239 | 0 | if (idx < 0) { |
1240 | 0 | rb_raise(rb_eTypeError, "unknown encoding"); |
1241 | 0 | } |
1242 | 0 | return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK); |
1243 | 0 | } |
1244 | | |
1245 | | int |
1246 | | rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc) |
1247 | 843k | { |
1248 | 843k | return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); |
1249 | 843k | } |
1250 | | |
1251 | | int |
1252 | | rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) |
1253 | 74.1M | { |
1254 | 74.1M | int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); |
1255 | 74.1M | if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) |
1256 | 74.1M | return MBCLEN_CHARFOUND_LEN(n); |
1257 | 729 | else { |
1258 | 729 | int min = rb_enc_mbminlen(enc); |
1259 | 729 | return min <= e-p ? min : (int)(e-p); |
1260 | 729 | } |
1261 | 74.1M | } |
1262 | | |
1263 | | int |
1264 | | rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) |
1265 | 64.6M | { |
1266 | 64.6M | int n; |
1267 | 64.6M | if (e <= p) |
1268 | 5 | return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); |
1269 | 64.6M | n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); |
1270 | 64.6M | if (e-p < n) |
1271 | 0 | return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p)); |
1272 | 64.6M | return n; |
1273 | 64.6M | } |
1274 | | |
1275 | | int |
1276 | | rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc) |
1277 | 6.00M | { |
1278 | 6.00M | unsigned int c; |
1279 | 6.00M | int l; |
1280 | 6.00M | if (e <= p) |
1281 | 0 | return -1; |
1282 | 6.00M | if (rb_enc_asciicompat(enc)) { |
1283 | 6.00M | c = (unsigned char)*p; |
1284 | 6.00M | if (!ISASCII(c)) |
1285 | 928k | return -1; |
1286 | 5.07M | if (len) *len = 1; |
1287 | 5.07M | return c; |
1288 | 6.00M | } |
1289 | 0 | l = rb_enc_precise_mbclen(p, e, enc); |
1290 | 0 | if (!MBCLEN_CHARFOUND_P(l)) |
1291 | 0 | return -1; |
1292 | 0 | c = rb_enc_mbc_to_codepoint(p, e, enc); |
1293 | 0 | if (!rb_enc_isascii(c, enc)) |
1294 | 0 | return -1; |
1295 | 0 | if (len) *len = l; |
1296 | 0 | return c; |
1297 | 0 | } |
1298 | | |
1299 | | unsigned int |
1300 | | rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc) |
1301 | 53.3k | { |
1302 | 53.3k | int r; |
1303 | 53.3k | if (e <= p) |
1304 | 0 | rb_raise(rb_eArgError, "empty string"); |
1305 | 53.3k | r = rb_enc_precise_mbclen(p, e, enc); |
1306 | 53.3k | if (!MBCLEN_CHARFOUND_P(r)) { |
1307 | 0 | rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc)); |
1308 | 0 | } |
1309 | 53.3k | if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r); |
1310 | 53.3k | return rb_enc_mbc_to_codepoint(p, e, enc); |
1311 | 53.3k | } |
1312 | | |
1313 | | int |
1314 | | rb_enc_codelen(int c, rb_encoding *enc) |
1315 | 30.6k | { |
1316 | 30.6k | int n = ONIGENC_CODE_TO_MBCLEN(enc,c); |
1317 | 30.6k | if (n == 0) { |
1318 | 0 | rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc)); |
1319 | 0 | } |
1320 | 30.6k | return n; |
1321 | 30.6k | } |
1322 | | |
1323 | | int |
1324 | | rb_enc_toupper(int c, rb_encoding *enc) |
1325 | 0 | { |
1326 | 0 | return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c)); |
1327 | 0 | } |
1328 | | |
1329 | | int |
1330 | | rb_enc_tolower(int c, rb_encoding *enc) |
1331 | 0 | { |
1332 | 0 | return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c)); |
1333 | 0 | } |
1334 | | |
1335 | | /* |
1336 | | * call-seq: |
1337 | | * enc.inspect -> string |
1338 | | * |
1339 | | * Returns a string which represents the encoding for programmers. |
1340 | | * |
1341 | | * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" |
1342 | | * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" |
1343 | | */ |
1344 | | static VALUE |
1345 | | enc_inspect(VALUE self) |
1346 | 0 | { |
1347 | 0 | rb_encoding *enc; |
1348 | |
|
1349 | 0 | if (!is_obj_encoding(self)) { /* do not resolve autoload */ |
1350 | 0 | not_encoding(self); |
1351 | 0 | } |
1352 | 0 | if (!(enc = RTYPEDDATA_GET_DATA(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) { |
1353 | 0 | rb_raise(rb_eTypeError, "broken Encoding"); |
1354 | 0 | } |
1355 | | |
1356 | 0 | return rb_enc_sprintf(rb_usascii_encoding(), |
1357 | 0 | "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self), |
1358 | 0 | rb_enc_inspect_name(enc), |
1359 | 0 | (ENC_DUMMY_P(enc) ? " (dummy)" : ""), |
1360 | 0 | rb_enc_autoload_p(enc) ? " (autoload)" : ""); |
1361 | 0 | } |
1362 | | |
1363 | | |
1364 | | static int |
1365 | | enc_names_i(st_data_t name, st_data_t idx, st_data_t args) |
1366 | 0 | { |
1367 | 0 | VALUE *arg = (VALUE *)args; |
1368 | |
|
1369 | 0 | if ((int)idx == (int)arg[0]) { |
1370 | 0 | VALUE str = rb_interned_str_cstr((char *)name); |
1371 | 0 | rb_ary_push(arg[1], str); |
1372 | 0 | } |
1373 | 0 | return ST_CONTINUE; |
1374 | 0 | } |
1375 | | |
1376 | | /* |
1377 | | * call-seq: |
1378 | | * enc.names -> array |
1379 | | * |
1380 | | * Returns the list of name and aliases of the encoding. |
1381 | | * |
1382 | | * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"] |
1383 | | */ |
1384 | | static VALUE |
1385 | | enc_names(VALUE self) |
1386 | 0 | { |
1387 | 0 | VALUE args[2]; |
1388 | |
|
1389 | 0 | args[0] = (VALUE)rb_to_encoding_index(self); |
1390 | 0 | args[1] = rb_ary_new2(0); |
1391 | |
|
1392 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1393 | 0 | st_foreach(enc_table->names, enc_names_i, (st_data_t)args); |
1394 | 0 | } |
1395 | 0 | return args[1]; |
1396 | 0 | } |
1397 | | |
1398 | | /* |
1399 | | * call-seq: |
1400 | | * Encoding.list -> [enc1, enc2, ...] |
1401 | | * |
1402 | | * Returns the list of loaded encodings. |
1403 | | * |
1404 | | * Encoding.list |
1405 | | * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, |
1406 | | * #<Encoding:ISO-2022-JP (dummy)>] |
1407 | | * |
1408 | | * Encoding.find("US-ASCII") |
1409 | | * #=> #<Encoding:US-ASCII> |
1410 | | * |
1411 | | * Encoding.list |
1412 | | * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, |
1413 | | * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>] |
1414 | | * |
1415 | | */ |
1416 | | static VALUE |
1417 | | enc_list(VALUE klass) |
1418 | 0 | { |
1419 | 0 | VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list); |
1420 | 0 | return rb_ary_dup(list); |
1421 | 0 | } |
1422 | | |
1423 | | /* |
1424 | | * call-seq: |
1425 | | * Encoding.find(string) -> enc |
1426 | | * |
1427 | | * Search the encoding with specified <i>name</i>. |
1428 | | * <i>name</i> should be a string. |
1429 | | * |
1430 | | * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII> |
1431 | | * |
1432 | | * Names which this method accept are encoding names and aliases |
1433 | | * including following special aliases |
1434 | | * |
1435 | | * "external":: default external encoding |
1436 | | * "internal":: default internal encoding |
1437 | | * "locale":: locale encoding |
1438 | | * "filesystem":: filesystem encoding |
1439 | | * |
1440 | | * An ArgumentError is raised when no encoding with <i>name</i>. |
1441 | | * Only <code>Encoding.find("internal")</code> however returns nil |
1442 | | * when no encoding named "internal", in other words, when Ruby has no |
1443 | | * default internal encoding. |
1444 | | */ |
1445 | | static VALUE |
1446 | | enc_find(VALUE klass, VALUE enc) |
1447 | 0 | { |
1448 | 0 | int idx; |
1449 | 0 | if (is_obj_encoding(enc)) |
1450 | 0 | return enc; |
1451 | 0 | idx = str_to_encindex(enc); |
1452 | 0 | if (idx == UNSPECIFIED_ENCODING) return Qnil; |
1453 | 0 | return rb_enc_from_encoding_index(idx); |
1454 | 0 | } |
1455 | | |
1456 | | /* |
1457 | | * call-seq: |
1458 | | * Encoding.compatible?(obj1, obj2) -> enc or nil |
1459 | | * |
1460 | | * Checks the compatibility of two objects. |
1461 | | * |
1462 | | * If the objects are both strings they are compatible when they are |
1463 | | * concatenatable. The encoding of the concatenated string will be returned |
1464 | | * if they are compatible, nil if they are not. |
1465 | | * |
1466 | | * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") |
1467 | | * #=> #<Encoding:ISO-8859-1> |
1468 | | * |
1469 | | * Encoding.compatible?( |
1470 | | * "\xa1".force_encoding("iso-8859-1"), |
1471 | | * "\xa1\xa1".force_encoding("euc-jp")) |
1472 | | * #=> nil |
1473 | | * |
1474 | | * If the objects are non-strings their encodings are compatible when they |
1475 | | * have an encoding and: |
1476 | | * * Either encoding is US-ASCII compatible |
1477 | | * * One of the encodings is a 7-bit encoding |
1478 | | * |
1479 | | */ |
1480 | | static VALUE |
1481 | | enc_compatible_p(VALUE klass, VALUE str1, VALUE str2) |
1482 | 0 | { |
1483 | 0 | rb_encoding *enc; |
1484 | |
|
1485 | 0 | if (!enc_capable(str1)) return Qnil; |
1486 | 0 | if (!enc_capable(str2)) return Qnil; |
1487 | 0 | enc = rb_enc_compatible(str1, str2); |
1488 | 0 | if (!enc) return Qnil; |
1489 | 0 | return rb_enc_from_encoding(enc); |
1490 | 0 | } |
1491 | | |
1492 | | NORETURN(static VALUE enc_s_alloc(VALUE klass)); |
1493 | | /* :nodoc: */ |
1494 | | static VALUE |
1495 | | enc_s_alloc(VALUE klass) |
1496 | 0 | { |
1497 | 0 | rb_undefined_alloc(klass); |
1498 | 0 | UNREACHABLE_RETURN(Qnil); |
1499 | 0 | } |
1500 | | |
1501 | | /* :nodoc: */ |
1502 | | static VALUE |
1503 | | enc_dump(int argc, VALUE *argv, VALUE self) |
1504 | 0 | { |
1505 | 0 | rb_check_arity(argc, 0, 1); |
1506 | 0 | return rb_attr_get(self, id_i_name); |
1507 | 0 | } |
1508 | | |
1509 | | /* :nodoc: */ |
1510 | | static VALUE |
1511 | | enc_load(VALUE klass, VALUE str) |
1512 | 0 | { |
1513 | 0 | return str; |
1514 | 0 | } |
1515 | | |
1516 | | /* :nodoc: */ |
1517 | | static VALUE |
1518 | | enc_m_loader(VALUE klass, VALUE str) |
1519 | 0 | { |
1520 | 0 | return enc_find(klass, str); |
1521 | 0 | } |
1522 | | |
1523 | | rb_encoding * |
1524 | | rb_ascii8bit_encoding(void) |
1525 | 710k | { |
1526 | 710k | return global_enc_ascii; |
1527 | 710k | } |
1528 | | |
1529 | | int |
1530 | | rb_ascii8bit_encindex(void) |
1531 | 0 | { |
1532 | 0 | return ENCINDEX_ASCII_8BIT; |
1533 | 0 | } |
1534 | | |
1535 | | rb_encoding * |
1536 | | rb_utf8_encoding(void) |
1537 | 237k | { |
1538 | 237k | return global_enc_utf_8; |
1539 | 237k | } |
1540 | | |
1541 | | int |
1542 | | rb_utf8_encindex(void) |
1543 | 0 | { |
1544 | 0 | return ENCINDEX_UTF_8; |
1545 | 0 | } |
1546 | | |
1547 | | rb_encoding * |
1548 | | rb_usascii_encoding(void) |
1549 | 432k | { |
1550 | 432k | return global_enc_us_ascii; |
1551 | 432k | } |
1552 | | |
1553 | | int |
1554 | | rb_usascii_encindex(void) |
1555 | 0 | { |
1556 | 0 | return ENCINDEX_US_ASCII; |
1557 | 0 | } |
1558 | | |
1559 | | int rb_locale_charmap_index(void); |
1560 | | |
1561 | | int |
1562 | | rb_locale_encindex(void) |
1563 | 0 | { |
1564 | | // `rb_locale_charmap_index` can call `enc_find_index`, which can |
1565 | | // load an encoding. This needs to be done without VM lock held. |
1566 | 0 | ASSERT_vm_unlocking(); |
1567 | 0 | int idx = rb_locale_charmap_index(); |
1568 | |
|
1569 | 0 | if (idx < 0) idx = ENCINDEX_UTF_8; |
1570 | |
|
1571 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1572 | 0 | if (enc_registered(enc_table, "locale") < 0) { |
1573 | | # if defined _WIN32 |
1574 | | void Init_w32_codepage(void); |
1575 | | Init_w32_codepage(); |
1576 | | # endif |
1577 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1578 | 0 | enc_alias_internal(enc_table, "locale", idx); |
1579 | 0 | } |
1580 | 0 | } |
1581 | 0 | } |
1582 | |
|
1583 | 0 | return idx; |
1584 | 0 | } |
1585 | | |
1586 | | rb_encoding * |
1587 | | rb_locale_encoding(void) |
1588 | 0 | { |
1589 | 0 | return rb_enc_from_index(rb_locale_encindex()); |
1590 | 0 | } |
1591 | | |
1592 | | int |
1593 | | rb_filesystem_encindex(void) |
1594 | 1.40M | { |
1595 | 1.40M | return filesystem_encindex; |
1596 | 1.40M | } |
1597 | | |
1598 | | rb_encoding * |
1599 | | rb_filesystem_encoding(void) |
1600 | 1.40M | { |
1601 | 1.40M | return rb_enc_from_index(rb_filesystem_encindex()); |
1602 | 1.40M | } |
1603 | | |
1604 | | struct default_encoding { |
1605 | | int index; /* -2 => not yet set, -1 => nil */ |
1606 | | rb_encoding *enc; |
1607 | | }; |
1608 | | |
1609 | | static struct default_encoding default_external = {0}; |
1610 | | |
1611 | | static int |
1612 | | enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name) |
1613 | 0 | { |
1614 | 0 | int overridden = FALSE; |
1615 | |
|
1616 | 0 | if (def->index != -2) |
1617 | | /* Already set */ |
1618 | 0 | overridden = TRUE; |
1619 | |
|
1620 | 0 | int index = 0; |
1621 | 0 | if (!NIL_P(encoding)) { |
1622 | 0 | enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock. |
1623 | 0 | index = rb_enc_to_index(rb_to_encoding(encoding)); |
1624 | 0 | } |
1625 | |
|
1626 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1627 | 0 | if (NIL_P(encoding)) { |
1628 | 0 | def->index = -1; |
1629 | 0 | def->enc = 0; |
1630 | 0 | char *name_dup = strdup(name); |
1631 | |
|
1632 | 0 | st_data_t existing_name = (st_data_t)name_dup; |
1633 | 0 | if (st_delete(enc_table->names, &existing_name, NULL)) { |
1634 | 0 | xfree((void *)existing_name); |
1635 | 0 | } |
1636 | |
|
1637 | 0 | st_insert(enc_table->names, (st_data_t)name_dup, |
1638 | 0 | (st_data_t)UNSPECIFIED_ENCODING); |
1639 | 0 | } |
1640 | 0 | else { |
1641 | 0 | def->index = index; |
1642 | 0 | def->enc = 0; |
1643 | 0 | enc_alias_internal(enc_table, name, def->index); |
1644 | 0 | } |
1645 | |
|
1646 | 0 | if (def == &default_external) { |
1647 | 0 | int fs_idx = Init_enc_set_filesystem_encoding(); |
1648 | 0 | enc_alias_internal(enc_table, "filesystem", fs_idx); |
1649 | 0 | filesystem_encindex = fs_idx; |
1650 | 0 | } |
1651 | 0 | } |
1652 | |
|
1653 | 0 | return overridden; |
1654 | 0 | } |
1655 | | |
1656 | | rb_encoding * |
1657 | | rb_default_external_encoding(void) |
1658 | 178k | { |
1659 | 178k | if (default_external.enc) return default_external.enc; |
1660 | | |
1661 | 9 | if (default_external.index >= 0) { |
1662 | 9 | default_external.enc = rb_enc_from_index(default_external.index); |
1663 | 9 | return default_external.enc; |
1664 | 9 | } |
1665 | 0 | else { |
1666 | 0 | return rb_locale_encoding(); |
1667 | 0 | } |
1668 | 9 | } |
1669 | | |
1670 | | VALUE |
1671 | | rb_enc_default_external(void) |
1672 | 0 | { |
1673 | 0 | return rb_enc_from_encoding(rb_default_external_encoding()); |
1674 | 0 | } |
1675 | | |
1676 | | /* |
1677 | | * call-seq: |
1678 | | * Encoding.default_external -> enc |
1679 | | * |
1680 | | * Returns default external encoding. |
1681 | | * |
1682 | | * The default external encoding is used by default for strings created from |
1683 | | * the following locations: |
1684 | | * |
1685 | | * * CSV |
1686 | | * * File data read from disk |
1687 | | * * SDBM |
1688 | | * * StringIO |
1689 | | * * Zlib::GzipReader |
1690 | | * * Zlib::GzipWriter |
1691 | | * * String#inspect |
1692 | | * * Regexp#inspect |
1693 | | * |
1694 | | * While strings created from these locations will have this encoding, the |
1695 | | * encoding may not be valid. Be sure to check String#valid_encoding?. |
1696 | | * |
1697 | | * File data written to disk will be transcoded to the default external |
1698 | | * encoding when written, if default_internal is not nil. |
1699 | | * |
1700 | | * The default external encoding is initialized by the -E option. |
1701 | | * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on |
1702 | | * other operating systems. |
1703 | | */ |
1704 | | static VALUE |
1705 | | get_default_external(VALUE klass) |
1706 | 0 | { |
1707 | 0 | return rb_enc_default_external(); |
1708 | 0 | } |
1709 | | |
1710 | | void |
1711 | | rb_enc_set_default_external(VALUE encoding) |
1712 | 0 | { |
1713 | 0 | if (NIL_P(encoding)) { |
1714 | 0 | rb_raise(rb_eArgError, "default external can not be nil"); |
1715 | 0 | } |
1716 | 0 | enc_set_default_encoding(&default_external, encoding, |
1717 | 0 | "external"); |
1718 | 0 | } |
1719 | | |
1720 | | /* |
1721 | | * call-seq: |
1722 | | * Encoding.default_external = enc |
1723 | | * |
1724 | | * Sets default external encoding. You should not set |
1725 | | * Encoding::default_external in ruby code as strings created before changing |
1726 | | * the value may have a different encoding from strings created after the value |
1727 | | * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with |
1728 | | * the correct default_external. |
1729 | | * |
1730 | | * See Encoding::default_external for information on how the default external |
1731 | | * encoding is used. |
1732 | | */ |
1733 | | static VALUE |
1734 | | set_default_external(VALUE klass, VALUE encoding) |
1735 | 0 | { |
1736 | 0 | rb_warning("setting Encoding.default_external"); |
1737 | 0 | rb_enc_set_default_external(encoding); |
1738 | 0 | return encoding; |
1739 | 0 | } |
1740 | | |
1741 | | static struct default_encoding default_internal = {-2}; |
1742 | | |
1743 | | rb_encoding * |
1744 | | rb_default_internal_encoding(void) |
1745 | 178k | { |
1746 | 178k | if (!default_internal.enc && default_internal.index >= 0) { |
1747 | 0 | default_internal.enc = rb_enc_from_index(default_internal.index); |
1748 | 0 | } |
1749 | 178k | return default_internal.enc; /* can be NULL */ |
1750 | 178k | } |
1751 | | |
1752 | | VALUE |
1753 | | rb_enc_default_internal(void) |
1754 | 0 | { |
1755 | | /* Note: These functions cope with default_internal not being set */ |
1756 | 0 | return rb_enc_from_encoding(rb_default_internal_encoding()); |
1757 | 0 | } |
1758 | | |
1759 | | /* |
1760 | | * call-seq: |
1761 | | * Encoding.default_internal -> enc |
1762 | | * |
1763 | | * Returns default internal encoding. Strings will be transcoded to the |
1764 | | * default internal encoding in the following places if the default internal |
1765 | | * encoding is not nil: |
1766 | | * |
1767 | | * * CSV |
1768 | | * * Etc.sysconfdir and Etc.systmpdir |
1769 | | * * File data read from disk |
1770 | | * * File names from Dir |
1771 | | * * Integer#chr |
1772 | | * * String#inspect and Regexp#inspect |
1773 | | * * Strings returned from Readline |
1774 | | * * Strings returned from SDBM |
1775 | | * * Time#zone |
1776 | | * * Values from ENV |
1777 | | * * Values in ARGV including $PROGRAM_NAME |
1778 | | * |
1779 | | * Additionally String#encode and String#encode! use the default internal |
1780 | | * encoding if no encoding is given. |
1781 | | * |
1782 | | * The script encoding (__ENCODING__), not default_internal, is used as the |
1783 | | * encoding of created strings. |
1784 | | * |
1785 | | * Encoding::default_internal is initialized with -E option or nil otherwise. |
1786 | | */ |
1787 | | static VALUE |
1788 | | get_default_internal(VALUE klass) |
1789 | 0 | { |
1790 | 0 | return rb_enc_default_internal(); |
1791 | 0 | } |
1792 | | |
1793 | | void |
1794 | | rb_enc_set_default_internal(VALUE encoding) |
1795 | 0 | { |
1796 | 0 | enc_set_default_encoding(&default_internal, encoding, |
1797 | 0 | "internal"); |
1798 | 0 | } |
1799 | | |
1800 | | /* |
1801 | | * call-seq: |
1802 | | * Encoding.default_internal = enc or nil |
1803 | | * |
1804 | | * Sets default internal encoding or removes default internal encoding when |
1805 | | * passed nil. You should not set Encoding::default_internal in ruby code as |
1806 | | * strings created before changing the value may have a different encoding |
1807 | | * from strings created after the change. Instead you should use |
1808 | | * <tt>ruby -E</tt> to invoke ruby with the correct default_internal. |
1809 | | * |
1810 | | * See Encoding::default_internal for information on how the default internal |
1811 | | * encoding is used. |
1812 | | */ |
1813 | | static VALUE |
1814 | | set_default_internal(VALUE klass, VALUE encoding) |
1815 | 0 | { |
1816 | 0 | rb_warning("setting Encoding.default_internal"); |
1817 | 0 | rb_enc_set_default_internal(encoding); |
1818 | 0 | return encoding; |
1819 | 0 | } |
1820 | | |
1821 | | static void |
1822 | | set_encoding_const(const char *name, rb_encoding *enc) |
1823 | 0 | { |
1824 | 0 | VALUE encoding = rb_enc_from_encoding(enc); |
1825 | 0 | char *s = (char *)name; |
1826 | 0 | int haslower = 0, hasupper = 0, valid = 0; |
1827 | |
|
1828 | 0 | if (ISDIGIT(*s)) return; |
1829 | 0 | if (ISUPPER(*s)) { |
1830 | 0 | hasupper = 1; |
1831 | 0 | while (*++s && (ISALNUM(*s) || *s == '_')) { |
1832 | 0 | if (ISLOWER(*s)) haslower = 1; |
1833 | 0 | } |
1834 | 0 | } |
1835 | 0 | if (!*s) { |
1836 | 0 | if (s - name > ENCODING_NAMELEN_MAX) return; |
1837 | 0 | valid = 1; |
1838 | 0 | rb_define_const(rb_cEncoding, name, encoding); |
1839 | 0 | } |
1840 | 0 | if (!valid || haslower) { |
1841 | 0 | size_t len = s - name; |
1842 | 0 | if (len > ENCODING_NAMELEN_MAX) return; |
1843 | 0 | if (!haslower || !hasupper) { |
1844 | 0 | do { |
1845 | 0 | if (ISLOWER(*s)) haslower = 1; |
1846 | 0 | if (ISUPPER(*s)) hasupper = 1; |
1847 | 0 | } while (*++s && (!haslower || !hasupper)); |
1848 | 0 | len = s - name; |
1849 | 0 | } |
1850 | 0 | len += strlen(s); |
1851 | 0 | if (len++ > ENCODING_NAMELEN_MAX) return; |
1852 | 0 | MEMCPY(s = ALLOCA_N(char, len), name, char, len); |
1853 | 0 | name = s; |
1854 | 0 | if (!valid) { |
1855 | 0 | if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); |
1856 | 0 | for (; *s; ++s) { |
1857 | 0 | if (!ISALNUM(*s)) *s = '_'; |
1858 | 0 | } |
1859 | 0 | if (hasupper) { |
1860 | 0 | rb_define_const(rb_cEncoding, name, encoding); |
1861 | 0 | } |
1862 | 0 | } |
1863 | 0 | if (haslower) { |
1864 | 0 | for (s = (char *)name; *s; ++s) { |
1865 | 0 | if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); |
1866 | 0 | } |
1867 | 0 | rb_define_const(rb_cEncoding, name, encoding); |
1868 | 0 | } |
1869 | 0 | } |
1870 | 0 | } |
1871 | | |
1872 | | static int |
1873 | | rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg) |
1874 | 0 | { |
1875 | 0 | VALUE ary = (VALUE)arg; |
1876 | 0 | VALUE str = rb_interned_str_cstr((char *)name); |
1877 | 0 | rb_ary_push(ary, str); |
1878 | 0 | return ST_CONTINUE; |
1879 | 0 | } |
1880 | | |
1881 | | /* |
1882 | | * call-seq: |
1883 | | * Encoding.name_list -> ["enc1", "enc2", ...] |
1884 | | * |
1885 | | * Returns the list of available encoding names. |
1886 | | * |
1887 | | * Encoding.name_list |
1888 | | * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", |
1889 | | * "ISO-8859-1", "Shift_JIS", "EUC-JP", |
1890 | | * "Windows-31J", |
1891 | | * "BINARY", "CP932", "eucJP"] |
1892 | | * |
1893 | | */ |
1894 | | |
1895 | | static VALUE |
1896 | | rb_enc_name_list(VALUE klass) |
1897 | 0 | { |
1898 | 0 | VALUE ary; |
1899 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1900 | 0 | ary = rb_ary_new2(enc_table->names->num_entries); |
1901 | 0 | st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary); |
1902 | 0 | } |
1903 | 0 | return ary; |
1904 | 0 | } |
1905 | | |
1906 | | static int |
1907 | | rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg) |
1908 | 0 | { |
1909 | 0 | VALUE *p = (VALUE *)arg; |
1910 | 0 | VALUE aliases = p[0], ary = p[1]; |
1911 | 0 | int idx = (int)orig; |
1912 | 0 | VALUE key, str = rb_ary_entry(ary, idx); |
1913 | |
|
1914 | 0 | if (NIL_P(str)) { |
1915 | 0 | rb_encoding *enc = rb_enc_from_index(idx); |
1916 | |
|
1917 | 0 | if (!enc) return ST_CONTINUE; |
1918 | 0 | if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) { |
1919 | 0 | return ST_CONTINUE; |
1920 | 0 | } |
1921 | 0 | str = rb_fstring_cstr(rb_enc_name(enc)); |
1922 | 0 | rb_ary_store(ary, idx, str); |
1923 | 0 | } |
1924 | 0 | key = rb_interned_str_cstr((char *)name); |
1925 | 0 | rb_hash_aset(aliases, key, str); |
1926 | 0 | return ST_CONTINUE; |
1927 | 0 | } |
1928 | | |
1929 | | /* |
1930 | | * call-seq: |
1931 | | * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...} |
1932 | | * |
1933 | | * Returns the hash of available encoding alias and original encoding name. |
1934 | | * |
1935 | | * Encoding.aliases |
1936 | | * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII", |
1937 | | * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} |
1938 | | * |
1939 | | */ |
1940 | | |
1941 | | static VALUE |
1942 | | rb_enc_aliases(VALUE klass) |
1943 | 0 | { |
1944 | 0 | VALUE aliases[2]; |
1945 | 0 | aliases[0] = rb_hash_new(); |
1946 | 0 | aliases[1] = rb_ary_new(); |
1947 | |
|
1948 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
1949 | 0 | st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases); |
1950 | 0 | } |
1951 | |
|
1952 | 0 | return aliases[0]; |
1953 | 0 | } |
1954 | | |
1955 | | /* |
1956 | | * An \Encoding instance represents a character encoding usable in Ruby. |
1957 | | * It is defined as a constant under the \Encoding namespace. |
1958 | | * It has a name and, optionally, aliases: |
1959 | | * |
1960 | | * Encoding::US_ASCII.name # => "US-ASCII" |
1961 | | * Encoding::US_ASCII.names # => ["US-ASCII", "ASCII", "ANSI_X3.4-1968", "646"] |
1962 | | * |
1963 | | * A Ruby method that accepts an encoding as an argument will accept: |
1964 | | * |
1965 | | * - An \Encoding object. |
1966 | | * - The name of an encoding. |
1967 | | * - An alias for an encoding name. |
1968 | | * |
1969 | | * These are equivalent: |
1970 | | * |
1971 | | * 'foo'.encode(Encoding::US_ASCII) # Encoding object. |
1972 | | * 'foo'.encode('US-ASCII') # Encoding name. |
1973 | | * 'foo'.encode('ASCII') # Encoding alias. |
1974 | | * |
1975 | | * For a full discussion of encodings and their uses, |
1976 | | * see {the Encodings document}[rdoc-ref:encodings.rdoc]. |
1977 | | * |
1978 | | * Encoding::ASCII_8BIT is a special-purpose encoding that is usually used for |
1979 | | * a string of bytes, not a string of characters. |
1980 | | * But as the name indicates, its characters in the ASCII range |
1981 | | * are considered as ASCII characters. |
1982 | | * This is useful when you use other ASCII-compatible encodings. |
1983 | | * |
1984 | | */ |
1985 | | |
1986 | | void |
1987 | | Init_Encoding(void) |
1988 | 9 | { |
1989 | 9 | VALUE list; |
1990 | 9 | int i; |
1991 | | |
1992 | 9 | id_i_name = rb_intern_const("@name"); |
1993 | 9 | rb_cEncoding = rb_define_class("Encoding", rb_cObject); |
1994 | 9 | rb_define_alloc_func(rb_cEncoding, enc_s_alloc); |
1995 | 9 | rb_undef_method(CLASS_OF(rb_cEncoding), "new"); |
1996 | | |
1997 | | /* The name of the encoding. |
1998 | | * |
1999 | | * Encoding::UTF_8.name #=> "UTF-8" |
2000 | | */ |
2001 | 9 | rb_attr(rb_cEncoding, rb_intern("name"), TRUE, FALSE, Qfalse); |
2002 | 9 | rb_define_alias(rb_cEncoding, "to_s", "name"); |
2003 | | |
2004 | 9 | rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0); |
2005 | 9 | rb_define_method(rb_cEncoding, "names", enc_names, 0); |
2006 | 9 | rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0); |
2007 | 9 | rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0); |
2008 | 9 | rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0); |
2009 | 9 | rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0); |
2010 | 9 | rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0); |
2011 | 9 | rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1); |
2012 | 9 | rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2); |
2013 | | |
2014 | 9 | rb_define_method(rb_cEncoding, "_dump", enc_dump, -1); |
2015 | 9 | rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1); |
2016 | | |
2017 | 9 | rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0); |
2018 | 9 | rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1); |
2019 | 9 | rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0); |
2020 | 9 | rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1); |
2021 | 9 | rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */ |
2022 | | |
2023 | 9 | struct enc_table *enc_table = &global_enc_table; |
2024 | | |
2025 | 9 | rb_gc_register_address(&rb_encoding_list); |
2026 | 9 | list = rb_encoding_list = rb_ary_new2(ENCODING_LIST_CAPA); |
2027 | 9 | RBASIC_CLEAR_CLASS(list); |
2028 | | |
2029 | 117 | for (i = 0; i < enc_table->count; ++i) { |
2030 | 108 | rb_ary_push(list, enc_new(enc_table->list[i].enc)); |
2031 | 108 | } |
2032 | | |
2033 | 9 | rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader); |
2034 | 9 | } |
2035 | | |
2036 | | void |
2037 | | Init_unicode_version(void) |
2038 | 9 | { |
2039 | 9 | extern const char onigenc_unicode_version_string[]; |
2040 | | |
2041 | 9 | VALUE str = rb_usascii_str_new_static(onigenc_unicode_version_string, |
2042 | 9 | strlen(onigenc_unicode_version_string)); |
2043 | 9 | OBJ_FREEZE(str); |
2044 | | /* The supported Unicode version. */ |
2045 | 9 | rb_define_const(rb_cEncoding, "UNICODE_VERSION", str); |
2046 | 9 | } |
2047 | | |
2048 | | void |
2049 | | Init_encodings(void) |
2050 | 9 | { |
2051 | 9 | rb_enc_init(&global_enc_table); |
2052 | 9 | } |
2053 | | |
2054 | | /* locale insensitive ctype functions */ |
2055 | | |
2056 | | void |
2057 | | rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg) |
2058 | 0 | { |
2059 | 0 | GLOBAL_ENC_TABLE_LOCKING(enc_table) { |
2060 | 0 | st_foreach(enc_table->names, func, arg); |
2061 | 0 | } |
2062 | 0 | } |