Coverage Report

Created: 2025-08-28 06:26

/src/serenity/Userland/Libraries/LibTextCodec/Encoder.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/BinarySearch.h>
8
#include <AK/Error.h>
9
#include <AK/Utf8View.h>
10
#include <LibTextCodec/Decoder.h>
11
#include <LibTextCodec/Encoder.h>
12
#include <LibTextCodec/LookupTables.h>
13
14
namespace TextCodec {
15
16
namespace {
17
UTF8Encoder s_utf8_encoder;
18
GB18030Encoder s_gb18030_encoder;
19
GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
20
Big5Encoder s_big5_encoder;
21
EUCJPEncoder s_euc_jp_encoder;
22
ISO2022JPEncoder s_iso_2022_jp_encoder;
23
ShiftJISEncoder s_shift_jis_encoder;
24
EUCKREncoder s_euc_kr_encoder;
25
26
// s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json
27
// Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt
28
SingleByteEncoder s_ibm866_encoder { s_ibm866_index };
29
SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index };
30
SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index };
31
SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index };
32
SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index };
33
SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index };
34
SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index };
35
SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index };
36
SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index };
37
SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index };
38
SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index };
39
SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index };
40
SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index };
41
SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index };
42
SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index };
43
SingleByteEncoder s_hebrew_encoder { s_windows_1255_index };
44
SingleByteEncoder s_koi8r_encoder { s_koi8_r_index };
45
SingleByteEncoder s_koi8u_encoder { s_koi8_u_index };
46
SingleByteEncoder s_mac_roman_encoder { s_macintosh_index };
47
SingleByteEncoder s_windows874_encoder { s_windows_874_index };
48
SingleByteEncoder s_windows1252_encoder { s_windows_1252_index };
49
SingleByteEncoder s_windows1253_encoder { s_windows_1253_index };
50
SingleByteEncoder s_turkish_encoder { s_windows_1254_index };
51
SingleByteEncoder s_windows1256_encoder { s_windows_1256_index };
52
SingleByteEncoder s_windows1257_encoder { s_windows_1257_index };
53
SingleByteEncoder s_windows1258_encoder { s_windows_1258_index };
54
SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index };
55
56
}
57
58
Optional<Encoder&> encoder_for_exact_name(StringView encoding)
59
5.18M
{
60
5.18M
    if (encoding.equals_ignoring_ascii_case("utf-8"sv))
61
5.18M
        return s_utf8_encoder;
62
0
    if (encoding.equals_ignoring_ascii_case("big5"sv))
63
0
        return s_big5_encoder;
64
0
    if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
65
0
        return s_euc_jp_encoder;
66
0
    if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv))
67
0
        return s_iso_2022_jp_encoder;
68
0
    if (encoding.equals_ignoring_ascii_case("shift_jis"sv))
69
0
        return s_shift_jis_encoder;
70
0
    if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
71
0
        return s_euc_kr_encoder;
72
0
    if (encoding.equals_ignoring_ascii_case("gb18030"sv))
73
0
        return s_gb18030_encoder;
74
0
    if (encoding.equals_ignoring_ascii_case("gbk"sv))
75
0
        return s_gbk_encoder;
76
0
    if (encoding.equals_ignoring_ascii_case("ibm866"sv))
77
0
        return s_ibm866_encoder;
78
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv))
79
0
        return s_latin2_encoder;
80
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv))
81
0
        return s_latin3_encoder;
82
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv))
83
0
        return s_latin4_encoder;
84
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv))
85
0
        return s_latin_cyrillic_encoder;
86
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv))
87
0
        return s_latin_arabic_encoder;
88
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv))
89
0
        return s_latin_greek_encoder;
90
0
    if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv))
91
0
        return s_latin_hebrew_encoder;
92
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv))
93
0
        return s_latin6_encoder;
94
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv))
95
0
        return s_latin7_encoder;
96
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv))
97
0
        return s_latin8_encoder;
98
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv))
99
0
        return s_latin9_encoder;
100
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv))
101
0
        return s_latin10_encoder;
102
0
    if (encoding.equals_ignoring_ascii_case("koi8-r"sv))
103
0
        return s_koi8r_encoder;
104
0
    if (encoding.equals_ignoring_ascii_case("koi8-u"sv))
105
0
        return s_koi8u_encoder;
106
0
    if (encoding.equals_ignoring_ascii_case("macintosh"sv))
107
0
        return s_mac_roman_encoder;
108
0
    if (encoding.equals_ignoring_ascii_case("windows-874"sv))
109
0
        return s_windows874_encoder;
110
0
    if (encoding.equals_ignoring_ascii_case("windows-1250"sv))
111
0
        return s_centraleurope_encoder;
112
0
    if (encoding.equals_ignoring_ascii_case("windows-1251"sv))
113
0
        return s_cyrillic_encoder;
114
0
    if (encoding.equals_ignoring_ascii_case("windows-1252"sv))
115
0
        return s_windows1252_encoder;
116
0
    if (encoding.equals_ignoring_ascii_case("windows-1253"sv))
117
0
        return s_windows1253_encoder;
118
0
    if (encoding.equals_ignoring_ascii_case("windows-1254"sv))
119
0
        return s_turkish_encoder;
120
0
    if (encoding.equals_ignoring_ascii_case("windows-1255"sv))
121
0
        return s_hebrew_encoder;
122
0
    if (encoding.equals_ignoring_ascii_case("windows-1256"sv))
123
0
        return s_windows1256_encoder;
124
0
    if (encoding.equals_ignoring_ascii_case("windows-1257"sv))
125
0
        return s_windows1257_encoder;
126
0
    if (encoding.equals_ignoring_ascii_case("windows-1258"sv))
127
0
        return s_windows1258_encoder;
128
0
    if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv))
129
0
        return s_mac_cyrillic_encoder;
130
0
    dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
131
0
    return {};
132
0
}
133
134
Optional<Encoder&> encoder_for(StringView label)
135
5.18M
{
136
5.18M
    auto encoding = get_standardized_encoding(label);
137
5.18M
    return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
138
5.18M
}
139
140
// https://encoding.spec.whatwg.org/#utf-8-encoder
141
ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>)
142
1.14k
{
143
1.14k
    ReadonlyBytes bytes { input.bytes(), input.byte_length() };
144
1.14k
    for (auto byte : bytes)
145
65.6M
        TRY(on_byte(byte));
146
1.14k
    return {};
147
1.14k
}
148
149
// https://encoding.spec.whatwg.org/#euc-jp-encoder
150
ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
151
0
{
152
0
    for (auto item : input) {
153
        // 1. If code point is end-of-queue, return finished.
154
155
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
156
0
        if (is_ascii(item)) {
157
0
            TRY(on_byte(static_cast<u8>(item)));
158
0
            continue;
159
0
        }
160
161
        // 3. If code point is U+00A5, return byte 0x5C.
162
0
        if (item == 0x00A5) {
163
0
            TRY(on_byte(static_cast<u8>(0x5C)));
164
0
            continue;
165
0
        }
166
167
        // 4. If code point is U+203E, return byte 0x7E.
168
0
        if (item == 0x203E) {
169
0
            TRY(on_byte(static_cast<u8>(0x7E)));
170
0
            continue;
171
0
        }
172
173
        // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1.
174
0
        if (item >= 0xFF61 && item <= 0xFF9F) {
175
0
            TRY(on_byte(0x8E));
176
0
            TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
177
0
            continue;
178
0
        }
179
180
        // 6. If code point is U+2212, set it to U+FF0D.
181
0
        if (item == 0x2212)
182
0
            item = 0xFF0D;
183
184
        // 7. Let pointer be the index pointer for code point in index jis0208.
185
0
        auto pointer = code_point_jis0208_index(item);
186
187
        // 8. If pointer is null, return error with code point.
188
0
        if (!pointer.has_value()) {
189
0
            TRY(on_error(item));
190
0
            continue;
191
0
        }
192
193
        // 9. Let lead be pointer / 94 + 0xA1.
194
0
        auto lead = *pointer / 94 + 0xA1;
195
196
        // 10. Let trail be pointer % 94 + 0xA1.
197
0
        auto trail = *pointer % 94 + 0xA1;
198
199
        // 11. Return two bytes whose values are lead and trail.
200
0
        TRY(on_byte(static_cast<u8>(lead)));
201
0
        TRY(on_byte(static_cast<u8>(trail)));
202
0
    }
203
204
0
    return {};
205
0
}
206
207
// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder
208
ErrorOr<ISO2022JPEncoder::State> ISO2022JPEncoder::process_item(u32 item, State state, Function<ErrorOr<void>(u8)>& on_byte, Function<ErrorOr<void>(u32)>& on_error)
209
0
{
210
    // 3. If ISO-2022-JP encoder state is ASCII or Roman, and code point is U+000E, U+000F, or U+001B, return error with U+FFFD.
211
0
    if (state == State::ASCII || state == State::Roman) {
212
0
        if (item == 0x000E || item == 0x000F || item == 0x001B) {
213
0
            TRY(on_error(0xFFFD));
214
0
            return state;
215
0
        }
216
0
    }
217
218
    // 4. If ISO-2022-JP encoder state is ASCII and code point is an ASCII code point, return a byte whose value is code point.
219
0
    if (state == State::ASCII && is_ascii(item)) {
220
0
        TRY(on_byte(static_cast<u8>(item)));
221
0
        return state;
222
0
    }
223
224
    // 5. If ISO-2022-JP encoder state is Roman and code point is an ASCII code point, excluding U+005C and U+007E, or is U+00A5 or U+203E, then:
225
0
    if (state == State::Roman && ((is_ascii(item) && item != 0x005C && item != 0x007E) || (item == 0x00A5 || item == 0x203E))) {
226
        // 1. If code point is an ASCII code point, return a byte whose value is code point.
227
0
        if (is_ascii(item)) {
228
0
            TRY(on_byte(static_cast<u8>(item)));
229
0
            return state;
230
0
        }
231
232
        // 2. If code point is U+00A5, return byte 0x5C.
233
0
        if (item == 0x00A5) {
234
0
            TRY(on_byte(0x5C));
235
0
            return state;
236
0
        }
237
238
        // 3. If code point is U+203E, return byte 0x7E.
239
0
        if (item == 0x203E) {
240
0
            TRY(on_byte(0x7E));
241
0
            return state;
242
0
        }
243
0
    }
244
245
    // 6. If code point is an ASCII code point, and ISO-2022-JP encoder state is not ASCII, restore code point to ioQueue, set
246
    //    ISO-2022-JP encoder state to ASCII, and return three bytes 0x1B 0x28 0x42.
247
0
    if (is_ascii(item) && state != State::ASCII) {
248
0
        TRY(on_byte(0x1B));
249
0
        TRY(on_byte(0x28));
250
0
        TRY(on_byte(0x42));
251
0
        return process_item(item, State::ASCII, on_byte, on_error);
252
0
    }
253
254
    // 7. If code point is either U+00A5 or U+203E, and ISO-2022-JP encoder state is not Roman, restore code point to ioQueue,
255
    //    set ISO-2022-JP encoder state to Roman, and return three bytes 0x1B 0x28 0x4A.
256
0
    if ((item == 0x00A5 || item == 0x203E) && state != State::Roman) {
257
0
        TRY(on_byte(0x1B));
258
0
        TRY(on_byte(0x28));
259
0
        TRY(on_byte(0x4A));
260
0
        return process_item(item, State::Roman, on_byte, on_error);
261
0
    }
262
263
    // 8. If code point is U+2212, set it to U+FF0D.
264
0
    if (item == 0x2212)
265
0
        item = 0xFF0D;
266
267
    // 9. If code point is in the range U+FF61 to U+FF9F, inclusive, set it to the index code point for code point − 0xFF61
268
    //    in index ISO-2022-JP katakana.
269
0
    if (item >= 0xFF61 && item <= 0xFF9F) {
270
0
        item = *index_iso_2022_jp_katakana_code_point(item - 0xFF61);
271
0
    }
272
273
    // 10. Let pointer be the index pointer for code point in index jis0208.
274
0
    auto pointer = code_point_jis0208_index(item);
275
276
    // 11. If pointer is null, then:
277
0
    if (!pointer.has_value()) {
278
        // 1. If ISO-2022-JP encoder state is jis0208, then restore code point to ioQueue, set ISO-2022-JP encoder state to
279
        //    ASCII, and return three bytes 0x1B 0x28 0x42.
280
0
        if (state == State::jis0208) {
281
0
            TRY(on_byte(0x1B));
282
0
            TRY(on_byte(0x28));
283
0
            TRY(on_byte(0x4A));
284
0
            return process_item(item, State::ASCII, on_byte, on_error);
285
0
        }
286
287
        // 2. Return error with code point.
288
0
        TRY(on_error(item));
289
0
        return state;
290
0
    }
291
292
    // 12. If ISO-2022-JP encoder state is not jis0208, restore code point to ioQueue, set ISO-2022-JP encoder state to
293
    //     jis0208, and return three bytes 0x1B 0x24 0x42.
294
0
    if (state != State::jis0208) {
295
0
        TRY(on_byte(0x1B));
296
0
        TRY(on_byte(0x24));
297
0
        TRY(on_byte(0x42));
298
0
        return process_item(item, State::jis0208, on_byte, on_error);
299
0
    }
300
301
    // 13. Let lead be pointer / 94 + 0x21.
302
0
    auto lead = *pointer / 94 + 0x21;
303
304
    // 14. Let trail be pointer % 94 + 0x21.
305
0
    auto trail = *pointer % 94 + 0x21;
306
307
    // 15. Return two bytes whose values are lead and trail.
308
0
    TRY(on_byte(static_cast<u8>(lead)));
309
0
    TRY(on_byte(static_cast<u8>(trail)));
310
0
    return state;
311
0
}
312
313
// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder
314
ErrorOr<void> ISO2022JPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
315
0
{
316
    // ISO-2022-JP’s encoder has an associated ISO-2022-JP encoder state which is ASCII, Roman, or jis0208 (initially ASCII).
317
0
    auto state = State::ASCII;
318
319
0
    for (u32 item : input) {
320
0
        state = TRY(process_item(item, state, on_byte, on_error));
321
0
    }
322
323
    // 1. If code point is end-of-queue and ISO-2022-JP encoder state is not ASCII, set ISO-2022-JP
324
    //    encoder state to ASCII, and return three bytes 0x1B 0x28 0x42.
325
0
    if (state != State::ASCII) {
326
0
        state = State::ASCII;
327
0
        TRY(on_byte(0x1B));
328
0
        TRY(on_byte(0x28));
329
0
        TRY(on_byte(0x42));
330
0
        return {};
331
0
    }
332
333
    // 2. If code point is end-of-queue and ISO-2022-JP encoder state is ASCII, return finished.
334
0
    return {};
335
0
}
336
337
static Optional<u32> code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to)
338
0
{
339
0
    VERIFY(skip_to >= skip_from);
340
0
    for (u32 i = 0; i < s_jis0208_index.size(); ++i) {
341
0
        if (i >= skip_from && i <= skip_to)
342
0
            continue;
343
0
        if (s_jis0208_index[i] == code_point)
344
0
            return i;
345
0
    }
346
0
    return {};
347
0
}
348
349
// https://encoding.spec.whatwg.org/#index-shift_jis-pointer
350
static Optional<u32> index_shift_jis_pointer(u32 code_point)
351
0
{
352
    // 1. Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive.
353
0
    auto pointer = code_point_jis0208_index_skipping_range(code_point, 8272, 8835);
354
0
    if (!pointer.has_value())
355
0
        return {};
356
357
    // 2. Return the index pointer for code point in index.
358
0
    return *pointer;
359
0
}
360
361
// https://encoding.spec.whatwg.org/#shift_jis-encoder
362
ErrorOr<void> ShiftJISEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
363
0
{
364
0
    for (u32 item : input) {
365
        // 1. If code point is end-of-queue, return finished.
366
367
        // 2. If code point is an ASCII code point or U+0080, return a byte whose value is code point.
368
0
        if (is_ascii(item) || item == 0x0080) {
369
0
            TRY(on_byte(static_cast<u8>(item)));
370
0
            continue;
371
0
        }
372
373
        // 3. If code point is U+00A5, return byte 0x5C.
374
0
        if (item == 0x00A5) {
375
0
            TRY(on_byte(0x5C));
376
0
            continue;
377
0
        }
378
379
        // 4. If code point is U+203E, return byte 0x7E.
380
0
        if (item == 0x203E) {
381
0
            TRY(on_byte(0x7E));
382
0
            continue;
383
0
        }
384
385
        // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return a byte whose value is code point − 0xFF61 + 0xA1.
386
0
        if (item >= 0xFF61 && item <= 0xFF9F) {
387
0
            TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
388
0
            continue;
389
0
        }
390
391
        // 6. If code point is U+2212, set it to U+FF0D.
392
0
        if (item == 0x2212)
393
0
            item = 0xFF0D;
394
395
        // 7. Let pointer be the index Shift_JIS pointer for code point.
396
0
        auto pointer = index_shift_jis_pointer(item);
397
398
        // 8. If pointer is null, return error with code point.
399
0
        if (!pointer.has_value()) {
400
0
            TRY(on_error(item));
401
0
            continue;
402
0
        }
403
404
        // 9. Let lead be pointer / 188.
405
0
        auto lead = *pointer / 188;
406
407
        // 10. Let lead offset be 0x81 if lead is less than 0x1F, otherwise 0xC1.
408
0
        auto lead_offset = 0xC1;
409
0
        if (lead < 0x1F)
410
0
            lead_offset = 0x81;
411
412
        // 11. Let trail be pointer % 188.
413
0
        auto trail = *pointer % 188;
414
415
        // 12. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
416
0
        auto offset = 0x41;
417
0
        if (trail < 0x3F)
418
0
            offset = 0x40;
419
420
        // 13. Return two bytes whose values are lead + lead offset and trail + offset.
421
0
        TRY(on_byte(static_cast<u8>(lead + lead_offset)));
422
0
        TRY(on_byte(static_cast<u8>(trail + offset)));
423
0
    }
424
425
0
    return {};
426
0
}
427
428
// https://encoding.spec.whatwg.org/#euc-kr-encoder
429
ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
430
0
{
431
0
    for (u32 item : input) {
432
        // 1. If code point is end-of-queue, return finished.
433
434
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
435
0
        if (is_ascii(item)) {
436
0
            TRY(on_byte(static_cast<u8>(item)));
437
0
            continue;
438
0
        }
439
440
        // 3. Let pointer be the index pointer for code point in index EUC-KR.
441
0
        auto pointer = code_point_euc_kr_index(item);
442
443
        // 4. If pointer is null, return error with code point.
444
0
        if (!pointer.has_value()) {
445
0
            TRY(on_error(item));
446
0
            continue;
447
0
        }
448
449
        // 5. Let lead be pointer / 190 + 0x81.
450
0
        auto lead = *pointer / 190 + 0x81;
451
452
        // 6. Let trail be pointer % 190 + 0x41.
453
0
        auto trail = *pointer % 190 + 0x41;
454
455
        // 7. Return two bytes whose values are lead and trail.
456
0
        TRY(on_byte(static_cast<u8>(lead)));
457
0
        TRY(on_byte(static_cast<u8>(trail)));
458
0
    }
459
460
0
    return {};
461
0
}
462
463
// https://encoding.spec.whatwg.org/#index-big5-pointer
464
static Optional<u32> index_big5_pointer(u32 code_point)
465
0
{
466
    // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157.
467
0
    auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer;
468
469
    // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer
470
    //    corresponding to code point in index.
471
0
    if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) {
472
0
        for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) {
473
0
            if (s_big5_index[i] == code_point) {
474
0
                return s_big5_index_first_pointer + i;
475
0
            }
476
0
        }
477
0
        return {};
478
0
    }
479
480
    // 3. Return the index pointer for code point in index.
481
0
    for (u32 i = start_index; i < s_big5_index.size(); ++i) {
482
0
        if (s_big5_index[i] == code_point) {
483
0
            return s_big5_index_first_pointer + i;
484
0
        }
485
0
    }
486
0
    return {};
487
0
}
488
489
// https://encoding.spec.whatwg.org/#big5-encoder
490
ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
491
0
{
492
0
    for (u32 item : input) {
493
        // 1. If code point is end-of-queue, return finished.
494
495
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
496
0
        if (is_ascii(item)) {
497
0
            TRY(on_byte(static_cast<u8>(item)));
498
0
            continue;
499
0
        }
500
501
        // 3. Let pointer be the index Big5 pointer for code point.
502
0
        auto pointer = index_big5_pointer(item);
503
504
        // 4. If pointer is null, return error with code point.
505
0
        if (!pointer.has_value()) {
506
0
            TRY(on_error(item));
507
0
            continue;
508
0
        }
509
510
        // 5. Let lead be pointer / 157 + 0x81.
511
0
        auto lead = *pointer / 157 + 0x81;
512
513
        // 6. Let trail be pointer % 157.
514
0
        auto trail = *pointer % 157;
515
516
        // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62.
517
0
        auto offset = 0x62;
518
0
        if (trail < 0x3f)
519
0
            offset = 0x40;
520
521
        // 8. Return two bytes whose values are lead and trail + offset.
522
0
        TRY(on_byte(static_cast<u8>(lead)));
523
0
        TRY(on_byte(static_cast<u8>(trail + offset)));
524
0
    }
525
526
0
    return {};
527
0
}
528
529
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
530
static u32 index_gb18030_ranges_pointer(u32 code_point)
531
0
{
532
    // 1. If code point is U+E7C7, return pointer 7457.
533
0
    if (code_point == 0xe7c7)
534
0
        return 7457;
535
536
    // 2. Let offset be the last code point in index gb18030 ranges that is less than
537
    //    or equal to code point and let pointer offset be its corresponding pointer.
538
0
    size_t last_index;
539
0
    binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
540
0
        return code_point - entry.code_point;
541
0
    });
542
0
    auto offset = s_gb18030_ranges[last_index].code_point;
543
0
    auto pointer_offset = s_gb18030_ranges[last_index].pointer;
544
545
    // 3. Return a pointer whose value is pointer offset + code point − offset.
546
0
    return pointer_offset + code_point - offset;
547
0
}
548
549
GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
550
124
    : m_is_gbk(is_gbk)
551
124
{
552
124
}
553
554
// https://encoding.spec.whatwg.org/#gb18030-encoder
555
ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
556
0
{
557
0
    bool gbk = (m_is_gbk == IsGBK::Yes);
558
559
0
    for (u32 item : input) {
560
        // 1. If code point is end-of-queue, return finished.
561
562
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
563
0
        if (is_ascii(item)) {
564
0
            TRY(on_byte(static_cast<u8>(item)));
565
0
            continue;
566
0
        }
567
568
        // 3. If code point is U+E5E5, return error with code point.
569
0
        if (item == 0xE5E5) {
570
0
            TRY(on_error(item));
571
0
            continue;
572
0
        }
573
574
        // 4. If is GBK is true and code point is U+20AC, return byte 0x80.
575
0
        if (gbk && item == 0x20AC) {
576
0
            TRY(on_byte(0x80));
577
0
            continue;
578
0
        }
579
580
        // 5. Let pointer be the index pointer for code point in index gb18030.
581
0
        auto pointer = code_point_gb18030_index(item);
582
583
        // 6. If pointer is non-null, then:
584
0
        if (pointer.has_value()) {
585
            // 1. Let lead be pointer / 190 + 0x81.
586
0
            auto lead = *pointer / 190 + 0x81;
587
588
            // 2. Let trail be pointer % 190.
589
0
            auto trail = *pointer % 190;
590
591
            // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
592
0
            auto offset = 0x41;
593
0
            if (trail < 0x3f)
594
0
                offset = 0x40;
595
596
            // 4. Return two bytes whose values are lead and trail + offset.
597
0
            TRY(on_byte(static_cast<u8>(lead)));
598
0
            TRY(on_byte(static_cast<u8>(trail + offset)));
599
0
            continue;
600
0
        }
601
602
        // 7. If is GBK is true, return error with code point.
603
0
        if (gbk) {
604
0
            TRY(on_error(item));
605
0
            continue;
606
0
        }
607
608
        // 8. Set pointer to the index gb18030 ranges pointer for code point.
609
0
        pointer = index_gb18030_ranges_pointer(item);
610
611
        // 9. Let byte1 be pointer / (10 × 126 × 10).
612
0
        auto byte1 = *pointer / (10 * 126 * 10);
613
614
        // 10. Set pointer to pointer % (10 × 126 × 10).
615
0
        pointer = *pointer % (10 * 126 * 10);
616
617
        // 11. Let byte2 be pointer / (10 × 126).
618
0
        auto byte2 = *pointer / (10 * 126);
619
620
        // 12. Set pointer to pointer % (10 × 126).
621
0
        pointer = *pointer % (10 * 126);
622
623
        // 13. Let byte3 be pointer / 10.
624
0
        auto byte3 = *pointer / 10;
625
626
        // 14. Let byte4 be pointer % 10.
627
0
        auto byte4 = *pointer % 10;
628
629
        // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
630
0
        TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
631
0
        TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
632
0
        TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
633
0
        TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
634
0
    }
635
636
0
    return {};
637
0
}
638
639
// https://encoding.spec.whatwg.org/#single-byte-encoder
640
template<Integral ArrayType>
641
ErrorOr<void> SingleByteEncoder<ArrayType>::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
642
0
{
643
0
    for (u32 const code_point : input) {
644
0
        if (code_point < 0x80) {
645
            // 2. If code point is an ASCII code point, return a byte whose value is code point.
646
0
            TRY(on_byte(static_cast<u8>(code_point)));
647
0
        } else {
648
0
            Optional<u8> pointer = {};
649
0
            for (u8 i = 0; i < m_translation_table.size(); i++) {
650
0
                if (m_translation_table[i] == code_point) {
651
                    // 3. Let pointer be the index pointer for code point in index single-byte.
652
0
                    pointer = i;
653
0
                    break;
654
0
                }
655
0
            }
656
0
            if (pointer.has_value()) {
657
                // 5. Return a byte whose value is pointer + 0x80.
658
0
                TRY(on_byte(pointer.value() + 0x80));
659
0
            } else {
660
                // 4. If pointer is null, return error with code point.
661
0
                TRY(on_error(code_point));
662
0
            }
663
0
        }
664
0
    }
665
    // 1. If code point is end-of-queue, return finished.
666
0
    return {};
667
0
}
668
669
}