Coverage Report

Created: 2025-11-02 07:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibTextCodec/Encoder.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/BinarySearch.h>
8
#include <AK/Error.h>
9
#include <AK/Utf16View.h>
10
#include <AK/Utf8View.h>
11
#include <LibTextCodec/Decoder.h>
12
#include <LibTextCodec/Encoder.h>
13
#include <LibTextCodec/LookupTables.h>
14
15
namespace TextCodec {
16
17
namespace {
18
Latin1Encoder s_latin1_encoder;
19
UTF8Encoder s_utf8_encoder;
20
UTF16BEEncoder s_utf16be_encoder;
21
UTF16LEEncoder s_utf16le_encoder;
22
GB18030Encoder s_gb18030_encoder;
23
GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
24
Big5Encoder s_big5_encoder;
25
EUCJPEncoder s_euc_jp_encoder;
26
ISO2022JPEncoder s_iso_2022_jp_encoder;
27
ShiftJISEncoder s_shift_jis_encoder;
28
EUCKREncoder s_euc_kr_encoder;
29
30
// s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json
31
// Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt
32
SingleByteEncoder s_ibm866_encoder { s_ibm866_index };
33
SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index };
34
SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index };
35
SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index };
36
SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index };
37
SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index };
38
SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index };
39
SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index };
40
SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index };
41
SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index };
42
SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index };
43
SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index };
44
SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index };
45
SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index };
46
SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index };
47
SingleByteEncoder s_hebrew_encoder { s_windows_1255_index };
48
SingleByteEncoder s_koi8r_encoder { s_koi8_r_index };
49
SingleByteEncoder s_koi8u_encoder { s_koi8_u_index };
50
SingleByteEncoder s_mac_roman_encoder { s_macintosh_index };
51
SingleByteEncoder s_windows874_encoder { s_windows_874_index };
52
SingleByteEncoder s_windows1252_encoder { s_windows_1252_index };
53
SingleByteEncoder s_windows1253_encoder { s_windows_1253_index };
54
SingleByteEncoder s_turkish_encoder { s_windows_1254_index };
55
SingleByteEncoder s_windows1256_encoder { s_windows_1256_index };
56
SingleByteEncoder s_windows1257_encoder { s_windows_1257_index };
57
SingleByteEncoder s_windows1258_encoder { s_windows_1258_index };
58
SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index };
59
60
}
61
62
Optional<Encoder&> encoder_for_exact_name(StringView encoding)
63
6.12M
{
64
6.12M
    if (encoding.equals_ignoring_ascii_case("iso-8859-1"sv))
65
0
        return s_latin1_encoder;
66
6.12M
    if (encoding.equals_ignoring_ascii_case("utf-8"sv))
67
6.12M
        return s_utf8_encoder;
68
0
    if (encoding.equals_ignoring_ascii_case("utf-16be"sv))
69
0
        return s_utf16be_encoder;
70
0
    if (encoding.equals_ignoring_ascii_case("utf-16le"sv))
71
0
        return s_utf16le_encoder;
72
0
    if (encoding.equals_ignoring_ascii_case("big5"sv))
73
0
        return s_big5_encoder;
74
0
    if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
75
0
        return s_euc_jp_encoder;
76
0
    if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv))
77
0
        return s_iso_2022_jp_encoder;
78
0
    if (encoding.equals_ignoring_ascii_case("shift_jis"sv))
79
0
        return s_shift_jis_encoder;
80
0
    if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
81
0
        return s_euc_kr_encoder;
82
0
    if (encoding.equals_ignoring_ascii_case("gb18030"sv))
83
0
        return s_gb18030_encoder;
84
0
    if (encoding.equals_ignoring_ascii_case("gbk"sv))
85
0
        return s_gbk_encoder;
86
0
    if (encoding.equals_ignoring_ascii_case("ibm866"sv))
87
0
        return s_ibm866_encoder;
88
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv))
89
0
        return s_latin2_encoder;
90
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv))
91
0
        return s_latin3_encoder;
92
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv))
93
0
        return s_latin4_encoder;
94
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv))
95
0
        return s_latin_cyrillic_encoder;
96
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv))
97
0
        return s_latin_arabic_encoder;
98
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv))
99
0
        return s_latin_greek_encoder;
100
0
    if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv))
101
0
        return s_latin_hebrew_encoder;
102
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv))
103
0
        return s_latin6_encoder;
104
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv))
105
0
        return s_latin7_encoder;
106
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv))
107
0
        return s_latin8_encoder;
108
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv))
109
0
        return s_latin9_encoder;
110
0
    if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv))
111
0
        return s_latin10_encoder;
112
0
    if (encoding.equals_ignoring_ascii_case("koi8-r"sv))
113
0
        return s_koi8r_encoder;
114
0
    if (encoding.equals_ignoring_ascii_case("koi8-u"sv))
115
0
        return s_koi8u_encoder;
116
0
    if (encoding.equals_ignoring_ascii_case("macintosh"sv))
117
0
        return s_mac_roman_encoder;
118
0
    if (encoding.equals_ignoring_ascii_case("windows-874"sv))
119
0
        return s_windows874_encoder;
120
0
    if (encoding.equals_ignoring_ascii_case("windows-1250"sv))
121
0
        return s_centraleurope_encoder;
122
0
    if (encoding.equals_ignoring_ascii_case("windows-1251"sv))
123
0
        return s_cyrillic_encoder;
124
0
    if (encoding.equals_ignoring_ascii_case("windows-1252"sv))
125
0
        return s_windows1252_encoder;
126
0
    if (encoding.equals_ignoring_ascii_case("windows-1253"sv))
127
0
        return s_windows1253_encoder;
128
0
    if (encoding.equals_ignoring_ascii_case("windows-1254"sv))
129
0
        return s_turkish_encoder;
130
0
    if (encoding.equals_ignoring_ascii_case("windows-1255"sv))
131
0
        return s_hebrew_encoder;
132
0
    if (encoding.equals_ignoring_ascii_case("windows-1256"sv))
133
0
        return s_windows1256_encoder;
134
0
    if (encoding.equals_ignoring_ascii_case("windows-1257"sv))
135
0
        return s_windows1257_encoder;
136
0
    if (encoding.equals_ignoring_ascii_case("windows-1258"sv))
137
0
        return s_windows1258_encoder;
138
0
    if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv))
139
0
        return s_mac_cyrillic_encoder;
140
0
    dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
141
0
    return {};
142
0
}
143
144
Optional<Encoder&> encoder_for(StringView label)
145
6.12M
{
146
6.12M
    auto encoding = get_standardized_encoding(label);
147
6.12M
    return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
148
6.12M
}
149
150
// https://encoding.spec.whatwg.org/#utf-8-encoder
151
ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>)
152
1.34k
{
153
1.34k
    ReadonlyBytes bytes { input.bytes(), input.byte_length() };
154
1.34k
    for (auto byte : bytes)
155
49.8M
        TRY(on_byte(byte));
156
1.34k
    return {};
157
1.34k
}
158
159
ErrorOr<void> UTF16BEEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>)
160
0
{
161
0
    auto utf16 = TRY(utf8_to_utf16(input));
162
0
    for (auto utf16_codepoint : utf16) {
163
0
        u8 high_byte = static_cast<u8>((utf16_codepoint >> 8) & 0xFF);
164
0
        u8 low_byte = static_cast<u8>(utf16_codepoint & 0xFF);
165
0
        TRY(on_byte(high_byte));
166
0
        TRY(on_byte(low_byte));
167
0
    }
168
0
    return {};
169
0
}
170
171
ErrorOr<void> UTF16LEEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>)
172
0
{
173
0
    auto utf16 = TRY(utf8_to_utf16(input));
174
0
    for (auto utf16_codepoint : utf16) {
175
0
        u8 high_byte = static_cast<u8>((utf16_codepoint >> 8) & 0xFF);
176
0
        u8 low_byte = static_cast<u8>(utf16_codepoint & 0xFF);
177
0
        TRY(on_byte(low_byte));
178
0
        TRY(on_byte(high_byte));
179
0
    }
180
0
    return {};
181
0
}
182
183
ErrorOr<void> Latin1Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
184
0
{
185
0
    for (auto item : input) {
186
        // Latin1 is the same as the first 256 Unicode code_points.
187
0
        if (item <= 255)
188
0
            TRY(on_byte(static_cast<u8>(item)));
189
0
        else
190
0
            TRY(on_error(item));
191
0
    }
192
193
0
    return {};
194
0
}
195
196
// https://encoding.spec.whatwg.org/#euc-jp-encoder
197
ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
198
0
{
199
0
    for (auto item : input) {
200
        // 1. If code point is end-of-queue, return finished.
201
202
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
203
0
        if (is_ascii(item)) {
204
0
            TRY(on_byte(static_cast<u8>(item)));
205
0
            continue;
206
0
        }
207
208
        // 3. If code point is U+00A5, return byte 0x5C.
209
0
        if (item == 0x00A5) {
210
0
            TRY(on_byte(static_cast<u8>(0x5C)));
211
0
            continue;
212
0
        }
213
214
        // 4. If code point is U+203E, return byte 0x7E.
215
0
        if (item == 0x203E) {
216
0
            TRY(on_byte(static_cast<u8>(0x7E)));
217
0
            continue;
218
0
        }
219
220
        // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1.
221
0
        if (item >= 0xFF61 && item <= 0xFF9F) {
222
0
            TRY(on_byte(0x8E));
223
0
            TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
224
0
            continue;
225
0
        }
226
227
        // 6. If code point is U+2212, set it to U+FF0D.
228
0
        if (item == 0x2212)
229
0
            item = 0xFF0D;
230
231
        // 7. Let pointer be the index pointer for code point in index jis0208.
232
0
        auto pointer = code_point_jis0208_index(item);
233
234
        // 8. If pointer is null, return error with code point.
235
0
        if (!pointer.has_value()) {
236
0
            TRY(on_error(item));
237
0
            continue;
238
0
        }
239
240
        // 9. Let lead be pointer / 94 + 0xA1.
241
0
        auto lead = *pointer / 94 + 0xA1;
242
243
        // 10. Let trail be pointer % 94 + 0xA1.
244
0
        auto trail = *pointer % 94 + 0xA1;
245
246
        // 11. Return two bytes whose values are lead and trail.
247
0
        TRY(on_byte(static_cast<u8>(lead)));
248
0
        TRY(on_byte(static_cast<u8>(trail)));
249
0
    }
250
251
0
    return {};
252
0
}
253
254
// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder
255
ErrorOr<ISO2022JPEncoder::State> ISO2022JPEncoder::process_item(u32 item, State state, Function<ErrorOr<void>(u8)>& on_byte, Function<ErrorOr<void>(u32)>& on_error)
256
0
{
257
    // 3. If ISO-2022-JP encoder state is ASCII or Roman, and code point is U+000E, U+000F, or U+001B, return error with U+FFFD.
258
0
    if (state == State::ASCII || state == State::Roman) {
259
0
        if (item == 0x000E || item == 0x000F || item == 0x001B) {
260
0
            TRY(on_error(0xFFFD));
261
0
            return state;
262
0
        }
263
0
    }
264
265
    // 4. If ISO-2022-JP encoder state is ASCII and code point is an ASCII code point, return a byte whose value is code point.
266
0
    if (state == State::ASCII && is_ascii(item)) {
267
0
        TRY(on_byte(static_cast<u8>(item)));
268
0
        return state;
269
0
    }
270
271
    // 5. If ISO-2022-JP encoder state is Roman and code point is an ASCII code point, excluding U+005C and U+007E, or is U+00A5 or U+203E, then:
272
0
    if (state == State::Roman && ((is_ascii(item) && item != 0x005C && item != 0x007E) || (item == 0x00A5 || item == 0x203E))) {
273
        // 1. If code point is an ASCII code point, return a byte whose value is code point.
274
0
        if (is_ascii(item)) {
275
0
            TRY(on_byte(static_cast<u8>(item)));
276
0
            return state;
277
0
        }
278
279
        // 2. If code point is U+00A5, return byte 0x5C.
280
0
        if (item == 0x00A5) {
281
0
            TRY(on_byte(0x5C));
282
0
            return state;
283
0
        }
284
285
        // 3. If code point is U+203E, return byte 0x7E.
286
0
        if (item == 0x203E) {
287
0
            TRY(on_byte(0x7E));
288
0
            return state;
289
0
        }
290
0
    }
291
292
    // 6. If code point is an ASCII code point, and ISO-2022-JP encoder state is not ASCII, restore code point to ioQueue, set
293
    //    ISO-2022-JP encoder state to ASCII, and return three bytes 0x1B 0x28 0x42.
294
0
    if (is_ascii(item) && state != State::ASCII) {
295
0
        TRY(on_byte(0x1B));
296
0
        TRY(on_byte(0x28));
297
0
        TRY(on_byte(0x42));
298
0
        return process_item(item, State::ASCII, on_byte, on_error);
299
0
    }
300
301
    // 7. If code point is either U+00A5 or U+203E, and ISO-2022-JP encoder state is not Roman, restore code point to ioQueue,
302
    //    set ISO-2022-JP encoder state to Roman, and return three bytes 0x1B 0x28 0x4A.
303
0
    if ((item == 0x00A5 || item == 0x203E) && state != State::Roman) {
304
0
        TRY(on_byte(0x1B));
305
0
        TRY(on_byte(0x28));
306
0
        TRY(on_byte(0x4A));
307
0
        return process_item(item, State::Roman, on_byte, on_error);
308
0
    }
309
310
    // 8. If code point is U+2212, set it to U+FF0D.
311
0
    if (item == 0x2212)
312
0
        item = 0xFF0D;
313
314
    // 9. If code point is in the range U+FF61 to U+FF9F, inclusive, set it to the index code point for code point − 0xFF61
315
    //    in index ISO-2022-JP katakana.
316
0
    if (item >= 0xFF61 && item <= 0xFF9F) {
317
0
        item = *index_iso_2022_jp_katakana_code_point(item - 0xFF61);
318
0
    }
319
320
    // 10. Let pointer be the index pointer for code point in index jis0208.
321
0
    auto pointer = code_point_jis0208_index(item);
322
323
    // 11. If pointer is null, then:
324
0
    if (!pointer.has_value()) {
325
        // 1. If ISO-2022-JP encoder state is jis0208, then restore code point to ioQueue, set ISO-2022-JP encoder state to
326
        //    ASCII, and return three bytes 0x1B 0x28 0x42.
327
0
        if (state == State::jis0208) {
328
0
            TRY(on_byte(0x1B));
329
0
            TRY(on_byte(0x28));
330
0
            TRY(on_byte(0x4A));
331
0
            return process_item(item, State::ASCII, on_byte, on_error);
332
0
        }
333
334
        // 2. Return error with code point.
335
0
        TRY(on_error(item));
336
0
        return state;
337
0
    }
338
339
    // 12. If ISO-2022-JP encoder state is not jis0208, restore code point to ioQueue, set ISO-2022-JP encoder state to
340
    //     jis0208, and return three bytes 0x1B 0x24 0x42.
341
0
    if (state != State::jis0208) {
342
0
        TRY(on_byte(0x1B));
343
0
        TRY(on_byte(0x24));
344
0
        TRY(on_byte(0x42));
345
0
        return process_item(item, State::jis0208, on_byte, on_error);
346
0
    }
347
348
    // 13. Let lead be pointer / 94 + 0x21.
349
0
    auto lead = *pointer / 94 + 0x21;
350
351
    // 14. Let trail be pointer % 94 + 0x21.
352
0
    auto trail = *pointer % 94 + 0x21;
353
354
    // 15. Return two bytes whose values are lead and trail.
355
0
    TRY(on_byte(static_cast<u8>(lead)));
356
0
    TRY(on_byte(static_cast<u8>(trail)));
357
0
    return state;
358
0
}
359
360
// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder
361
ErrorOr<void> ISO2022JPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
362
0
{
363
    // ISO-2022-JP’s encoder has an associated ISO-2022-JP encoder state which is ASCII, Roman, or jis0208 (initially ASCII).
364
0
    auto state = State::ASCII;
365
366
0
    for (u32 item : input) {
367
0
        state = TRY(process_item(item, state, on_byte, on_error));
368
0
    }
369
370
    // 1. If code point is end-of-queue and ISO-2022-JP encoder state is not ASCII, set ISO-2022-JP
371
    //    encoder state to ASCII, and return three bytes 0x1B 0x28 0x42.
372
0
    if (state != State::ASCII) {
373
0
        state = State::ASCII;
374
0
        TRY(on_byte(0x1B));
375
0
        TRY(on_byte(0x28));
376
0
        TRY(on_byte(0x42));
377
0
        return {};
378
0
    }
379
380
    // 2. If code point is end-of-queue and ISO-2022-JP encoder state is ASCII, return finished.
381
0
    return {};
382
0
}
383
384
static Optional<u32> code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to)
385
0
{
386
0
    VERIFY(skip_to >= skip_from);
387
0
    for (u32 i = 0; i < s_jis0208_index.size(); ++i) {
388
0
        if (i >= skip_from && i <= skip_to)
389
0
            continue;
390
0
        if (s_jis0208_index[i] == code_point)
391
0
            return i;
392
0
    }
393
0
    return {};
394
0
}
395
396
// https://encoding.spec.whatwg.org/#index-shift_jis-pointer
397
static Optional<u32> index_shift_jis_pointer(u32 code_point)
398
0
{
399
    // 1. Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive.
400
0
    auto pointer = code_point_jis0208_index_skipping_range(code_point, 8272, 8835);
401
0
    if (!pointer.has_value())
402
0
        return {};
403
404
    // 2. Return the index pointer for code point in index.
405
0
    return *pointer;
406
0
}
407
408
// https://encoding.spec.whatwg.org/#shift_jis-encoder
409
ErrorOr<void> ShiftJISEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
410
0
{
411
0
    for (u32 item : input) {
412
        // 1. If code point is end-of-queue, return finished.
413
414
        // 2. If code point is an ASCII code point or U+0080, return a byte whose value is code point.
415
0
        if (is_ascii(item) || item == 0x0080) {
416
0
            TRY(on_byte(static_cast<u8>(item)));
417
0
            continue;
418
0
        }
419
420
        // 3. If code point is U+00A5, return byte 0x5C.
421
0
        if (item == 0x00A5) {
422
0
            TRY(on_byte(0x5C));
423
0
            continue;
424
0
        }
425
426
        // 4. If code point is U+203E, return byte 0x7E.
427
0
        if (item == 0x203E) {
428
0
            TRY(on_byte(0x7E));
429
0
            continue;
430
0
        }
431
432
        // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return a byte whose value is code point − 0xFF61 + 0xA1.
433
0
        if (item >= 0xFF61 && item <= 0xFF9F) {
434
0
            TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
435
0
            continue;
436
0
        }
437
438
        // 6. If code point is U+2212, set it to U+FF0D.
439
0
        if (item == 0x2212)
440
0
            item = 0xFF0D;
441
442
        // 7. Let pointer be the index Shift_JIS pointer for code point.
443
0
        auto pointer = index_shift_jis_pointer(item);
444
445
        // 8. If pointer is null, return error with code point.
446
0
        if (!pointer.has_value()) {
447
0
            TRY(on_error(item));
448
0
            continue;
449
0
        }
450
451
        // 9. Let lead be pointer / 188.
452
0
        auto lead = *pointer / 188;
453
454
        // 10. Let lead offset be 0x81 if lead is less than 0x1F, otherwise 0xC1.
455
0
        auto lead_offset = 0xC1;
456
0
        if (lead < 0x1F)
457
0
            lead_offset = 0x81;
458
459
        // 11. Let trail be pointer % 188.
460
0
        auto trail = *pointer % 188;
461
462
        // 12. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
463
0
        auto offset = 0x41;
464
0
        if (trail < 0x3F)
465
0
            offset = 0x40;
466
467
        // 13. Return two bytes whose values are lead + lead offset and trail + offset.
468
0
        TRY(on_byte(static_cast<u8>(lead + lead_offset)));
469
0
        TRY(on_byte(static_cast<u8>(trail + offset)));
470
0
    }
471
472
0
    return {};
473
0
}
474
475
// https://encoding.spec.whatwg.org/#euc-kr-encoder
476
ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
477
0
{
478
0
    for (u32 item : input) {
479
        // 1. If code point is end-of-queue, return finished.
480
481
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
482
0
        if (is_ascii(item)) {
483
0
            TRY(on_byte(static_cast<u8>(item)));
484
0
            continue;
485
0
        }
486
487
        // 3. Let pointer be the index pointer for code point in index EUC-KR.
488
0
        auto pointer = code_point_euc_kr_index(item);
489
490
        // 4. If pointer is null, return error with code point.
491
0
        if (!pointer.has_value()) {
492
0
            TRY(on_error(item));
493
0
            continue;
494
0
        }
495
496
        // 5. Let lead be pointer / 190 + 0x81.
497
0
        auto lead = *pointer / 190 + 0x81;
498
499
        // 6. Let trail be pointer % 190 + 0x41.
500
0
        auto trail = *pointer % 190 + 0x41;
501
502
        // 7. Return two bytes whose values are lead and trail.
503
0
        TRY(on_byte(static_cast<u8>(lead)));
504
0
        TRY(on_byte(static_cast<u8>(trail)));
505
0
    }
506
507
0
    return {};
508
0
}
509
510
// https://encoding.spec.whatwg.org/#index-big5-pointer
511
static Optional<u32> index_big5_pointer(u32 code_point)
512
0
{
513
    // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157.
514
0
    auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer;
515
516
    // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer
517
    //    corresponding to code point in index.
518
0
    if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) {
519
0
        for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) {
520
0
            if (s_big5_index[i] == code_point) {
521
0
                return s_big5_index_first_pointer + i;
522
0
            }
523
0
        }
524
0
        return {};
525
0
    }
526
527
    // 3. Return the index pointer for code point in index.
528
0
    for (u32 i = start_index; i < s_big5_index.size(); ++i) {
529
0
        if (s_big5_index[i] == code_point) {
530
0
            return s_big5_index_first_pointer + i;
531
0
        }
532
0
    }
533
0
    return {};
534
0
}
535
536
// https://encoding.spec.whatwg.org/#big5-encoder
537
ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
538
0
{
539
0
    for (u32 item : input) {
540
        // 1. If code point is end-of-queue, return finished.
541
542
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
543
0
        if (is_ascii(item)) {
544
0
            TRY(on_byte(static_cast<u8>(item)));
545
0
            continue;
546
0
        }
547
548
        // 3. Let pointer be the index Big5 pointer for code point.
549
0
        auto pointer = index_big5_pointer(item);
550
551
        // 4. If pointer is null, return error with code point.
552
0
        if (!pointer.has_value()) {
553
0
            TRY(on_error(item));
554
0
            continue;
555
0
        }
556
557
        // 5. Let lead be pointer / 157 + 0x81.
558
0
        auto lead = *pointer / 157 + 0x81;
559
560
        // 6. Let trail be pointer % 157.
561
0
        auto trail = *pointer % 157;
562
563
        // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62.
564
0
        auto offset = 0x62;
565
0
        if (trail < 0x3f)
566
0
            offset = 0x40;
567
568
        // 8. Return two bytes whose values are lead and trail + offset.
569
0
        TRY(on_byte(static_cast<u8>(lead)));
570
0
        TRY(on_byte(static_cast<u8>(trail + offset)));
571
0
    }
572
573
0
    return {};
574
0
}
575
576
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
577
static u32 index_gb18030_ranges_pointer(u32 code_point)
578
0
{
579
    // 1. If code point is U+E7C7, return pointer 7457.
580
0
    if (code_point == 0xe7c7)
581
0
        return 7457;
582
583
    // 2. Let offset be the last code point in index gb18030 ranges that is less than
584
    //    or equal to code point and let pointer offset be its corresponding pointer.
585
0
    size_t last_index;
586
0
    binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
587
0
        return code_point - entry.code_point;
588
0
    });
589
0
    auto offset = s_gb18030_ranges[last_index].code_point;
590
0
    auto pointer_offset = s_gb18030_ranges[last_index].pointer;
591
592
    // 3. Return a pointer whose value is pointer offset + code point − offset.
593
0
    return pointer_offset + code_point - offset;
594
0
}
595
596
GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
597
124
    : m_is_gbk(is_gbk)
598
124
{
599
124
}
600
601
// https://encoding.spec.whatwg.org/#gb18030-encoder
602
ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
603
0
{
604
0
    bool gbk = (m_is_gbk == IsGBK::Yes);
605
606
0
    for (u32 item : input) {
607
        // 1. If code point is end-of-queue, return finished.
608
609
        // 2. If code point is an ASCII code point, return a byte whose value is code point.
610
0
        if (is_ascii(item)) {
611
0
            TRY(on_byte(static_cast<u8>(item)));
612
0
            continue;
613
0
        }
614
615
        // 3. If code point is U+E5E5, return error with code point.
616
0
        if (item == 0xE5E5) {
617
0
            TRY(on_error(item));
618
0
            continue;
619
0
        }
620
621
        // 4. If is GBK is true and code point is U+20AC, return byte 0x80.
622
0
        if (gbk && item == 0x20AC) {
623
0
            TRY(on_byte(0x80));
624
0
            continue;
625
0
        }
626
627
        // 5. Let pointer be the index pointer for code point in index gb18030.
628
0
        auto pointer = code_point_gb18030_index(item);
629
630
        // 6. If pointer is non-null, then:
631
0
        if (pointer.has_value()) {
632
            // 1. Let lead be pointer / 190 + 0x81.
633
0
            auto lead = *pointer / 190 + 0x81;
634
635
            // 2. Let trail be pointer % 190.
636
0
            auto trail = *pointer % 190;
637
638
            // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
639
0
            auto offset = 0x41;
640
0
            if (trail < 0x3f)
641
0
                offset = 0x40;
642
643
            // 4. Return two bytes whose values are lead and trail + offset.
644
0
            TRY(on_byte(static_cast<u8>(lead)));
645
0
            TRY(on_byte(static_cast<u8>(trail + offset)));
646
0
            continue;
647
0
        }
648
649
        // 7. If is GBK is true, return error with code point.
650
0
        if (gbk) {
651
0
            TRY(on_error(item));
652
0
            continue;
653
0
        }
654
655
        // 8. Set pointer to the index gb18030 ranges pointer for code point.
656
0
        pointer = index_gb18030_ranges_pointer(item);
657
658
        // 9. Let byte1 be pointer / (10 × 126 × 10).
659
0
        auto byte1 = *pointer / (10 * 126 * 10);
660
661
        // 10. Set pointer to pointer % (10 × 126 × 10).
662
0
        pointer = *pointer % (10 * 126 * 10);
663
664
        // 11. Let byte2 be pointer / (10 × 126).
665
0
        auto byte2 = *pointer / (10 * 126);
666
667
        // 12. Set pointer to pointer % (10 × 126).
668
0
        pointer = *pointer % (10 * 126);
669
670
        // 13. Let byte3 be pointer / 10.
671
0
        auto byte3 = *pointer / 10;
672
673
        // 14. Let byte4 be pointer % 10.
674
0
        auto byte4 = *pointer % 10;
675
676
        // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
677
0
        TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
678
0
        TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
679
0
        TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
680
0
        TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
681
0
    }
682
683
0
    return {};
684
0
}
685
686
// https://encoding.spec.whatwg.org/#single-byte-encoder
687
template<Integral ArrayType>
688
ErrorOr<void> SingleByteEncoder<ArrayType>::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
689
0
{
690
0
    for (u32 const code_point : input) {
691
0
        if (code_point < 0x80) {
692
            // 2. If code point is an ASCII code point, return a byte whose value is code point.
693
0
            TRY(on_byte(static_cast<u8>(code_point)));
694
0
        } else {
695
0
            Optional<u8> pointer = {};
696
0
            for (u8 i = 0; i < m_translation_table.size(); i++) {
697
0
                if (m_translation_table[i] == code_point) {
698
                    // 3. Let pointer be the index pointer for code point in index single-byte.
699
0
                    pointer = i;
700
0
                    break;
701
0
                }
702
0
            }
703
0
            if (pointer.has_value()) {
704
                // 5. Return a byte whose value is pointer + 0x80.
705
0
                TRY(on_byte(pointer.value() + 0x80));
706
0
            } else {
707
                // 4. If pointer is null, return error with code point.
708
0
                TRY(on_error(code_point));
709
0
            }
710
0
        }
711
0
    }
712
    // 1. If code point is end-of-queue, return finished.
713
0
    return {};
714
0
}
715
716
}