/src/serenity/Userland/Libraries/LibTextCodec/Encoder.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/BinarySearch.h> |
8 | | #include <AK/Error.h> |
9 | | #include <AK/Utf8View.h> |
10 | | #include <LibTextCodec/Decoder.h> |
11 | | #include <LibTextCodec/Encoder.h> |
12 | | #include <LibTextCodec/LookupTables.h> |
13 | | |
14 | | namespace TextCodec { |
15 | | |
16 | | namespace { |
17 | | UTF8Encoder s_utf8_encoder; |
18 | | GB18030Encoder s_gb18030_encoder; |
19 | | GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes); |
20 | | Big5Encoder s_big5_encoder; |
21 | | EUCJPEncoder s_euc_jp_encoder; |
22 | | ISO2022JPEncoder s_iso_2022_jp_encoder; |
23 | | ShiftJISEncoder s_shift_jis_encoder; |
24 | | EUCKREncoder s_euc_kr_encoder; |
25 | | |
26 | | // s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json |
27 | | // Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt |
28 | | SingleByteEncoder s_ibm866_encoder { s_ibm866_index }; |
29 | | SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index }; |
30 | | SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index }; |
31 | | SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index }; |
32 | | SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index }; |
33 | | SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index }; |
34 | | SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index }; |
35 | | SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index }; |
36 | | SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index }; |
37 | | SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index }; |
38 | | SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index }; |
39 | | SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index }; |
40 | | SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index }; |
41 | | SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index }; |
42 | | SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index }; |
43 | | SingleByteEncoder s_hebrew_encoder { s_windows_1255_index }; |
44 | | SingleByteEncoder s_koi8r_encoder { s_koi8_r_index }; |
45 | | SingleByteEncoder s_koi8u_encoder { s_koi8_u_index }; |
46 | | SingleByteEncoder s_mac_roman_encoder { s_macintosh_index }; |
47 | | SingleByteEncoder s_windows874_encoder { s_windows_874_index }; |
48 | | SingleByteEncoder s_windows1252_encoder { s_windows_1252_index }; |
49 | | SingleByteEncoder s_windows1253_encoder { s_windows_1253_index }; |
50 | | SingleByteEncoder s_turkish_encoder { s_windows_1254_index }; |
51 | | SingleByteEncoder s_windows1256_encoder { s_windows_1256_index }; |
52 | | SingleByteEncoder s_windows1257_encoder { s_windows_1257_index }; |
53 | | SingleByteEncoder s_windows1258_encoder { s_windows_1258_index }; |
54 | | SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index }; |
55 | | |
56 | | } |
57 | | |
58 | | Optional<Encoder&> encoder_for_exact_name(StringView encoding) |
59 | 5.18M | { |
60 | 5.18M | if (encoding.equals_ignoring_ascii_case("utf-8"sv)) |
61 | 5.18M | return s_utf8_encoder; |
62 | 0 | if (encoding.equals_ignoring_ascii_case("big5"sv)) |
63 | 0 | return s_big5_encoder; |
64 | 0 | if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) |
65 | 0 | return s_euc_jp_encoder; |
66 | 0 | if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv)) |
67 | 0 | return s_iso_2022_jp_encoder; |
68 | 0 | if (encoding.equals_ignoring_ascii_case("shift_jis"sv)) |
69 | 0 | return s_shift_jis_encoder; |
70 | 0 | if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) |
71 | 0 | return s_euc_kr_encoder; |
72 | 0 | if (encoding.equals_ignoring_ascii_case("gb18030"sv)) |
73 | 0 | return s_gb18030_encoder; |
74 | 0 | if (encoding.equals_ignoring_ascii_case("gbk"sv)) |
75 | 0 | return s_gbk_encoder; |
76 | 0 | if (encoding.equals_ignoring_ascii_case("ibm866"sv)) |
77 | 0 | return s_ibm866_encoder; |
78 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv)) |
79 | 0 | return s_latin2_encoder; |
80 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv)) |
81 | 0 | return s_latin3_encoder; |
82 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv)) |
83 | 0 | return s_latin4_encoder; |
84 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv)) |
85 | 0 | return s_latin_cyrillic_encoder; |
86 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv)) |
87 | 0 | return s_latin_arabic_encoder; |
88 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv)) |
89 | 0 | return s_latin_greek_encoder; |
90 | 0 | if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv)) |
91 | 0 | return s_latin_hebrew_encoder; |
92 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv)) |
93 | 0 | return s_latin6_encoder; |
94 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv)) |
95 | 0 | return s_latin7_encoder; |
96 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv)) |
97 | 0 | return s_latin8_encoder; |
98 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv)) |
99 | 0 | return s_latin9_encoder; |
100 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv)) |
101 | 0 | return s_latin10_encoder; |
102 | 0 | if (encoding.equals_ignoring_ascii_case("koi8-r"sv)) |
103 | 0 | return s_koi8r_encoder; |
104 | 0 | if (encoding.equals_ignoring_ascii_case("koi8-u"sv)) |
105 | 0 | return s_koi8u_encoder; |
106 | 0 | if (encoding.equals_ignoring_ascii_case("macintosh"sv)) |
107 | 0 | return s_mac_roman_encoder; |
108 | 0 | if (encoding.equals_ignoring_ascii_case("windows-874"sv)) |
109 | 0 | return s_windows874_encoder; |
110 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1250"sv)) |
111 | 0 | return s_centraleurope_encoder; |
112 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1251"sv)) |
113 | 0 | return s_cyrillic_encoder; |
114 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1252"sv)) |
115 | 0 | return s_windows1252_encoder; |
116 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1253"sv)) |
117 | 0 | return s_windows1253_encoder; |
118 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1254"sv)) |
119 | 0 | return s_turkish_encoder; |
120 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1255"sv)) |
121 | 0 | return s_hebrew_encoder; |
122 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1256"sv)) |
123 | 0 | return s_windows1256_encoder; |
124 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1257"sv)) |
125 | 0 | return s_windows1257_encoder; |
126 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1258"sv)) |
127 | 0 | return s_windows1258_encoder; |
128 | 0 | if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv)) |
129 | 0 | return s_mac_cyrillic_encoder; |
130 | 0 | dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding); |
131 | 0 | return {}; |
132 | 0 | } |
133 | | |
134 | | Optional<Encoder&> encoder_for(StringView label) |
135 | 5.18M | { |
136 | 5.18M | auto encoding = get_standardized_encoding(label); |
137 | 5.18M | return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {}; |
138 | 5.18M | } |
139 | | |
140 | | // https://encoding.spec.whatwg.org/#utf-8-encoder |
141 | | ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>) |
142 | 1.14k | { |
143 | 1.14k | ReadonlyBytes bytes { input.bytes(), input.byte_length() }; |
144 | 1.14k | for (auto byte : bytes) |
145 | 65.6M | TRY(on_byte(byte)); |
146 | 1.14k | return {}; |
147 | 1.14k | } |
148 | | |
149 | | // https://encoding.spec.whatwg.org/#euc-jp-encoder |
150 | | ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
151 | 0 | { |
152 | 0 | for (auto item : input) { |
153 | | // 1. If code point is end-of-queue, return finished. |
154 | | |
155 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
156 | 0 | if (is_ascii(item)) { |
157 | 0 | TRY(on_byte(static_cast<u8>(item))); |
158 | 0 | continue; |
159 | 0 | } |
160 | | |
161 | | // 3. If code point is U+00A5, return byte 0x5C. |
162 | 0 | if (item == 0x00A5) { |
163 | 0 | TRY(on_byte(static_cast<u8>(0x5C))); |
164 | 0 | continue; |
165 | 0 | } |
166 | | |
167 | | // 4. If code point is U+203E, return byte 0x7E. |
168 | 0 | if (item == 0x203E) { |
169 | 0 | TRY(on_byte(static_cast<u8>(0x7E))); |
170 | 0 | continue; |
171 | 0 | } |
172 | | |
173 | | // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1. |
174 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
175 | 0 | TRY(on_byte(0x8E)); |
176 | 0 | TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1))); |
177 | 0 | continue; |
178 | 0 | } |
179 | | |
180 | | // 6. If code point is U+2212, set it to U+FF0D. |
181 | 0 | if (item == 0x2212) |
182 | 0 | item = 0xFF0D; |
183 | | |
184 | | // 7. Let pointer be the index pointer for code point in index jis0208. |
185 | 0 | auto pointer = code_point_jis0208_index(item); |
186 | | |
187 | | // 8. If pointer is null, return error with code point. |
188 | 0 | if (!pointer.has_value()) { |
189 | 0 | TRY(on_error(item)); |
190 | 0 | continue; |
191 | 0 | } |
192 | | |
193 | | // 9. Let lead be pointer / 94 + 0xA1. |
194 | 0 | auto lead = *pointer / 94 + 0xA1; |
195 | | |
196 | | // 10. Let trail be pointer % 94 + 0xA1. |
197 | 0 | auto trail = *pointer % 94 + 0xA1; |
198 | | |
199 | | // 11. Return two bytes whose values are lead and trail. |
200 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
201 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
202 | 0 | } |
203 | | |
204 | 0 | return {}; |
205 | 0 | } |
206 | | |
207 | | // https://encoding.spec.whatwg.org/#iso-2022-jp-encoder |
208 | | ErrorOr<ISO2022JPEncoder::State> ISO2022JPEncoder::process_item(u32 item, State state, Function<ErrorOr<void>(u8)>& on_byte, Function<ErrorOr<void>(u32)>& on_error) |
209 | 0 | { |
210 | | // 3. If ISO-2022-JP encoder state is ASCII or Roman, and code point is U+000E, U+000F, or U+001B, return error with U+FFFD. |
211 | 0 | if (state == State::ASCII || state == State::Roman) { |
212 | 0 | if (item == 0x000E || item == 0x000F || item == 0x001B) { |
213 | 0 | TRY(on_error(0xFFFD)); |
214 | 0 | return state; |
215 | 0 | } |
216 | 0 | } |
217 | | |
218 | | // 4. If ISO-2022-JP encoder state is ASCII and code point is an ASCII code point, return a byte whose value is code point. |
219 | 0 | if (state == State::ASCII && is_ascii(item)) { |
220 | 0 | TRY(on_byte(static_cast<u8>(item))); |
221 | 0 | return state; |
222 | 0 | } |
223 | | |
224 | | // 5. If ISO-2022-JP encoder state is Roman and code point is an ASCII code point, excluding U+005C and U+007E, or is U+00A5 or U+203E, then: |
225 | 0 | if (state == State::Roman && ((is_ascii(item) && item != 0x005C && item != 0x007E) || (item == 0x00A5 || item == 0x203E))) { |
226 | | // 1. If code point is an ASCII code point, return a byte whose value is code point. |
227 | 0 | if (is_ascii(item)) { |
228 | 0 | TRY(on_byte(static_cast<u8>(item))); |
229 | 0 | return state; |
230 | 0 | } |
231 | | |
232 | | // 2. If code point is U+00A5, return byte 0x5C. |
233 | 0 | if (item == 0x00A5) { |
234 | 0 | TRY(on_byte(0x5C)); |
235 | 0 | return state; |
236 | 0 | } |
237 | | |
238 | | // 3. If code point is U+203E, return byte 0x7E. |
239 | 0 | if (item == 0x203E) { |
240 | 0 | TRY(on_byte(0x7E)); |
241 | 0 | return state; |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | // 6. If code point is an ASCII code point, and ISO-2022-JP encoder state is not ASCII, restore code point to ioQueue, set |
246 | | // ISO-2022-JP encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. |
247 | 0 | if (is_ascii(item) && state != State::ASCII) { |
248 | 0 | TRY(on_byte(0x1B)); |
249 | 0 | TRY(on_byte(0x28)); |
250 | 0 | TRY(on_byte(0x42)); |
251 | 0 | return process_item(item, State::ASCII, on_byte, on_error); |
252 | 0 | } |
253 | | |
254 | | // 7. If code point is either U+00A5 or U+203E, and ISO-2022-JP encoder state is not Roman, restore code point to ioQueue, |
255 | | // set ISO-2022-JP encoder state to Roman, and return three bytes 0x1B 0x28 0x4A. |
256 | 0 | if ((item == 0x00A5 || item == 0x203E) && state != State::Roman) { |
257 | 0 | TRY(on_byte(0x1B)); |
258 | 0 | TRY(on_byte(0x28)); |
259 | 0 | TRY(on_byte(0x4A)); |
260 | 0 | return process_item(item, State::Roman, on_byte, on_error); |
261 | 0 | } |
262 | | |
263 | | // 8. If code point is U+2212, set it to U+FF0D. |
264 | 0 | if (item == 0x2212) |
265 | 0 | item = 0xFF0D; |
266 | | |
267 | | // 9. If code point is in the range U+FF61 to U+FF9F, inclusive, set it to the index code point for code point − 0xFF61 |
268 | | // in index ISO-2022-JP katakana. |
269 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
270 | 0 | item = *index_iso_2022_jp_katakana_code_point(item - 0xFF61); |
271 | 0 | } |
272 | | |
273 | | // 10. Let pointer be the index pointer for code point in index jis0208. |
274 | 0 | auto pointer = code_point_jis0208_index(item); |
275 | | |
276 | | // 11. If pointer is null, then: |
277 | 0 | if (!pointer.has_value()) { |
278 | | // 1. If ISO-2022-JP encoder state is jis0208, then restore code point to ioQueue, set ISO-2022-JP encoder state to |
279 | | // ASCII, and return three bytes 0x1B 0x28 0x42. |
280 | 0 | if (state == State::jis0208) { |
281 | 0 | TRY(on_byte(0x1B)); |
282 | 0 | TRY(on_byte(0x28)); |
283 | 0 | TRY(on_byte(0x4A)); |
284 | 0 | return process_item(item, State::ASCII, on_byte, on_error); |
285 | 0 | } |
286 | | |
287 | | // 2. Return error with code point. |
288 | 0 | TRY(on_error(item)); |
289 | 0 | return state; |
290 | 0 | } |
291 | | |
292 | | // 12. If ISO-2022-JP encoder state is not jis0208, restore code point to ioQueue, set ISO-2022-JP encoder state to |
293 | | // jis0208, and return three bytes 0x1B 0x24 0x42. |
294 | 0 | if (state != State::jis0208) { |
295 | 0 | TRY(on_byte(0x1B)); |
296 | 0 | TRY(on_byte(0x24)); |
297 | 0 | TRY(on_byte(0x42)); |
298 | 0 | return process_item(item, State::jis0208, on_byte, on_error); |
299 | 0 | } |
300 | | |
301 | | // 13. Let lead be pointer / 94 + 0x21. |
302 | 0 | auto lead = *pointer / 94 + 0x21; |
303 | | |
304 | | // 14. Let trail be pointer % 94 + 0x21. |
305 | 0 | auto trail = *pointer % 94 + 0x21; |
306 | | |
307 | | // 15. Return two bytes whose values are lead and trail. |
308 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
309 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
310 | 0 | return state; |
311 | 0 | } |
312 | | |
313 | | // https://encoding.spec.whatwg.org/#iso-2022-jp-encoder |
314 | | ErrorOr<void> ISO2022JPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
315 | 0 | { |
316 | | // ISO-2022-JP’s encoder has an associated ISO-2022-JP encoder state which is ASCII, Roman, or jis0208 (initially ASCII). |
317 | 0 | auto state = State::ASCII; |
318 | |
|
319 | 0 | for (u32 item : input) { |
320 | 0 | state = TRY(process_item(item, state, on_byte, on_error)); |
321 | 0 | } |
322 | | |
323 | | // 1. If code point is end-of-queue and ISO-2022-JP encoder state is not ASCII, set ISO-2022-JP |
324 | | // encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. |
325 | 0 | if (state != State::ASCII) { |
326 | 0 | state = State::ASCII; |
327 | 0 | TRY(on_byte(0x1B)); |
328 | 0 | TRY(on_byte(0x28)); |
329 | 0 | TRY(on_byte(0x42)); |
330 | 0 | return {}; |
331 | 0 | } |
332 | | |
333 | | // 2. If code point is end-of-queue and ISO-2022-JP encoder state is ASCII, return finished. |
334 | 0 | return {}; |
335 | 0 | } |
336 | | |
337 | | static Optional<u32> code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to) |
338 | 0 | { |
339 | 0 | VERIFY(skip_to >= skip_from); |
340 | 0 | for (u32 i = 0; i < s_jis0208_index.size(); ++i) { |
341 | 0 | if (i >= skip_from && i <= skip_to) |
342 | 0 | continue; |
343 | 0 | if (s_jis0208_index[i] == code_point) |
344 | 0 | return i; |
345 | 0 | } |
346 | 0 | return {}; |
347 | 0 | } |
348 | | |
349 | | // https://encoding.spec.whatwg.org/#index-shift_jis-pointer |
350 | | static Optional<u32> index_shift_jis_pointer(u32 code_point) |
351 | 0 | { |
352 | | // 1. Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive. |
353 | 0 | auto pointer = code_point_jis0208_index_skipping_range(code_point, 8272, 8835); |
354 | 0 | if (!pointer.has_value()) |
355 | 0 | return {}; |
356 | | |
357 | | // 2. Return the index pointer for code point in index. |
358 | 0 | return *pointer; |
359 | 0 | } |
360 | | |
361 | | // https://encoding.spec.whatwg.org/#shift_jis-encoder |
362 | | ErrorOr<void> ShiftJISEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
363 | 0 | { |
364 | 0 | for (u32 item : input) { |
365 | | // 1. If code point is end-of-queue, return finished. |
366 | | |
367 | | // 2. If code point is an ASCII code point or U+0080, return a byte whose value is code point. |
368 | 0 | if (is_ascii(item) || item == 0x0080) { |
369 | 0 | TRY(on_byte(static_cast<u8>(item))); |
370 | 0 | continue; |
371 | 0 | } |
372 | | |
373 | | // 3. If code point is U+00A5, return byte 0x5C. |
374 | 0 | if (item == 0x00A5) { |
375 | 0 | TRY(on_byte(0x5C)); |
376 | 0 | continue; |
377 | 0 | } |
378 | | |
379 | | // 4. If code point is U+203E, return byte 0x7E. |
380 | 0 | if (item == 0x203E) { |
381 | 0 | TRY(on_byte(0x7E)); |
382 | 0 | continue; |
383 | 0 | } |
384 | | |
385 | | // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return a byte whose value is code point − 0xFF61 + 0xA1. |
386 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
387 | 0 | TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1))); |
388 | 0 | continue; |
389 | 0 | } |
390 | | |
391 | | // 6. If code point is U+2212, set it to U+FF0D. |
392 | 0 | if (item == 0x2212) |
393 | 0 | item = 0xFF0D; |
394 | | |
395 | | // 7. Let pointer be the index Shift_JIS pointer for code point. |
396 | 0 | auto pointer = index_shift_jis_pointer(item); |
397 | | |
398 | | // 8. If pointer is null, return error with code point. |
399 | 0 | if (!pointer.has_value()) { |
400 | 0 | TRY(on_error(item)); |
401 | 0 | continue; |
402 | 0 | } |
403 | | |
404 | | // 9. Let lead be pointer / 188. |
405 | 0 | auto lead = *pointer / 188; |
406 | | |
407 | | // 10. Let lead offset be 0x81 if lead is less than 0x1F, otherwise 0xC1. |
408 | 0 | auto lead_offset = 0xC1; |
409 | 0 | if (lead < 0x1F) |
410 | 0 | lead_offset = 0x81; |
411 | | |
412 | | // 11. Let trail be pointer % 188. |
413 | 0 | auto trail = *pointer % 188; |
414 | | |
415 | | // 12. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41. |
416 | 0 | auto offset = 0x41; |
417 | 0 | if (trail < 0x3F) |
418 | 0 | offset = 0x40; |
419 | | |
420 | | // 13. Return two bytes whose values are lead + lead offset and trail + offset. |
421 | 0 | TRY(on_byte(static_cast<u8>(lead + lead_offset))); |
422 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
423 | 0 | } |
424 | | |
425 | 0 | return {}; |
426 | 0 | } |
427 | | |
428 | | // https://encoding.spec.whatwg.org/#euc-kr-encoder |
429 | | ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
430 | 0 | { |
431 | 0 | for (u32 item : input) { |
432 | | // 1. If code point is end-of-queue, return finished. |
433 | | |
434 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
435 | 0 | if (is_ascii(item)) { |
436 | 0 | TRY(on_byte(static_cast<u8>(item))); |
437 | 0 | continue; |
438 | 0 | } |
439 | | |
440 | | // 3. Let pointer be the index pointer for code point in index EUC-KR. |
441 | 0 | auto pointer = code_point_euc_kr_index(item); |
442 | | |
443 | | // 4. If pointer is null, return error with code point. |
444 | 0 | if (!pointer.has_value()) { |
445 | 0 | TRY(on_error(item)); |
446 | 0 | continue; |
447 | 0 | } |
448 | | |
449 | | // 5. Let lead be pointer / 190 + 0x81. |
450 | 0 | auto lead = *pointer / 190 + 0x81; |
451 | | |
452 | | // 6. Let trail be pointer % 190 + 0x41. |
453 | 0 | auto trail = *pointer % 190 + 0x41; |
454 | | |
455 | | // 7. Return two bytes whose values are lead and trail. |
456 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
457 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
458 | 0 | } |
459 | | |
460 | 0 | return {}; |
461 | 0 | } |
462 | | |
463 | | // https://encoding.spec.whatwg.org/#index-big5-pointer |
464 | | static Optional<u32> index_big5_pointer(u32 code_point) |
465 | 0 | { |
466 | | // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157. |
467 | 0 | auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer; |
468 | | |
469 | | // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer |
470 | | // corresponding to code point in index. |
471 | 0 | if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) { |
472 | 0 | for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) { |
473 | 0 | if (s_big5_index[i] == code_point) { |
474 | 0 | return s_big5_index_first_pointer + i; |
475 | 0 | } |
476 | 0 | } |
477 | 0 | return {}; |
478 | 0 | } |
479 | | |
480 | | // 3. Return the index pointer for code point in index. |
481 | 0 | for (u32 i = start_index; i < s_big5_index.size(); ++i) { |
482 | 0 | if (s_big5_index[i] == code_point) { |
483 | 0 | return s_big5_index_first_pointer + i; |
484 | 0 | } |
485 | 0 | } |
486 | 0 | return {}; |
487 | 0 | } |
488 | | |
489 | | // https://encoding.spec.whatwg.org/#big5-encoder |
490 | | ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
491 | 0 | { |
492 | 0 | for (u32 item : input) { |
493 | | // 1. If code point is end-of-queue, return finished. |
494 | | |
495 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
496 | 0 | if (is_ascii(item)) { |
497 | 0 | TRY(on_byte(static_cast<u8>(item))); |
498 | 0 | continue; |
499 | 0 | } |
500 | | |
501 | | // 3. Let pointer be the index Big5 pointer for code point. |
502 | 0 | auto pointer = index_big5_pointer(item); |
503 | | |
504 | | // 4. If pointer is null, return error with code point. |
505 | 0 | if (!pointer.has_value()) { |
506 | 0 | TRY(on_error(item)); |
507 | 0 | continue; |
508 | 0 | } |
509 | | |
510 | | // 5. Let lead be pointer / 157 + 0x81. |
511 | 0 | auto lead = *pointer / 157 + 0x81; |
512 | | |
513 | | // 6. Let trail be pointer % 157. |
514 | 0 | auto trail = *pointer % 157; |
515 | | |
516 | | // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62. |
517 | 0 | auto offset = 0x62; |
518 | 0 | if (trail < 0x3f) |
519 | 0 | offset = 0x40; |
520 | | |
521 | | // 8. Return two bytes whose values are lead and trail + offset. |
522 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
523 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
524 | 0 | } |
525 | | |
526 | 0 | return {}; |
527 | 0 | } |
528 | | |
529 | | // https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer |
530 | | static u32 index_gb18030_ranges_pointer(u32 code_point) |
531 | 0 | { |
532 | | // 1. If code point is U+E7C7, return pointer 7457. |
533 | 0 | if (code_point == 0xe7c7) |
534 | 0 | return 7457; |
535 | | |
536 | | // 2. Let offset be the last code point in index gb18030 ranges that is less than |
537 | | // or equal to code point and let pointer offset be its corresponding pointer. |
538 | 0 | size_t last_index; |
539 | 0 | binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) { |
540 | 0 | return code_point - entry.code_point; |
541 | 0 | }); |
542 | 0 | auto offset = s_gb18030_ranges[last_index].code_point; |
543 | 0 | auto pointer_offset = s_gb18030_ranges[last_index].pointer; |
544 | | |
545 | | // 3. Return a pointer whose value is pointer offset + code point − offset. |
546 | 0 | return pointer_offset + code_point - offset; |
547 | 0 | } |
548 | | |
549 | | GB18030Encoder::GB18030Encoder(IsGBK is_gbk) |
550 | 124 | : m_is_gbk(is_gbk) |
551 | 124 | { |
552 | 124 | } |
553 | | |
554 | | // https://encoding.spec.whatwg.org/#gb18030-encoder |
555 | | ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
556 | 0 | { |
557 | 0 | bool gbk = (m_is_gbk == IsGBK::Yes); |
558 | |
|
559 | 0 | for (u32 item : input) { |
560 | | // 1. If code point is end-of-queue, return finished. |
561 | | |
562 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
563 | 0 | if (is_ascii(item)) { |
564 | 0 | TRY(on_byte(static_cast<u8>(item))); |
565 | 0 | continue; |
566 | 0 | } |
567 | | |
568 | | // 3. If code point is U+E5E5, return error with code point. |
569 | 0 | if (item == 0xE5E5) { |
570 | 0 | TRY(on_error(item)); |
571 | 0 | continue; |
572 | 0 | } |
573 | | |
574 | | // 4. If is GBK is true and code point is U+20AC, return byte 0x80. |
575 | 0 | if (gbk && item == 0x20AC) { |
576 | 0 | TRY(on_byte(0x80)); |
577 | 0 | continue; |
578 | 0 | } |
579 | | |
580 | | // 5. Let pointer be the index pointer for code point in index gb18030. |
581 | 0 | auto pointer = code_point_gb18030_index(item); |
582 | | |
583 | | // 6. If pointer is non-null, then: |
584 | 0 | if (pointer.has_value()) { |
585 | | // 1. Let lead be pointer / 190 + 0x81. |
586 | 0 | auto lead = *pointer / 190 + 0x81; |
587 | | |
588 | | // 2. Let trail be pointer % 190. |
589 | 0 | auto trail = *pointer % 190; |
590 | | |
591 | | // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41. |
592 | 0 | auto offset = 0x41; |
593 | 0 | if (trail < 0x3f) |
594 | 0 | offset = 0x40; |
595 | | |
596 | | // 4. Return two bytes whose values are lead and trail + offset. |
597 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
598 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
599 | 0 | continue; |
600 | 0 | } |
601 | | |
602 | | // 7. If is GBK is true, return error with code point. |
603 | 0 | if (gbk) { |
604 | 0 | TRY(on_error(item)); |
605 | 0 | continue; |
606 | 0 | } |
607 | | |
608 | | // 8. Set pointer to the index gb18030 ranges pointer for code point. |
609 | 0 | pointer = index_gb18030_ranges_pointer(item); |
610 | | |
611 | | // 9. Let byte1 be pointer / (10 × 126 × 10). |
612 | 0 | auto byte1 = *pointer / (10 * 126 * 10); |
613 | | |
614 | | // 10. Set pointer to pointer % (10 × 126 × 10). |
615 | 0 | pointer = *pointer % (10 * 126 * 10); |
616 | | |
617 | | // 11. Let byte2 be pointer / (10 × 126). |
618 | 0 | auto byte2 = *pointer / (10 * 126); |
619 | | |
620 | | // 12. Set pointer to pointer % (10 × 126). |
621 | 0 | pointer = *pointer % (10 * 126); |
622 | | |
623 | | // 13. Let byte3 be pointer / 10. |
624 | 0 | auto byte3 = *pointer / 10; |
625 | | |
626 | | // 14. Let byte4 be pointer % 10. |
627 | 0 | auto byte4 = *pointer % 10; |
628 | | |
629 | | // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30. |
630 | 0 | TRY(on_byte(static_cast<u8>(byte1 + 0x81))); |
631 | 0 | TRY(on_byte(static_cast<u8>(byte2 + 0x30))); |
632 | 0 | TRY(on_byte(static_cast<u8>(byte3 + 0x81))); |
633 | 0 | TRY(on_byte(static_cast<u8>(byte4 + 0x30))); |
634 | 0 | } |
635 | | |
636 | 0 | return {}; |
637 | 0 | } |
638 | | |
639 | | // https://encoding.spec.whatwg.org/#single-byte-encoder |
640 | | template<Integral ArrayType> |
641 | | ErrorOr<void> SingleByteEncoder<ArrayType>::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
642 | 0 | { |
643 | 0 | for (u32 const code_point : input) { |
644 | 0 | if (code_point < 0x80) { |
645 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
646 | 0 | TRY(on_byte(static_cast<u8>(code_point))); |
647 | 0 | } else { |
648 | 0 | Optional<u8> pointer = {}; |
649 | 0 | for (u8 i = 0; i < m_translation_table.size(); i++) { |
650 | 0 | if (m_translation_table[i] == code_point) { |
651 | | // 3. Let pointer be the index pointer for code point in index single-byte. |
652 | 0 | pointer = i; |
653 | 0 | break; |
654 | 0 | } |
655 | 0 | } |
656 | 0 | if (pointer.has_value()) { |
657 | | // 5. Return a byte whose value is pointer + 0x80. |
658 | 0 | TRY(on_byte(pointer.value() + 0x80)); |
659 | 0 | } else { |
660 | | // 4. If pointer is null, return error with code point. |
661 | 0 | TRY(on_error(code_point)); |
662 | 0 | } |
663 | 0 | } |
664 | 0 | } |
665 | | // 1. If code point is end-of-queue, return finished. |
666 | 0 | return {}; |
667 | 0 | } |
668 | | |
669 | | } |