/src/serenity/Userland/Libraries/LibTextCodec/Encoder.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/BinarySearch.h> |
8 | | #include <AK/Error.h> |
9 | | #include <AK/Utf16View.h> |
10 | | #include <AK/Utf8View.h> |
11 | | #include <LibTextCodec/Decoder.h> |
12 | | #include <LibTextCodec/Encoder.h> |
13 | | #include <LibTextCodec/LookupTables.h> |
14 | | |
15 | | namespace TextCodec { |
16 | | |
17 | | namespace { |
18 | | Latin1Encoder s_latin1_encoder; |
19 | | UTF8Encoder s_utf8_encoder; |
20 | | UTF16BEEncoder s_utf16be_encoder; |
21 | | UTF16LEEncoder s_utf16le_encoder; |
22 | | GB18030Encoder s_gb18030_encoder; |
23 | | GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes); |
24 | | Big5Encoder s_big5_encoder; |
25 | | EUCJPEncoder s_euc_jp_encoder; |
26 | | ISO2022JPEncoder s_iso_2022_jp_encoder; |
27 | | ShiftJISEncoder s_shift_jis_encoder; |
28 | | EUCKREncoder s_euc_kr_encoder; |
29 | | |
30 | | // s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json |
31 | | // Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt |
32 | | SingleByteEncoder s_ibm866_encoder { s_ibm866_index }; |
33 | | SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index }; |
34 | | SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index }; |
35 | | SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index }; |
36 | | SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index }; |
37 | | SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index }; |
38 | | SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index }; |
39 | | SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index }; |
40 | | SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index }; |
41 | | SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index }; |
42 | | SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index }; |
43 | | SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index }; |
44 | | SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index }; |
45 | | SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index }; |
46 | | SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index }; |
47 | | SingleByteEncoder s_hebrew_encoder { s_windows_1255_index }; |
48 | | SingleByteEncoder s_koi8r_encoder { s_koi8_r_index }; |
49 | | SingleByteEncoder s_koi8u_encoder { s_koi8_u_index }; |
50 | | SingleByteEncoder s_mac_roman_encoder { s_macintosh_index }; |
51 | | SingleByteEncoder s_windows874_encoder { s_windows_874_index }; |
52 | | SingleByteEncoder s_windows1252_encoder { s_windows_1252_index }; |
53 | | SingleByteEncoder s_windows1253_encoder { s_windows_1253_index }; |
54 | | SingleByteEncoder s_turkish_encoder { s_windows_1254_index }; |
55 | | SingleByteEncoder s_windows1256_encoder { s_windows_1256_index }; |
56 | | SingleByteEncoder s_windows1257_encoder { s_windows_1257_index }; |
57 | | SingleByteEncoder s_windows1258_encoder { s_windows_1258_index }; |
58 | | SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index }; |
59 | | |
60 | | } |
61 | | |
62 | | Optional<Encoder&> encoder_for_exact_name(StringView encoding) |
63 | 6.12M | { |
64 | 6.12M | if (encoding.equals_ignoring_ascii_case("iso-8859-1"sv)) |
65 | 0 | return s_latin1_encoder; |
66 | 6.12M | if (encoding.equals_ignoring_ascii_case("utf-8"sv)) |
67 | 6.12M | return s_utf8_encoder; |
68 | 0 | if (encoding.equals_ignoring_ascii_case("utf-16be"sv)) |
69 | 0 | return s_utf16be_encoder; |
70 | 0 | if (encoding.equals_ignoring_ascii_case("utf-16le"sv)) |
71 | 0 | return s_utf16le_encoder; |
72 | 0 | if (encoding.equals_ignoring_ascii_case("big5"sv)) |
73 | 0 | return s_big5_encoder; |
74 | 0 | if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) |
75 | 0 | return s_euc_jp_encoder; |
76 | 0 | if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv)) |
77 | 0 | return s_iso_2022_jp_encoder; |
78 | 0 | if (encoding.equals_ignoring_ascii_case("shift_jis"sv)) |
79 | 0 | return s_shift_jis_encoder; |
80 | 0 | if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) |
81 | 0 | return s_euc_kr_encoder; |
82 | 0 | if (encoding.equals_ignoring_ascii_case("gb18030"sv)) |
83 | 0 | return s_gb18030_encoder; |
84 | 0 | if (encoding.equals_ignoring_ascii_case("gbk"sv)) |
85 | 0 | return s_gbk_encoder; |
86 | 0 | if (encoding.equals_ignoring_ascii_case("ibm866"sv)) |
87 | 0 | return s_ibm866_encoder; |
88 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv)) |
89 | 0 | return s_latin2_encoder; |
90 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv)) |
91 | 0 | return s_latin3_encoder; |
92 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv)) |
93 | 0 | return s_latin4_encoder; |
94 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv)) |
95 | 0 | return s_latin_cyrillic_encoder; |
96 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv)) |
97 | 0 | return s_latin_arabic_encoder; |
98 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv)) |
99 | 0 | return s_latin_greek_encoder; |
100 | 0 | if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv)) |
101 | 0 | return s_latin_hebrew_encoder; |
102 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv)) |
103 | 0 | return s_latin6_encoder; |
104 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv)) |
105 | 0 | return s_latin7_encoder; |
106 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv)) |
107 | 0 | return s_latin8_encoder; |
108 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv)) |
109 | 0 | return s_latin9_encoder; |
110 | 0 | if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv)) |
111 | 0 | return s_latin10_encoder; |
112 | 0 | if (encoding.equals_ignoring_ascii_case("koi8-r"sv)) |
113 | 0 | return s_koi8r_encoder; |
114 | 0 | if (encoding.equals_ignoring_ascii_case("koi8-u"sv)) |
115 | 0 | return s_koi8u_encoder; |
116 | 0 | if (encoding.equals_ignoring_ascii_case("macintosh"sv)) |
117 | 0 | return s_mac_roman_encoder; |
118 | 0 | if (encoding.equals_ignoring_ascii_case("windows-874"sv)) |
119 | 0 | return s_windows874_encoder; |
120 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1250"sv)) |
121 | 0 | return s_centraleurope_encoder; |
122 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1251"sv)) |
123 | 0 | return s_cyrillic_encoder; |
124 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1252"sv)) |
125 | 0 | return s_windows1252_encoder; |
126 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1253"sv)) |
127 | 0 | return s_windows1253_encoder; |
128 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1254"sv)) |
129 | 0 | return s_turkish_encoder; |
130 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1255"sv)) |
131 | 0 | return s_hebrew_encoder; |
132 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1256"sv)) |
133 | 0 | return s_windows1256_encoder; |
134 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1257"sv)) |
135 | 0 | return s_windows1257_encoder; |
136 | 0 | if (encoding.equals_ignoring_ascii_case("windows-1258"sv)) |
137 | 0 | return s_windows1258_encoder; |
138 | 0 | if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv)) |
139 | 0 | return s_mac_cyrillic_encoder; |
140 | 0 | dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding); |
141 | 0 | return {}; |
142 | 0 | } |
143 | | |
144 | | Optional<Encoder&> encoder_for(StringView label) |
145 | 6.12M | { |
146 | 6.12M | auto encoding = get_standardized_encoding(label); |
147 | 6.12M | return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {}; |
148 | 6.12M | } |
149 | | |
150 | | // https://encoding.spec.whatwg.org/#utf-8-encoder |
151 | | ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>) |
152 | 1.34k | { |
153 | 1.34k | ReadonlyBytes bytes { input.bytes(), input.byte_length() }; |
154 | 1.34k | for (auto byte : bytes) |
155 | 49.8M | TRY(on_byte(byte)); |
156 | 1.34k | return {}; |
157 | 1.34k | } |
158 | | |
159 | | ErrorOr<void> UTF16BEEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>) |
160 | 0 | { |
161 | 0 | auto utf16 = TRY(utf8_to_utf16(input)); |
162 | 0 | for (auto utf16_codepoint : utf16) { |
163 | 0 | u8 high_byte = static_cast<u8>((utf16_codepoint >> 8) & 0xFF); |
164 | 0 | u8 low_byte = static_cast<u8>(utf16_codepoint & 0xFF); |
165 | 0 | TRY(on_byte(high_byte)); |
166 | 0 | TRY(on_byte(low_byte)); |
167 | 0 | } |
168 | 0 | return {}; |
169 | 0 | } |
170 | | |
171 | | ErrorOr<void> UTF16LEEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)>) |
172 | 0 | { |
173 | 0 | auto utf16 = TRY(utf8_to_utf16(input)); |
174 | 0 | for (auto utf16_codepoint : utf16) { |
175 | 0 | u8 high_byte = static_cast<u8>((utf16_codepoint >> 8) & 0xFF); |
176 | 0 | u8 low_byte = static_cast<u8>(utf16_codepoint & 0xFF); |
177 | 0 | TRY(on_byte(low_byte)); |
178 | 0 | TRY(on_byte(high_byte)); |
179 | 0 | } |
180 | 0 | return {}; |
181 | 0 | } |
182 | | |
183 | | ErrorOr<void> Latin1Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
184 | 0 | { |
185 | 0 | for (auto item : input) { |
186 | | // Latin1 is the same as the first 256 Unicode code_points. |
187 | 0 | if (item <= 255) |
188 | 0 | TRY(on_byte(static_cast<u8>(item))); |
189 | 0 | else |
190 | 0 | TRY(on_error(item)); |
191 | 0 | } |
192 | |
|
193 | 0 | return {}; |
194 | 0 | } |
195 | | |
196 | | // https://encoding.spec.whatwg.org/#euc-jp-encoder |
197 | | ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
198 | 0 | { |
199 | 0 | for (auto item : input) { |
200 | | // 1. If code point is end-of-queue, return finished. |
201 | | |
202 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
203 | 0 | if (is_ascii(item)) { |
204 | 0 | TRY(on_byte(static_cast<u8>(item))); |
205 | 0 | continue; |
206 | 0 | } |
207 | | |
208 | | // 3. If code point is U+00A5, return byte 0x5C. |
209 | 0 | if (item == 0x00A5) { |
210 | 0 | TRY(on_byte(static_cast<u8>(0x5C))); |
211 | 0 | continue; |
212 | 0 | } |
213 | | |
214 | | // 4. If code point is U+203E, return byte 0x7E. |
215 | 0 | if (item == 0x203E) { |
216 | 0 | TRY(on_byte(static_cast<u8>(0x7E))); |
217 | 0 | continue; |
218 | 0 | } |
219 | | |
220 | | // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1. |
221 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
222 | 0 | TRY(on_byte(0x8E)); |
223 | 0 | TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1))); |
224 | 0 | continue; |
225 | 0 | } |
226 | | |
227 | | // 6. If code point is U+2212, set it to U+FF0D. |
228 | 0 | if (item == 0x2212) |
229 | 0 | item = 0xFF0D; |
230 | | |
231 | | // 7. Let pointer be the index pointer for code point in index jis0208. |
232 | 0 | auto pointer = code_point_jis0208_index(item); |
233 | | |
234 | | // 8. If pointer is null, return error with code point. |
235 | 0 | if (!pointer.has_value()) { |
236 | 0 | TRY(on_error(item)); |
237 | 0 | continue; |
238 | 0 | } |
239 | | |
240 | | // 9. Let lead be pointer / 94 + 0xA1. |
241 | 0 | auto lead = *pointer / 94 + 0xA1; |
242 | | |
243 | | // 10. Let trail be pointer % 94 + 0xA1. |
244 | 0 | auto trail = *pointer % 94 + 0xA1; |
245 | | |
246 | | // 11. Return two bytes whose values are lead and trail. |
247 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
248 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
249 | 0 | } |
250 | | |
251 | 0 | return {}; |
252 | 0 | } |
253 | | |
254 | | // https://encoding.spec.whatwg.org/#iso-2022-jp-encoder |
255 | | ErrorOr<ISO2022JPEncoder::State> ISO2022JPEncoder::process_item(u32 item, State state, Function<ErrorOr<void>(u8)>& on_byte, Function<ErrorOr<void>(u32)>& on_error) |
256 | 0 | { |
257 | | // 3. If ISO-2022-JP encoder state is ASCII or Roman, and code point is U+000E, U+000F, or U+001B, return error with U+FFFD. |
258 | 0 | if (state == State::ASCII || state == State::Roman) { |
259 | 0 | if (item == 0x000E || item == 0x000F || item == 0x001B) { |
260 | 0 | TRY(on_error(0xFFFD)); |
261 | 0 | return state; |
262 | 0 | } |
263 | 0 | } |
264 | | |
265 | | // 4. If ISO-2022-JP encoder state is ASCII and code point is an ASCII code point, return a byte whose value is code point. |
266 | 0 | if (state == State::ASCII && is_ascii(item)) { |
267 | 0 | TRY(on_byte(static_cast<u8>(item))); |
268 | 0 | return state; |
269 | 0 | } |
270 | | |
271 | | // 5. If ISO-2022-JP encoder state is Roman and code point is an ASCII code point, excluding U+005C and U+007E, or is U+00A5 or U+203E, then: |
272 | 0 | if (state == State::Roman && ((is_ascii(item) && item != 0x005C && item != 0x007E) || (item == 0x00A5 || item == 0x203E))) { |
273 | | // 1. If code point is an ASCII code point, return a byte whose value is code point. |
274 | 0 | if (is_ascii(item)) { |
275 | 0 | TRY(on_byte(static_cast<u8>(item))); |
276 | 0 | return state; |
277 | 0 | } |
278 | | |
279 | | // 2. If code point is U+00A5, return byte 0x5C. |
280 | 0 | if (item == 0x00A5) { |
281 | 0 | TRY(on_byte(0x5C)); |
282 | 0 | return state; |
283 | 0 | } |
284 | | |
285 | | // 3. If code point is U+203E, return byte 0x7E. |
286 | 0 | if (item == 0x203E) { |
287 | 0 | TRY(on_byte(0x7E)); |
288 | 0 | return state; |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | | // 6. If code point is an ASCII code point, and ISO-2022-JP encoder state is not ASCII, restore code point to ioQueue, set |
293 | | // ISO-2022-JP encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. |
294 | 0 | if (is_ascii(item) && state != State::ASCII) { |
295 | 0 | TRY(on_byte(0x1B)); |
296 | 0 | TRY(on_byte(0x28)); |
297 | 0 | TRY(on_byte(0x42)); |
298 | 0 | return process_item(item, State::ASCII, on_byte, on_error); |
299 | 0 | } |
300 | | |
301 | | // 7. If code point is either U+00A5 or U+203E, and ISO-2022-JP encoder state is not Roman, restore code point to ioQueue, |
302 | | // set ISO-2022-JP encoder state to Roman, and return three bytes 0x1B 0x28 0x4A. |
303 | 0 | if ((item == 0x00A5 || item == 0x203E) && state != State::Roman) { |
304 | 0 | TRY(on_byte(0x1B)); |
305 | 0 | TRY(on_byte(0x28)); |
306 | 0 | TRY(on_byte(0x4A)); |
307 | 0 | return process_item(item, State::Roman, on_byte, on_error); |
308 | 0 | } |
309 | | |
310 | | // 8. If code point is U+2212, set it to U+FF0D. |
311 | 0 | if (item == 0x2212) |
312 | 0 | item = 0xFF0D; |
313 | | |
314 | | // 9. If code point is in the range U+FF61 to U+FF9F, inclusive, set it to the index code point for code point − 0xFF61 |
315 | | // in index ISO-2022-JP katakana. |
316 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
317 | 0 | item = *index_iso_2022_jp_katakana_code_point(item - 0xFF61); |
318 | 0 | } |
319 | | |
320 | | // 10. Let pointer be the index pointer for code point in index jis0208. |
321 | 0 | auto pointer = code_point_jis0208_index(item); |
322 | | |
323 | | // 11. If pointer is null, then: |
324 | 0 | if (!pointer.has_value()) { |
325 | | // 1. If ISO-2022-JP encoder state is jis0208, then restore code point to ioQueue, set ISO-2022-JP encoder state to |
326 | | // ASCII, and return three bytes 0x1B 0x28 0x42. |
327 | 0 | if (state == State::jis0208) { |
328 | 0 | TRY(on_byte(0x1B)); |
329 | 0 | TRY(on_byte(0x28)); |
330 | 0 | TRY(on_byte(0x4A)); |
331 | 0 | return process_item(item, State::ASCII, on_byte, on_error); |
332 | 0 | } |
333 | | |
334 | | // 2. Return error with code point. |
335 | 0 | TRY(on_error(item)); |
336 | 0 | return state; |
337 | 0 | } |
338 | | |
339 | | // 12. If ISO-2022-JP encoder state is not jis0208, restore code point to ioQueue, set ISO-2022-JP encoder state to |
340 | | // jis0208, and return three bytes 0x1B 0x24 0x42. |
341 | 0 | if (state != State::jis0208) { |
342 | 0 | TRY(on_byte(0x1B)); |
343 | 0 | TRY(on_byte(0x24)); |
344 | 0 | TRY(on_byte(0x42)); |
345 | 0 | return process_item(item, State::jis0208, on_byte, on_error); |
346 | 0 | } |
347 | | |
348 | | // 13. Let lead be pointer / 94 + 0x21. |
349 | 0 | auto lead = *pointer / 94 + 0x21; |
350 | | |
351 | | // 14. Let trail be pointer % 94 + 0x21. |
352 | 0 | auto trail = *pointer % 94 + 0x21; |
353 | | |
354 | | // 15. Return two bytes whose values are lead and trail. |
355 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
356 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
357 | 0 | return state; |
358 | 0 | } |
359 | | |
360 | | // https://encoding.spec.whatwg.org/#iso-2022-jp-encoder |
361 | | ErrorOr<void> ISO2022JPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
362 | 0 | { |
363 | | // ISO-2022-JP’s encoder has an associated ISO-2022-JP encoder state which is ASCII, Roman, or jis0208 (initially ASCII). |
364 | 0 | auto state = State::ASCII; |
365 | |
|
366 | 0 | for (u32 item : input) { |
367 | 0 | state = TRY(process_item(item, state, on_byte, on_error)); |
368 | 0 | } |
369 | | |
370 | | // 1. If code point is end-of-queue and ISO-2022-JP encoder state is not ASCII, set ISO-2022-JP |
371 | | // encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. |
372 | 0 | if (state != State::ASCII) { |
373 | 0 | state = State::ASCII; |
374 | 0 | TRY(on_byte(0x1B)); |
375 | 0 | TRY(on_byte(0x28)); |
376 | 0 | TRY(on_byte(0x42)); |
377 | 0 | return {}; |
378 | 0 | } |
379 | | |
380 | | // 2. If code point is end-of-queue and ISO-2022-JP encoder state is ASCII, return finished. |
381 | 0 | return {}; |
382 | 0 | } |
383 | | |
384 | | static Optional<u32> code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to) |
385 | 0 | { |
386 | 0 | VERIFY(skip_to >= skip_from); |
387 | 0 | for (u32 i = 0; i < s_jis0208_index.size(); ++i) { |
388 | 0 | if (i >= skip_from && i <= skip_to) |
389 | 0 | continue; |
390 | 0 | if (s_jis0208_index[i] == code_point) |
391 | 0 | return i; |
392 | 0 | } |
393 | 0 | return {}; |
394 | 0 | } |
395 | | |
396 | | // https://encoding.spec.whatwg.org/#index-shift_jis-pointer |
397 | | static Optional<u32> index_shift_jis_pointer(u32 code_point) |
398 | 0 | { |
399 | | // 1. Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive. |
400 | 0 | auto pointer = code_point_jis0208_index_skipping_range(code_point, 8272, 8835); |
401 | 0 | if (!pointer.has_value()) |
402 | 0 | return {}; |
403 | | |
404 | | // 2. Return the index pointer for code point in index. |
405 | 0 | return *pointer; |
406 | 0 | } |
407 | | |
408 | | // https://encoding.spec.whatwg.org/#shift_jis-encoder |
409 | | ErrorOr<void> ShiftJISEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
410 | 0 | { |
411 | 0 | for (u32 item : input) { |
412 | | // 1. If code point is end-of-queue, return finished. |
413 | | |
414 | | // 2. If code point is an ASCII code point or U+0080, return a byte whose value is code point. |
415 | 0 | if (is_ascii(item) || item == 0x0080) { |
416 | 0 | TRY(on_byte(static_cast<u8>(item))); |
417 | 0 | continue; |
418 | 0 | } |
419 | | |
420 | | // 3. If code point is U+00A5, return byte 0x5C. |
421 | 0 | if (item == 0x00A5) { |
422 | 0 | TRY(on_byte(0x5C)); |
423 | 0 | continue; |
424 | 0 | } |
425 | | |
426 | | // 4. If code point is U+203E, return byte 0x7E. |
427 | 0 | if (item == 0x203E) { |
428 | 0 | TRY(on_byte(0x7E)); |
429 | 0 | continue; |
430 | 0 | } |
431 | | |
432 | | // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return a byte whose value is code point − 0xFF61 + 0xA1. |
433 | 0 | if (item >= 0xFF61 && item <= 0xFF9F) { |
434 | 0 | TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1))); |
435 | 0 | continue; |
436 | 0 | } |
437 | | |
438 | | // 6. If code point is U+2212, set it to U+FF0D. |
439 | 0 | if (item == 0x2212) |
440 | 0 | item = 0xFF0D; |
441 | | |
442 | | // 7. Let pointer be the index Shift_JIS pointer for code point. |
443 | 0 | auto pointer = index_shift_jis_pointer(item); |
444 | | |
445 | | // 8. If pointer is null, return error with code point. |
446 | 0 | if (!pointer.has_value()) { |
447 | 0 | TRY(on_error(item)); |
448 | 0 | continue; |
449 | 0 | } |
450 | | |
451 | | // 9. Let lead be pointer / 188. |
452 | 0 | auto lead = *pointer / 188; |
453 | | |
454 | | // 10. Let lead offset be 0x81 if lead is less than 0x1F, otherwise 0xC1. |
455 | 0 | auto lead_offset = 0xC1; |
456 | 0 | if (lead < 0x1F) |
457 | 0 | lead_offset = 0x81; |
458 | | |
459 | | // 11. Let trail be pointer % 188. |
460 | 0 | auto trail = *pointer % 188; |
461 | | |
462 | | // 12. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41. |
463 | 0 | auto offset = 0x41; |
464 | 0 | if (trail < 0x3F) |
465 | 0 | offset = 0x40; |
466 | | |
467 | | // 13. Return two bytes whose values are lead + lead offset and trail + offset. |
468 | 0 | TRY(on_byte(static_cast<u8>(lead + lead_offset))); |
469 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
470 | 0 | } |
471 | | |
472 | 0 | return {}; |
473 | 0 | } |
474 | | |
475 | | // https://encoding.spec.whatwg.org/#euc-kr-encoder |
476 | | ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
477 | 0 | { |
478 | 0 | for (u32 item : input) { |
479 | | // 1. If code point is end-of-queue, return finished. |
480 | | |
481 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
482 | 0 | if (is_ascii(item)) { |
483 | 0 | TRY(on_byte(static_cast<u8>(item))); |
484 | 0 | continue; |
485 | 0 | } |
486 | | |
487 | | // 3. Let pointer be the index pointer for code point in index EUC-KR. |
488 | 0 | auto pointer = code_point_euc_kr_index(item); |
489 | | |
490 | | // 4. If pointer is null, return error with code point. |
491 | 0 | if (!pointer.has_value()) { |
492 | 0 | TRY(on_error(item)); |
493 | 0 | continue; |
494 | 0 | } |
495 | | |
496 | | // 5. Let lead be pointer / 190 + 0x81. |
497 | 0 | auto lead = *pointer / 190 + 0x81; |
498 | | |
499 | | // 6. Let trail be pointer % 190 + 0x41. |
500 | 0 | auto trail = *pointer % 190 + 0x41; |
501 | | |
502 | | // 7. Return two bytes whose values are lead and trail. |
503 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
504 | 0 | TRY(on_byte(static_cast<u8>(trail))); |
505 | 0 | } |
506 | | |
507 | 0 | return {}; |
508 | 0 | } |
509 | | |
510 | | // https://encoding.spec.whatwg.org/#index-big5-pointer |
511 | | static Optional<u32> index_big5_pointer(u32 code_point) |
512 | 0 | { |
513 | | // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157. |
514 | 0 | auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer; |
515 | | |
516 | | // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer |
517 | | // corresponding to code point in index. |
518 | 0 | if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) { |
519 | 0 | for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) { |
520 | 0 | if (s_big5_index[i] == code_point) { |
521 | 0 | return s_big5_index_first_pointer + i; |
522 | 0 | } |
523 | 0 | } |
524 | 0 | return {}; |
525 | 0 | } |
526 | | |
527 | | // 3. Return the index pointer for code point in index. |
528 | 0 | for (u32 i = start_index; i < s_big5_index.size(); ++i) { |
529 | 0 | if (s_big5_index[i] == code_point) { |
530 | 0 | return s_big5_index_first_pointer + i; |
531 | 0 | } |
532 | 0 | } |
533 | 0 | return {}; |
534 | 0 | } |
535 | | |
536 | | // https://encoding.spec.whatwg.org/#big5-encoder |
537 | | ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
538 | 0 | { |
539 | 0 | for (u32 item : input) { |
540 | | // 1. If code point is end-of-queue, return finished. |
541 | | |
542 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
543 | 0 | if (is_ascii(item)) { |
544 | 0 | TRY(on_byte(static_cast<u8>(item))); |
545 | 0 | continue; |
546 | 0 | } |
547 | | |
548 | | // 3. Let pointer be the index Big5 pointer for code point. |
549 | 0 | auto pointer = index_big5_pointer(item); |
550 | | |
551 | | // 4. If pointer is null, return error with code point. |
552 | 0 | if (!pointer.has_value()) { |
553 | 0 | TRY(on_error(item)); |
554 | 0 | continue; |
555 | 0 | } |
556 | | |
557 | | // 5. Let lead be pointer / 157 + 0x81. |
558 | 0 | auto lead = *pointer / 157 + 0x81; |
559 | | |
560 | | // 6. Let trail be pointer % 157. |
561 | 0 | auto trail = *pointer % 157; |
562 | | |
563 | | // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62. |
564 | 0 | auto offset = 0x62; |
565 | 0 | if (trail < 0x3f) |
566 | 0 | offset = 0x40; |
567 | | |
568 | | // 8. Return two bytes whose values are lead and trail + offset. |
569 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
570 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
571 | 0 | } |
572 | | |
573 | 0 | return {}; |
574 | 0 | } |
575 | | |
576 | | // https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer |
577 | | static u32 index_gb18030_ranges_pointer(u32 code_point) |
578 | 0 | { |
579 | | // 1. If code point is U+E7C7, return pointer 7457. |
580 | 0 | if (code_point == 0xe7c7) |
581 | 0 | return 7457; |
582 | | |
583 | | // 2. Let offset be the last code point in index gb18030 ranges that is less than |
584 | | // or equal to code point and let pointer offset be its corresponding pointer. |
585 | 0 | size_t last_index; |
586 | 0 | binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) { |
587 | 0 | return code_point - entry.code_point; |
588 | 0 | }); |
589 | 0 | auto offset = s_gb18030_ranges[last_index].code_point; |
590 | 0 | auto pointer_offset = s_gb18030_ranges[last_index].pointer; |
591 | | |
592 | | // 3. Return a pointer whose value is pointer offset + code point − offset. |
593 | 0 | return pointer_offset + code_point - offset; |
594 | 0 | } |
595 | | |
596 | | GB18030Encoder::GB18030Encoder(IsGBK is_gbk) |
597 | 124 | : m_is_gbk(is_gbk) |
598 | 124 | { |
599 | 124 | } |
600 | | |
601 | | // https://encoding.spec.whatwg.org/#gb18030-encoder |
602 | | ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
603 | 0 | { |
604 | 0 | bool gbk = (m_is_gbk == IsGBK::Yes); |
605 | |
|
606 | 0 | for (u32 item : input) { |
607 | | // 1. If code point is end-of-queue, return finished. |
608 | | |
609 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
610 | 0 | if (is_ascii(item)) { |
611 | 0 | TRY(on_byte(static_cast<u8>(item))); |
612 | 0 | continue; |
613 | 0 | } |
614 | | |
615 | | // 3. If code point is U+E5E5, return error with code point. |
616 | 0 | if (item == 0xE5E5) { |
617 | 0 | TRY(on_error(item)); |
618 | 0 | continue; |
619 | 0 | } |
620 | | |
621 | | // 4. If is GBK is true and code point is U+20AC, return byte 0x80. |
622 | 0 | if (gbk && item == 0x20AC) { |
623 | 0 | TRY(on_byte(0x80)); |
624 | 0 | continue; |
625 | 0 | } |
626 | | |
627 | | // 5. Let pointer be the index pointer for code point in index gb18030. |
628 | 0 | auto pointer = code_point_gb18030_index(item); |
629 | | |
630 | | // 6. If pointer is non-null, then: |
631 | 0 | if (pointer.has_value()) { |
632 | | // 1. Let lead be pointer / 190 + 0x81. |
633 | 0 | auto lead = *pointer / 190 + 0x81; |
634 | | |
635 | | // 2. Let trail be pointer % 190. |
636 | 0 | auto trail = *pointer % 190; |
637 | | |
638 | | // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41. |
639 | 0 | auto offset = 0x41; |
640 | 0 | if (trail < 0x3f) |
641 | 0 | offset = 0x40; |
642 | | |
643 | | // 4. Return two bytes whose values are lead and trail + offset. |
644 | 0 | TRY(on_byte(static_cast<u8>(lead))); |
645 | 0 | TRY(on_byte(static_cast<u8>(trail + offset))); |
646 | 0 | continue; |
647 | 0 | } |
648 | | |
649 | | // 7. If is GBK is true, return error with code point. |
650 | 0 | if (gbk) { |
651 | 0 | TRY(on_error(item)); |
652 | 0 | continue; |
653 | 0 | } |
654 | | |
655 | | // 8. Set pointer to the index gb18030 ranges pointer for code point. |
656 | 0 | pointer = index_gb18030_ranges_pointer(item); |
657 | | |
658 | | // 9. Let byte1 be pointer / (10 × 126 × 10). |
659 | 0 | auto byte1 = *pointer / (10 * 126 * 10); |
660 | | |
661 | | // 10. Set pointer to pointer % (10 × 126 × 10). |
662 | 0 | pointer = *pointer % (10 * 126 * 10); |
663 | | |
664 | | // 11. Let byte2 be pointer / (10 × 126). |
665 | 0 | auto byte2 = *pointer / (10 * 126); |
666 | | |
667 | | // 12. Set pointer to pointer % (10 × 126). |
668 | 0 | pointer = *pointer % (10 * 126); |
669 | | |
670 | | // 13. Let byte3 be pointer / 10. |
671 | 0 | auto byte3 = *pointer / 10; |
672 | | |
673 | | // 14. Let byte4 be pointer % 10. |
674 | 0 | auto byte4 = *pointer % 10; |
675 | | |
676 | | // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30. |
677 | 0 | TRY(on_byte(static_cast<u8>(byte1 + 0x81))); |
678 | 0 | TRY(on_byte(static_cast<u8>(byte2 + 0x30))); |
679 | 0 | TRY(on_byte(static_cast<u8>(byte3 + 0x81))); |
680 | 0 | TRY(on_byte(static_cast<u8>(byte4 + 0x30))); |
681 | 0 | } |
682 | | |
683 | 0 | return {}; |
684 | 0 | } |
685 | | |
686 | | // https://encoding.spec.whatwg.org/#single-byte-encoder |
687 | | template<Integral ArrayType> |
688 | | ErrorOr<void> SingleByteEncoder<ArrayType>::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) |
689 | 0 | { |
690 | 0 | for (u32 const code_point : input) { |
691 | 0 | if (code_point < 0x80) { |
692 | | // 2. If code point is an ASCII code point, return a byte whose value is code point. |
693 | 0 | TRY(on_byte(static_cast<u8>(code_point))); |
694 | 0 | } else { |
695 | 0 | Optional<u8> pointer = {}; |
696 | 0 | for (u8 i = 0; i < m_translation_table.size(); i++) { |
697 | 0 | if (m_translation_table[i] == code_point) { |
698 | | // 3. Let pointer be the index pointer for code point in index single-byte. |
699 | 0 | pointer = i; |
700 | 0 | break; |
701 | 0 | } |
702 | 0 | } |
703 | 0 | if (pointer.has_value()) { |
704 | | // 5. Return a byte whose value is pointer + 0x80. |
705 | 0 | TRY(on_byte(pointer.value() + 0x80)); |
706 | 0 | } else { |
707 | | // 4. If pointer is null, return error with code point. |
708 | 0 | TRY(on_error(code_point)); |
709 | 0 | } |
710 | 0 | } |
711 | 0 | } |
712 | | // 1. If code point is end-of-queue, return finished. |
713 | 0 | return {}; |
714 | 0 | } |
715 | | |
716 | | } |