/src/wireshark/epan/charsets.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* charsets.c |
2 | | * Routines for handling character sets |
3 | | * |
4 | | * Wireshark - Network traffic analyzer |
5 | | * By Gerald Combs <gerald@wireshark.org> |
6 | | * Copyright 1998 Gerald Combs |
7 | | * |
8 | | * SPDX-License-Identifier: GPL-2.0-or-later |
9 | | */ |
10 | | |
11 | | #include "config.h" |
12 | | |
13 | | #include <errno.h> |
14 | | #include <glib.h> |
15 | | |
16 | | #include <epan/proto.h> |
17 | | #include <epan/wmem_scopes.h> |
18 | | |
19 | | #include <wsutil/pint.h> |
20 | | #include <wsutil/unicode-utils.h> |
21 | | |
22 | | #include "charsets.h" |
23 | | |
24 | | /* |
25 | | * 6-character abbreviation for "Unicode REPLACEMENT CHARACTER", so it |
26 | | * takes up the same amount of space as the 6-character hex values for |
27 | | * Basic Multilingual Plane code points in the tables below. |
28 | | */ |
29 | 8.12k | #define UNREPL UNICODE_REPLACEMENT_CHARACTER |
30 | | |
31 | | /* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */ |
32 | 246 | #define BYTE_ORDER_MARK 0xFEFF |
33 | | |
34 | | /* |
35 | | * Wikipedia's "Character encoding" template, giving a pile of character |
36 | | * encodings and Wikipedia pages for them: |
37 | | * |
38 | | * http://en.wikipedia.org/wiki/Template:Character_encoding |
39 | | * |
40 | | * Unicode character encoding model: |
41 | | * |
42 | | * https://www.unicode.org/reports/tr17/ |
43 | | * |
44 | | * International Components for Unicode character set mapping tables: |
45 | | * |
46 | | * http://site.icu-project.org/charts/charset |
47 | | * |
48 | | * MSDN information on code pages: |
49 | | * |
50 | | * https://docs.microsoft.com/en-us/windows/win32/intl/code-pages |
51 | | * |
52 | | * ASCII-based code pages, from IBM: |
53 | | * |
54 | | * http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html |
55 | | * |
56 | | * EBCDIC code pages, from IBM: |
57 | | * |
58 | | * http://www-03.ibm.com/systems/i/software/globalization/codepages.html |
59 | | * |
60 | | * The IBM pages are no longer available; the versions archived on the |
61 | | * Wayback Machine are, but the links to the PDF and text versions of |
62 | | * the code pages don't all work (do *any* work?). |
63 | | * |
64 | | * Mappings to Unicode at the Unicode Consortium: |
65 | | * |
66 | | * https://www.unicode.org/Public/MAPPINGS/ |
67 | | * |
68 | | * Of note, the VENDORS/MICSFT directory not only has various Windows |
69 | | * and DOS code pages, but also several of the common MAC and EBCDIC |
70 | | * code page mappings to Unicode. |
71 | | */ |
72 | | |
73 | | /* |
74 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
75 | | * referred to by the pointer and length as an ASCII string, with all bytes |
76 | | * with the high-order bit set being invalid, and return a pointer to a |
77 | | * UTF-8 string, allocated using the wmem scope. |
78 | | * |
79 | | * Octets with the highest bit set will be converted to the Unicode |
80 | | * REPLACEMENT CHARACTER. |
81 | | */ |
82 | | uint8_t * |
83 | | get_ascii_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
84 | 604k | { |
85 | 604k | wmem_strbuf_t *str; |
86 | 604k | const uint8_t *prev = ptr; |
87 | 604k | size_t valid_bytes = 0; |
88 | | |
89 | 604k | str = wmem_strbuf_new_sized(scope, length+1); |
90 | | |
91 | 16.2M | while (length > 0) { |
92 | 15.6M | uint8_t ch = *ptr++; |
93 | | |
94 | 15.6M | if (ch < 0x80) { |
95 | 13.0M | valid_bytes++; |
96 | 13.0M | } else { |
97 | 2.64M | if (valid_bytes) { |
98 | 992k | wmem_strbuf_append_len(str, prev, valid_bytes); |
99 | 992k | valid_bytes = 0; |
100 | 992k | } |
101 | 2.64M | prev = ptr; |
102 | 2.64M | wmem_strbuf_append_unichar_repl(str); |
103 | 2.64M | } |
104 | 15.6M | length--; |
105 | 15.6M | } |
106 | 604k | if (valid_bytes) { |
107 | 297k | wmem_strbuf_append_len(str, prev, valid_bytes); |
108 | 297k | } |
109 | | |
110 | 604k | return (uint8_t *) wmem_strbuf_finalize(str); |
111 | 604k | } |
112 | | |
113 | | uint8_t * |
114 | | get_utf_8_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
115 | 36.9k | { |
116 | 36.9k | return ws_utf8_make_valid(scope, ptr, length); |
117 | 36.9k | } |
118 | | |
119 | | /* |
120 | | * ISO 646 "Basic code table". |
121 | | */ |
122 | | const gunichar2 charset_table_iso_646_basic[0x80] = { |
123 | | 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* 0x00 - */ |
124 | | 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* - 0x0F */ |
125 | | 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, /* 0x10 - */ |
126 | | 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* - 0x1F */ |
127 | | 0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027, /* 0x20 - */ |
128 | | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* - 0x2F */ |
129 | | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x30 - */ |
130 | | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* - 0x3F */ |
131 | | UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x40 - */ |
132 | | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* - 0x4F */ |
133 | | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x50 - */ |
134 | | 0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f, /* - 0x5F */ |
135 | | UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x60 - */ |
136 | | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* - 0x6F */ |
137 | | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x70 - */ |
138 | | 0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f, /* - 0x7F */ |
139 | | }; |
140 | | |
141 | | /* |
142 | | * Given a wmem scope, a pointer, a length, and a translation table, |
143 | | * treat the string of bytes referred to by the pointer and length as a |
144 | | * string encoded using one octet per character, with octets with the |
145 | | * high-order bit clear being mapped by the translation table to 2-byte |
146 | | * Unicode Basic Multilingual Plane characters (including REPLACEMENT |
147 | | * CHARACTER) and octets with the high-order bit set being mapped to |
148 | | * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string, |
149 | | * allocated using the wmem scope. |
150 | | */ |
151 | | uint8_t * |
152 | | get_iso_646_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80]) |
153 | 18 | { |
154 | 18 | wmem_strbuf_t *str; |
155 | | |
156 | 18 | str = wmem_strbuf_new_sized(scope, length+1); |
157 | | |
158 | 306 | while (length > 0) { |
159 | 288 | uint8_t ch = *ptr; |
160 | | |
161 | 288 | if (ch < 0x80) |
162 | 138 | wmem_strbuf_append_unichar(str, table[ch]); |
163 | 150 | else |
164 | 150 | wmem_strbuf_append_unichar_repl(str); |
165 | 288 | ptr++; |
166 | 288 | length--; |
167 | 288 | } |
168 | | |
169 | 18 | return (uint8_t *) wmem_strbuf_finalize(str); |
170 | 18 | } |
171 | | |
172 | | /* |
173 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
174 | | * referred to by the pointer and length as an ISO 8859/1 string, and |
175 | | * return a pointer to a UTF-8 string, allocated using the wmem scope. |
176 | | */ |
177 | | uint8_t * |
178 | | get_8859_1_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
179 | 786 | { |
180 | 786 | wmem_strbuf_t *str; |
181 | | |
182 | 786 | str = wmem_strbuf_new_sized(scope, length+1); |
183 | | |
184 | 23.4k | while (length > 0) { |
185 | 22.6k | uint8_t ch = *ptr; |
186 | | |
187 | 22.6k | if (ch < 0x80) |
188 | 18.8k | wmem_strbuf_append_c(str, ch); |
189 | 3.81k | else { |
190 | | /* |
191 | | * Note: we assume here that the code points |
192 | | * 0x80-0x9F are used for C1 control characters, |
193 | | * and thus have the same value as the corresponding |
194 | | * Unicode code points. |
195 | | */ |
196 | 3.81k | wmem_strbuf_append_unichar(str, ch); |
197 | 3.81k | } |
198 | 22.6k | ptr++; |
199 | 22.6k | length--; |
200 | 22.6k | } |
201 | | |
202 | 786 | return (uint8_t *) wmem_strbuf_finalize(str); |
203 | 786 | } |
204 | | |
205 | | /* |
206 | | * Translation tables that map the upper 128 code points in single-byte |
207 | | * "extended ASCII" character encodings to Unicode code points in the |
208 | | * Basic Multilingual Plane. |
209 | | */ |
210 | | |
211 | | /* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */ |
212 | | const gunichar2 charset_table_iso_8859_2[0x80] = { |
213 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
214 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
215 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
216 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
217 | | 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, /* 0xA0 - */ |
218 | | 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, /* - 0xAF */ |
219 | | 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, /* 0xB0 - */ |
220 | | 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, /* - 0xBF */ |
221 | | 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */ |
222 | | 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */ |
223 | | 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */ |
224 | | 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */ |
225 | | 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */ |
226 | | 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */ |
227 | | 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */ |
228 | | 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 /* - 0xFF */ |
229 | | }; |
230 | | |
231 | | /* generated by ../tools/make_charset_ISO-8859-3 */ |
232 | | const gunichar2 charset_table_iso_8859_3[0x80] = { |
233 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
234 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
235 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
236 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
237 | | 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7, /* 0xA0 - */ |
238 | | 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b, /* - 0xAF */ |
239 | | 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, /* 0xB0 - */ |
240 | | 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c, /* - 0xBF */ |
241 | | 0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7, /* 0xC0 - */ |
242 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
243 | | UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, /* 0xD0 - */ |
244 | | 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* - 0xDF */ |
245 | | 0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7, /* 0xE0 - */ |
246 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
247 | | UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, /* 0xF0 - */ |
248 | | 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, /* - 0xFF */ |
249 | | }; |
250 | | |
251 | | /* generated by ../tools/make_charset_ISO-8859-4 */ |
252 | | const gunichar2 charset_table_iso_8859_4[0x80] = { |
253 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
254 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
255 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
256 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
257 | | 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, /* 0xA0 - */ |
258 | | 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* - 0xAF */ |
259 | | 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, /* 0xB0 - */ |
260 | | 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* - 0xBF */ |
261 | | 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */ |
262 | | 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* - 0xCF */ |
263 | | 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */ |
264 | | 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* - 0xDF */ |
265 | | 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */ |
266 | | 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* - 0xEF */ |
267 | | 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */ |
268 | | 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, /* - 0xFF */ |
269 | | }; |
270 | | |
271 | | /* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */ |
272 | | const gunichar2 charset_table_iso_8859_5[0x80] = { |
273 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
274 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
275 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
276 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
277 | | 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, /* 0xA0 - */ |
278 | | 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, /* - 0xAF */ |
279 | | 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xB0 - */ |
280 | | 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xBF */ |
281 | | 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xC0 - */ |
282 | | 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xCF */ |
283 | | 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xD0 - */ |
284 | | 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xDF */ |
285 | | 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */ |
286 | | 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */ |
287 | | 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, /* 0xF0 - */ |
288 | | 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f /* - 0xFF */ |
289 | | }; |
290 | | |
291 | | /* generated by ../tools/make_charset_ISO-8859-6 */ |
292 | | const gunichar2 charset_table_iso_8859_6[0x80] = { |
293 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
294 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
295 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
296 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
297 | | 0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL, /* 0xA0 - */ |
298 | | UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL, /* - 0xAF */ |
299 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xB0 - */ |
300 | | UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f, /* - 0xBF */ |
301 | | UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, /* 0xC0 - */ |
302 | | 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, /* - 0xCF */ |
303 | | 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, /* 0xD0 - */ |
304 | | 0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xDF */ |
305 | | 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, /* 0xE0 - */ |
306 | | 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, /* - 0xEF */ |
307 | | 0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xF0 - */ |
308 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */ |
309 | | }; |
310 | | |
311 | | /* generated by ../tools/make_charset_ISO-8859-7 */ |
312 | | const gunichar2 charset_table_iso_8859_7[0x80] = { |
313 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
314 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
315 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
316 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
317 | | 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, /* 0xA0 - */ |
318 | | 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015, /* - 0xAF */ |
319 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, /* 0xB0 - */ |
320 | | 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, /* - 0xBF */ |
321 | | 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, /* 0xC0 - */ |
322 | | 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, /* - 0xCF */ |
323 | | 0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, /* 0xD0 - */ |
324 | | 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, /* - 0xDF */ |
325 | | 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, /* 0xE0 - */ |
326 | | 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, /* - 0xEF */ |
327 | | 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, /* 0xF0 - */ |
328 | | 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL, /* - 0xFF */ |
329 | | }; |
330 | | |
331 | | /* generated by ../tools/make_charset_ISO-8859-8 */ |
332 | | const gunichar2 charset_table_iso_8859_8[0x80] = { |
333 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
334 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
335 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
336 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
337 | | 0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */ |
338 | | 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */ |
339 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
340 | | 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL, /* - 0xBF */ |
341 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xC0 - */ |
342 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xCF */ |
343 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xD0 - */ |
344 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017, /* - 0xDF */ |
345 | | 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, /* 0xE0 - */ |
346 | | 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, /* - 0xEF */ |
347 | | 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, /* 0xF0 - */ |
348 | | 0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL, /* - 0xFF */ |
349 | | }; |
350 | | |
351 | | /* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */ |
352 | | const gunichar2 charset_table_iso_8859_9[0x80] = { |
353 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
354 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
355 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
356 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
357 | | 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */ |
358 | | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */ |
359 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
360 | | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */ |
361 | | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */ |
362 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
363 | | 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */ |
364 | | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, /* - 0xDF */ |
365 | | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */ |
366 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
367 | | 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */ |
368 | | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff /* - 0xFF */ |
369 | | }; |
370 | | |
371 | | /* generated by ../tools/make_charset_ISO-8859-10 */ |
372 | | const gunichar2 charset_table_iso_8859_10[0x80] = { |
373 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
374 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
375 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
376 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
377 | | 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, /* 0xA0 - */ |
378 | | 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, /* - 0xAF */ |
379 | | 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, /* 0xB0 - */ |
380 | | 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, /* - 0xBF */ |
381 | | 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */ |
382 | | 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
383 | | 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, /* 0xD0 - */ |
384 | | 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */ |
385 | | 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */ |
386 | | 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
387 | | 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, /* 0xF0 - */ |
388 | | 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138, /* - 0xFF */ |
389 | | }; |
390 | | |
391 | | /* generated by ../tools/make_charset_ISO-8859-11 */ |
392 | | const gunichar2 charset_table_iso_8859_11[0x80] = { |
393 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
394 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
395 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
396 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
397 | | 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, /* 0xA0 - */ |
398 | | 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* - 0xAF */ |
399 | | 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, /* 0xB0 - */ |
400 | | 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* - 0xBF */ |
401 | | 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, /* 0xC0 - */ |
402 | | 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* - 0xCF */ |
403 | | 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, /* 0xD0 - */ |
404 | | 0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f, /* - 0xDF */ |
405 | | 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, /* 0xE0 - */ |
406 | | 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* - 0xEF */ |
407 | | 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, /* 0xF0 - */ |
408 | | 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */ |
409 | | }; |
410 | | |
411 | | /* generated by ../tools/make_charset_ISO-8859-13 */ |
412 | | const gunichar2 charset_table_iso_8859_13[0x80] = { |
413 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
414 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
415 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
416 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
417 | | 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, /* 0xA0 - */ |
418 | | 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, /* - 0xAF */ |
419 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
420 | | 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, /* - 0xBF */ |
421 | | 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, /* 0xC0 - */ |
422 | | 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, /* - 0xCF */ |
423 | | 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */ |
424 | | 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, /* - 0xDF */ |
425 | | 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, /* 0xE0 - */ |
426 | | 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, /* - 0xEF */ |
427 | | 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */ |
428 | | 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, /* - 0xFF */ |
429 | | }; |
430 | | |
431 | | /* generated by ../tools/make_charset_ISO-8859-14 */ |
432 | | const gunichar2 charset_table_iso_8859_14[0x80] = { |
433 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
434 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
435 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
436 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
437 | | 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, /* 0xA0 - */ |
438 | | 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, /* - 0xAF */ |
439 | | 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, /* 0xB0 - */ |
440 | | 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, /* - 0xBF */ |
441 | | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */ |
442 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
443 | | 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, /* 0xD0 - */ |
444 | | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, /* - 0xDF */ |
445 | | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */ |
446 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
447 | | 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, /* 0xF0 - */ |
448 | | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, /* - 0xFF */ |
449 | | }; |
450 | | |
451 | | /* generated by ../tools/make_charset_ISO-8859-15 */ |
452 | | const gunichar2 charset_table_iso_8859_15[0x80] = { |
453 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
454 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
455 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
456 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
457 | | 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, /* 0xA0 - */ |
458 | | 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */ |
459 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
460 | | 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, /* - 0xBF */ |
461 | | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */ |
462 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
463 | | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */ |
464 | | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */ |
465 | | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */ |
466 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
467 | | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */ |
468 | | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */ |
469 | | }; |
470 | | |
471 | | /* generated by ../tools/make_charset_ISO-8859-16 */ |
472 | | const gunichar2 charset_table_iso_8859_16[0x80] = { |
473 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */ |
474 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */ |
475 | | 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */ |
476 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */ |
477 | | 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, /* 0xA0 - */ |
478 | | 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, /* - 0xAF */ |
479 | | 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, /* 0xB0 - */ |
480 | | 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, /* - 0xBF */ |
481 | | 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, /* 0xC0 - */ |
482 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
483 | | 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, /* 0xD0 - */ |
484 | | 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, /* - 0xDF */ |
485 | | 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, /* 0xE0 - */ |
486 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
487 | | 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, /* 0xF0 - */ |
488 | | 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, /* - 0xFF */ |
489 | | }; |
490 | | |
491 | | /* |
492 | | * Windows-1250 |
493 | | * |
494 | | * See: |
495 | | * httpss://en.wikipedia.org/wiki/Windows-1250) |
496 | | * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT |
497 | | */ |
498 | | const gunichar2 charset_table_cp1250[0x80] = { |
499 | | 0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */ |
500 | | UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */ |
501 | | UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */ |
502 | | UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */ |
503 | | 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */ |
504 | | 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */ |
505 | | 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
506 | | 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */ |
507 | | 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */ |
508 | | 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */ |
509 | | 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */ |
510 | | 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */ |
511 | | 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */ |
512 | | 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */ |
513 | | 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */ |
514 | | 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */ |
515 | | }; |
516 | | |
517 | | /* |
518 | | * Windows-1251 |
519 | | * |
520 | | * See: |
521 | | * https://en.wikipedia.org/wiki/Windows-1251 |
522 | | * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT |
523 | | */ |
524 | | const gunichar2 charset_table_cp1251[0x80] = { |
525 | | 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */ |
526 | | 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f, /* - 0x8F */ |
527 | | 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */ |
528 | | UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, /* - 0x9F */ |
529 | | 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, /* 0xA0 - */ |
530 | | 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, /* - 0xAF */ |
531 | | 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
532 | | 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, /* - 0xBF */ |
533 | | 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xC0 - */ |
534 | | 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xCF */ |
535 | | 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xD0 - */ |
536 | | 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xDF */ |
537 | | 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xE0 - */ |
538 | | 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xEF */ |
539 | | 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xF0 - */ |
540 | | 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xFF */ |
541 | | }; |
542 | | |
543 | | /* |
544 | | * Windows-1252 |
545 | | * |
546 | | * See: |
547 | | * https://en.wikipedia.org/wiki/Windows-1252 |
548 | | * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT |
549 | | */ |
550 | | const gunichar2 charset_table_cp1252[0x80] = { |
551 | | 0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */ |
552 | | 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL, /* - 0x8F */ |
553 | | UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */ |
554 | | 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178, /* - 0x9F */ |
555 | | 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */ |
556 | | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */ |
557 | | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ |
558 | | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */ |
559 | | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */ |
560 | | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */ |
561 | | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */ |
562 | | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */ |
563 | | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */ |
564 | | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */ |
565 | | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */ |
566 | | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */ |
567 | | }; |
568 | | |
569 | | /* generated by ./make_charset_table MACROMAN */ |
570 | | /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */ |
571 | | const gunichar2 charset_table_mac_roman[0x80] = { |
572 | | 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, /* 0x80 - */ |
573 | | 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* - 0x8F */ |
574 | | 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, /* 0x90 - */ |
575 | | 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* - 0x9F */ |
576 | | 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, /* 0xA0 - */ |
577 | | 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* - 0xAF */ |
578 | | 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, /* 0xB0 - */ |
579 | | 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* - 0xBF */ |
580 | | 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, /* 0xC0 - */ |
581 | | 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* - 0xCF */ |
582 | | 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, /* 0xD0 - */ |
583 | | 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* - 0xDF */ |
584 | | 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, /* 0xE0 - */ |
585 | | 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* - 0xEF */ |
586 | | 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, /* 0xF0 - */ |
587 | | 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, /* - 0xFF */ |
588 | | }; |
589 | | |
590 | | /* generated by ./make_charset_table CP437 */ |
591 | | const gunichar2 charset_table_cp437[0x80] = { |
592 | | 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, /* 0x80 - */ |
593 | | 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, /* - 0x8F */ |
594 | | 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, /* 0x90 - */ |
595 | | 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, /* - 0x9F */ |
596 | | 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, /* 0xA0 - */ |
597 | | 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, /* - 0xAF */ |
598 | | 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */ |
599 | | 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */ |
600 | | 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */ |
601 | | 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */ |
602 | | 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */ |
603 | | 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */ |
604 | | 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, /* 0xE0 - */ |
605 | | 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, /* - 0xEF */ |
606 | | 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, /* 0xF0 - */ |
607 | | 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, /* - 0xFF */ |
608 | | }; |
609 | | |
610 | | /* |
611 | | * CP855 |
612 | | * |
613 | | * See |
614 | | * https://en.wikipedia.org/wiki/CP855 |
615 | | * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT |
616 | | * |
617 | | * XXX - this doesn't have the graphics for 0x00 through 0x1F shown |
618 | | * on the Wikipedia page, but not in the Microsoft mapping file; |
619 | | * that would require a 256-code-point mapping table. (Are those |
620 | | * positions used for the same graphics on all code pages - the PC |
621 | | * graphics set, or whatever it's called?) |
622 | | */ |
623 | | const gunichar2 charset_table_cp855[0x80] = { |
624 | | 0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404, /* 0x80 - */ |
625 | | 0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408, /* - 0x8F */ |
626 | | 0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c, /* 0x90 - */ |
627 | | 0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a, /* - 0x9F */ |
628 | | 0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414, /* 0xA0 - */ |
629 | | 0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb, /* - 0xAF */ |
630 | | 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438, /* 0xB0 - */ |
631 | | 0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510, /* - 0xBF */ |
632 | | 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a, /* 0xC0 - */ |
633 | | 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* - 0xCF */ |
634 | | 0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e, /* 0xD0 - */ |
635 | | 0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580, /* - 0xDF */ |
636 | | 0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443, /* 0xE0 - */ |
637 | | 0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116, /* - 0xEF */ |
638 | | 0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d, /* 0xF0 - */ |
639 | | 0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0, /* - 0xFF */ |
640 | | }; |
641 | | |
642 | | /* |
643 | | * CP866 |
644 | | * |
645 | | * See: |
646 | | * https://en.wikipedia.org/wiki/CP866 |
647 | | * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT |
648 | | */ |
649 | | const gunichar2 charset_table_cp866[0x80] = { |
650 | | 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0x80 - */ |
651 | | 0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0x8F */ |
652 | | 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0x90 - */ |
653 | | 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0x9F */ |
654 | | 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xA0 - */ |
655 | | 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xAF */ |
656 | | 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */ |
657 | | 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */ |
658 | | 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */ |
659 | | 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */ |
660 | | 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */ |
661 | | 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */ |
662 | | 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */ |
663 | | 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */ |
664 | | 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e, /* 0xF0 - */ |
665 | | 0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0, /* - 0xFF */ |
666 | | }; |
667 | | |
668 | | /* |
669 | | * Given a wmem scope, a pointer, a length, and a translation table with |
670 | | * 128 entries, treat the string of bytes referred to by the pointer and |
671 | | * length as a string encoded using one octet per character, with octets |
672 | | * with the high-order bit clear being ASCII and octets with the high-order |
673 | | * bit set being mapped by the translation table to 2-byte Unicode Basic |
674 | | * Multilingual Plane characters (including REPLACEMENT CHARACTER), and |
675 | | * return a pointer to a UTF-8 string, allocated using the wmem scope. |
676 | | */ |
677 | | uint8_t * |
678 | | get_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80]) |
679 | 816 | { |
680 | 816 | wmem_strbuf_t *str; |
681 | | |
682 | 816 | str = wmem_strbuf_new_sized(scope, length+1); |
683 | | |
684 | 16.4k | while (length > 0) { |
685 | 15.6k | uint8_t ch = *ptr; |
686 | | |
687 | 15.6k | if (ch < 0x80) |
688 | 10.8k | wmem_strbuf_append_c(str, ch); |
689 | 4.79k | else |
690 | 4.79k | wmem_strbuf_append_unichar(str, table[ch-0x80]); |
691 | 15.6k | ptr++; |
692 | 15.6k | length--; |
693 | 15.6k | } |
694 | | |
695 | 816 | return (uint8_t *) wmem_strbuf_finalize(str); |
696 | 816 | } |
697 | | |
698 | | /* |
699 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
700 | | * referred to by the pointer and length as a UCS-2 encoded string |
701 | | * containing characters from the Basic Multilingual Plane (plane 0) of |
702 | | * Unicode, and return a pointer to a UTF-8 string, allocated with the |
703 | | * wmem scope. |
704 | | * |
705 | | * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, |
706 | | * possibly ORed with ENC_BOM. |
707 | | * |
708 | | * Specify length in bytes. |
709 | | */ |
710 | | uint8_t * |
711 | | get_ucs_2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding) |
712 | 1.94k | { |
713 | 1.94k | gunichar2 uchar; |
714 | 1.94k | int i = 0; /* Byte counter for string */ |
715 | 1.94k | wmem_strbuf_t *strbuf; |
716 | | |
717 | 1.94k | strbuf = wmem_strbuf_new_sized(scope, length+1); |
718 | | |
719 | 1.94k | if (encoding & ENC_BOM && length >= 2) { |
720 | 0 | if (pletoh16(ptr) == BYTE_ORDER_MARK) { |
721 | 0 | encoding = ENC_LITTLE_ENDIAN; |
722 | 0 | i += 2; |
723 | 0 | } else if (pntoh16(ptr) == BYTE_ORDER_MARK) { |
724 | 0 | encoding = ENC_BIG_ENDIAN; |
725 | 0 | i += 2; |
726 | 0 | } |
727 | 0 | } |
728 | | |
729 | 1.94k | encoding = encoding & ENC_LITTLE_ENDIAN; |
730 | | |
731 | 52.4k | for(; i + 1 < length; i += 2) { |
732 | 50.5k | if (encoding == ENC_BIG_ENDIAN) { |
733 | 49.6k | uchar = pntoh16(ptr + i); |
734 | 49.6k | } else { |
735 | 882 | uchar = pletoh16(ptr + i); |
736 | 882 | } |
737 | 50.5k | wmem_strbuf_append_unichar_validated(strbuf, uchar); |
738 | 50.5k | } |
739 | | |
740 | | /* |
741 | | * If i < length, this means we were handed an odd number of bytes; |
742 | | * insert a REPLACEMENT CHARACTER to mark the error. |
743 | | */ |
744 | 1.94k | if (i < length) { |
745 | 118 | wmem_strbuf_append_unichar_repl(strbuf); |
746 | 118 | } |
747 | 1.94k | return (uint8_t *) wmem_strbuf_finalize(strbuf); |
748 | 1.94k | } |
749 | | |
750 | | /* |
751 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
752 | | * referred to by the pointer and length as a UTF-16 encoded string, and |
753 | | * return a pointer to a UTF-8 string, allocated with the wmem scope. |
754 | | * |
755 | | * See RFC 2781 section 2.2. |
756 | | * |
757 | | * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, |
758 | | * possibly ORed with ENC_BOM. |
759 | | * |
760 | | * Specify length in bytes. |
761 | | */ |
762 | | uint8_t * |
763 | | get_utf_16_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding) |
764 | 8.19k | { |
765 | 8.19k | wmem_strbuf_t *strbuf; |
766 | 8.19k | gunichar2 uchar2, lead_surrogate; |
767 | 8.19k | gunichar uchar; |
768 | 8.19k | int i = 0; /* Byte counter for string */ |
769 | | |
770 | 8.19k | strbuf = wmem_strbuf_new_sized(scope, length+1); |
771 | | |
772 | 8.19k | if (encoding & ENC_BOM && length >= 2) { |
773 | 123 | if (pletoh16(ptr) == BYTE_ORDER_MARK) { |
774 | 0 | encoding = ENC_LITTLE_ENDIAN; |
775 | 0 | i += 2; |
776 | 123 | } else if (pntoh16(ptr) == BYTE_ORDER_MARK) { |
777 | 0 | encoding = ENC_BIG_ENDIAN; |
778 | 0 | i += 2; |
779 | 0 | } |
780 | 123 | } |
781 | | |
782 | 8.19k | encoding = encoding & ENC_LITTLE_ENDIAN; |
783 | | |
784 | 92.5k | for(; i + 1 < length; i += 2) { |
785 | 84.6k | if (encoding == ENC_BIG_ENDIAN) |
786 | 4.20k | uchar2 = pntoh16(ptr + i); |
787 | 80.4k | else |
788 | 80.4k | uchar2 = pletoh16(ptr + i); |
789 | | |
790 | 84.6k | if (IS_LEAD_SURROGATE(uchar2)) { |
791 | | /* |
792 | | * Lead surrogate. Must be followed by |
793 | | * a trail surrogate. |
794 | | */ |
795 | 4.09k | i += 2; |
796 | 4.09k | if (i + 1 >= length) { |
797 | | /* |
798 | | * Oops, string ends with a lead surrogate. |
799 | | * |
800 | | * Insert a REPLACEMENT CHARACTER to mark the error, |
801 | | * and quit. |
802 | | */ |
803 | 238 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
804 | 238 | break; |
805 | 238 | } |
806 | 3.85k | lead_surrogate = uchar2; |
807 | 3.85k | if (encoding == ENC_BIG_ENDIAN) |
808 | 206 | uchar2 = pntoh16(ptr + i); |
809 | 3.65k | else |
810 | 3.65k | uchar2 = pletoh16(ptr + i); |
811 | 3.85k | if (IS_TRAIL_SURROGATE(uchar2)) { |
812 | | /* Trail surrogate. */ |
813 | 699 | uchar = SURROGATE_VALUE(lead_surrogate, uchar2); |
814 | 699 | wmem_strbuf_append_unichar(strbuf, uchar); |
815 | 3.16k | } else { |
816 | | /* |
817 | | * Not a trail surrogate. |
818 | | * |
819 | | * Insert a REPLACEMENT CHARACTER to mark the error, |
820 | | * and continue; |
821 | | */ |
822 | 3.16k | wmem_strbuf_append_unichar(strbuf, UNREPL); |
823 | 3.16k | } |
824 | 80.5k | } else { |
825 | 80.5k | if (IS_TRAIL_SURROGATE(uchar2)) { |
826 | | /* |
827 | | * Trail surrogate without a preceding |
828 | | * lead surrogate. |
829 | | * |
830 | | * Insert a REPLACEMENT CHARACTER to mark the error, |
831 | | * and continue; |
832 | | */ |
833 | 1.19k | wmem_strbuf_append_unichar(strbuf, UNREPL); |
834 | 79.3k | } else { |
835 | | /* |
836 | | * Non-surrogate; just append it. |
837 | | */ |
838 | 79.3k | wmem_strbuf_append_unichar(strbuf, uchar2); |
839 | 79.3k | } |
840 | 80.5k | } |
841 | 84.6k | } |
842 | | |
843 | | /* |
844 | | * If i < length, this means we were handed an odd number of bytes, |
845 | | * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER |
846 | | * to mark the error. |
847 | | */ |
848 | 8.19k | if (i < length) |
849 | 3.37k | wmem_strbuf_append_unichar(strbuf, UNREPL); |
850 | 8.19k | return (uint8_t *) wmem_strbuf_finalize(strbuf); |
851 | 8.19k | } |
852 | | |
853 | | /* |
854 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
855 | | * referred to by the pointer and length as a UCS-4 encoded string, and |
856 | | * return a pointer to a UTF-8 string, allocated with the wmem scope. |
857 | | * |
858 | | * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN |
859 | | * |
860 | | * Specify length in bytes |
861 | | */ |
862 | | uint8_t * |
863 | | get_ucs_4_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding) |
864 | 55 | { |
865 | 55 | gunichar uchar; |
866 | 55 | int i = 0; /* Byte counter for string */ |
867 | 55 | wmem_strbuf_t *strbuf; |
868 | | |
869 | 55 | strbuf = wmem_strbuf_new_sized(scope, length+1); |
870 | | |
871 | 55 | if (encoding & ENC_BOM && length >= 4) { |
872 | 0 | if (pletoh32(ptr) == BYTE_ORDER_MARK) { |
873 | 0 | encoding = ENC_LITTLE_ENDIAN; |
874 | 0 | i += 4; |
875 | 0 | } else if (pntoh32(ptr) == BYTE_ORDER_MARK) { |
876 | 0 | encoding = ENC_BIG_ENDIAN; |
877 | 0 | i += 4; |
878 | 0 | } |
879 | 0 | } |
880 | | |
881 | 55 | encoding = encoding & ENC_LITTLE_ENDIAN; |
882 | | |
883 | 1.57k | for(; i + 3 < length; i += 4) { |
884 | 1.51k | if (encoding == ENC_BIG_ENDIAN) |
885 | 1.51k | uchar = pntoh32(ptr + i); |
886 | 0 | else |
887 | 0 | uchar = pletoh32(ptr + i); |
888 | | |
889 | 1.51k | wmem_strbuf_append_unichar_validated(strbuf, uchar); |
890 | 1.51k | } |
891 | | |
892 | | /* |
893 | | * if i < length, this means we were handed a number of bytes |
894 | | * that's not a multiple of 4, so not a valid UCS-4 string. |
895 | | * Insert a REPLACEMENT CHARACTER for the remaining bytes. |
896 | | */ |
897 | 55 | if (i < length) { |
898 | 25 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
899 | 25 | } |
900 | 55 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
901 | 55 | } |
902 | | |
903 | | /* |
904 | | * FROM GNOKII |
905 | | * gsm-encoding.c |
906 | | * gsm-sms.c |
907 | | */ |
908 | | |
909 | | /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */ |
910 | | static const gunichar2 gsm_default_alphabet[0x80] = { |
911 | | '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec, |
912 | | 0xf2, 0xc7, '\n', 0xd8, 0xf8, '\r', 0xc5, 0xe5, |
913 | | 0x394, '_', 0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8, |
914 | | 0x3a3, 0x398, 0x39e, 0xa0, 0xc6, 0xe6, 0xdf, 0xc9, |
915 | | ' ', '!', '\"', '#', 0xa4, '%', '&', '\'', |
916 | | '(', ')', '*', '+', ',', '-', '.', '/', |
917 | | '0', '1', '2', '3', '4', '5', '6', '7', |
918 | | '8', '9', ':', ';', '<', '=', '>', '?', |
919 | | 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
920 | | 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', |
921 | | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', |
922 | | 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7, |
923 | | 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g', |
924 | | 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
925 | | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', |
926 | | 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0 |
927 | | }; |
928 | | |
929 | | static gunichar |
930 | | GSM_to_UNICHAR(uint8_t c) |
931 | 3.35k | { |
932 | 3.35k | if (c < G_N_ELEMENTS(gsm_default_alphabet)) |
933 | 3.35k | return gsm_default_alphabet[c]; |
934 | | |
935 | 0 | return UNREPL; |
936 | 3.35k | } |
937 | | |
938 | | static gunichar |
939 | | GSMext_to_UNICHAR(uint8_t c) |
940 | 5 | { |
941 | 5 | switch (c) |
942 | 5 | { |
943 | 0 | case 0x0a: return 0x0c; /* form feed */ |
944 | 0 | case 0x14: return '^'; |
945 | 0 | case 0x28: return '{'; |
946 | 0 | case 0x29: return '}'; |
947 | 0 | case 0x2f: return '\\'; |
948 | 0 | case 0x3c: return '['; |
949 | 0 | case 0x3d: return '~'; |
950 | 0 | case 0x3e: return ']'; |
951 | 0 | case 0x40: return '|'; |
952 | 0 | case 0x65: return 0x20ac; /* euro */ |
953 | 5 | } |
954 | | |
955 | 5 | return UNREPL; /* invalid character */ |
956 | 5 | } |
957 | | |
958 | 2.03k | #define GN_BYTE_MASK ((1 << bits) - 1) |
959 | | |
960 | 3.45k | #define GN_CHAR_ESCAPE 0x1b |
961 | | |
962 | | static bool |
963 | | char_is_escape(unsigned char value) |
964 | 3.45k | { |
965 | 3.45k | return (value == GN_CHAR_ESCAPE); |
966 | 3.45k | } |
967 | | |
968 | | static bool |
969 | | handle_ts_23_038_char(wmem_strbuf_t *strbuf, uint8_t code_point, |
970 | | bool saw_escape) |
971 | 3.45k | { |
972 | 3.45k | gunichar uchar; |
973 | | |
974 | 3.45k | if (char_is_escape(code_point)) { |
975 | | /* |
976 | | * XXX - if saw_escape is true here, then this is |
977 | | * the case where we escape to "another extension table", |
978 | | * but TS 128 038 V11.0 doesn't specify such an extension |
979 | | * table. |
980 | | */ |
981 | 5 | saw_escape = true; |
982 | 3.45k | } else { |
983 | 3.45k | if (!(code_point & 0x80)) { |
984 | | /* |
985 | | * Code point is valid (7-bit). |
986 | | * Have we seen an escape? |
987 | | */ |
988 | 3.36k | if (saw_escape) { |
989 | 5 | saw_escape = false; |
990 | 5 | uchar = GSMext_to_UNICHAR(code_point); |
991 | 3.35k | } else { |
992 | 3.35k | uchar = GSM_to_UNICHAR(code_point); |
993 | 3.35k | } |
994 | 3.36k | wmem_strbuf_append_unichar(strbuf, uchar); |
995 | 3.36k | } else { |
996 | | /* Invalid - put in a REPLACEMENT CHARACTER */ |
997 | 92 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
998 | 92 | } |
999 | 3.45k | } |
1000 | 3.45k | return saw_escape; |
1001 | 3.45k | } |
1002 | | |
1003 | | uint8_t * |
1004 | | get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const uint8_t *ptr, |
1005 | | const int bit_offset, int no_of_chars) |
1006 | 38 | { |
1007 | 38 | wmem_strbuf_t *strbuf; |
1008 | 38 | int char_count; /* character counter for string */ |
1009 | 38 | uint8_t in_byte, out_byte, rest = 0x00; |
1010 | 38 | const uint8_t *start_ptr = ptr; |
1011 | 38 | bool saw_escape = false; |
1012 | 38 | int bits; |
1013 | | |
1014 | 38 | strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1); |
1015 | | |
1016 | 38 | bits = bit_offset & 0x07; |
1017 | 38 | if (!bits) { |
1018 | 38 | bits = 7; |
1019 | 38 | } |
1020 | | |
1021 | 2.07k | for(char_count = 0; char_count < no_of_chars; ptr++) { |
1022 | | /* Get the next byte from the string. */ |
1023 | 2.03k | in_byte = *ptr; |
1024 | | |
1025 | | /* |
1026 | | * Combine the bits we've accumulated with bits from |
1027 | | * that byte to make a 7-bit code point. |
1028 | | */ |
1029 | 2.03k | out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest; |
1030 | | |
1031 | | /* |
1032 | | * Leftover bits used in that code point. |
1033 | | */ |
1034 | 2.03k | rest = in_byte >> bits; |
1035 | | |
1036 | | /* |
1037 | | * If we don't start from 0th bit, we shouldn't go to the |
1038 | | * next char. Under *out_num we have now 0 and under Rest - |
1039 | | * _first_ part of the char. |
1040 | | */ |
1041 | 2.03k | if ((start_ptr != ptr) || (bits == 7)) { |
1042 | 2.03k | saw_escape = handle_ts_23_038_char(strbuf, out_byte, |
1043 | 2.03k | saw_escape); |
1044 | 2.03k | char_count++; |
1045 | 2.03k | } |
1046 | | |
1047 | | /* |
1048 | | * After reading 7 octets we have read 7 full characters |
1049 | | * but we have 7 bits as well. This is the next character. |
1050 | | */ |
1051 | 2.03k | if ((bits == 1) && (char_count < no_of_chars)) { |
1052 | 271 | saw_escape = handle_ts_23_038_char(strbuf, rest, |
1053 | 271 | saw_escape); |
1054 | 271 | char_count++; |
1055 | 271 | bits = 7; |
1056 | 271 | rest = 0x00; |
1057 | 1.76k | } else { |
1058 | 1.76k | bits--; |
1059 | 1.76k | } |
1060 | 2.03k | } |
1061 | | |
1062 | 38 | if (saw_escape) { |
1063 | | /* |
1064 | | * Escape not followed by anything. |
1065 | | * |
1066 | | * XXX - for now, show the escape as a REPLACEMENT |
1067 | | * CHARACTER. |
1068 | | */ |
1069 | 0 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
1070 | 0 | } |
1071 | | |
1072 | 38 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1073 | 38 | } |
1074 | | |
1075 | | uint8_t * |
1076 | | get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const uint8_t *ptr, |
1077 | | int length) |
1078 | 86 | { |
1079 | 86 | wmem_strbuf_t *strbuf; |
1080 | 86 | int i; /* Byte counter for string */ |
1081 | 86 | bool saw_escape = false; |
1082 | | |
1083 | 86 | strbuf = wmem_strbuf_new_sized(scope, length+1); |
1084 | | |
1085 | 1.23k | for (i = 0; i < length; i++) |
1086 | 1.14k | saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape); |
1087 | | |
1088 | 86 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1089 | 86 | } |
1090 | | |
1091 | | /* |
1092 | | * ETSI TS 102 221 Annex A. |
1093 | | */ |
1094 | | uint8_t * |
1095 | | get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const uint8_t *ptr, |
1096 | | int length) |
1097 | 6 | { |
1098 | 6 | uint8_t string_type; |
1099 | 6 | uint8_t string_len; |
1100 | 6 | gunichar2 ucs2_base; |
1101 | 6 | wmem_strbuf_t *strbuf; |
1102 | 6 | unsigned i; /* Byte counter for string */ |
1103 | 6 | bool saw_escape = false; |
1104 | | |
1105 | | /* |
1106 | | * get the first octet. |
1107 | | */ |
1108 | 6 | if (length == 0) { |
1109 | | /* XXX - return error indication */ |
1110 | 0 | strbuf = wmem_strbuf_new(scope, ""); |
1111 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1112 | 0 | } |
1113 | 6 | string_type = *ptr; |
1114 | 6 | ptr++; |
1115 | 6 | length--; |
1116 | | |
1117 | 6 | if (string_type == 0x80) { |
1118 | | /* |
1119 | | * Annex A, coding scheme 1) - big-endian UCS-2. |
1120 | | */ |
1121 | 0 | return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN); |
1122 | 0 | } |
1123 | | |
1124 | | /* |
1125 | | * Annex A, coding schemes 2) and 3): |
1126 | | * |
1127 | | * the second byte is the number of characters (characters, |
1128 | | * not octets) in the string; |
1129 | | * |
1130 | | * for coding scheme 2), the third byte defines bits 15 to 8 |
1131 | | * of all UCS-2 characters in the string (all bit numbers are |
1132 | | * 1-origin, so bit 1 is the low-order bit), with bit 16 being 0; |
1133 | | * |
1134 | | * for coding scheme 3), the third byte and fourth bytes, treated |
1135 | | * as a big-endian value, define the base value for all UCS-2 |
1136 | | * characters in the string; |
1137 | | * |
1138 | | * for all subsequent bytes, if bit 8 is 0, it's a character |
1139 | | * in the GSM Default Alphabet, otherwise, it is added to |
1140 | | * the UCS-2 base value to give a UCS-2 character. |
1141 | | * |
1142 | | * XXX - that doesn't seem to indicate that a byte of 0x1b is |
1143 | | * treated as an escape character, it just says that a single octet |
1144 | | * with the 8th bit not set is a GSM Default Alphabet character. |
1145 | | */ |
1146 | | |
1147 | | /* |
1148 | | * Get the string length, in characters. |
1149 | | */ |
1150 | 6 | if (length == 0) { |
1151 | | /* XXX - return error indication */ |
1152 | 0 | strbuf = wmem_strbuf_new(scope, ""); |
1153 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1154 | 0 | } |
1155 | 6 | string_len = *ptr; |
1156 | 6 | ptr++; |
1157 | 6 | length--; |
1158 | | |
1159 | 6 | strbuf = wmem_strbuf_new_sized(scope, 2*string_len+1); |
1160 | | |
1161 | | /* |
1162 | | * Get the UCS-2 base. |
1163 | | */ |
1164 | 6 | if (string_type == 0x81) { |
1165 | 0 | if (length == 0) { |
1166 | | /* XXX - return error indication */ |
1167 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1168 | 0 | } |
1169 | 0 | ucs2_base = (*ptr) << 7; |
1170 | 0 | ptr++; |
1171 | 0 | length--; |
1172 | 6 | } else if (string_type == 0x82) { |
1173 | 0 | if (length == 0) { |
1174 | | /* XXX - return error indication */ |
1175 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1176 | 0 | } |
1177 | 0 | ucs2_base = (*ptr) << 8; |
1178 | 0 | ptr++; |
1179 | 0 | length--; |
1180 | |
|
1181 | 0 | if (length == 0) { |
1182 | | /* XXX - return error indication */ |
1183 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1184 | 0 | } |
1185 | 0 | ucs2_base |= *ptr; |
1186 | 0 | ptr++; |
1187 | 0 | length--; |
1188 | 6 | } else { |
1189 | | /* Invalid string type. */ |
1190 | | /* XXX - return error indication */ |
1191 | 6 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1192 | 6 | } |
1193 | | |
1194 | 0 | for (i = 0; i < string_len; i++) { |
1195 | 0 | uint8_t byte; |
1196 | |
|
1197 | 0 | if (length == 0) { |
1198 | | /* XXX - return error indication */ |
1199 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1200 | 0 | } |
1201 | 0 | byte = *ptr; |
1202 | 0 | if ((byte & 0x80) == 0) { |
1203 | 0 | saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape); |
1204 | 0 | } else { |
1205 | 0 | gunichar2 uchar; |
1206 | | |
1207 | | /* |
1208 | | * XXX - if saw_escape is true, this is bogus. |
1209 | | * |
1210 | | * XXX - if there are an odd number of bytes, should put a |
1211 | | * REPLACEMENT CHARACTER at the end. |
1212 | | */ |
1213 | 0 | uchar = ucs2_base + (byte & 0x7f); |
1214 | 0 | wmem_strbuf_append_unichar_validated(strbuf, uchar); |
1215 | 0 | } |
1216 | 0 | } |
1217 | | |
1218 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1219 | 0 | } |
1220 | | |
1221 | | uint8_t * |
1222 | | get_ascii_7bits_string(wmem_allocator_t *scope, const uint8_t *ptr, |
1223 | | const int bit_offset, int no_of_chars) |
1224 | 0 | { |
1225 | 0 | wmem_strbuf_t *strbuf; |
1226 | 0 | int char_count; /* character counter for string */ |
1227 | 0 | uint8_t in_byte, out_byte, rest = 0x00; |
1228 | 0 | const uint8_t *start_ptr = ptr; |
1229 | 0 | int bits; |
1230 | |
|
1231 | 0 | bits = bit_offset & 0x07; |
1232 | 0 | if (!bits) { |
1233 | 0 | bits = 7; |
1234 | 0 | } |
1235 | |
|
1236 | 0 | strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1); |
1237 | 0 | for(char_count = 0; char_count < no_of_chars; ptr++) { |
1238 | | /* Get the next byte from the string. */ |
1239 | 0 | in_byte = *ptr; |
1240 | | |
1241 | | /* |
1242 | | * Combine the bits we've accumulated with bits from |
1243 | | * that byte to make a 7-bit code point. |
1244 | | */ |
1245 | 0 | out_byte = (in_byte >> (8 - bits)) | rest; |
1246 | | |
1247 | | /* |
1248 | | * Leftover bits used in that code point. |
1249 | | */ |
1250 | 0 | rest = (in_byte << (bits - 1)) & 0x7f; |
1251 | | |
1252 | | /* |
1253 | | * If we don't start from 0th bit, we shouldn't go to the |
1254 | | * next char. Under *out_num we have now 0 and under Rest - |
1255 | | * _first_ part of the char. |
1256 | | */ |
1257 | 0 | if ((start_ptr != ptr) || (bits == 7)) { |
1258 | 0 | wmem_strbuf_append_c(strbuf, out_byte); |
1259 | 0 | char_count++; |
1260 | 0 | } |
1261 | | |
1262 | | /* |
1263 | | * After reading 7 octets we have read 7 full characters |
1264 | | * but we have 7 bits as well. This is the next character. |
1265 | | */ |
1266 | 0 | if ((bits == 1) && (char_count < no_of_chars)) { |
1267 | 0 | wmem_strbuf_append_c(strbuf, rest); |
1268 | 0 | char_count++; |
1269 | 0 | bits = 7; |
1270 | 0 | rest = 0x00; |
1271 | 0 | } else { |
1272 | 0 | bits--; |
1273 | 0 | } |
1274 | 0 | } |
1275 | |
|
1276 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1277 | 0 | } |
1278 | | |
1279 | | /* Tables for EBCDIC code pages */ |
1280 | | |
1281 | | /* EBCDIC common; based on the table in appendix H of ESA/370 Principles |
1282 | | of Operation, but with some code points that don't correspond to |
1283 | | the same characters in code pages 037 and 1158 mapped to REPLACEMENT |
1284 | | CHARACTER - there may be more code points of that sort */ |
1285 | | |
1286 | | /* There are a few EBCDIC control codes that, strictly speaking, do not |
1287 | | * map to any control codes in ASCII or Unicode for that matter. The |
1288 | | * customary treatment is to map them in a particular way to ASCII C1 |
1289 | | * control codes that have no exact equivalent in EBCDIC, as below. */ |
1290 | | const gunichar2 charset_table_ebcdic[256] = { |
1291 | | 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f, |
1292 | | 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, |
1293 | | 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087, |
1294 | | 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f, |
1295 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b, |
1296 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007, |
1297 | | UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, |
1298 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a, |
1299 | | 0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1300 | | UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL, |
1301 | | 0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1302 | | UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL, |
1303 | | 0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1304 | | UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f, |
1305 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1306 | | UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022, |
1307 | | UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, |
1308 | | 0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1309 | | UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, |
1310 | | 0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1311 | | UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, |
1312 | | 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1313 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1314 | | UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1315 | | 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, |
1316 | | 0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1317 | | 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, |
1318 | | 0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1319 | | 0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, |
1320 | | 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1321 | | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, |
1322 | | 0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, |
1323 | | }; |
1324 | | |
1325 | | /* EBCDIC code page 037 */ |
1326 | | const gunichar2 charset_table_ebcdic_cp037[256] = { |
1327 | | 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f, |
1328 | | 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, |
1329 | | 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087, |
1330 | | 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f, |
1331 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b, |
1332 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007, |
1333 | | 0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, |
1334 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a, |
1335 | | 0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5, |
1336 | | 0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c, |
1337 | | 0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef, |
1338 | | 0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac, |
1339 | | 0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5, |
1340 | | 0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f, |
1341 | | 0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, |
1342 | | 0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022, |
1343 | | 0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, |
1344 | | 0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1, |
1345 | | 0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, |
1346 | | 0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4, |
1347 | | 0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, |
1348 | | 0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae, |
1349 | | 0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc, |
1350 | | 0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7, |
1351 | | 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, |
1352 | | 0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5, |
1353 | | 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, |
1354 | | 0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff, |
1355 | | 0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, |
1356 | | 0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5, |
1357 | | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, |
1358 | | 0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f, |
1359 | | }; |
1360 | | |
1361 | | /* EBCDIC code page 500 |
1362 | | * https://www.ibm.com/support/pages/conversion-character-differences-between-ccsid-037-and-ccsid-500 |
1363 | | * CCSID 500 ("International Latin-1") has exactly the same repertoire as 37, |
1364 | | * covering all of ISO-8559-1, but with seven code points permuted. |
1365 | | * It is notable because it is the default code page for DRDA: |
1366 | | * https://www.ibm.com/support/pages/drda-user-id-and-password-not-being-transmitted-correctly-when-containing-characters-%C2%AC-%C2%A2?lnk=hm |
1367 | | */ |
1368 | | const gunichar2 charset_table_ebcdic_cp500[256] = { |
1369 | | 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f, |
1370 | | 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, |
1371 | | 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087, |
1372 | | 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f, |
1373 | | 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b, |
1374 | | 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007, |
1375 | | 0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, |
1376 | | 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a, |
1377 | | 0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5, |
1378 | | 0x00e7, 0x00f1, 0x005b, 0x002e, 0x003c, 0x0028, 0x002b, 0x0021, |
1379 | | 0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef, |
1380 | | 0x00ec, 0x00df, 0x005d, 0x0024, 0x002a, 0x0029, 0x003b, 0x005e, |
1381 | | 0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5, |
1382 | | 0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f, |
1383 | | 0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, |
1384 | | 0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022, |
1385 | | 0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, |
1386 | | 0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1, |
1387 | | 0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, |
1388 | | 0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4, |
1389 | | 0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, |
1390 | | 0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae, |
1391 | | 0x00a2, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc, |
1392 | | 0x00bd, 0x00be, 0x00ac, 0x007c, 0x00af, 0x00a8, 0x00b4, 0x00d7, |
1393 | | 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, |
1394 | | 0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5, |
1395 | | 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, |
1396 | | 0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff, |
1397 | | 0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, |
1398 | | 0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5, |
1399 | | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, |
1400 | | 0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f, |
1401 | | }; |
1402 | | |
1403 | | /* |
1404 | | * Given a wmem scope, a pointer, a length, and a translation table with |
1405 | | * 256 entries, treat the string of bytes referred to by the pointer and |
1406 | | * length as a string encoded using one octet per character, with octets |
1407 | | * being mapped by the translation table to 2-byte Unicode Basic Multilingual |
1408 | | * Plane characters (including REPLACEMENT CHARACTER), and return a |
1409 | | * pointer to a UTF-8 string, allocated using the wmem scope. |
1410 | | */ |
1411 | | uint8_t * |
1412 | | get_nonascii_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[256]) |
1413 | 2.52k | { |
1414 | 2.52k | wmem_strbuf_t *str; |
1415 | | |
1416 | 2.52k | str = wmem_strbuf_new_sized(scope, length+1); |
1417 | | |
1418 | 37.6k | while (length > 0) { |
1419 | 35.1k | uint8_t ch = *ptr; |
1420 | | |
1421 | 35.1k | wmem_strbuf_append_unichar(str, table[ch]); |
1422 | 35.1k | ptr++; |
1423 | 35.1k | length--; |
1424 | 35.1k | } |
1425 | | |
1426 | 2.52k | return (uint8_t *) wmem_strbuf_finalize(str); |
1427 | 2.52k | } |
1428 | | |
1429 | | /* |
1430 | | * Given a wmem scope, a pointer, a length, and a string referring to an |
1431 | | * encoding (recognized by iconv), treat the bytes referred to by the pointer |
1432 | | * and length as a string in that encoding, and return a pointer to a UTF-8 |
1433 | | * string, allocated using the wmem scope, converted from the original |
1434 | | * encoding having substituted REPLACEMENT CHARACTER according to the |
1435 | | * Unicode Standard 5.22 U+FFFD Substitution for Conversion |
1436 | | * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) |
1437 | | */ |
1438 | | static uint8_t * |
1439 | | get_string_enc_iconv(wmem_allocator_t *scope, const uint8_t *ptr, int length, const char *encoding) |
1440 | 909 | { |
1441 | 909 | GIConv cd; |
1442 | 909 | size_t inbytes, outbytes; |
1443 | 909 | size_t tempstr_size, bytes_written; |
1444 | 909 | size_t err; |
1445 | 909 | size_t max_subpart, tempinbytes; |
1446 | 909 | char *outptr, *tempstr; |
1447 | | |
1448 | 909 | wmem_strbuf_t *str; |
1449 | | |
1450 | 909 | if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) { |
1451 | 0 | REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding); |
1452 | | /* Most likely to be a programming error passing in a bad encoding |
1453 | | * name. However, could be a issue with the iconv support on the |
1454 | | * system running WS. GLib requires iconv/libiconv, but is it possible |
1455 | | * that some versions don't support all common encodings? */ |
1456 | 0 | } |
1457 | | |
1458 | 909 | inbytes = length; |
1459 | 909 | str = wmem_strbuf_new_sized(scope, length+1); |
1460 | | /* XXX: If speed becomes an issue, the faster way to do this would |
1461 | | * involve passing the wmem_strbuf_t's string buffer directly into |
1462 | | * g_iconv to avoid a memcpy later, but that requires changes to the |
1463 | | * wmem_strbuf interface to have non const access to the string buffer, |
1464 | | * and to manipulate the used length directly. */ |
1465 | 909 | outbytes = tempstr_size = MAX(8, length); |
1466 | 909 | outptr = tempstr = (char *)g_malloc(outbytes); |
1467 | 3.63k | while (inbytes > 0) { |
1468 | 2.72k | err = g_iconv(cd, (char **)&ptr, &inbytes, &outptr, &outbytes); |
1469 | 2.72k | bytes_written = outptr - tempstr; |
1470 | 2.72k | wmem_strbuf_append_len(str, tempstr, bytes_written); |
1471 | 2.72k | outptr = tempstr; |
1472 | 2.72k | outbytes = tempstr_size; |
1473 | | |
1474 | 2.72k | if (err == (size_t) -1) { |
1475 | | /* Errors */ |
1476 | 1.97k | switch (errno) { |
1477 | 85 | case EINVAL: |
1478 | | /* Incomplete sequence at the end, not an error */ |
1479 | 85 | wmem_strbuf_append_unichar_repl(str); |
1480 | 85 | inbytes = 0; |
1481 | 85 | break; |
1482 | 22 | case E2BIG: |
1483 | | /* Not enough room (UTF-8 longer than the initial buffer), |
1484 | | * start back at the beginning of the buffer */ |
1485 | 22 | break; |
1486 | 1.86k | case EILSEQ: |
1487 | | /* Find the maximal subpart of the ill-formed sequence */ |
1488 | 1.86k | errno = EINVAL; |
1489 | 4.72k | for (max_subpart = 1; err == (size_t)-1 && errno == EINVAL; max_subpart++) { |
1490 | 2.85k | tempinbytes = max_subpart; |
1491 | 2.85k | err = g_iconv(cd, (char **)&ptr, &tempinbytes, |
1492 | 2.85k | &outptr, &outbytes); |
1493 | 2.85k | } |
1494 | 1.86k | max_subpart = MAX(1, max_subpart-1); |
1495 | 1.86k | ptr += max_subpart; |
1496 | 1.86k | inbytes -= max_subpart; |
1497 | 1.86k | wmem_strbuf_append_unichar_repl(str); |
1498 | 1.86k | outptr = tempstr; |
1499 | 1.86k | outbytes = tempstr_size; |
1500 | 1.86k | break; |
1501 | 0 | default: |
1502 | | /* Unexpected conversion error, unrecoverable */ |
1503 | 0 | g_free(tempstr); |
1504 | 0 | g_iconv_close(cd); |
1505 | 0 | REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding); |
1506 | 0 | break; |
1507 | 1.97k | } |
1508 | 1.97k | } else { |
1509 | | /* Otherwise err is the number of replacement characters used, |
1510 | | * but we don't care about that. */ |
1511 | | /* If we were converting to ISO-2022-JP or some other stateful |
1512 | | * decoder with shift sequences (e.g. EBCDIC mixed-byte), a |
1513 | | * final call with NULL input in order to output the shift |
1514 | | * sequence back to initial state might make sense, but not |
1515 | | * needed for UTF-8. */ |
1516 | 750 | } |
1517 | 2.72k | } |
1518 | | |
1519 | 909 | g_free(tempstr); |
1520 | 909 | g_iconv_close(cd); |
1521 | 909 | return (uint8_t *) wmem_strbuf_finalize(str); |
1522 | 909 | } |
1523 | | |
1524 | | /* |
1525 | | * Given a wmem scope, a pointer, and a length, treat the bytes referred to |
1526 | | * by the pointer and length as a GB18030 encoded string, and return a pointer |
1527 | | * to a UTF-8 string, allocated using the wmem scope, converted having |
1528 | | * substituted REPLACEMENT CHARACTER according to the Unicode Standard |
1529 | | * 5.22 U+FFFD Substitution for Conversion. |
1530 | | * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) |
1531 | | * |
1532 | | * As expected, this will also decode GBK and GB2312 strings. |
1533 | | */ |
1534 | | uint8_t * |
1535 | | get_gb18030_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
1536 | 233 | { |
1537 | | /* iconv/libiconv support is guaranteed with GLib. Support this |
1538 | | * via iconv, at least for now. */ |
1539 | | /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since |
1540 | | * 2000-10-24 and version 1.4, is there is a system that compiles current |
1541 | | * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */ |
1542 | 233 | const char *encoding = "GB18030"; |
1543 | 233 | GIConv cd; |
1544 | 233 | if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) { |
1545 | 0 | encoding = "GBK"; |
1546 | | /* GB18030 is backwards compatible, at worst this will mean a few |
1547 | | * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings |
1548 | | * from GB18030, which are all pairs of two byte sequences |
1549 | | * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK |
1550 | | * and thus the 4 byte characters will be replaced with two |
1551 | | * REPLACEMENT CHARACTERs. */ |
1552 | 233 | } else { |
1553 | 233 | g_iconv_close(cd); |
1554 | 233 | } |
1555 | 233 | return get_string_enc_iconv(scope, ptr, length, encoding); |
1556 | 233 | } |
1557 | | |
1558 | | /* |
1559 | | * Given a wmem scope, a pointer, and a length, treat the bytes referred to |
1560 | | * by the pointer and length as a EUC-KR encoded string, and return a pointer |
1561 | | * to a UTF-8 string, allocated using the wmem scope, converted having |
1562 | | * substituted REPLACEMENT CHARACTER according to the Unicode Standard |
1563 | | * 5.22 U+FFFD Substitution for Conversion. |
1564 | | * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) |
1565 | | */ |
1566 | | uint8_t * |
1567 | | get_euc_kr_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
1568 | 676 | { |
1569 | | /* iconv/libiconv support is guaranteed with GLib. Support this |
1570 | | * via iconv, at least for now. */ |
1571 | 676 | return get_string_enc_iconv(scope, ptr, length, "EUC-KR"); |
1572 | 676 | } |
1573 | | |
1574 | | /* T.61 to UTF-8 conversion table from OpenLDAP project |
1575 | | * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD |
1576 | | */ |
1577 | | static const gunichar2 t61_tab[] = { |
1578 | | 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, |
1579 | | 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, |
1580 | | 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017, |
1581 | | 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f, |
1582 | | 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027, |
1583 | | 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f, |
1584 | | 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037, |
1585 | | 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f, |
1586 | | 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047, |
1587 | | 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f, |
1588 | | 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057, |
1589 | | 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f, |
1590 | | 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067, |
1591 | | 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f, |
1592 | | 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077, |
1593 | | 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f, |
1594 | | 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087, |
1595 | | 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f, |
1596 | | 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097, |
1597 | | 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f, |
1598 | | 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7, |
1599 | | 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000, |
1600 | | 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7, |
1601 | | 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf, |
1602 | | 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307, |
1603 | | 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c, |
1604 | | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
1605 | | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
1606 | | 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f, |
1607 | | 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149, |
1608 | | 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140, |
1609 | | 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000 |
1610 | | }; |
1611 | | |
1612 | | typedef gunichar2 wvec16[16]; |
1613 | | typedef gunichar2 wvec32[32]; |
1614 | | |
1615 | | /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */ |
1616 | | static const wvec16 accents = { |
1617 | | 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9, |
1618 | | 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7}; |
1619 | | |
1620 | | /* In the following tables, base characters commented in (parentheses) |
1621 | | * are not defined by T.61 but are mapped anyway since their Unicode |
1622 | | * composite exists. |
1623 | | */ |
1624 | | |
1625 | | /* Grave accented chars AEIOU (NWY) */ |
1626 | | static const wvec32 c1_vec1 = { |
1627 | | /* Upper case */ |
1628 | | 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2, |
1629 | | 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0}; |
1630 | | static const wvec32 c1_vec2 = { |
1631 | | /* Lower case */ |
1632 | | 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2, |
1633 | | 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0}; |
1634 | | |
1635 | | static const wvec32 *c1_grave[] = { |
1636 | | NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL |
1637 | | }; |
1638 | | |
1639 | | /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */ |
1640 | | static const wvec32 c2_vec1 = { |
1641 | | /* Upper case */ |
1642 | | 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4, |
1643 | | 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3, |
1644 | | 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82, |
1645 | | 0, 0xdd, 0x179, 0, 0, 0, 0, 0}; |
1646 | | static const wvec32 c2_vec2 = { |
1647 | | /* Lower case */ |
1648 | | 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5, |
1649 | | 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3, |
1650 | | 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83, |
1651 | | 0, 0xfd, 0x17a, 0, 0, 0, 0, 0}; |
1652 | | static const wvec32 c2_vec3 = { |
1653 | | /* (AE and ae) */ |
1654 | | 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1655 | | 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1656 | | |
1657 | | static const wvec32 *c2_acute[] = { |
1658 | | NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3 |
1659 | | }; |
1660 | | |
1661 | | /* Circumflex AEIOUYCGHJSW (Z) */ |
1662 | | static const wvec32 c3_vec1 = { |
1663 | | /* Upper case */ |
1664 | | 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c, |
1665 | | 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4, |
1666 | | 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174, |
1667 | | 0, 0x176, 0x1e90, 0, 0, 0, 0, 0}; |
1668 | | static const wvec32 c3_vec2 = { |
1669 | | /* Lower case */ |
1670 | | 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d, |
1671 | | 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4, |
1672 | | 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175, |
1673 | | 0, 0x177, 0x1e91, 0, 0, 0, 0, 0}; |
1674 | | static const wvec32 *c3_circumflex[] = { |
1675 | | NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL |
1676 | | }; |
1677 | | |
1678 | | /* Tilde AIOUN (EVY) */ |
1679 | | static const wvec32 c4_vec1 = { |
1680 | | /* Upper case */ |
1681 | | 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5, |
1682 | | 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0}; |
1683 | | static const wvec32 c4_vec2 = { |
1684 | | /* Lower case */ |
1685 | | 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5, |
1686 | | 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0}; |
1687 | | static const wvec32 *c4_tilde[] = { |
1688 | | NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL |
1689 | | }; |
1690 | | |
1691 | | /* Macron AEIOU (YG) */ |
1692 | | static const wvec32 c5_vec1 = { |
1693 | | /* Upper case */ |
1694 | | 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c, |
1695 | | 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0}; |
1696 | | static const wvec32 c5_vec2 = { |
1697 | | /* Lower case */ |
1698 | | 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d, |
1699 | | 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0}; |
1700 | | static const wvec32 c5_vec3 = { |
1701 | | /* (AE and ae) */ |
1702 | | 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1703 | | 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1704 | | static const wvec32 *c5_macron[] = { |
1705 | | NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3 |
1706 | | }; |
1707 | | |
1708 | | /* Breve AUG (EIO) */ |
1709 | | static const wvec32 c6_vec1 = { |
1710 | | /* Upper case */ |
1711 | | 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e, |
1712 | | 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1713 | | static const wvec32 c6_vec2 = { |
1714 | | /* Lower case */ |
1715 | | 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f, |
1716 | | 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1717 | | static const wvec32 *c6_breve[] = { |
1718 | | NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL |
1719 | | }; |
1720 | | |
1721 | | /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */ |
1722 | | static const wvec32 c7_vec1 = { |
1723 | | /* Upper case */ |
1724 | | 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120, |
1725 | | 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e, |
1726 | | 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86, |
1727 | | 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0}; |
1728 | | static const wvec32 c7_vec2 = { |
1729 | | /* Lower case */ |
1730 | | 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121, |
1731 | | 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f, |
1732 | | 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87, |
1733 | | 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0}; |
1734 | | static const wvec32 *c7_dotabove[] = { |
1735 | | NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL |
1736 | | }; |
1737 | | |
1738 | | /* Diaeresis AEIOUY (HWXt) */ |
1739 | | static const wvec32 c8_vec1 = { |
1740 | | /* Upper case */ |
1741 | | 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6, |
1742 | | 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0}; |
1743 | | static const wvec32 c8_vec2 = { |
1744 | | /* Lower case */ |
1745 | | 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6, |
1746 | | 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0}; |
1747 | | static const wvec32 *c8_diaeresis[] = { |
1748 | | NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL |
1749 | | }; |
1750 | | |
1751 | | /* Ring Above AU (wy) */ |
1752 | | static const wvec32 ca_vec1 = { |
1753 | | /* Upper case */ |
1754 | | 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1755 | | 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1756 | | static const wvec32 ca_vec2 = { |
1757 | | /* Lower case */ |
1758 | | 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1759 | | 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0}; |
1760 | | static const wvec32 *ca_ringabove[] = { |
1761 | | NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL |
1762 | | }; |
1763 | | |
1764 | | /* Cedilla CGKLNRST (EDH) */ |
1765 | | static const wvec32 cb_vec1 = { |
1766 | | /* Upper case */ |
1767 | | 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122, |
1768 | | 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0, |
1769 | | 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1770 | | static const wvec32 cb_vec2 = { |
1771 | | /* Lower case */ |
1772 | | 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123, |
1773 | | 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0, |
1774 | | 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1775 | | static const wvec32 *cb_cedilla[] = { |
1776 | | NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL |
1777 | | }; |
1778 | | |
1779 | | /* Double Acute Accent OU */ |
1780 | | static const wvec32 cd_vec1 = { |
1781 | | /* Upper case */ |
1782 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150, |
1783 | | 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1784 | | static const wvec32 cd_vec2 = { |
1785 | | /* Lower case */ |
1786 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151, |
1787 | | 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1788 | | static const wvec32 *cd_doubleacute[] = { |
1789 | | NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL |
1790 | | }; |
1791 | | |
1792 | | /* Ogonek AEIU (O) */ |
1793 | | static const wvec32 ce_vec1 = { |
1794 | | /* Upper case */ |
1795 | | 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea, |
1796 | | 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1797 | | static const wvec32 ce_vec2 = { |
1798 | | /* Lower case */ |
1799 | | 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb, |
1800 | | 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1801 | | static const wvec32 *ce_ogonek[] = { |
1802 | | NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL |
1803 | | }; |
1804 | | |
1805 | | /* Caron CDELNRSTZ (AIOUGKjH) */ |
1806 | | static const wvec32 cf_vec1 = { |
1807 | | /* Upper case */ |
1808 | | 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6, |
1809 | | 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1, |
1810 | | 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0, |
1811 | | 0, 0, 0x17d, 0, 0, 0, 0, 0}; |
1812 | | static const wvec32 cf_vec2 = { |
1813 | | /* Lower case */ |
1814 | | 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7, |
1815 | | 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2, |
1816 | | 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0, |
1817 | | 0, 0, 0x17e, 0, 0, 0, 0, 0}; |
1818 | | static const wvec32 *cf_caron[] = { |
1819 | | NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL |
1820 | | }; |
1821 | | |
1822 | | static const wvec32 **cx_tab[] = { |
1823 | | NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron, |
1824 | | c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove, |
1825 | | cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron }; |
1826 | | |
1827 | | uint8_t * |
1828 | | get_t61_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
1829 | 13 | { |
1830 | 13 | int i; |
1831 | 13 | const uint8_t *c; |
1832 | 13 | wmem_strbuf_t *strbuf; |
1833 | | |
1834 | 13 | strbuf = wmem_strbuf_new_sized(scope, length+1); |
1835 | | |
1836 | 188 | for (i = 0, c = ptr; i < length; c++, i++) { |
1837 | 175 | if (!t61_tab[*c]) { |
1838 | 34 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
1839 | 141 | } else if (i < length - 1 && (*c & 0xf0) == 0xc0) { |
1840 | 0 | int j = *c & 0x0f; |
1841 | | /* If this is the end of the string, or if the base |
1842 | | * character is just a space, treat this as a regular |
1843 | | * spacing character. |
1844 | | */ |
1845 | 0 | if ((!c[1] || c[1] == 0x20) && accents[j]) { |
1846 | 0 | wmem_strbuf_append_unichar(strbuf, accents[j]); |
1847 | 0 | } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && |
1848 | | /* We have a composite mapping for this pair */ |
1849 | 0 | (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { |
1850 | 0 | wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]); |
1851 | 0 | } else { |
1852 | | /* No mapping, just swap it around so the base |
1853 | | * character comes first. |
1854 | | */ |
1855 | 0 | wmem_strbuf_append_unichar(strbuf, c[1]); |
1856 | 0 | wmem_strbuf_append_unichar(strbuf, t61_tab[*c]); |
1857 | 0 | } |
1858 | 0 | c++; i++; |
1859 | 0 | continue; |
1860 | 141 | } else { |
1861 | 141 | wmem_strbuf_append_unichar(strbuf, t61_tab[*c]); |
1862 | 141 | } |
1863 | 175 | } |
1864 | | |
1865 | 13 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1866 | 13 | } |
1867 | | |
1868 | | /* The DECT standard charset from ETSI EN 300 175-5 Annex D |
1869 | | */ |
1870 | | static const gunichar2 dect_standard_8bits_code_table[] = { |
1871 | | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
1872 | | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
1873 | | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
1874 | | 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
1875 | | ' ', '!', '\"', '#', '$', '%', '&', '\'', |
1876 | | '(', ')', '*', '+', ',', '-', '.', '/', |
1877 | | '0', '1', '2', '3', '4', '5', '6', '7', |
1878 | | '8', '9', ':', ';', '<', '=', '>', '?', |
1879 | | '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
1880 | | 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', |
1881 | | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', |
1882 | | 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', |
1883 | | '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', |
1884 | | 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
1885 | | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', |
1886 | | 'x', 'y', 'z', '{', '|', '}', '~', 0x7f, |
1887 | | }; |
1888 | | |
1889 | | uint8_t * |
1890 | | get_dect_standard_8bits_string(wmem_allocator_t *scope, const uint8_t *ptr, int length) |
1891 | 0 | { |
1892 | 0 | int position; |
1893 | 0 | const uint8_t *current_byte_ptr; |
1894 | 0 | wmem_strbuf_t *strbuf; |
1895 | |
|
1896 | 0 | strbuf = wmem_strbuf_new_sized(scope, length+1); |
1897 | |
|
1898 | 0 | for (position = 0, current_byte_ptr = ptr; position < length; current_byte_ptr++, position++) { |
1899 | 0 | if (*current_byte_ptr & 0x80) { |
1900 | 0 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
1901 | 0 | } else if (!dect_standard_8bits_code_table[*current_byte_ptr]) { |
1902 | 0 | wmem_strbuf_append_unichar(strbuf, UNREPL); |
1903 | 0 | } else { |
1904 | 0 | wmem_strbuf_append_unichar(strbuf, dect_standard_8bits_code_table[*current_byte_ptr]); |
1905 | 0 | } |
1906 | 0 | } |
1907 | |
|
1908 | 0 | return (uint8_t *)wmem_strbuf_finalize(strbuf); |
1909 | 0 | } |
1910 | | /* |
1911 | | * Editor modelines - https://www.wireshark.org/tools/modelines.html |
1912 | | * |
1913 | | * Local variables: |
1914 | | * c-basic-offset: 4 |
1915 | | * tab-width: 8 |
1916 | | * indent-tabs-mode: nil |
1917 | | * End: |
1918 | | * |
1919 | | * vi: set shiftwidth=4 tabstop=8 expandtab: |
1920 | | * :indentSize=4:tabSize=8:noTabs=true: |
1921 | | */ |