Coverage Report

Created: 2025-02-15 06:25

/src/wireshark/epan/charsets.c
Line
Count
Source (jump to first uncovered line)
1
/* charsets.c
2
 * Routines for handling character sets
3
 *
4
 * Wireshark - Network traffic analyzer
5
 * By Gerald Combs <gerald@wireshark.org>
6
 * Copyright 1998 Gerald Combs
7
 *
8
 * SPDX-License-Identifier: GPL-2.0-or-later
9
 */
10
11
#include "config.h"
12
13
#include <errno.h>
14
#include <glib.h>
15
16
#include <epan/proto.h>
17
#include <epan/wmem_scopes.h>
18
19
#include <wsutil/pint.h>
20
#include <wsutil/unicode-utils.h>
21
22
#include "charsets.h"
23
24
/*
25
 * 6-character abbreviation for "Unicode REPLACEMENT CHARACTER", so it
26
 * takes up the same amount of space as the 6-character hex values for
27
 * Basic Multilingual Plane code points in the tables below.
28
 */
29
8.12k
#define UNREPL UNICODE_REPLACEMENT_CHARACTER
30
31
/* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
32
246
#define BYTE_ORDER_MARK 0xFEFF
33
34
/*
35
 * Wikipedia's "Character encoding" template, giving a pile of character
36
 * encodings and Wikipedia pages for them:
37
 *
38
 *    http://en.wikipedia.org/wiki/Template:Character_encoding
39
 *
40
 * Unicode character encoding model:
41
 *
42
 *    https://www.unicode.org/reports/tr17/
43
 *
44
 * International Components for Unicode character set mapping tables:
45
 *
46
 *    http://site.icu-project.org/charts/charset
47
 *
48
 * MSDN information on code pages:
49
 *
50
 *    https://docs.microsoft.com/en-us/windows/win32/intl/code-pages
51
 *
52
 * ASCII-based code pages, from IBM:
53
 *
54
 *    http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
55
 *
56
 * EBCDIC code pages, from IBM:
57
 *
58
 *    http://www-03.ibm.com/systems/i/software/globalization/codepages.html
59
 *
60
 * The IBM pages are no longer available; the versions archived on the
61
 * Wayback Machine are, but the links to the PDF and text versions of
62
 * the code pages don't all work (do *any* work?).
63
 *
64
 * Mappings to Unicode at the Unicode Consortium:
65
 *
66
 *    https://www.unicode.org/Public/MAPPINGS/
67
 *
68
 * Of note, the VENDORS/MICSFT directory not only has various Windows
69
 * and DOS code pages, but also several of the common MAC and EBCDIC
70
 * code page mappings to Unicode.
71
 */
72
73
/*
74
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
75
 * referred to by the pointer and length as an ASCII string, with all bytes
76
 * with the high-order bit set being invalid, and return a pointer to a
77
 * UTF-8 string, allocated using the wmem scope.
78
 *
79
 * Octets with the highest bit set will be converted to the Unicode
80
 * REPLACEMENT CHARACTER.
81
 */
82
uint8_t *
83
get_ascii_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
84
604k
{
85
604k
    wmem_strbuf_t *str;
86
604k
    const uint8_t *prev = ptr;
87
604k
    size_t valid_bytes = 0;
88
89
604k
    str = wmem_strbuf_new_sized(scope, length+1);
90
91
16.2M
    while (length > 0) {
92
15.6M
        uint8_t ch = *ptr++;
93
94
15.6M
        if (ch < 0x80) {
95
13.0M
            valid_bytes++;
96
13.0M
        } else {
97
2.64M
            if (valid_bytes) {
98
992k
                wmem_strbuf_append_len(str, prev, valid_bytes);
99
992k
                valid_bytes = 0;
100
992k
            }
101
2.64M
            prev = ptr;
102
2.64M
            wmem_strbuf_append_unichar_repl(str);
103
2.64M
        }
104
15.6M
        length--;
105
15.6M
    }
106
604k
    if (valid_bytes) {
107
297k
        wmem_strbuf_append_len(str, prev, valid_bytes);
108
297k
    }
109
110
604k
    return (uint8_t *) wmem_strbuf_finalize(str);
111
604k
}
112
113
uint8_t *
114
get_utf_8_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
115
36.9k
{
116
36.9k
    return ws_utf8_make_valid(scope, ptr, length);
117
36.9k
}
118
119
/*
120
 * ISO 646 "Basic code table".
121
 */
122
const gunichar2 charset_table_iso_646_basic[0x80] = {
123
    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,        /* 0x00 -      */
124
    0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,        /*      - 0x0F */
125
    0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,        /* 0x10 -      */
126
    0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,        /*      - 0x1F */
127
    0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027,        /* 0x20 -      */
128
    0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,        /*      - 0x2F */
129
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,        /* 0x30 -      */
130
    0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,        /*      - 0x3F */
131
    UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,        /* 0x40 -      */
132
    0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,        /*      - 0x4F */
133
    0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,        /* 0x50 -      */
134
    0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f,        /*      - 0x5F */
135
    UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,        /* 0x60 -      */
136
    0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,        /*      - 0x6F */
137
    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,        /* 0x70 -      */
138
    0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f,        /*      - 0x7F */
139
};
140
141
/*
142
 * Given a wmem scope, a pointer, a length, and a translation table,
143
 * treat the string of bytes referred to by the pointer and length as a
144
 * string encoded using one octet per character, with octets with the
145
 * high-order bit clear being mapped by the translation table to 2-byte
146
 * Unicode Basic Multilingual Plane characters (including REPLACEMENT
147
 * CHARACTER) and octets with the high-order bit set being mapped to
148
 * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
149
 * allocated using the wmem scope.
150
 */
151
uint8_t *
152
get_iso_646_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
153
18
{
154
18
    wmem_strbuf_t *str;
155
156
18
    str = wmem_strbuf_new_sized(scope, length+1);
157
158
306
    while (length > 0) {
159
288
        uint8_t ch = *ptr;
160
161
288
        if (ch < 0x80)
162
138
            wmem_strbuf_append_unichar(str, table[ch]);
163
150
        else
164
150
            wmem_strbuf_append_unichar_repl(str);
165
288
        ptr++;
166
288
        length--;
167
288
    }
168
169
18
    return (uint8_t *) wmem_strbuf_finalize(str);
170
18
}
171
172
/*
173
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
174
 * referred to by the pointer and length as an ISO 8859/1 string, and
175
 * return a pointer to a UTF-8 string, allocated using the wmem scope.
176
 */
177
uint8_t *
178
get_8859_1_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
179
786
{
180
786
    wmem_strbuf_t *str;
181
182
786
    str = wmem_strbuf_new_sized(scope, length+1);
183
184
23.4k
    while (length > 0) {
185
22.6k
        uint8_t ch = *ptr;
186
187
22.6k
        if (ch < 0x80)
188
18.8k
            wmem_strbuf_append_c(str, ch);
189
3.81k
        else {
190
            /*
191
             * Note: we assume here that the code points
192
             * 0x80-0x9F are used for C1 control characters,
193
             * and thus have the same value as the corresponding
194
             * Unicode code points.
195
             */
196
3.81k
            wmem_strbuf_append_unichar(str, ch);
197
3.81k
        }
198
22.6k
        ptr++;
199
22.6k
        length--;
200
22.6k
    }
201
202
786
    return (uint8_t *) wmem_strbuf_finalize(str);
203
786
}
204
205
/*
206
 * Translation tables that map the upper 128 code points in single-byte
207
 * "extended ASCII" character encodings to Unicode code points in the
208
 * Basic Multilingual Plane.
209
 */
210
211
/* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
212
const gunichar2 charset_table_iso_8859_2[0x80] = {
213
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
214
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
215
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
216
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
217
    0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,        /* 0xA0 -      */
218
    0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,        /*      - 0xAF */
219
    0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,        /* 0xB0 -      */
220
    0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,        /*      - 0xBF */
221
    0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
222
    0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
223
    0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
224
    0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
225
    0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
226
    0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
227
    0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
228
    0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9         /*      - 0xFF */
229
};
230
231
/* generated by ../tools/make_charset_ISO-8859-3 */
232
const gunichar2 charset_table_iso_8859_3[0x80] = {
233
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
234
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
235
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
236
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
237
    0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7,        /* 0xA0 -      */
238
    0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b,        /*      - 0xAF */
239
    0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,        /* 0xB0 -      */
240
    0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c,        /*      - 0xBF */
241
    0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7,        /* 0xC0 -      */
242
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
243
    UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,        /* 0xD0 -      */
244
    0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,        /*      - 0xDF */
245
    0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7,        /* 0xE0 -      */
246
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
247
    UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,        /* 0xF0 -      */
248
    0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,        /*      - 0xFF */
249
};
250
251
/* generated by ../tools/make_charset_ISO-8859-4 */
252
const gunichar2 charset_table_iso_8859_4[0x80] = {
253
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
254
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
255
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
256
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
257
    0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,        /* 0xA0 -      */
258
    0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,        /*      - 0xAF */
259
    0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,        /* 0xB0 -      */
260
    0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,        /*      - 0xBF */
261
    0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
262
    0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,        /*      - 0xCF */
263
    0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
264
    0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,        /*      - 0xDF */
265
    0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
266
    0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,        /*      - 0xEF */
267
    0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
268
    0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,        /*      - 0xFF */
269
};
270
271
/* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
272
const gunichar2 charset_table_iso_8859_5[0x80] = {
273
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
274
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
275
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
276
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
277
    0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,        /* 0xA0 -      */
278
    0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,        /*      - 0xAF */
279
    0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xB0 -      */
280
    0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xBF */
281
    0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xC0 -      */
282
    0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xCF */
283
    0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xD0 -      */
284
    0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xDF */
285
    0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
286
    0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
287
    0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,        /* 0xF0 -      */
288
    0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f         /*      - 0xFF */
289
};
290
291
/* generated by ../tools/make_charset_ISO-8859-6 */
292
const gunichar2 charset_table_iso_8859_6[0x80] = {
293
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
294
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
295
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
296
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
297
    0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL,        /* 0xA0 -      */
298
    UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL,        /*      - 0xAF */
299
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xB0 -      */
300
    UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f,        /*      - 0xBF */
301
    UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,        /* 0xC0 -      */
302
    0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,        /*      - 0xCF */
303
    0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,        /* 0xD0 -      */
304
    0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xDF */
305
    0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,        /* 0xE0 -      */
306
    0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,        /*      - 0xEF */
307
    0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xF0 -      */
308
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
309
};
310
311
/* generated by ../tools/make_charset_ISO-8859-7 */
312
const gunichar2 charset_table_iso_8859_7[0x80] = {
313
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
314
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
315
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
316
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
317
    0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7,        /* 0xA0 -      */
318
    0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015,        /*      - 0xAF */
319
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,        /* 0xB0 -      */
320
    0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,        /*      - 0xBF */
321
    0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,        /* 0xC0 -      */
322
    0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,        /*      - 0xCF */
323
    0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,        /* 0xD0 -      */
324
    0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,        /*      - 0xDF */
325
    0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,        /* 0xE0 -      */
326
    0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,        /*      - 0xEF */
327
    0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,        /* 0xF0 -      */
328
    0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL,        /*      - 0xFF */
329
};
330
331
/* generated by ../tools/make_charset_ISO-8859-8 */
332
const gunichar2 charset_table_iso_8859_8[0x80] = {
333
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
334
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
335
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
336
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
337
    0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
338
    0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
339
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
340
    0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL,        /*      - 0xBF */
341
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xC0 -      */
342
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xCF */
343
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xD0 -      */
344
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017,        /*      - 0xDF */
345
    0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,        /* 0xE0 -      */
346
    0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,        /*      - 0xEF */
347
    0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,        /* 0xF0 -      */
348
    0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL,        /*      - 0xFF */
349
};
350
351
/* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
352
const gunichar2 charset_table_iso_8859_9[0x80] = {
353
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
354
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
355
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
356
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
357
    0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
358
    0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
359
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
360
    0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
361
    0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
362
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
363
    0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
364
    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,        /*      - 0xDF */
365
    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
366
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
367
    0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
368
    0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff         /*      - 0xFF */
369
};
370
371
/* generated by ../tools/make_charset_ISO-8859-10 */
372
const gunichar2 charset_table_iso_8859_10[0x80] = {
373
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
374
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
375
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
376
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
377
    0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7,        /* 0xA0 -      */
378
    0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a,        /*      - 0xAF */
379
    0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7,        /* 0xB0 -      */
380
    0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b,        /*      - 0xBF */
381
    0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
382
    0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
383
    0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168,        /* 0xD0 -      */
384
    0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
385
    0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
386
    0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
387
    0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169,        /* 0xF0 -      */
388
    0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138,        /*      - 0xFF */
389
};
390
391
/* generated by ../tools/make_charset_ISO-8859-11 */
392
const gunichar2 charset_table_iso_8859_11[0x80] = {
393
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
394
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
395
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
396
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
397
    0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07,        /* 0xA0 -      */
398
    0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f,        /*      - 0xAF */
399
    0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17,        /* 0xB0 -      */
400
    0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f,        /*      - 0xBF */
401
    0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27,        /* 0xC0 -      */
402
    0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f,        /*      - 0xCF */
403
    0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37,        /* 0xD0 -      */
404
    0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f,        /*      - 0xDF */
405
    0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47,        /* 0xE0 -      */
406
    0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f,        /*      - 0xEF */
407
    0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57,        /* 0xF0 -      */
408
    0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
409
};
410
411
/* generated by ../tools/make_charset_ISO-8859-13 */
412
const gunichar2 charset_table_iso_8859_13[0x80] = {
413
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
414
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
415
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
416
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
417
    0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7,        /* 0xA0 -      */
418
    0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6,        /*      - 0xAF */
419
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
420
    0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6,        /*      - 0xBF */
421
    0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112,        /* 0xC0 -      */
422
    0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b,        /*      - 0xCF */
423
    0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
424
    0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df,        /*      - 0xDF */
425
    0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113,        /* 0xE0 -      */
426
    0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c,        /*      - 0xEF */
427
    0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
428
    0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019,        /*      - 0xFF */
429
};
430
431
/* generated by ../tools/make_charset_ISO-8859-14 */
432
const gunichar2 charset_table_iso_8859_14[0x80] = {
433
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
434
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
435
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
436
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
437
    0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7,        /* 0xA0 -      */
438
    0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178,        /*      - 0xAF */
439
    0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56,        /* 0xB0 -      */
440
    0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61,        /*      - 0xBF */
441
    0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
442
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
443
    0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a,        /* 0xD0 -      */
444
    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df,        /*      - 0xDF */
445
    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
446
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
447
    0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b,        /* 0xF0 -      */
448
    0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff,        /*      - 0xFF */
449
};
450
451
/* generated by ../tools/make_charset_ISO-8859-15 */
452
const gunichar2 charset_table_iso_8859_15[0x80] = {
453
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
454
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
455
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
456
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
457
    0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7,        /* 0xA0 -      */
458
    0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
459
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
460
    0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf,        /*      - 0xBF */
461
    0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
462
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
463
    0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
464
    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
465
    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
466
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
467
    0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
468
    0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
469
};
470
471
/* generated by ../tools/make_charset_ISO-8859-16 */
472
const gunichar2 charset_table_iso_8859_16[0x80] = {
473
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
474
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
475
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
476
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
477
    0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7,        /* 0xA0 -      */
478
    0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b,        /*      - 0xAF */
479
    0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7,        /* 0xB0 -      */
480
    0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c,        /*      - 0xBF */
481
    0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7,        /* 0xC0 -      */
482
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
483
    0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a,        /* 0xD0 -      */
484
    0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df,        /*      - 0xDF */
485
    0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7,        /* 0xE0 -      */
486
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
487
    0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b,        /* 0xF0 -      */
488
    0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff,        /*      - 0xFF */
489
};
490
491
/*
492
 * Windows-1250
493
 *
494
 * See:
495
 *     httpss://en.wikipedia.org/wiki/Windows-1250)
496
 *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
497
 */
498
const gunichar2 charset_table_cp1250[0x80] = {
499
    0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
500
    UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,        /*      - 0x8F */
501
    UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
502
    UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,        /*      - 0x9F */
503
    0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,        /* 0xA0 -      */
504
    0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,        /*      - 0xAF */
505
    0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
506
    0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,        /*      - 0xBF */
507
    0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
508
    0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
509
    0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
510
    0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
511
    0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
512
    0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
513
    0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
514
    0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,        /*      - 0xFF */
515
};
516
517
/*
518
 * Windows-1251
519
 *
520
 * See:
521
 *     https://en.wikipedia.org/wiki/Windows-1251
522
 *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
523
 */
524
const gunichar2 charset_table_cp1251[0x80] = {
525
    0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
526
    0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f,        /*      - 0x8F */
527
    0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
528
    UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,        /*      - 0x9F */
529
    0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,        /* 0xA0 -      */
530
    0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,        /*      - 0xAF */
531
    0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
532
    0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,        /*      - 0xBF */
533
    0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xC0 -      */
534
    0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xCF */
535
    0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xD0 -      */
536
    0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xDF */
537
    0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xE0 -      */
538
    0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xEF */
539
    0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xF0 -      */
540
    0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xFF */
541
};
542
543
/*
544
 * Windows-1252
545
 *
546
 * See:
547
 *     https://en.wikipedia.org/wiki/Windows-1252
548
 *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
549
 */
550
const gunichar2 charset_table_cp1252[0x80] = {
551
    0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
552
    0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL,        /*      - 0x8F */
553
    UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
554
    0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178,        /*      - 0x9F */
555
    0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
556
    0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
557
    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
558
    0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
559
    0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
560
    0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
561
    0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
562
    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
563
    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
564
    0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
565
    0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
566
    0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
567
};
568
569
/* generated by ./make_charset_table MACROMAN */
570
/* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
571
const gunichar2 charset_table_mac_roman[0x80] = {
572
    0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,        /* 0x80 -      */
573
    0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,        /*      - 0x8F */
574
    0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,        /* 0x90 -      */
575
    0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,        /*      - 0x9F */
576
    0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,        /* 0xA0 -      */
577
    0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,        /*      - 0xAF */
578
    0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,        /* 0xB0 -      */
579
    0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8,        /*      - 0xBF */
580
    0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,        /* 0xC0 -      */
581
    0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,        /*      - 0xCF */
582
    0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,        /* 0xD0 -      */
583
    0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02,        /*      - 0xDF */
584
    0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,        /* 0xE0 -      */
585
    0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,        /*      - 0xEF */
586
    0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,        /* 0xF0 -      */
587
    0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,        /*      - 0xFF */
588
};
589
590
/* generated by ./make_charset_table CP437 */
591
const gunichar2 charset_table_cp437[0x80] = {
592
    0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,        /* 0x80 -      */
593
    0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,        /*      - 0x8F */
594
    0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,        /* 0x90 -      */
595
    0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,        /*      - 0x9F */
596
    0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,        /* 0xA0 -      */
597
    0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,        /*      - 0xAF */
598
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
599
    0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
600
    0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
601
    0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
602
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
603
    0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
604
    0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,        /* 0xE0 -      */
605
    0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,        /*      - 0xEF */
606
    0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,        /* 0xF0 -      */
607
    0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,        /*      - 0xFF */
608
};
609
610
/*
611
 * CP855
612
 *
613
 * See
614
 *     https://en.wikipedia.org/wiki/CP855
615
 *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT
616
 *
617
 * XXX - this doesn't have the graphics for 0x00 through 0x1F shown
618
 * on the Wikipedia page, but not in the Microsoft mapping file;
619
 * that would require a 256-code-point mapping table.  (Are those
620
 * positions used for the same graphics on all code pages - the PC
621
 * graphics set, or whatever it's called?)
622
 */
623
const gunichar2 charset_table_cp855[0x80] = {
624
    0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404,        /* 0x80 -      */
625
    0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408,        /*      - 0x8F */
626
    0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c,        /* 0x90 -      */
627
    0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a,        /*      - 0x9F */
628
    0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414,        /* 0xA0 -      */
629
    0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb,        /*      - 0xAF */
630
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438,        /* 0xB0 -      */
631
    0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510,        /*      - 0xBF */
632
    0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a,        /* 0xC0 -      */
633
    0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,        /*      - 0xCF */
634
    0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e,        /* 0xD0 -      */
635
    0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580,        /*      - 0xDF */
636
    0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443,        /* 0xE0 -      */
637
    0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116,        /*      - 0xEF */
638
    0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d,        /* 0xF0 -      */
639
    0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0,        /*      - 0xFF */
640
};
641
642
/*
643
 * CP866
644
 *
645
 * See:
646
 *     https://en.wikipedia.org/wiki/CP866
647
 *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
648
 */
649
const gunichar2 charset_table_cp866[0x80] = {
650
    0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0x80 -      */
651
    0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0x8F */
652
    0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0x90 -      */
653
    0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0x9F */
654
    0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xA0 -      */
655
    0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xAF */
656
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
657
    0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
658
    0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
659
    0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
660
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
661
    0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
662
    0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
663
    0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
664
    0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e,        /* 0xF0 -      */
665
    0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0,        /*      - 0xFF */
666
};
667
668
/*
669
 * Given a wmem scope, a pointer, a length, and a translation table with
670
 * 128 entries, treat the string of bytes referred to by the pointer and
671
 * length as a string encoded using one octet per character, with octets
672
 * with the high-order bit clear being ASCII and octets with the high-order
673
 * bit set being mapped by the translation table to 2-byte Unicode Basic
674
 * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
675
 * return a pointer to a UTF-8 string, allocated using the wmem scope.
676
 */
677
uint8_t *
678
get_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
679
816
{
680
816
    wmem_strbuf_t *str;
681
682
816
    str = wmem_strbuf_new_sized(scope, length+1);
683
684
16.4k
    while (length > 0) {
685
15.6k
        uint8_t ch = *ptr;
686
687
15.6k
        if (ch < 0x80)
688
10.8k
            wmem_strbuf_append_c(str, ch);
689
4.79k
        else
690
4.79k
            wmem_strbuf_append_unichar(str, table[ch-0x80]);
691
15.6k
        ptr++;
692
15.6k
        length--;
693
15.6k
    }
694
695
816
    return (uint8_t *) wmem_strbuf_finalize(str);
696
816
}
697
698
/*
699
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
700
 * referred to by the pointer and length as a UCS-2 encoded string
701
 * containing characters from the Basic Multilingual Plane (plane 0) of
702
 * Unicode, and return a pointer to a UTF-8 string, allocated with the
703
 * wmem scope.
704
 *
705
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
706
 * possibly ORed with ENC_BOM.
707
 *
708
 * Specify length in bytes.
709
 */
710
uint8_t *
711
get_ucs_2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
712
1.94k
{
713
1.94k
    gunichar2      uchar;
714
1.94k
    int            i = 0;       /* Byte counter for string */
715
1.94k
    wmem_strbuf_t *strbuf;
716
717
1.94k
    strbuf = wmem_strbuf_new_sized(scope, length+1);
718
719
1.94k
    if (encoding & ENC_BOM && length >= 2) {
720
0
        if (pletoh16(ptr) == BYTE_ORDER_MARK) {
721
0
            encoding = ENC_LITTLE_ENDIAN;
722
0
            i += 2;
723
0
        } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
724
0
            encoding = ENC_BIG_ENDIAN;
725
0
            i += 2;
726
0
        }
727
0
    }
728
729
1.94k
    encoding = encoding & ENC_LITTLE_ENDIAN;
730
731
52.4k
    for(; i + 1 < length; i += 2) {
732
50.5k
        if (encoding == ENC_BIG_ENDIAN) {
733
49.6k
            uchar = pntoh16(ptr + i);
734
49.6k
        } else {
735
882
            uchar = pletoh16(ptr + i);
736
882
        }
737
50.5k
        wmem_strbuf_append_unichar_validated(strbuf, uchar);
738
50.5k
    }
739
740
    /*
741
     * If i < length, this means we were handed an odd number of bytes;
742
     * insert a REPLACEMENT CHARACTER to mark the error.
743
     */
744
1.94k
    if (i < length) {
745
118
        wmem_strbuf_append_unichar_repl(strbuf);
746
118
    }
747
1.94k
    return (uint8_t *) wmem_strbuf_finalize(strbuf);
748
1.94k
}
749
750
/*
751
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
752
 * referred to by the pointer and length as a UTF-16 encoded string, and
753
 * return a pointer to a UTF-8 string, allocated with the wmem scope.
754
 *
755
 * See RFC 2781 section 2.2.
756
 *
757
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
758
 * possibly ORed with ENC_BOM.
759
 *
760
 * Specify length in bytes.
761
 */
762
uint8_t *
763
get_utf_16_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
764
8.19k
{
765
8.19k
    wmem_strbuf_t *strbuf;
766
8.19k
    gunichar2      uchar2, lead_surrogate;
767
8.19k
    gunichar       uchar;
768
8.19k
    int            i = 0;       /* Byte counter for string */
769
770
8.19k
    strbuf = wmem_strbuf_new_sized(scope, length+1);
771
772
8.19k
    if (encoding & ENC_BOM && length >= 2) {
773
123
        if (pletoh16(ptr) == BYTE_ORDER_MARK) {
774
0
            encoding = ENC_LITTLE_ENDIAN;
775
0
            i += 2;
776
123
        } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
777
0
            encoding = ENC_BIG_ENDIAN;
778
0
            i += 2;
779
0
        }
780
123
    }
781
782
8.19k
    encoding = encoding & ENC_LITTLE_ENDIAN;
783
784
92.5k
    for(; i + 1 < length; i += 2) {
785
84.6k
        if (encoding == ENC_BIG_ENDIAN)
786
4.20k
            uchar2 = pntoh16(ptr + i);
787
80.4k
        else
788
80.4k
            uchar2 = pletoh16(ptr + i);
789
790
84.6k
        if (IS_LEAD_SURROGATE(uchar2)) {
791
            /*
792
             * Lead surrogate.  Must be followed by
793
             * a trail surrogate.
794
             */
795
4.09k
            i += 2;
796
4.09k
            if (i + 1 >= length) {
797
                /*
798
                 * Oops, string ends with a lead surrogate.
799
                 *
800
                 * Insert a REPLACEMENT CHARACTER to mark the error,
801
                 * and quit.
802
                 */
803
238
                wmem_strbuf_append_unichar(strbuf, UNREPL);
804
238
                break;
805
238
            }
806
3.85k
            lead_surrogate = uchar2;
807
3.85k
            if (encoding == ENC_BIG_ENDIAN)
808
206
                uchar2 = pntoh16(ptr + i);
809
3.65k
            else
810
3.65k
                uchar2 = pletoh16(ptr + i);
811
3.85k
            if (IS_TRAIL_SURROGATE(uchar2)) {
812
                /* Trail surrogate. */
813
699
                uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
814
699
                wmem_strbuf_append_unichar(strbuf, uchar);
815
3.16k
            } else {
816
                /*
817
                 * Not a trail surrogate.
818
                 *
819
                 * Insert a REPLACEMENT CHARACTER to mark the error,
820
                 * and continue;
821
                 */
822
3.16k
                wmem_strbuf_append_unichar(strbuf, UNREPL);
823
3.16k
            }
824
80.5k
        } else {
825
80.5k
            if (IS_TRAIL_SURROGATE(uchar2)) {
826
                /*
827
                 * Trail surrogate without a preceding
828
                 * lead surrogate.
829
                 *
830
                 * Insert a REPLACEMENT CHARACTER to mark the error,
831
                 * and continue;
832
                 */
833
1.19k
                wmem_strbuf_append_unichar(strbuf, UNREPL);
834
79.3k
            } else {
835
                /*
836
                 * Non-surrogate; just append it.
837
                 */
838
79.3k
                wmem_strbuf_append_unichar(strbuf, uchar2);
839
79.3k
            }
840
80.5k
        }
841
84.6k
    }
842
843
    /*
844
     * If i < length, this means we were handed an odd number of bytes,
845
     * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER
846
     * to mark the error.
847
     */
848
8.19k
    if (i < length)
849
3.37k
        wmem_strbuf_append_unichar(strbuf, UNREPL);
850
8.19k
    return (uint8_t *) wmem_strbuf_finalize(strbuf);
851
8.19k
}
852
853
/*
854
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
855
 * referred to by the pointer and length as a UCS-4 encoded string, and
856
 * return a pointer to a UTF-8 string, allocated with the wmem scope.
857
 *
858
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
859
 *
860
 * Specify length in bytes
861
 */
862
uint8_t *
863
get_ucs_4_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
864
55
{
865
55
    gunichar       uchar;
866
55
    int            i = 0;       /* Byte counter for string */
867
55
    wmem_strbuf_t *strbuf;
868
869
55
    strbuf = wmem_strbuf_new_sized(scope, length+1);
870
871
55
    if (encoding & ENC_BOM && length >= 4) {
872
0
        if (pletoh32(ptr) == BYTE_ORDER_MARK) {
873
0
            encoding = ENC_LITTLE_ENDIAN;
874
0
            i += 4;
875
0
        } else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
876
0
            encoding = ENC_BIG_ENDIAN;
877
0
            i += 4;
878
0
        }
879
0
    }
880
881
55
    encoding = encoding & ENC_LITTLE_ENDIAN;
882
883
1.57k
    for(; i + 3 < length; i += 4) {
884
1.51k
        if (encoding == ENC_BIG_ENDIAN)
885
1.51k
            uchar = pntoh32(ptr + i);
886
0
        else
887
0
            uchar = pletoh32(ptr + i);
888
889
1.51k
        wmem_strbuf_append_unichar_validated(strbuf, uchar);
890
1.51k
    }
891
892
    /*
893
     * if i < length, this means we were handed a number of bytes
894
     * that's not a multiple of 4, so not a valid UCS-4 string.
895
     * Insert a REPLACEMENT CHARACTER for the remaining bytes.
896
     */
897
55
    if (i < length) {
898
25
        wmem_strbuf_append_unichar(strbuf, UNREPL);
899
25
    }
900
55
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
901
55
}
902
903
/*
904
 * FROM GNOKII
905
 * gsm-encoding.c
906
 * gsm-sms.c
907
 */
908
909
/* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
910
static const gunichar2 gsm_default_alphabet[0x80] = {
911
    '@',   0xa3,  '$',   0xa5,  0xe8,  0xe9,  0xf9,  0xec,
912
    0xf2,  0xc7,  '\n',  0xd8,  0xf8,  '\r',  0xc5,  0xe5,
913
    0x394, '_',   0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
914
    0x3a3, 0x398, 0x39e, 0xa0,  0xc6,  0xe6,  0xdf,  0xc9,
915
    ' ',   '!',   '\"',  '#',   0xa4,  '%',   '&',   '\'',
916
    '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',
917
    '0',   '1',   '2',   '3',   '4',   '5',   '6',   '7',
918
    '8',   '9',   ':',   ';',   '<',   '=',   '>',   '?',
919
    0xa1,  'A',   'B',   'C',   'D',   'E',   'F',   'G',
920
    'H',   'I',   'J',   'K',   'L',   'M',   'N',   'O',
921
    'P',   'Q',   'R',   'S',   'T',   'U',   'V',   'W',
922
    'X',   'Y',   'Z',   0xc4,  0xd6,  0xd1,  0xdc,  0xa7,
923
    0xbf,  'a',   'b',   'c',   'd',   'e',   'f',   'g',
924
    'h',   'i',   'j',   'k',   'l',   'm',   'n',   'o',
925
    'p',   'q',   'r',   's',   't',   'u',   'v',   'w',
926
    'x',   'y',   'z',   0xe4,  0xf6,  0xf1,  0xfc,  0xe0
927
};
928
929
static gunichar
930
GSM_to_UNICHAR(uint8_t c)
931
3.35k
{
932
3.35k
    if (c < G_N_ELEMENTS(gsm_default_alphabet))
933
3.35k
        return gsm_default_alphabet[c];
934
935
0
    return UNREPL;
936
3.35k
}
937
938
static gunichar
939
GSMext_to_UNICHAR(uint8_t c)
940
5
{
941
5
    switch (c)
942
5
    {
943
0
        case 0x0a: return 0x0c; /* form feed */
944
0
        case 0x14: return '^';
945
0
        case 0x28: return '{';
946
0
        case 0x29: return '}';
947
0
        case 0x2f: return '\\';
948
0
        case 0x3c: return '[';
949
0
        case 0x3d: return '~';
950
0
        case 0x3e: return ']';
951
0
        case 0x40: return '|';
952
0
        case 0x65: return 0x20ac; /* euro */
953
5
    }
954
955
5
    return UNREPL; /* invalid character */
956
5
}
957
958
2.03k
#define GN_BYTE_MASK ((1 << bits) - 1)
959
960
3.45k
#define GN_CHAR_ESCAPE 0x1b
961
962
static bool
963
char_is_escape(unsigned char value)
964
3.45k
{
965
3.45k
    return (value == GN_CHAR_ESCAPE);
966
3.45k
}
967
968
static bool
969
handle_ts_23_038_char(wmem_strbuf_t *strbuf, uint8_t code_point,
970
                      bool saw_escape)
971
3.45k
{
972
3.45k
    gunichar       uchar;
973
974
3.45k
    if (char_is_escape(code_point)) {
975
        /*
976
         * XXX - if saw_escape is true here, then this is
977
         * the case where we escape to "another extension table",
978
         * but TS 128 038 V11.0 doesn't specify such an extension
979
         * table.
980
         */
981
5
        saw_escape = true;
982
3.45k
    } else {
983
3.45k
        if (!(code_point & 0x80)) {
984
            /*
985
             * Code point is valid (7-bit).
986
             * Have we seen an escape?
987
             */
988
3.36k
            if (saw_escape) {
989
5
                saw_escape = false;
990
5
                uchar = GSMext_to_UNICHAR(code_point);
991
3.35k
            } else {
992
3.35k
                uchar = GSM_to_UNICHAR(code_point);
993
3.35k
            }
994
3.36k
            wmem_strbuf_append_unichar(strbuf, uchar);
995
3.36k
        } else {
996
            /* Invalid - put in a REPLACEMENT CHARACTER */
997
92
            wmem_strbuf_append_unichar(strbuf, UNREPL);
998
92
        }
999
3.45k
    }
1000
3.45k
    return saw_escape;
1001
3.45k
}
1002
1003
uint8_t *
1004
get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const uint8_t *ptr,
1005
                                  const int bit_offset, int no_of_chars)
1006
38
{
1007
38
    wmem_strbuf_t *strbuf;
1008
38
    int            char_count;                  /* character counter for string */
1009
38
    uint8_t        in_byte, out_byte, rest = 0x00;
1010
38
    const uint8_t *start_ptr = ptr;
1011
38
    bool           saw_escape = false;
1012
38
    int            bits;
1013
1014
38
    strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1015
1016
38
    bits = bit_offset & 0x07;
1017
38
    if (!bits) {
1018
38
        bits = 7;
1019
38
    }
1020
1021
2.07k
    for(char_count = 0; char_count < no_of_chars; ptr++) {
1022
        /* Get the next byte from the string. */
1023
2.03k
        in_byte = *ptr;
1024
1025
        /*
1026
         * Combine the bits we've accumulated with bits from
1027
         * that byte to make a 7-bit code point.
1028
         */
1029
2.03k
        out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
1030
1031
        /*
1032
         * Leftover bits used in that code point.
1033
         */
1034
2.03k
        rest = in_byte >> bits;
1035
1036
        /*
1037
         * If we don't start from 0th bit, we shouldn't go to the
1038
         * next char. Under *out_num we have now 0 and under Rest -
1039
         * _first_ part of the char.
1040
         */
1041
2.03k
        if ((start_ptr != ptr) || (bits == 7)) {
1042
2.03k
            saw_escape = handle_ts_23_038_char(strbuf, out_byte,
1043
2.03k
                saw_escape);
1044
2.03k
            char_count++;
1045
2.03k
        }
1046
1047
        /*
1048
         * After reading 7 octets we have read 7 full characters
1049
         * but we have 7 bits as well. This is the next character.
1050
         */
1051
2.03k
        if ((bits == 1) && (char_count < no_of_chars)) {
1052
271
            saw_escape = handle_ts_23_038_char(strbuf, rest,
1053
271
                saw_escape);
1054
271
            char_count++;
1055
271
            bits = 7;
1056
271
            rest = 0x00;
1057
1.76k
        } else {
1058
1.76k
            bits--;
1059
1.76k
        }
1060
2.03k
    }
1061
1062
38
    if (saw_escape) {
1063
        /*
1064
         * Escape not followed by anything.
1065
         *
1066
         * XXX - for now, show the escape as a REPLACEMENT
1067
         * CHARACTER.
1068
         */
1069
0
        wmem_strbuf_append_unichar(strbuf, UNREPL);
1070
0
    }
1071
1072
38
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1073
38
}
1074
1075
uint8_t *
1076
get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const uint8_t *ptr,
1077
                           int length)
1078
86
{
1079
86
    wmem_strbuf_t *strbuf;
1080
86
    int            i;       /* Byte counter for string */
1081
86
    bool           saw_escape = false;
1082
1083
86
    strbuf = wmem_strbuf_new_sized(scope, length+1);
1084
1085
1.23k
    for (i = 0; i < length; i++)
1086
1.14k
        saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape);
1087
1088
86
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1089
86
}
1090
1091
/*
1092
 * ETSI TS 102 221 Annex A.
1093
 */
1094
uint8_t *
1095
get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const uint8_t *ptr,
1096
                                   int length)
1097
6
{
1098
6
    uint8_t        string_type;
1099
6
    uint8_t        string_len;
1100
6
    gunichar2      ucs2_base;
1101
6
    wmem_strbuf_t *strbuf;
1102
6
    unsigned       i;       /* Byte counter for string */
1103
6
    bool           saw_escape = false;
1104
1105
    /*
1106
     * get the first octet.
1107
     */
1108
6
    if (length == 0) {
1109
        /* XXX - return error indication */
1110
0
        strbuf = wmem_strbuf_new(scope, "");
1111
0
        return (uint8_t *)wmem_strbuf_finalize(strbuf);
1112
0
    }
1113
6
    string_type = *ptr;
1114
6
    ptr++;
1115
6
    length--;
1116
1117
6
    if (string_type == 0x80) {
1118
        /*
1119
         * Annex A, coding scheme 1) - big-endian UCS-2.
1120
         */
1121
0
        return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN);
1122
0
    }
1123
1124
    /*
1125
     * Annex A, coding schemes 2) and 3):
1126
     *
1127
     *    the second byte is the number of characters (characters,
1128
     *    not octets) in the string;
1129
     *
1130
     *    for coding scheme 2), the third byte defines bits 15 to 8
1131
     *    of all UCS-2 characters in the string (all bit numbers are
1132
     *    1-origin, so bit 1 is the low-order bit), with bit 16 being 0;
1133
     *
1134
     *    for coding scheme 3), the third byte and fourth bytes, treated
1135
     *    as a big-endian value, define the base value for all UCS-2
1136
     *    characters in the string;
1137
     *
1138
     *    for all subsequent bytes, if bit 8 is 0, it's a character
1139
     *    in the GSM Default Alphabet, otherwise, it is added to
1140
     *    the UCS-2 base value to give a UCS-2 character.
1141
     *
1142
     * XXX - that doesn't seem to indicate that a byte of 0x1b is
1143
     * treated as an escape character, it just says that a single octet
1144
     * with the 8th bit not set is a GSM Default Alphabet character.
1145
     */
1146
1147
    /*
1148
     * Get the string length, in characters.
1149
     */
1150
6
    if (length == 0) {
1151
        /* XXX - return error indication */
1152
0
        strbuf = wmem_strbuf_new(scope, "");
1153
0
        return (uint8_t *)wmem_strbuf_finalize(strbuf);
1154
0
    }
1155
6
    string_len = *ptr;
1156
6
    ptr++;
1157
6
    length--;
1158
1159
6
    strbuf = wmem_strbuf_new_sized(scope, 2*string_len+1);
1160
1161
    /*
1162
     * Get the UCS-2 base.
1163
     */
1164
6
    if (string_type == 0x81) {
1165
0
        if (length == 0) {
1166
            /* XXX - return error indication */
1167
0
            return (uint8_t *)wmem_strbuf_finalize(strbuf);
1168
0
  }
1169
0
        ucs2_base = (*ptr) << 7;
1170
0
        ptr++;
1171
0
        length--;
1172
6
    } else if (string_type == 0x82) {
1173
0
        if (length == 0) {
1174
            /* XXX - return error indication */
1175
0
            return (uint8_t *)wmem_strbuf_finalize(strbuf);
1176
0
  }
1177
0
        ucs2_base = (*ptr) << 8;
1178
0
        ptr++;
1179
0
        length--;
1180
1181
0
        if (length == 0) {
1182
            /* XXX - return error indication */
1183
0
            return (uint8_t *)wmem_strbuf_finalize(strbuf);
1184
0
  }
1185
0
        ucs2_base |= *ptr;
1186
0
        ptr++;
1187
0
        length--;
1188
6
    } else {
1189
        /* Invalid string type. */
1190
        /* XXX - return error indication */
1191
6
        return (uint8_t *)wmem_strbuf_finalize(strbuf);
1192
6
    }
1193
1194
0
    for (i = 0; i < string_len; i++) {
1195
0
        uint8_t byte;
1196
1197
0
        if (length == 0) {
1198
            /* XXX - return error indication */
1199
0
            return (uint8_t *)wmem_strbuf_finalize(strbuf);
1200
0
  }
1201
0
        byte = *ptr;
1202
0
        if ((byte & 0x80) == 0) {
1203
0
            saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape);
1204
0
        } else {
1205
0
            gunichar2 uchar;
1206
1207
            /*
1208
             * XXX - if saw_escape is true, this is bogus.
1209
             *
1210
             * XXX - if there are an odd number of bytes, should put a
1211
             * REPLACEMENT CHARACTER at the end.
1212
             */
1213
0
            uchar = ucs2_base + (byte & 0x7f);
1214
0
            wmem_strbuf_append_unichar_validated(strbuf, uchar);
1215
0
        }
1216
0
    }
1217
1218
0
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1219
0
}
1220
1221
uint8_t *
1222
get_ascii_7bits_string(wmem_allocator_t *scope, const uint8_t *ptr,
1223
                       const int bit_offset, int no_of_chars)
1224
0
{
1225
0
    wmem_strbuf_t *strbuf;
1226
0
    int            char_count;                  /* character counter for string */
1227
0
    uint8_t        in_byte, out_byte, rest = 0x00;
1228
0
    const uint8_t *start_ptr = ptr;
1229
0
    int            bits;
1230
1231
0
    bits = bit_offset & 0x07;
1232
0
    if (!bits) {
1233
0
        bits = 7;
1234
0
    }
1235
1236
0
    strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1237
0
    for(char_count = 0; char_count < no_of_chars; ptr++) {
1238
        /* Get the next byte from the string. */
1239
0
        in_byte = *ptr;
1240
1241
        /*
1242
         * Combine the bits we've accumulated with bits from
1243
         * that byte to make a 7-bit code point.
1244
         */
1245
0
        out_byte = (in_byte >> (8 - bits)) | rest;
1246
1247
        /*
1248
         * Leftover bits used in that code point.
1249
         */
1250
0
        rest = (in_byte << (bits - 1)) & 0x7f;
1251
1252
        /*
1253
         * If we don't start from 0th bit, we shouldn't go to the
1254
         * next char. Under *out_num we have now 0 and under Rest -
1255
         * _first_ part of the char.
1256
         */
1257
0
        if ((start_ptr != ptr) || (bits == 7)) {
1258
0
            wmem_strbuf_append_c(strbuf, out_byte);
1259
0
            char_count++;
1260
0
        }
1261
1262
        /*
1263
         * After reading 7 octets we have read 7 full characters
1264
         * but we have 7 bits as well. This is the next character.
1265
         */
1266
0
        if ((bits == 1) && (char_count < no_of_chars)) {
1267
0
            wmem_strbuf_append_c(strbuf, rest);
1268
0
            char_count++;
1269
0
            bits = 7;
1270
0
            rest = 0x00;
1271
0
        } else {
1272
0
            bits--;
1273
0
        }
1274
0
    }
1275
1276
0
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1277
0
}
1278
1279
/* Tables for EBCDIC code pages */
1280
1281
/* EBCDIC common; based on the table in appendix H of ESA/370 Principles
1282
   of Operation, but with some code points that don't correspond to
1283
   the same characters in code pages 037 and 1158 mapped to REPLACEMENT
1284
   CHARACTER - there may be more code points of that sort */
1285
1286
/* There are a few EBCDIC control codes that, strictly speaking, do not
1287
 * map to any control codes in ASCII or Unicode for that matter. The
1288
 * customary treatment is to map them in a particular way to ASCII C1
1289
 * control codes that have no exact equivalent in EBCDIC, as below. */
1290
const gunichar2 charset_table_ebcdic[256] = {
1291
    0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1292
    0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1293
    0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1294
    0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1295
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1296
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1297
    UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1298
    0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a,
1299
    0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1300
    UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL,
1301
    0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1302
    UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL,
1303
    0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1304
    UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1305
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1306
    UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1307
    UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1308
    0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1309
    UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1310
    0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1311
    UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1312
    0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1313
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1314
    UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1315
    0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1316
    0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1317
    0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1318
    0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1319
    0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1320
    0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1321
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1322
    0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1323
};
1324
1325
/* EBCDIC code page 037 */
1326
const gunichar2 charset_table_ebcdic_cp037[256] = {
1327
    0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1328
    0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1329
    0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1330
    0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1331
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1332
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1333
    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1334
    0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1335
    0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1336
    0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c,
1337
    0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1338
    0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac,
1339
    0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1340
    0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1341
    0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1342
    0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1343
    0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1344
    0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1345
    0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1346
    0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1347
    0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1348
    0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1349
    0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1350
    0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1351
    0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1352
    0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1353
    0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1354
    0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1355
    0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1356
    0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1357
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1358
    0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1359
};
1360
1361
/* EBCDIC code page 500
1362
 * https://www.ibm.com/support/pages/conversion-character-differences-between-ccsid-037-and-ccsid-500
1363
 * CCSID 500 ("International Latin-1") has exactly the same repertoire as 37,
1364
 * covering all of ISO-8559-1, but with seven code points permuted.
1365
 * It is notable because it is the default code page for DRDA:
1366
 * https://www.ibm.com/support/pages/drda-user-id-and-password-not-being-transmitted-correctly-when-containing-characters-%C2%AC-%C2%A2?lnk=hm
1367
 */
1368
const gunichar2 charset_table_ebcdic_cp500[256] = {
1369
    0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1370
    0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1371
    0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1372
    0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1373
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1374
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1375
    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1376
    0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1377
    0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1378
    0x00e7, 0x00f1, 0x005b, 0x002e, 0x003c, 0x0028, 0x002b, 0x0021,
1379
    0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1380
    0x00ec, 0x00df, 0x005d, 0x0024, 0x002a, 0x0029, 0x003b, 0x005e,
1381
    0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1382
    0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1383
    0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1384
    0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1385
    0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1386
    0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1387
    0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1388
    0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1389
    0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1390
    0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1391
    0x00a2, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1392
    0x00bd, 0x00be, 0x00ac, 0x007c, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1393
    0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1394
    0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1395
    0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1396
    0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1397
    0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1398
    0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1399
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1400
    0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1401
};
1402
1403
/*
1404
 * Given a wmem scope, a pointer, a length, and a translation table with
1405
 * 256 entries, treat the string of bytes referred to by the pointer and
1406
 * length as a string encoded using one octet per character, with octets
1407
 * being mapped by the translation table to 2-byte Unicode Basic Multilingual
1408
 * Plane characters (including REPLACEMENT CHARACTER), and return a
1409
 * pointer to a UTF-8 string, allocated using the wmem scope.
1410
 */
1411
uint8_t *
1412
get_nonascii_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[256])
1413
2.52k
{
1414
2.52k
    wmem_strbuf_t *str;
1415
1416
2.52k
    str = wmem_strbuf_new_sized(scope, length+1);
1417
1418
37.6k
    while (length > 0) {
1419
35.1k
        uint8_t ch = *ptr;
1420
1421
35.1k
        wmem_strbuf_append_unichar(str, table[ch]);
1422
35.1k
        ptr++;
1423
35.1k
        length--;
1424
35.1k
    }
1425
1426
2.52k
    return (uint8_t *) wmem_strbuf_finalize(str);
1427
2.52k
}
1428
1429
/*
1430
 * Given a wmem scope, a pointer, a length, and a string referring to an
1431
 * encoding (recognized by iconv), treat the bytes referred to by the pointer
1432
 * and length as a string in that encoding, and return a pointer to a UTF-8
1433
 * string, allocated using the wmem scope, converted from the original
1434
 * encoding having substituted REPLACEMENT CHARACTER according to the
1435
 * Unicode Standard 5.22 U+FFFD Substitution for Conversion
1436
 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1437
 */
1438
static uint8_t *
1439
get_string_enc_iconv(wmem_allocator_t *scope, const uint8_t *ptr, int length, const char *encoding)
1440
909
{
1441
909
    GIConv cd;
1442
909
    size_t inbytes, outbytes;
1443
909
    size_t tempstr_size, bytes_written;
1444
909
    size_t err;
1445
909
    size_t max_subpart, tempinbytes;
1446
909
    char *outptr, *tempstr;
1447
1448
909
    wmem_strbuf_t *str;
1449
1450
909
    if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1451
0
        REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
1452
        /* Most likely to be a programming error passing in a bad encoding
1453
         * name. However, could be a issue with the iconv support on the
1454
         * system running WS. GLib requires iconv/libiconv, but is it possible
1455
         * that some versions don't support all common encodings? */
1456
0
    }
1457
1458
909
    inbytes = length;
1459
909
    str = wmem_strbuf_new_sized(scope, length+1);
1460
    /* XXX: If speed becomes an issue, the faster way to do this would
1461
     * involve passing the wmem_strbuf_t's string buffer directly into
1462
     * g_iconv to avoid a memcpy later, but that requires changes to the
1463
     * wmem_strbuf interface to have non const access to the string buffer,
1464
     * and to manipulate the used length directly. */
1465
909
    outbytes = tempstr_size = MAX(8, length);
1466
909
    outptr = tempstr = (char *)g_malloc(outbytes);
1467
3.63k
    while (inbytes > 0) {
1468
2.72k
        err = g_iconv(cd, (char **)&ptr, &inbytes, &outptr, &outbytes);
1469
2.72k
        bytes_written = outptr - tempstr;
1470
2.72k
        wmem_strbuf_append_len(str, tempstr, bytes_written);
1471
2.72k
        outptr = tempstr;
1472
2.72k
        outbytes = tempstr_size;
1473
1474
2.72k
        if (err == (size_t) -1) {
1475
            /* Errors */
1476
1.97k
            switch (errno) {
1477
85
                case EINVAL:
1478
                    /* Incomplete sequence at the end, not an error */
1479
85
                    wmem_strbuf_append_unichar_repl(str);
1480
85
                    inbytes = 0;
1481
85
                    break;
1482
22
                case E2BIG:
1483
                    /* Not enough room (UTF-8 longer than the initial buffer),
1484
                     * start back at the beginning of the buffer */
1485
22
                    break;
1486
1.86k
                case EILSEQ:
1487
                    /* Find the maximal subpart of the ill-formed sequence */
1488
1.86k
                    errno = EINVAL;
1489
4.72k
                    for (max_subpart = 1; err == (size_t)-1 && errno == EINVAL; max_subpart++) {
1490
2.85k
                        tempinbytes = max_subpart;
1491
2.85k
                        err = g_iconv(cd, (char **)&ptr, &tempinbytes,
1492
2.85k
                                &outptr, &outbytes);
1493
2.85k
                    }
1494
1.86k
                    max_subpart = MAX(1, max_subpart-1);
1495
1.86k
                    ptr += max_subpart;
1496
1.86k
                    inbytes -= max_subpart;
1497
1.86k
                    wmem_strbuf_append_unichar_repl(str);
1498
1.86k
                    outptr = tempstr;
1499
1.86k
                    outbytes = tempstr_size;
1500
1.86k
                    break;
1501
0
                default:
1502
                    /* Unexpected conversion error, unrecoverable */
1503
0
                    g_free(tempstr);
1504
0
                    g_iconv_close(cd);
1505
0
                    REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
1506
0
                    break;
1507
1.97k
            }
1508
1.97k
        } else {
1509
            /* Otherwise err is the number of replacement characters used,
1510
             * but we don't care about that. */
1511
            /* If we were converting to ISO-2022-JP or some other stateful
1512
             * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
1513
             * final call with NULL input in order to output the shift
1514
             * sequence back to initial state might make sense, but not
1515
             * needed for UTF-8. */
1516
750
        }
1517
2.72k
    }
1518
1519
909
    g_free(tempstr);
1520
909
    g_iconv_close(cd);
1521
909
    return (uint8_t *) wmem_strbuf_finalize(str);
1522
909
}
1523
1524
/*
1525
 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1526
 * by the pointer and length as a GB18030 encoded string, and return a pointer
1527
 * to a UTF-8 string, allocated using the wmem scope, converted having
1528
 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1529
 * 5.22 U+FFFD Substitution for Conversion.
1530
 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1531
 *
1532
 * As expected, this will also decode GBK and GB2312 strings.
1533
 */
1534
uint8_t *
1535
get_gb18030_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1536
233
{
1537
    /* iconv/libiconv support is guaranteed with GLib. Support this
1538
     * via iconv, at least for now. */
1539
    /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
1540
     * 2000-10-24 and version 1.4, is there is a system that compiles current
1541
     * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
1542
233
    const char *encoding = "GB18030";
1543
233
    GIConv cd;
1544
233
    if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1545
0
        encoding = "GBK";
1546
        /* GB18030 is backwards compatible, at worst this will mean a few
1547
         * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
1548
         * from GB18030, which are all pairs of two byte sequences
1549
         * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
1550
         * and thus the 4 byte characters will be replaced with two
1551
         * REPLACEMENT CHARACTERs. */
1552
233
    } else {
1553
233
        g_iconv_close(cd);
1554
233
    }
1555
233
    return get_string_enc_iconv(scope, ptr, length, encoding);
1556
233
}
1557
1558
/*
1559
 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1560
 * by the pointer and length as a EUC-KR encoded string, and return a pointer
1561
 * to a UTF-8 string, allocated using the wmem scope, converted having
1562
 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1563
 * 5.22 U+FFFD Substitution for Conversion.
1564
 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1565
 */
1566
uint8_t *
1567
get_euc_kr_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1568
676
{
1569
    /* iconv/libiconv support is guaranteed with GLib. Support this
1570
     * via iconv, at least for now. */
1571
676
    return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
1572
676
}
1573
1574
/* T.61 to UTF-8 conversion table from OpenLDAP project
1575
 * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
1576
 */
1577
static const gunichar2 t61_tab[] = {
1578
    0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
1579
    0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
1580
    0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
1581
    0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
1582
    0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
1583
    0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
1584
    0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
1585
    0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
1586
    0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
1587
    0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
1588
    0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
1589
    0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
1590
    0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
1591
    0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
1592
    0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
1593
    0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
1594
    0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
1595
    0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
1596
    0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
1597
    0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
1598
    0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
1599
    0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
1600
    0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
1601
    0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
1602
    0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
1603
    0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
1604
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1605
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1606
    0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
1607
    0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
1608
    0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
1609
    0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
1610
};
1611
1612
typedef gunichar2 wvec16[16];
1613
typedef gunichar2 wvec32[32];
1614
1615
/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
1616
static const wvec16 accents = {
1617
    0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
1618
    0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
1619
1620
/* In the following tables, base characters commented in (parentheses)
1621
 * are not defined by T.61 but are mapped anyway since their Unicode
1622
 * composite exists.
1623
 */
1624
1625
/* Grave accented chars AEIOU (NWY) */
1626
static const wvec32 c1_vec1 = {
1627
    /* Upper case */
1628
    0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
1629
    0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
1630
static const wvec32 c1_vec2 = {
1631
    /* Lower case */
1632
    0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
1633
    0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
1634
1635
static const wvec32 *c1_grave[] = {
1636
    NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
1637
};
1638
1639
/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
1640
static const wvec32 c2_vec1 = {
1641
    /* Upper case */
1642
    0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
1643
    0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
1644
    0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
1645
    0, 0xdd, 0x179, 0, 0, 0, 0, 0};
1646
static const wvec32 c2_vec2 = {
1647
    /* Lower case */
1648
    0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
1649
    0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
1650
    0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
1651
    0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
1652
static const wvec32 c2_vec3 = {
1653
    /* (AE and ae) */
1654
    0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1655
    0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1656
1657
static const wvec32 *c2_acute[] = {
1658
    NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
1659
};
1660
1661
/* Circumflex AEIOUYCGHJSW (Z) */
1662
static const wvec32 c3_vec1 = {
1663
    /* Upper case */
1664
    0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
1665
    0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
1666
    0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
1667
    0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
1668
static const wvec32 c3_vec2 = {
1669
    /* Lower case */
1670
    0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
1671
    0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
1672
    0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
1673
    0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
1674
static const wvec32 *c3_circumflex[] = {
1675
    NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
1676
};
1677
1678
/* Tilde AIOUN (EVY) */
1679
static const wvec32 c4_vec1 = {
1680
    /* Upper case */
1681
    0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
1682
    0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
1683
static const wvec32 c4_vec2 = {
1684
    /* Lower case */
1685
    0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
1686
    0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
1687
static const wvec32 *c4_tilde[] = {
1688
    NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
1689
};
1690
1691
/* Macron AEIOU (YG) */
1692
static const wvec32 c5_vec1 = {
1693
    /* Upper case */
1694
    0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
1695
    0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
1696
static const wvec32 c5_vec2 = {
1697
    /* Lower case */
1698
    0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
1699
    0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
1700
static const wvec32 c5_vec3 = {
1701
    /* (AE and ae) */
1702
    0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1703
    0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1704
static const wvec32 *c5_macron[] = {
1705
    NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
1706
};
1707
1708
/* Breve AUG (EIO) */
1709
static const wvec32 c6_vec1 = {
1710
    /* Upper case */
1711
    0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
1712
    0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1713
static const wvec32 c6_vec2 = {
1714
    /* Lower case */
1715
    0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
1716
    0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1717
static const wvec32 *c6_breve[] = {
1718
    NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
1719
};
1720
1721
/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
1722
static const wvec32 c7_vec1 = {
1723
    /* Upper case */
1724
    0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
1725
    0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
1726
    0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
1727
    0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
1728
static const wvec32 c7_vec2 = {
1729
    /* Lower case */
1730
    0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
1731
    0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
1732
    0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
1733
    0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
1734
static const wvec32 *c7_dotabove[] = {
1735
    NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
1736
};
1737
1738
/* Diaeresis AEIOUY (HWXt) */
1739
static const wvec32 c8_vec1 = {
1740
    /* Upper case */
1741
    0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
1742
    0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
1743
static const wvec32 c8_vec2 = {
1744
    /* Lower case */
1745
    0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
1746
    0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
1747
static const wvec32 *c8_diaeresis[] = {
1748
    NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
1749
};
1750
1751
/* Ring Above AU (wy) */
1752
static const wvec32 ca_vec1 = {
1753
    /* Upper case */
1754
    0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755
    0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1756
static const wvec32 ca_vec2 = {
1757
    /* Lower case */
1758
    0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759
    0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
1760
static const wvec32 *ca_ringabove[] = {
1761
    NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
1762
};
1763
1764
/* Cedilla CGKLNRST (EDH) */
1765
static const wvec32 cb_vec1 = {
1766
    /* Upper case */
1767
    0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
1768
    0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
1769
    0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1770
static const wvec32 cb_vec2 = {
1771
    /* Lower case */
1772
    0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
1773
    0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
1774
    0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1775
static const wvec32 *cb_cedilla[] = {
1776
    NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
1777
};
1778
1779
/* Double Acute Accent OU */
1780
static const wvec32 cd_vec1 = {
1781
    /* Upper case */
1782
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
1783
    0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1784
static const wvec32 cd_vec2 = {
1785
    /* Lower case */
1786
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
1787
    0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1788
static const wvec32 *cd_doubleacute[] = {
1789
    NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
1790
};
1791
1792
/* Ogonek AEIU (O) */
1793
static const wvec32 ce_vec1 = {
1794
    /* Upper case */
1795
    0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
1796
    0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1797
static const wvec32 ce_vec2 = {
1798
    /* Lower case */
1799
    0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
1800
    0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1801
static const wvec32 *ce_ogonek[] = {
1802
    NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
1803
};
1804
1805
/* Caron CDELNRSTZ (AIOUGKjH) */
1806
static const wvec32 cf_vec1 = {
1807
    /* Upper case */
1808
    0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
1809
    0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
1810
    0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
1811
    0, 0, 0x17d, 0, 0, 0, 0, 0};
1812
static const wvec32 cf_vec2 = {
1813
    /* Lower case */
1814
    0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
1815
    0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
1816
    0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
1817
    0, 0, 0x17e, 0, 0, 0, 0, 0};
1818
static const wvec32 *cf_caron[] = {
1819
    NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
1820
};
1821
1822
static const wvec32 **cx_tab[] = {
1823
    NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
1824
    c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
1825
    cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
1826
1827
uint8_t *
1828
get_t61_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1829
13
{
1830
13
    int            i;
1831
13
    const uint8_t *c;
1832
13
    wmem_strbuf_t *strbuf;
1833
1834
13
    strbuf = wmem_strbuf_new_sized(scope, length+1);
1835
1836
188
    for (i = 0, c = ptr; i < length; c++, i++) {
1837
175
        if (!t61_tab[*c]) {
1838
34
            wmem_strbuf_append_unichar(strbuf, UNREPL);
1839
141
        } else if (i < length - 1 && (*c & 0xf0) == 0xc0) {
1840
0
            int j = *c & 0x0f;
1841
            /* If this is the end of the string, or if the base
1842
             * character is just a space, treat this as a regular
1843
             * spacing character.
1844
             */
1845
0
            if ((!c[1] || c[1] == 0x20) && accents[j]) {
1846
0
                wmem_strbuf_append_unichar(strbuf, accents[j]);
1847
0
            } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
1848
                /* We have a composite mapping for this pair */
1849
0
                       (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
1850
0
                wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]);
1851
0
            } else {
1852
                /* No mapping, just swap it around so the base
1853
                 * character comes first.
1854
                 */
1855
0
                wmem_strbuf_append_unichar(strbuf, c[1]);
1856
0
                wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1857
0
            }
1858
0
            c++; i++;
1859
0
            continue;
1860
141
        } else {
1861
141
            wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1862
141
        }
1863
175
    }
1864
1865
13
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1866
13
}
1867
1868
/* The DECT standard charset from ETSI EN 300 175-5 Annex D
1869
 */
1870
static const gunichar2 dect_standard_8bits_code_table[] = {
1871
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1872
    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1873
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1874
    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1875
    ' ',  '!',  '\"', '#',  '$',  '%',  '&',  '\'',
1876
    '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
1877
    '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
1878
    '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',
1879
    '@',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
1880
    'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
1881
    'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
1882
    'X',  'Y',  'Z',  '[', '\\',  ']',  '^',  '_',
1883
    '`',  'a',  'b',  'c',  'd',  'e',  'f',  'g',
1884
    'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
1885
    'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
1886
    'x',  'y',  'z',  '{',  '|',  '}',  '~', 0x7f,
1887
};
1888
1889
uint8_t *
1890
get_dect_standard_8bits_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1891
0
{
1892
0
    int            position;
1893
0
    const uint8_t *current_byte_ptr;
1894
0
    wmem_strbuf_t *strbuf;
1895
1896
0
    strbuf = wmem_strbuf_new_sized(scope, length+1);
1897
1898
0
    for (position = 0, current_byte_ptr = ptr; position < length; current_byte_ptr++, position++) {
1899
0
        if (*current_byte_ptr & 0x80) {
1900
0
            wmem_strbuf_append_unichar(strbuf, UNREPL);
1901
0
        } else if (!dect_standard_8bits_code_table[*current_byte_ptr]) {
1902
0
            wmem_strbuf_append_unichar(strbuf, UNREPL);
1903
0
        } else {
1904
0
            wmem_strbuf_append_unichar(strbuf, dect_standard_8bits_code_table[*current_byte_ptr]);
1905
0
        }
1906
0
    }
1907
1908
0
    return (uint8_t *)wmem_strbuf_finalize(strbuf);
1909
0
}
1910
/*
1911
 * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
1912
 *
1913
 * Local variables:
1914
 * c-basic-offset: 4
1915
 * tab-width: 8
1916
 * indent-tabs-mode: nil
1917
 * End:
1918
 *
1919
 * vi: set shiftwidth=4 tabstop=8 expandtab:
1920
 * :indentSize=4:tabSize=8:noTabs=true:
1921
 */