/src/irssi/subprojects/glib-2.74.3/glib/gtranslit.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2014 Canonical Limited |
3 | | * |
4 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | | * |
19 | | * Author: Ryan Lortie <desrt@desrt.ca> |
20 | | */ |
21 | | |
22 | | #include <config.h> |
23 | | |
24 | | #include "gstrfuncs.h" |
25 | | |
26 | | #include <glib.h> |
27 | | #include <locale.h> |
28 | | #include <stdlib.h> |
29 | | #include <string.h> |
30 | | |
31 | | struct mapping_entry |
32 | | { |
33 | | guint16 src; |
34 | | guint16 ascii; |
35 | | }; |
36 | | |
37 | | struct mapping_range |
38 | | { |
39 | | guint16 start; |
40 | | guint16 length; |
41 | | }; |
42 | | |
43 | | struct locale_entry |
44 | | { |
45 | | guint8 name_offset; |
46 | | guint8 item_id; |
47 | | }; |
48 | | |
49 | | #include "gtranslit-data.h" |
50 | | |
51 | 0 | #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded) |
52 | 0 | #define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1) |
53 | | |
54 | | #if G_BYTE_ORDER == G_BIG_ENDIAN |
55 | | #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1)) |
56 | | #else |
57 | 0 | #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded)) |
58 | | #endif |
59 | | |
60 | | static const gchar * lookup_in_item (guint item_id, |
61 | | const gunichar *key, |
62 | | gint *result_len, |
63 | | gint *key_consumed); |
64 | | |
65 | | static gint |
66 | | compare_mapping_entry (gconstpointer user_data, |
67 | | gconstpointer data) |
68 | 0 | { |
69 | 0 | const struct mapping_entry *entry = data; |
70 | 0 | const gunichar *key = user_data; |
71 | 0 | gunichar src_0; |
72 | |
|
73 | 0 | G_STATIC_ASSERT(MAX_KEY_SIZE == 2); |
74 | |
|
75 | 0 | src_0 = get_src_char (src_table, entry->src, 0); |
76 | |
|
77 | 0 | if (key[0] > src_0) |
78 | 0 | return 1; |
79 | 0 | else if (key[0] < src_0) |
80 | 0 | return -1; |
81 | | |
82 | 0 | if (get_length (entry->src) > 1) |
83 | 0 | { |
84 | 0 | gunichar src_1; |
85 | |
|
86 | 0 | src_1 = get_src_char (src_table, entry->src, 1); |
87 | |
|
88 | 0 | if (key[1] > src_1) |
89 | 0 | return 1; |
90 | 0 | else if (key[1] < src_1) |
91 | 0 | return -1; |
92 | 0 | } |
93 | 0 | else if (key[1]) |
94 | 0 | return 1; |
95 | | |
96 | 0 | return 0; |
97 | 0 | } |
98 | | |
99 | | static const gchar * |
100 | | lookup_in_mapping (const struct mapping_entry *mapping, |
101 | | gint mapping_size, |
102 | | const gunichar *key, |
103 | | gint *result_len, |
104 | | gint *key_consumed) |
105 | 0 | { |
106 | 0 | const struct mapping_entry *hit; |
107 | |
|
108 | 0 | hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry); |
109 | |
|
110 | 0 | if (hit == NULL) |
111 | 0 | return NULL; |
112 | | |
113 | 0 | *key_consumed = get_length (hit->src); |
114 | 0 | *result_len = get_length (hit->ascii); |
115 | |
|
116 | 0 | return get_ascii_item(ascii_table, hit->ascii); |
117 | 0 | } |
118 | | |
119 | | static const gchar * |
120 | | lookup_in_chain (const guint8 *chain, |
121 | | const gunichar *key, |
122 | | gint *result_len, |
123 | | gint *key_consumed) |
124 | 0 | { |
125 | 0 | const gchar *result; |
126 | |
|
127 | 0 | while (*chain != 0xff) |
128 | 0 | { |
129 | 0 | result = lookup_in_item (*chain, key, result_len, key_consumed); |
130 | |
|
131 | 0 | if (result) |
132 | 0 | return result; |
133 | | |
134 | 0 | chain++; |
135 | 0 | } |
136 | | |
137 | 0 | return NULL; |
138 | 0 | } |
139 | | |
140 | | static const gchar * |
141 | | lookup_in_item (guint item_id, |
142 | | const gunichar *key, |
143 | | gint *result_len, |
144 | | gint *key_consumed) |
145 | 0 | { |
146 | 0 | if (item_id & 0x80) |
147 | 0 | { |
148 | 0 | const guint8 *chain = chains_table + chain_starts[item_id & 0x7f]; |
149 | |
|
150 | 0 | return lookup_in_chain (chain, key, result_len, key_consumed); |
151 | 0 | } |
152 | 0 | else |
153 | 0 | { |
154 | 0 | const struct mapping_range *range = &mapping_ranges[item_id]; |
155 | |
|
156 | 0 | return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed); |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | static gint |
161 | | compare_locale_entry (gconstpointer user_data, |
162 | | gconstpointer data) |
163 | 0 | { |
164 | 0 | const struct locale_entry *entry = data; |
165 | 0 | const gchar *key = user_data; |
166 | |
|
167 | 0 | return strcmp (key, &locale_names[entry->name_offset]); |
168 | 0 | } |
169 | | |
170 | | static gboolean |
171 | | lookup_item_id_for_one_locale (const gchar *key, |
172 | | guint *item_id) |
173 | 0 | { |
174 | 0 | const struct locale_entry *hit; |
175 | |
|
176 | 0 | hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry); |
177 | |
|
178 | 0 | if (hit == NULL) |
179 | 0 | return FALSE; |
180 | | |
181 | 0 | *item_id = hit->item_id; |
182 | 0 | return TRUE; |
183 | 0 | } |
184 | | |
185 | | static guint |
186 | | lookup_item_id_for_locale (const gchar *locale) |
187 | 0 | { |
188 | 0 | gchar key[MAX_LOCALE_NAME + 1]; |
189 | 0 | const gchar *language; |
190 | 0 | guint language_len; |
191 | 0 | const gchar *territory = NULL; |
192 | 0 | guint territory_len = 0; |
193 | 0 | const gchar *modifier = NULL; |
194 | 0 | guint modifier_len = 0; |
195 | 0 | const gchar *next_char; |
196 | 0 | guint id; |
197 | | |
198 | | /* As per POSIX, a valid locale looks like: |
199 | | * |
200 | | * language[_territory][.codeset][@modifier] |
201 | | */ |
202 | 0 | language = locale; |
203 | 0 | language_len = strcspn (language, "_.@"); |
204 | 0 | next_char = language + language_len; |
205 | |
|
206 | 0 | if (*next_char == '_') |
207 | 0 | { |
208 | 0 | territory = next_char; |
209 | 0 | territory_len = strcspn (territory + 1, "_.@") + 1; |
210 | 0 | next_char = territory + territory_len; |
211 | 0 | } |
212 | |
|
213 | 0 | if (*next_char == '.') |
214 | 0 | { |
215 | 0 | const gchar *codeset; |
216 | 0 | guint codeset_len; |
217 | |
|
218 | 0 | codeset = next_char; |
219 | 0 | codeset_len = strcspn (codeset + 1, "_.@") + 1; |
220 | 0 | next_char = codeset + codeset_len; |
221 | 0 | } |
222 | |
|
223 | 0 | if (*next_char == '@') |
224 | 0 | { |
225 | 0 | modifier = next_char; |
226 | 0 | modifier_len = strcspn (modifier + 1, "_.@") + 1; |
227 | 0 | next_char = modifier + modifier_len; |
228 | 0 | } |
229 | | |
230 | | /* What madness is this? */ |
231 | 0 | if (language_len == 0 || *next_char) |
232 | 0 | return default_item_id; |
233 | | |
234 | | /* We are not interested in codeset. |
235 | | * |
236 | | * For this locale: |
237 | | * |
238 | | * aa_BB@cc |
239 | | * |
240 | | * try in this order: |
241 | | * |
242 | | * Note: we have no locales of the form aa_BB@cc in the database. |
243 | | * |
244 | | * 1. aa@cc |
245 | | * 2. aa_BB |
246 | | * 3. aa |
247 | | */ |
248 | | |
249 | | /* 1. */ |
250 | 0 | if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME) |
251 | 0 | { |
252 | 0 | memcpy (key, language, language_len); |
253 | 0 | memcpy (key + language_len, modifier, modifier_len); |
254 | 0 | key[language_len + modifier_len] = '\0'; |
255 | |
|
256 | 0 | if (lookup_item_id_for_one_locale (key, &id)) |
257 | 0 | return id; |
258 | 0 | } |
259 | | |
260 | | /* 2. */ |
261 | 0 | if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME) |
262 | 0 | { |
263 | 0 | memcpy (key, language, language_len); |
264 | 0 | memcpy (key + language_len, territory, territory_len); |
265 | 0 | key[language_len + territory_len] = '\0'; |
266 | |
|
267 | 0 | if (lookup_item_id_for_one_locale (key, &id)) |
268 | 0 | return id; |
269 | 0 | } |
270 | | |
271 | | /* 3. */ |
272 | 0 | if (language_len <= MAX_LOCALE_NAME) |
273 | 0 | { |
274 | 0 | memcpy (key, language, language_len); |
275 | 0 | key[language_len] = '\0'; |
276 | |
|
277 | 0 | if (lookup_item_id_for_one_locale (key, &id)) |
278 | 0 | return id; |
279 | 0 | } |
280 | | |
281 | 0 | return default_item_id; |
282 | 0 | } |
283 | | |
284 | | static guint |
285 | | get_default_item_id (void) |
286 | 0 | { |
287 | 0 | static guint item_id; |
288 | 0 | static gboolean done; |
289 | | |
290 | | /* Doesn't need to be locked -- no harm in doing it twice. */ |
291 | 0 | if (!done) |
292 | 0 | { |
293 | 0 | const gchar *locale; |
294 | |
|
295 | 0 | locale = setlocale (LC_CTYPE, NULL); |
296 | 0 | item_id = lookup_item_id_for_locale (locale); |
297 | 0 | done = TRUE; |
298 | 0 | } |
299 | |
|
300 | 0 | return item_id; |
301 | 0 | } |
302 | | |
303 | | /** |
304 | | * g_str_to_ascii: |
305 | | * @str: a string, in UTF-8 |
306 | | * @from_locale: (nullable): the source locale, if known |
307 | | * |
308 | | * Transliterate @str to plain ASCII. |
309 | | * |
310 | | * For best results, @str should be in composed normalised form. |
311 | | * |
312 | | * This function performs a reasonably good set of character |
313 | | * replacements. The particular set of replacements that is done may |
314 | | * change by version or even by runtime environment. |
315 | | * |
316 | | * If the source language of @str is known, it can used to improve the |
317 | | * accuracy of the translation by passing it as @from_locale. It should |
318 | | * be a valid POSIX locale string (of the form |
319 | | * `language[_territory][.codeset][@modifier]`). |
320 | | * |
321 | | * If @from_locale is %NULL then the current locale is used. |
322 | | * |
323 | | * If you want to do translation for no specific locale, and you want it |
324 | | * to be done independently of the currently locale, specify `"C"` for |
325 | | * @from_locale. |
326 | | * |
327 | | * Returns: a string in plain ASCII |
328 | | * |
329 | | * Since: 2.40 |
330 | | **/ |
331 | | gchar * |
332 | | g_str_to_ascii (const gchar *str, |
333 | | const gchar *from_locale) |
334 | 0 | { |
335 | 0 | GString *result; |
336 | 0 | guint item_id; |
337 | |
|
338 | 0 | g_return_val_if_fail (str != NULL, NULL); |
339 | | |
340 | 0 | if (g_str_is_ascii (str)) |
341 | 0 | return g_strdup (str); |
342 | | |
343 | 0 | if (from_locale) |
344 | 0 | item_id = lookup_item_id_for_locale (from_locale); |
345 | 0 | else |
346 | 0 | item_id = get_default_item_id (); |
347 | |
|
348 | 0 | result = g_string_sized_new (strlen (str)); |
349 | |
|
350 | 0 | while (*str) |
351 | 0 | { |
352 | | /* We only need to transliterate non-ASCII values... */ |
353 | 0 | if (*str & 0x80) |
354 | 0 | { |
355 | 0 | gunichar key[MAX_KEY_SIZE]; |
356 | 0 | const gchar *r; |
357 | 0 | gint consumed; |
358 | 0 | gint r_len; |
359 | 0 | gunichar c; |
360 | |
|
361 | 0 | G_STATIC_ASSERT(MAX_KEY_SIZE == 2); |
362 | |
|
363 | 0 | c = g_utf8_get_char (str); |
364 | | |
365 | | /* This is where it gets evil... |
366 | | * |
367 | | * We know that MAX_KEY_SIZE is 2. We also know that we |
368 | | * only want to try another character if it's non-ascii. |
369 | | */ |
370 | 0 | str = g_utf8_next_char (str); |
371 | |
|
372 | 0 | key[0] = c; |
373 | 0 | if (*str & 0x80) |
374 | 0 | key[1] = g_utf8_get_char (str); |
375 | 0 | else |
376 | 0 | key[1] = 0; |
377 | |
|
378 | 0 | r = lookup_in_item (item_id, key, &r_len, &consumed); |
379 | | |
380 | | /* If we failed to map two characters, try again with one. |
381 | | * |
382 | | * gconv behaviour is a bit weird here -- it seems to |
383 | | * depend in the randomness of the binary search and the |
384 | | * size of the input buffer as to what result we get here. |
385 | | * |
386 | | * Doing it this way is more work, but should be |
387 | | * more-correct. |
388 | | */ |
389 | 0 | if (r == NULL && key[1]) |
390 | 0 | { |
391 | 0 | key[1] = 0; |
392 | 0 | r = lookup_in_item (item_id, key, &r_len, &consumed); |
393 | 0 | } |
394 | |
|
395 | 0 | if (r != NULL) |
396 | 0 | { |
397 | 0 | g_string_append_len (result, r, r_len); |
398 | 0 | if (consumed == 2) |
399 | | /* If it took both then skip again */ |
400 | 0 | str = g_utf8_next_char (str); |
401 | 0 | } |
402 | 0 | else /* no match found */ |
403 | 0 | g_string_append_c (result, '?'); |
404 | 0 | } |
405 | 0 | else /* ASCII case */ |
406 | 0 | g_string_append_c (result, *str++); |
407 | 0 | } |
408 | |
|
409 | 0 | return g_string_free (result, FALSE); |
410 | 0 | } |