Coverage Report

Created: 2025-10-10 07:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/glib/glib/gtranslit.c
Line
Count
Source
1
/*
2
 * Copyright © 2014 Canonical Limited
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2.1 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
16
 *
17
 * Author: Ryan Lortie <desrt@desrt.ca>
18
 */
19
20
#include <config.h>
21
22
#include "gstrfuncs.h"
23
24
#include <glib.h>
25
#include <locale.h>
26
#include <stdlib.h>
27
#include <string.h>
28
29
struct mapping_entry
30
{
31
  guint16 src;
32
  guint16 ascii;
33
};
34
35
struct mapping_range
36
{
37
  guint16 start;
38
  guint16 length;
39
};
40
41
struct locale_entry
42
{
43
  guint8 name_offset;
44
  guint8 item_id;
45
};
46
47
#include "gtranslit-data.h"
48
49
0
#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
50
0
#define get_length(encoded)                 ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
51
52
#if G_BYTE_ORDER == G_BIG_ENDIAN
53
#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
54
#else
55
0
#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
56
#endif
57
58
static const gchar * lookup_in_item (guint           item_id,
59
                                     const gunichar *key,
60
                                     gint           *result_len,
61
                                     gint           *key_consumed);
62
63
static gint
64
compare_mapping_entry (gconstpointer user_data,
65
                       gconstpointer data)
66
0
{
67
0
  const struct mapping_entry *entry = data;
68
0
  const gunichar *key = user_data;
69
0
  gunichar src_0;
70
71
0
  G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
72
73
0
  src_0 = get_src_char (src_table, entry->src, 0);
74
75
0
  if (key[0] > src_0)
76
0
    return 1;
77
0
  else if (key[0] < src_0)
78
0
    return -1;
79
80
0
  if (get_length (entry->src) > 1)
81
0
    {
82
0
      gunichar src_1;
83
84
0
      src_1 = get_src_char (src_table, entry->src, 1);
85
86
0
      if (key[1] > src_1)
87
0
        return 1;
88
0
      else if (key[1] < src_1)
89
0
        return -1;
90
0
    }
91
0
  else if (key[1])
92
0
    return 1;
93
94
0
  return 0;
95
0
}
96
97
static const gchar *
98
lookup_in_mapping (const struct mapping_entry *mapping,
99
                   gint                        mapping_size,
100
                   const gunichar             *key,
101
                   gint                       *result_len,
102
                   gint                       *key_consumed)
103
0
{
104
0
  const struct mapping_entry *hit;
105
106
0
  hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
107
108
0
  if (hit == NULL)
109
0
    return NULL;
110
111
0
  *key_consumed = get_length (hit->src);
112
0
  *result_len = get_length (hit->ascii);
113
114
0
  return get_ascii_item(ascii_table, hit->ascii);
115
0
}
116
117
static const gchar *
118
lookup_in_chain (const guint8   *chain,
119
                 const gunichar *key,
120
                 gint           *result_len,
121
                 gint           *key_consumed)
122
0
{
123
0
  const gchar *result;
124
125
0
  while (*chain != 0xff)
126
0
    {
127
0
      result = lookup_in_item (*chain, key, result_len, key_consumed);
128
129
0
      if (result)
130
0
        return result;
131
132
0
      chain++;
133
0
    }
134
135
0
  return NULL;
136
0
}
137
138
static const gchar *
139
lookup_in_item (guint           item_id,
140
                const gunichar *key,
141
                gint           *result_len,
142
                gint           *key_consumed)
143
0
{
144
0
  if (item_id & 0x80)
145
0
    {
146
0
      const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
147
148
0
      return lookup_in_chain (chain, key, result_len, key_consumed);
149
0
    }
150
0
  else
151
0
    {
152
0
      const struct mapping_range *range = &mapping_ranges[item_id];
153
154
0
      return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
155
0
    }
156
0
}
157
158
static gint
159
compare_locale_entry (gconstpointer user_data,
160
                      gconstpointer data)
161
0
{
162
0
  const struct locale_entry *entry = data;
163
0
  const gchar *key = user_data;
164
165
0
  return strcmp (key, &locale_names[entry->name_offset]);
166
0
}
167
168
static gboolean
169
lookup_item_id_for_one_locale (const gchar *key,
170
                               guint       *item_id)
171
0
{
172
0
  const struct locale_entry *hit;
173
174
0
  hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
175
176
0
  if (hit == NULL)
177
0
    return FALSE;
178
179
0
  *item_id = hit->item_id;
180
0
  return TRUE;
181
0
}
182
183
static guint
184
lookup_item_id_for_locale (const gchar *locale)
185
0
{
186
0
  gchar key[MAX_LOCALE_NAME + 1];
187
0
  const gchar *language;
188
0
  guint language_len;
189
0
  const gchar *territory = NULL;
190
0
  guint territory_len = 0;
191
0
  const gchar *modifier = NULL;
192
0
  guint modifier_len = 0;
193
0
  const gchar *next_char;
194
0
  guint id;
195
196
  /* As per POSIX, a valid locale looks like:
197
   *
198
   *   language[_territory][.codeset][@modifier]
199
   */
200
0
  language = locale;
201
0
  language_len = strcspn (language, "_.@");
202
0
  next_char = language + language_len;
203
204
0
  if (*next_char == '_')
205
0
    {
206
0
      territory = next_char;
207
0
      territory_len = strcspn (territory + 1, "_.@") + 1;
208
0
      next_char = territory + territory_len;
209
0
    }
210
211
0
  if (*next_char == '.')
212
0
    {
213
0
      const gchar *codeset;
214
0
      guint codeset_len;
215
216
0
      codeset = next_char;
217
0
      codeset_len = strcspn (codeset + 1, "_.@") + 1;
218
0
      next_char = codeset + codeset_len;
219
0
    }
220
221
0
  if (*next_char == '@')
222
0
    {
223
0
      modifier = next_char;
224
0
      modifier_len = strcspn (modifier + 1, "_.@") + 1;
225
0
      next_char = modifier + modifier_len;
226
0
    }
227
228
  /* What madness is this? */
229
0
  if (language_len == 0 || *next_char)
230
0
    return default_item_id;
231
232
  /* We are not interested in codeset.
233
   *
234
   * For this locale:
235
   *
236
   *  aa_BB@cc
237
   *
238
   * try in this order:
239
   *
240
   * Note: we have no locales of the form aa_BB@cc in the database.
241
   *
242
   *  1. aa@cc
243
   *  2. aa_BB
244
   *  3. aa
245
   */
246
247
  /* 1. */
248
0
  if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
249
0
    {
250
0
      memcpy (key, language, language_len);
251
0
      memcpy (key + language_len, modifier, modifier_len);
252
0
      key[language_len + modifier_len] = '\0';
253
254
0
      if (lookup_item_id_for_one_locale (key, &id))
255
0
        return id;
256
0
    }
257
258
  /* 2. */
259
0
  if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
260
0
    {
261
0
      memcpy (key, language, language_len);
262
0
      memcpy (key + language_len, territory, territory_len);
263
0
      key[language_len + territory_len] = '\0';
264
265
0
      if (lookup_item_id_for_one_locale (key, &id))
266
0
        return id;
267
0
    }
268
269
  /* 3. */
270
0
  if (language_len <= MAX_LOCALE_NAME)
271
0
    {
272
0
      memcpy (key, language, language_len);
273
0
      key[language_len] = '\0';
274
275
0
      if (lookup_item_id_for_one_locale (key, &id))
276
0
        return id;
277
0
    }
278
279
0
  return default_item_id;
280
0
}
281
282
static guint
283
get_default_item_id (void)
284
0
{
285
0
  static guint item_id;
286
0
  static gboolean done;
287
288
  /* Doesn't need to be locked -- no harm in doing it twice. */
289
0
  if (!done)
290
0
    {
291
0
      const gchar *locale;
292
293
0
      locale = setlocale (LC_CTYPE, NULL);
294
0
      item_id = lookup_item_id_for_locale (locale);
295
0
      done = TRUE;
296
0
    }
297
298
0
  return item_id;
299
0
}
300
301
/**
302
 * g_str_to_ascii:
303
 * @str: a string, in UTF-8
304
 * @from_locale: (nullable): the source locale, if known
305
 *
306
 * Transliterate @str to plain ASCII.
307
 *
308
 * For best results, @str should be in composed normalised form.
309
 *
310
 * This function performs a reasonably good set of character
311
 * replacements.  The particular set of replacements that is done may
312
 * change by version or even by runtime environment.
313
 *
314
 * If the source language of @str is known, it can used to improve the
315
 * accuracy of the translation by passing it as @from_locale.  It should
316
 * be a valid POSIX locale string (of the form
317
 * `language[_territory][.codeset][@modifier]`).
318
 *
319
 * If @from_locale is %NULL then the current locale is used.
320
 *
321
 * If you want to do translation for no specific locale, and you want it
322
 * to be done independently of the currently locale, specify `"C"` for
323
 * @from_locale.
324
 *
325
 * Returns: a string in plain ASCII
326
 *
327
 * Since: 2.40
328
 **/
329
gchar *
330
g_str_to_ascii (const gchar *str,
331
                const gchar *from_locale)
332
0
{
333
0
  GString *result;
334
0
  guint item_id;
335
336
0
  g_return_val_if_fail (str != NULL, NULL);
337
338
0
  if (g_str_is_ascii (str))
339
0
    return g_strdup (str);
340
341
0
  if (from_locale)
342
0
    item_id = lookup_item_id_for_locale (from_locale);
343
0
  else
344
0
    item_id = get_default_item_id ();
345
346
0
  result = g_string_sized_new (strlen (str));
347
348
0
  while (*str)
349
0
    {
350
      /* We only need to transliterate non-ASCII values... */
351
0
      if (*str & 0x80)
352
0
        {
353
0
          gunichar key[MAX_KEY_SIZE];
354
0
          const gchar *r;
355
0
          gint consumed;
356
0
          gint r_len;
357
0
          gunichar c;
358
359
0
          G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
360
361
0
          c = g_utf8_get_char (str);
362
363
          /* This is where it gets evil...
364
           *
365
           * We know that MAX_KEY_SIZE is 2.  We also know that we
366
           * only want to try another character if it's non-ascii.
367
           */
368
0
          str = g_utf8_next_char (str);
369
370
0
          key[0] = c;
371
0
          if (*str & 0x80)
372
0
            key[1] = g_utf8_get_char (str);
373
0
          else
374
0
            key[1] = 0;
375
376
0
          r = lookup_in_item (item_id, key, &r_len, &consumed);
377
378
          /* If we failed to map two characters, try again with one.
379
           *
380
           * gconv behaviour is a bit weird here -- it seems to
381
           * depend in the randomness of the binary search and the
382
           * size of the input buffer as to what result we get here.
383
           *
384
           * Doing it this way is more work, but should be
385
           * more-correct.
386
           */
387
0
          if (r == NULL && key[1])
388
0
            {
389
0
              key[1] = 0;
390
0
              r = lookup_in_item (item_id, key, &r_len, &consumed);
391
0
            }
392
393
0
          if (r != NULL)
394
0
            {
395
0
              g_string_append_len (result, r, r_len);
396
0
              if (consumed == 2)
397
                /* If it took both then skip again */
398
0
                str = g_utf8_next_char (str);
399
0
            }
400
0
          else /* no match found */
401
0
            g_string_append_c (result, '?');
402
0
        }
403
0
      else /* ASCII case */
404
0
        g_string_append_c (result, *str++);
405
0
    }
406
407
0
  return g_string_free (result, FALSE);
408
0
}