/src/irssi/subprojects/glib-2.74.3/glib/gtranslit.c

Source (jump to first uncovered line)
/*
 * Copyright © 2014 Canonical Limited
 *
 * SPDX-License-Identifier: LGPL-2.1-or-later
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 *
 * Author: Ryan Lortie <desrt@desrt.ca>
 */

#include <config.h>

#include "gstrfuncs.h"

#include <glib.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>

struct mapping_entry
{
  guint16 src;
  guint16 ascii;
};

struct mapping_range
{
  guint16 start;
  guint16 length;
};

struct locale_entry
{
  guint8 name_offset;
  guint8 item_id;
};

#include "gtranslit-data.h"

#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
#define get_length(encoded)                 ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)

#if G_BYTE_ORDER == G_BIG_ENDIAN
#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
#else
#define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
#endif

static const gchar * lookup_in_item (guint           item_id,
                                     const gunichar *key,
                                     gint           *result_len,
                                     gint           *key_consumed);

static gint
compare_mapping_entry (gconstpointer user_data,
                       gconstpointer data)
{
  const struct mapping_entry *entry = data;
  const gunichar *key = user_data;
  gunichar src_0;

  G_STATIC_ASSERT(MAX_KEY_SIZE == 2);

  src_0 = get_src_char (src_table, entry->src, 0);

  if (key[0] > src_0)
    return 1;
  else if (key[0] < src_0)
    return -1;

  if (get_length (entry->src) > 1)
    {
      gunichar src_1;

      src_1 = get_src_char (src_table, entry->src, 1);

      if (key[1] > src_1)
        return 1;
      else if (key[1] < src_1)
        return -1;
    }
  else if (key[1])
    return 1;

  return 0;
}

static const gchar *
lookup_in_mapping (const struct mapping_entry *mapping,
                   gint                        mapping_size,
                   const gunichar             *key,
                   gint                       *result_len,
                   gint                       *key_consumed)
{
  const struct mapping_entry *hit;

  hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);

  if (hit == NULL)
    return NULL;

  *key_consumed = get_length (hit->src);
  *result_len = get_length (hit->ascii);

  return get_ascii_item(ascii_table, hit->ascii);
}

static const gchar *
lookup_in_chain (const guint8   *chain,
                 const gunichar *key,
                 gint           *result_len,
                 gint           *key_consumed)
{
  const gchar *result;

  while (*chain != 0xff)
    {
      result = lookup_in_item (*chain, key, result_len, key_consumed);

      if (result)
        return result;

      chain++;
    }

  return NULL;
}

static const gchar *
lookup_in_item (guint           item_id,
                const gunichar *key,
                gint           *result_len,
                gint           *key_consumed)
{
  if (item_id & 0x80)
    {
      const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];

      return lookup_in_chain (chain, key, result_len, key_consumed);
    }
  else
    {
      const struct mapping_range *range = &mapping_ranges[item_id];

      return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
    }
}

static gint
compare_locale_entry (gconstpointer user_data,
                      gconstpointer data)
{
  const struct locale_entry *entry = data;
  const gchar *key = user_data;

  return strcmp (key, &locale_names[entry->name_offset]);
}

static gboolean
lookup_item_id_for_one_locale (const gchar *key,
                               guint       *item_id)
{
  const struct locale_entry *hit;

  hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);

  if (hit == NULL)
    return FALSE;

  *item_id = hit->item_id;
  return TRUE;
}

static guint
lookup_item_id_for_locale (const gchar *locale)
{
  gchar key[MAX_LOCALE_NAME + 1];
  const gchar *language;
  guint language_len;
  const gchar *territory = NULL;
  guint territory_len = 0;
  const gchar *modifier = NULL;
  guint modifier_len = 0;
  const gchar *next_char;
  guint id;

  /* As per POSIX, a valid locale looks like:
   *
   *   language[_territory][.codeset][@modifier]
   */
  language = locale;
  language_len = strcspn (language, "_.@");
  next_char = language + language_len;

  if (*next_char == '_')
    {
      territory = next_char;
      territory_len = strcspn (territory + 1, "_.@") + 1;
      next_char = territory + territory_len;
    }

  if (*next_char == '.')
    {
      const gchar *codeset;
      guint codeset_len;

      codeset = next_char;
      codeset_len = strcspn (codeset + 1, "_.@") + 1;
      next_char = codeset + codeset_len;
    }

  if (*next_char == '@')
    {
      modifier = next_char;
      modifier_len = strcspn (modifier + 1, "_.@") + 1;
      next_char = modifier + modifier_len;
    }

  /* What madness is this? */
  if (language_len == 0 || *next_char)
    return default_item_id;

  /* We are not interested in codeset.
   *
   * For this locale:
   *
   *  aa_BB@cc
   *
   * try in this order:
   *
   * Note: we have no locales of the form aa_BB@cc in the database.
   *
   *  1. aa@cc
   *  2. aa_BB
   *  3. aa
   */

  /* 1. */
  if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
    {
      memcpy (key, language, language_len);
      memcpy (key + language_len, modifier, modifier_len);
      key[language_len + modifier_len] = '\0';

      if (lookup_item_id_for_one_locale (key, &id))
        return id;
    }

  /* 2. */
  if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
    {
      memcpy (key, language, language_len);
      memcpy (key + language_len, territory, territory_len);
      key[language_len + territory_len] = '\0';

      if (lookup_item_id_for_one_locale (key, &id))
        return id;
    }

  /* 3. */
  if (language_len <= MAX_LOCALE_NAME)
    {
      memcpy (key, language, language_len);
      key[language_len] = '\0';

      if (lookup_item_id_for_one_locale (key, &id))
        return id;
    }

  return default_item_id;
}

static guint
get_default_item_id (void)
{
  static guint item_id;
  static gboolean done;

  /* Doesn't need to be locked -- no harm in doing it twice. */
  if (!done)
    {
      const gchar *locale;

      locale = setlocale (LC_CTYPE, NULL);
      item_id = lookup_item_id_for_locale (locale);
      done = TRUE;
    }

  return item_id;
}

/**
 * g_str_to_ascii:
 * @str: a string, in UTF-8
 * @from_locale: (nullable): the source locale, if known
 *
 * Transliterate @str to plain ASCII.
 *
 * For best results, @str should be in composed normalised form.
 *
 * This function performs a reasonably good set of character
 * replacements.  The particular set of replacements that is done may
 * change by version or even by runtime environment.
 *
 * If the source language of @str is known, it can used to improve the
 * accuracy of the translation by passing it as @from_locale.  It should
 * be a valid POSIX locale string (of the form
 * `language[_territory][.codeset][@modifier]`).
 *
 * If @from_locale is %NULL then the current locale is used.
 *
 * If you want to do translation for no specific locale, and you want it
 * to be done independently of the currently locale, specify `"C"` for
 * @from_locale.
 *
 * Returns: a string in plain ASCII
 *
 * Since: 2.40
 **/
gchar *
g_str_to_ascii (const gchar *str,
                const gchar *from_locale)
{
  GString *result;
  guint item_id;

  g_return_val_if_fail (str != NULL, NULL);

  if (g_str_is_ascii (str))
    return g_strdup (str);

  if (from_locale)
    item_id = lookup_item_id_for_locale (from_locale);
  else
    item_id = get_default_item_id ();

  result = g_string_sized_new (strlen (str));

  while (*str)
    {
      /* We only need to transliterate non-ASCII values... */
      if (*str & 0x80)
        {
          gunichar key[MAX_KEY_SIZE];
          const gchar *r;
          gint consumed;
          gint r_len;
          gunichar c;

          G_STATIC_ASSERT(MAX_KEY_SIZE == 2);

          c = g_utf8_get_char (str);

          /* This is where it gets evil...
           *
           * We know that MAX_KEY_SIZE is 2.  We also know that we
           * only want to try another character if it's non-ascii.
           */
          str = g_utf8_next_char (str);

          key[0] = c;
          if (*str & 0x80)
            key[1] = g_utf8_get_char (str);
          else
            key[1] = 0;

          r = lookup_in_item (item_id, key, &r_len, &consumed);

          /* If we failed to map two characters, try again with one.
           *
           * gconv behaviour is a bit weird here -- it seems to
           * depend in the randomness of the binary search and the
           * size of the input buffer as to what result we get here.
           *
           * Doing it this way is more work, but should be
           * more-correct.
           */
          if (r == NULL && key[1])
            {
              key[1] = 0;
              r = lookup_in_item (item_id, key, &r_len, &consumed);
            }

          if (r != NULL)
            {
              g_string_append_len (result, r, r_len);
              if (consumed == 2)
                /* If it took both then skip again */
                str = g_utf8_next_char (str);
            }
          else /* no match found */
            g_string_append_c (result, '?');
        }
      else /* ASCII case */
        g_string_append_c (result, *str++);
    }

  return g_string_free (result, FALSE);
}

Coverage Report

Created: 2025-07-11 07:30

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright © 2014 Canonical Limited
3		*
4		* SPDX-License-Identifier: LGPL-2.1-or-later
5		*
6		* This library is free software; you can redistribute it and/or
7		* modify it under the terms of the GNU Lesser General Public
8		* License as published by the Free Software Foundation; either
9		* version 2.1 of the License, or (at your option) any later version.
10		*
11		* This library is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14		* Lesser General Public License for more details.
15		*
16		* You should have received a copy of the GNU Lesser General Public
17		* License along with this library; if not, see <http://www.gnu.org/licenses/>.
18		*
19		* Author: Ryan Lortie <desrt@desrt.ca>
20		*/
21
22		#include <config.h>
23
24		#include "gstrfuncs.h"
25
26		#include <glib.h>
27		#include <locale.h>
28		#include <stdlib.h>
29		#include <string.h>
30
31		struct mapping_entry
32		{
33		guint16 src;
34		guint16 ascii;
35		};
36
37		struct mapping_range
38		{
39		guint16 start;
40		guint16 length;
41		};
42
43		struct locale_entry
44		{
45		guint8 name_offset;
46		guint8 item_id;
47		};
48
49		#include "gtranslit-data.h"
50
51	0	#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
52	0	#define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
53
54		#if G_BYTE_ORDER == G_BIG_ENDIAN
55		#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
56		#else
57	0	#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
58		#endif
59
60		static const gchar * lookup_in_item (guint item_id,
61		const gunichar *key,
62		gint *result_len,
63		gint *key_consumed);
64
65		static gint
66		compare_mapping_entry (gconstpointer user_data,
67		gconstpointer data)
68	0	{
69	0	const struct mapping_entry *entry = data;
70	0	const gunichar *key = user_data;
71	0	gunichar src_0;
72
73	0	G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
74
75	0	src_0 = get_src_char (src_table, entry->src, 0);
76
77	0	if (key[0] > src_0)
78	0	return 1;
79	0	else if (key[0] < src_0)
80	0	return -1;
81
82	0	if (get_length (entry->src) > 1)
83	0	{
84	0	gunichar src_1;
85
86	0	src_1 = get_src_char (src_table, entry->src, 1);
87
88	0	if (key[1] > src_1)
89	0	return 1;
90	0	else if (key[1] < src_1)
91	0	return -1;
92	0	}
93	0	else if (key[1])
94	0	return 1;
95
96	0	return 0;
97	0	}
98
99		static const gchar *
100		lookup_in_mapping (const struct mapping_entry *mapping,
101		gint mapping_size,
102		const gunichar *key,
103		gint *result_len,
104		gint *key_consumed)
105	0	{
106	0	const struct mapping_entry *hit;
107
108	0	hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
109
110	0	if (hit == NULL)
111	0	return NULL;
112
113	0	*key_consumed = get_length (hit->src);
114	0	*result_len = get_length (hit->ascii);
115
116	0	return get_ascii_item(ascii_table, hit->ascii);
117	0	}
118
119		static const gchar *
120		lookup_in_chain (const guint8 *chain,
121		const gunichar *key,
122		gint *result_len,
123		gint *key_consumed)
124	0	{
125	0	const gchar *result;
126
127	0	while (*chain != 0xff)
128	0	{
129	0	result = lookup_in_item (*chain, key, result_len, key_consumed);
130
131	0	if (result)
132	0	return result;
133
134	0	chain++;
135	0	}
136
137	0	return NULL;
138	0	}
139
140		static const gchar *
141		lookup_in_item (guint item_id,
142		const gunichar *key,
143		gint *result_len,
144		gint *key_consumed)
145	0	{
146	0	if (item_id & 0x80)
147	0	{
148	0	const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
149
150	0	return lookup_in_chain (chain, key, result_len, key_consumed);
151	0	}
152	0	else
153	0	{
154	0	const struct mapping_range *range = &mapping_ranges[item_id];
155
156	0	return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
157	0	}
158	0	}
159
160		static gint
161		compare_locale_entry (gconstpointer user_data,
162		gconstpointer data)
163	0	{
164	0	const struct locale_entry *entry = data;
165	0	const gchar *key = user_data;
166
167	0	return strcmp (key, &locale_names[entry->name_offset]);
168	0	}
169
170		static gboolean
171		lookup_item_id_for_one_locale (const gchar *key,
172		guint *item_id)
173	0	{
174	0	const struct locale_entry *hit;
175
176	0	hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
177
178	0	if (hit == NULL)
179	0	return FALSE;
180
181	0	*item_id = hit->item_id;
182	0	return TRUE;
183	0	}
184
185		static guint
186		lookup_item_id_for_locale (const gchar *locale)
187	0	{
188	0	gchar key[MAX_LOCALE_NAME + 1];
189	0	const gchar *language;
190	0	guint language_len;
191	0	const gchar *territory = NULL;
192	0	guint territory_len = 0;
193	0	const gchar *modifier = NULL;
194	0	guint modifier_len = 0;
195	0	const gchar *next_char;
196	0	guint id;
197
198		/* As per POSIX, a valid locale looks like:
199		*
200		* language[_territory][.codeset][@modifier]
201		*/
202	0	language = locale;
203	0	language_len = strcspn (language, "_.@");
204	0	next_char = language + language_len;
205
206	0	if (*next_char == '_')
207	0	{
208	0	territory = next_char;
209	0	territory_len = strcspn (territory + 1, "_.@") + 1;
210	0	next_char = territory + territory_len;
211	0	}
212
213	0	if (*next_char == '.')
214	0	{
215	0	const gchar *codeset;
216	0	guint codeset_len;
217
218	0	codeset = next_char;
219	0	codeset_len = strcspn (codeset + 1, "_.@") + 1;
220	0	next_char = codeset + codeset_len;
221	0	}
222
223	0	if (*next_char == '@')
224	0	{
225	0	modifier = next_char;
226	0	modifier_len = strcspn (modifier + 1, "_.@") + 1;
227	0	next_char = modifier + modifier_len;
228	0	}
229
230		/* What madness is this? */
231	0	if (language_len == 0 \|\| *next_char)
232	0	return default_item_id;
233
234		/* We are not interested in codeset.
235		*
236		* For this locale:
237		*
238		* aa_BB@cc
239		*
240		* try in this order:
241		*
242		* Note: we have no locales of the form aa_BB@cc in the database.
243		*
244		* 1. aa@cc
245		* 2. aa_BB
246		* 3. aa
247		*/
248
249		/* 1. */
250	0	if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
251	0	{
252	0	memcpy (key, language, language_len);
253	0	memcpy (key + language_len, modifier, modifier_len);
254	0	key[language_len + modifier_len] = '\0';
255
256	0	if (lookup_item_id_for_one_locale (key, &id))
257	0	return id;
258	0	}
259
260		/* 2. */
261	0	if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
262	0	{
263	0	memcpy (key, language, language_len);
264	0	memcpy (key + language_len, territory, territory_len);
265	0	key[language_len + territory_len] = '\0';
266
267	0	if (lookup_item_id_for_one_locale (key, &id))
268	0	return id;
269	0	}
270
271		/* 3. */
272	0	if (language_len <= MAX_LOCALE_NAME)
273	0	{
274	0	memcpy (key, language, language_len);
275	0	key[language_len] = '\0';
276
277	0	if (lookup_item_id_for_one_locale (key, &id))
278	0	return id;
279	0	}
280
281	0	return default_item_id;
282	0	}
283
284		static guint
285		get_default_item_id (void)
286	0	{
287	0	static guint item_id;
288	0	static gboolean done;
289
290		/* Doesn't need to be locked -- no harm in doing it twice. */
291	0	if (!done)
292	0	{
293	0	const gchar *locale;
294
295	0	locale = setlocale (LC_CTYPE, NULL);
296	0	item_id = lookup_item_id_for_locale (locale);
297	0	done = TRUE;
298	0	}
299
300	0	return item_id;
301	0	}
302
303		/**
304		* g_str_to_ascii:
305		* @str: a string, in UTF-8
306		* @from_locale: (nullable): the source locale, if known
307		*
308		* Transliterate @str to plain ASCII.
309		*
310		* For best results, @str should be in composed normalised form.
311		*
312		* This function performs a reasonably good set of character
313		* replacements. The particular set of replacements that is done may
314		* change by version or even by runtime environment.
315		*
316		* If the source language of @str is known, it can used to improve the
317		* accuracy of the translation by passing it as @from_locale. It should
318		* be a valid POSIX locale string (of the form
319		* `language[_territory][.codeset][@modifier]`).
320		*
321		* If @from_locale is %NULL then the current locale is used.
322		*
323		* If you want to do translation for no specific locale, and you want it
324		* to be done independently of the currently locale, specify `"C"` for
325		* @from_locale.
326		*
327		* Returns: a string in plain ASCII
328		*
329		* Since: 2.40
330		**/
331		gchar *
332		g_str_to_ascii (const gchar *str,
333		const gchar *from_locale)
334	0	{
335	0	GString *result;
336	0	guint item_id;
337
338	0	g_return_val_if_fail (str != NULL, NULL);
339
340	0	if (g_str_is_ascii (str))
341	0	return g_strdup (str);
342
343	0	if (from_locale)
344	0	item_id = lookup_item_id_for_locale (from_locale);
345	0	else
346	0	item_id = get_default_item_id ();
347
348	0	result = g_string_sized_new (strlen (str));
349
350	0	while (*str)
351	0	{
352		/* We only need to transliterate non-ASCII values... */
353	0	if (*str & 0x80)
354	0	{
355	0	gunichar key[MAX_KEY_SIZE];
356	0	const gchar *r;
357	0	gint consumed;
358	0	gint r_len;
359	0	gunichar c;
360
361	0	G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
362
363	0	c = g_utf8_get_char (str);
364
365		/* This is where it gets evil...
366		*
367		* We know that MAX_KEY_SIZE is 2. We also know that we
368		* only want to try another character if it's non-ascii.
369		*/
370	0	str = g_utf8_next_char (str);
371
372	0	key[0] = c;
373	0	if (*str & 0x80)
374	0	key[1] = g_utf8_get_char (str);
375	0	else
376	0	key[1] = 0;
377
378	0	r = lookup_in_item (item_id, key, &r_len, &consumed);
379
380		/* If we failed to map two characters, try again with one.
381		*
382		* gconv behaviour is a bit weird here -- it seems to
383		* depend in the randomness of the binary search and the
384		* size of the input buffer as to what result we get here.
385		*
386		* Doing it this way is more work, but should be
387		* more-correct.
388		*/
389	0	if (r == NULL && key[1])
390	0	{
391	0	key[1] = 0;
392	0	r = lookup_in_item (item_id, key, &r_len, &consumed);
393	0	}
394
395	0	if (r != NULL)
396	0	{
397	0	g_string_append_len (result, r, r_len);
398	0	if (consumed == 2)
399		/* If it took both then skip again */
400	0	str = g_utf8_next_char (str);
401	0	}
402	0	else /* no match found */
403	0	g_string_append_c (result, '?');
404	0	}
405	0	else /* ASCII case */
406	0	g_string_append_c (result, *str++);
407	0	}
408
409	0	return g_string_free (result, FALSE);
410	0	}