/src/gettext-0.26/gettext-tools/libgettextpo/unilbrk/ulc-width-linebreaks.c

Source
/* Line breaking of strings.
   Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2001.

   This file is free software.
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   You can redistribute it and/or modify it under either
     - the terms of the GNU Lesser General Public License as published
       by the Free Software Foundation, either version 3, or (at your
       option) any later version, or
     - the terms of the GNU General Public License as published by the
       Free Software Foundation; either version 2, or (at your option)
       any later version, or
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".

   This file is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License and the GNU General Public License
   for more details.

   You should have received a copy of the GNU Lesser General Public
   License and of the GNU General Public License along with this
   program.  If not, see <https://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "unilbrk.h"

#include <stdlib.h>
#include <string.h>

#include "c-ctype.h"
#include "uniconv.h"
#include "unilbrk/internal.h"
#include "unilbrk/lbrktables.h"
#include "unilbrk/ulc-common.h"

/* Line breaking of a string in an arbitrary encoding.

   We convert the input string to Unicode.

   The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
   UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
   \U0000FFFF.  UTF-16 and variants support only characters up to
   \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
   UCS-4 specification leaves doubts about endianness and byte order mark.
   glibc currently interprets it as big endian without byte order mark,
   but this is not backed by an RFC.  So we use UTF-8. It supports
   characters up to \U7FFFFFFF and is unambiguously defined.  */

static int
ulc_width_linebreaks_internal (const char *s, size_t n,
                               int width, int start_column, int at_end_columns,
                               const char *o, const char *encoding, int cr,
                               char *p)
{
  if (n > 0)
    {
      if (is_utf8_encoding (encoding))
        return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *memory =
                    (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);

                  if (m == 0 || memory != NULL)
                    {
                      char *q = (char *) memory;
                      char *o8 = (o != NULL ? (char *) (q + m) : NULL);
                      int res_column;
                      size_t i;

                      /* Translate the overrides to the UTF-8 string.  */
                      if (o != NULL)
                        {
                          memset (o8, UC_BREAK_UNDEFINED, m);
                          for (i = 0; i < n; i++)
                            if (offsets[i] != (size_t)(-1))
                              o8[offsets[i]] = o[i];
                        }

                      /* Determine the line breaks of the UTF-8 string.  */
                      res_column =
                        u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);

                      /* Translate the result back to the original string.  */
                      memset (p, UC_BREAK_PROHIBITED, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (memory);
                      free (t);
                      free (offsets);
                      return res_column;
                    }
                  free (t);
                }
              free (offsets);
            }
          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce line breaks except those already present in the
             input string.  All we assume here is that the encoding is
             minimally ASCII compatible.  */
          {
            const char *s_end = s + n;
            while (s < s_end)
              {
                *p = ((o != NULL && *o == UC_BREAK_MANDATORY)
                      || *s == '\n'
                      ? UC_BREAK_MANDATORY
                      : ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
                         || (cr >= 0
                             && *s == '\r'
                             && s + 1 < s_end
                             && *(s + 1) == '\n')
                         ? UC_BREAK_CR_BEFORE_LF
                         : UC_BREAK_PROHIBITED));
                s++;
                p++;
                if (o != NULL)
                  o++;
              }
            /* We cannot compute widths in this case.  */
          }
        }
    }
  return start_column;
}

#if defined IN_LIBUNISTRING
/* For backward compatibility with older versions of libunistring.  */

# undef ulc_width_linebreaks

int
ulc_width_linebreaks (const char *s, size_t n,
                      int width, int start_column, int at_end_columns,
                      const char *o, const char *encoding,
                      char *p)
{
  return ulc_width_linebreaks_internal (s, n,
                                        width, start_column, at_end_columns,
                                        o, encoding, -1, p);
}

#endif

int
ulc_width_linebreaks_v2 (const char *s, size_t n,
                         int width, int start_column, int at_end_columns,
                         const char *o, const char *encoding,
                         char *p)
{
  return ulc_width_linebreaks_internal (s, n,
                                        width, start_column, at_end_columns,
                                        o, encoding, LBP_CR, p);
}


#ifdef TEST

#include <stdio.h>
#include <locale.h>

/* Read the contents of an input stream, and return it, terminated with a NUL
   byte. */
char *
read_file (FILE *stream)
{
#define BUFSIZE 4096
  char *buf = NULL;
  int alloc = 0;
  int size = 0;
  int count;

  while (! feof (stream))
    {
      if (size + BUFSIZE > alloc)
        {
          alloc = alloc + alloc / 2;
          if (alloc < size + BUFSIZE)
            alloc = size + BUFSIZE;
          buf = realloc (buf, alloc);
          if (buf == NULL)
            {
              fprintf (stderr, "out of memory\n");
              exit (1);
            }
        }
      count = fread (buf + size, 1, BUFSIZE, stream);
      if (count == 0)
        {
          if (ferror (stream))
            {
              perror ("fread");
              exit (1);
            }
        }
      else
        size += count;
    }
  buf = realloc (buf, size + 1);
  if (buf == NULL)
    {
      fprintf (stderr, "out of memory\n");
      exit (1);
    }
  buf[size] = '\0';
  return buf;
#undef BUFSIZE
}

int
main (int argc, char * argv[])
{
  setlocale (LC_CTYPE, "");
  if (argc == 2)
    {
      /* Insert line breaks for a given width.  */
      int width = atoi (argv[1]);
      char *input = read_file (stdin);
      int length = strlen (input);
      char *breaks = malloc (length);
      int i;

      ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);

      for (i = 0; i < length; i++)
        {
          switch (breaks[i])
            {
            case UC_BREAK_POSSIBLE:
              putc ('\n', stdout);
              break;
            case UC_BREAK_MANDATORY:
              break;
            case UC_BREAK_CR_BEFORE_LF:
              break;
            case UC_BREAK_PROHIBITED:
              break;
            default:
              abort ();
            }
          putc (input[i], stdout);
        }

      free (breaks);

      return 0;
    }
  else
    return 1;
}

#endif /* TEST */

Coverage Report

Created: 2026-01-25 07:18

Line	Count	Source
1		/* Line breaking of strings.
2		Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
3		Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5		This file is free software.
6		It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7		You can redistribute it and/or modify it under either
8		- the terms of the GNU Lesser General Public License as published
9		by the Free Software Foundation, either version 3, or (at your
10		option) any later version, or
11		- the terms of the GNU General Public License as published by the
12		Free Software Foundation; either version 2, or (at your option)
13		any later version, or
14		- the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16		This file is distributed in the hope that it will be useful,
17		but WITHOUT ANY WARRANTY; without even the implied warranty of
18		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19		Lesser General Public License and the GNU General Public License
20		for more details.
21
22		You should have received a copy of the GNU Lesser General Public
23		License and of the GNU General Public License along with this
24		program. If not, see <https://www.gnu.org/licenses/>. */
25
26		#include <config.h>
27
28		/* Specification. */
29		#include "unilbrk.h"
30
31		#include <stdlib.h>
32		#include <string.h>
33
34		#include "c-ctype.h"
35		#include "uniconv.h"
36		#include "unilbrk/internal.h"
37		#include "unilbrk/lbrktables.h"
38		#include "unilbrk/ulc-common.h"
39
40		/* Line breaking of a string in an arbitrary encoding.
41
42		We convert the input string to Unicode.
43
44		The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
45		UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
46		\U0000FFFF. UTF-16 and variants support only characters up to
47		\U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
48		UCS-4 specification leaves doubts about endianness and byte order mark.
49		glibc currently interprets it as big endian without byte order mark,
50		but this is not backed by an RFC. So we use UTF-8. It supports
51		characters up to \U7FFFFFFF and is unambiguously defined. */
52
53		static int
54		ulc_width_linebreaks_internal (const char *s, size_t n,
55		int width, int start_column, int at_end_columns,
56		const char o, const char encoding, int cr,
57		char *p)
58	0	{
59	0	if (n > 0)
60	0	{
61	0	if (is_utf8_encoding (encoding))
62	0	return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
63	0	else
64	0	{
65		/* Convert the string to UTF-8 and build a translation table
66		from offsets into s to offsets into the translated string. */
67	0	size_t offsets = (size_t ) malloc (n * sizeof (size_t));
68
69	0	if (offsets != NULL)
70	0	{
71	0	uint8_t *t;
72	0	size_t m;
73
74	0	t = u8_conv_from_encoding (encoding, iconveh_question_mark,
75	0	s, n, offsets, NULL, &m);
76	0	if (t != NULL)
77	0	{
78	0	char *memory =
79	0	(char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
80
81	0	if (m == 0 \|\| memory != NULL)
82	0	{
83	0	char q = (char ) memory;
84	0	char o8 = (o != NULL ? (char ) (q + m) : NULL);
85	0	int res_column;
86	0	size_t i;
87
88		/* Translate the overrides to the UTF-8 string. */
89	0	if (o != NULL)
90	0	{
91	0	memset (o8, UC_BREAK_UNDEFINED, m);
92	0	for (i = 0; i < n; i++)
93	0	if (offsets[i] != (size_t)(-1))
94	0	o8[offsets[i]] = o[i];
95	0	}
96
97		/* Determine the line breaks of the UTF-8 string. */
98	0	res_column =
99	0	u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);
100
101		/* Translate the result back to the original string. */
102	0	memset (p, UC_BREAK_PROHIBITED, n);
103	0	for (i = 0; i < n; i++)
104	0	if (offsets[i] != (size_t)(-1))
105	0	p[i] = q[offsets[i]];
106
107	0	free (memory);
108	0	free (t);
109	0	free (offsets);
110	0	return res_column;
111	0	}
112	0	free (t);
113	0	}
114	0	free (offsets);
115	0	}
116		/* Impossible to convert. */
117	0	#if C_CTYPE_ASCII
118	0	if (is_all_ascii (s, n))
119	0	{
120		/* ASCII is a subset of UTF-8. */
121	0	return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
122	0	}
123	0	#endif
124		/* We have a non-ASCII string and cannot convert it.
125		Don't produce line breaks except those already present in the
126		input string. All we assume here is that the encoding is
127		minimally ASCII compatible. */
128	0	{
129	0	const char *s_end = s + n;
130	0	while (s < s_end)
131	0	{
132	0	p = ((o != NULL && o == UC_BREAK_MANDATORY)
133	0	\|\| *s == '\n'
134	0	? UC_BREAK_MANDATORY
135	0	: ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
136	0	\|\| (cr >= 0
137	0	&& *s == '\r'
138	0	&& s + 1 < s_end
139	0	&& *(s + 1) == '\n')
140	0	? UC_BREAK_CR_BEFORE_LF
141	0	: UC_BREAK_PROHIBITED));
142	0	s++;
143	0	p++;
144	0	if (o != NULL)
145	0	o++;
146	0	}
147		/* We cannot compute widths in this case. */
148	0	}
149	0	}
150	0	}
151	0	return start_column;
152	0	}
153
154		#if defined IN_LIBUNISTRING
155		/* For backward compatibility with older versions of libunistring. */
156
157		# undef ulc_width_linebreaks
158
159		int
160		ulc_width_linebreaks (const char *s, size_t n,
161		int width, int start_column, int at_end_columns,
162		const char o, const char encoding,
163		char *p)
164		{
165		return ulc_width_linebreaks_internal (s, n,
166		width, start_column, at_end_columns,
167		o, encoding, -1, p);
168		}
169
170		#endif
171
172		int
173		ulc_width_linebreaks_v2 (const char *s, size_t n,
174		int width, int start_column, int at_end_columns,
175		const char o, const char encoding,
176		char *p)
177	0	{
178	0	return ulc_width_linebreaks_internal (s, n,
179	0	width, start_column, at_end_columns,
180	0	o, encoding, LBP_CR, p);
181	0	}
182
183
184		#ifdef TEST
185
186		#include <stdio.h>
187		#include <locale.h>
188
189		/* Read the contents of an input stream, and return it, terminated with a NUL
190		byte. */
191		char *
192		read_file (FILE *stream)
193		{
194		#define BUFSIZE 4096
195		char *buf = NULL;
196		int alloc = 0;
197		int size = 0;
198		int count;
199
200		while (! feof (stream))
201		{
202		if (size + BUFSIZE > alloc)
203		{
204		alloc = alloc + alloc / 2;
205		if (alloc < size + BUFSIZE)
206		alloc = size + BUFSIZE;
207		buf = realloc (buf, alloc);
208		if (buf == NULL)
209		{
210		fprintf (stderr, "out of memory\n");
211		exit (1);
212		}
213		}
214		count = fread (buf + size, 1, BUFSIZE, stream);
215		if (count == 0)
216		{
217		if (ferror (stream))
218		{
219		perror ("fread");
220		exit (1);
221		}
222		}
223		else
224		size += count;
225		}
226		buf = realloc (buf, size + 1);
227		if (buf == NULL)
228		{
229		fprintf (stderr, "out of memory\n");
230		exit (1);
231		}
232		buf[size] = '\0';
233		return buf;
234		#undef BUFSIZE
235		}
236
237		int
238		main (int argc, char * argv[])
239		{
240		setlocale (LC_CTYPE, "");
241		if (argc == 2)
242		{
243		/* Insert line breaks for a given width. */
244		int width = atoi (argv[1]);
245		char *input = read_file (stdin);
246		int length = strlen (input);
247		char *breaks = malloc (length);
248		int i;
249
250		ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);
251
252		for (i = 0; i < length; i++)
253		{
254		switch (breaks[i])
255		{
256		case UC_BREAK_POSSIBLE:
257		putc ('\n', stdout);
258		break;
259		case UC_BREAK_MANDATORY:
260		break;
261		case UC_BREAK_CR_BEFORE_LF:
262		break;
263		case UC_BREAK_PROHIBITED:
264		break;
265		default:
266		abort ();
267		}
268		putc (input[i], stdout);
269		}
270
271		free (breaks);
272
273		return 0;
274		}
275		else
276		return 1;
277		}
278
279		#endif /* TEST */