/src/gettext/gettext-tools/libgettextpo/unilbrk/ulc-width-linebreaks.c

Source
/* Line breaking of strings.
   Copyright (C) 2001-2003, 2006-2026 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2001.

   This file is free software.
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   You can redistribute it and/or modify it under either
     - the terms of the GNU Lesser General Public License as published
       by the Free Software Foundation, either version 3, or (at your
       option) any later version, or
     - the terms of the GNU General Public License as published by the
       Free Software Foundation; either version 2, or (at your option)
       any later version, or
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".

   This file is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License and the GNU General Public License
   for more details.

   You should have received a copy of the GNU Lesser General Public
   License and of the GNU General Public License along with this
   program.  If not, see <https://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "unilbrk.h"

#include <stdlib.h>
#include <string.h>

#include "c-ctype.h"
#include "uniconv.h"
#include "unilbrk/internal.h"
#include "unilbrk/lbrktables.h"
#include "unilbrk/ulc-common.h"

/* Line breaking of a string in an arbitrary encoding.

   We convert the input string to Unicode.

   The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
   UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
   \U0000FFFF.  UTF-16 and variants support only characters up to
   \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
   UCS-4 specification leaves doubts about endianness and byte order mark.
   glibc currently interprets it as big endian without byte order mark,
   but this is not backed by an RFC.  So we use UTF-8. It supports
   characters up to \U7FFFFFFF and is unambiguously defined.  */

static int
ulc_width_linebreaks_internal (const char *s, size_t n,
                               int width, int start_column, int at_end_columns,
                               const char *o, const char *encoding, int cr,
                               char *p)
{
  if (n > 0)
    {
      if (is_utf8_encoding (encoding))
        return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              size_t m;
              uint8_t *t =
                u8_conv_from_encoding (encoding, iconveh_question_mark,
                                       s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *memory =
                    (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);

                  if (m == 0 || memory != NULL)
                    {
                      char *q = (char *) memory;
                      char *o8 = (o != NULL ? (char *) (q + m) : NULL);

                      /* Translate the overrides to the UTF-8 string.  */
                      if (o != NULL)
                        {
                          memset (o8, UC_BREAK_UNDEFINED, m);
                          for (size_t i = 0; i < n; i++)
                            if (offsets[i] != (size_t)(-1))
                              o8[offsets[i]] = o[i];
                        }

                      /* Determine the line breaks of the UTF-8 string.  */
                      int res_column =
                        u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);

                      /* Translate the result back to the original string.  */
                      memset (p, UC_BREAK_PROHIBITED, n);
                      for (size_t i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (memory);
                      free (t);
                      free (offsets);
                      return res_column;
                    }
                  free (t);
                }
              free (offsets);
            }
          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce line breaks except those already present in the
             input string.  All we assume here is that the encoding is
             minimally ASCII compatible.  */
          {
            const char *s_end = s + n;
            while (s < s_end)
              {
                *p = ((o != NULL && *o == UC_BREAK_MANDATORY)
                      || *s == '\n'
                      ? UC_BREAK_MANDATORY
                      : ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
                         || (cr >= 0
                             && *s == '\r'
                             && s + 1 < s_end
                             && *(s + 1) == '\n')
                         ? UC_BREAK_CR_BEFORE_LF
                         : UC_BREAK_PROHIBITED));
                s++;
                p++;
                if (o != NULL)
                  o++;
              }
            /* We cannot compute widths in this case.  */
          }
        }
    }
  return start_column;
}

#if defined IN_LIBUNISTRING
/* For backward compatibility with older versions of libunistring.  */

# undef ulc_width_linebreaks

int
ulc_width_linebreaks (const char *s, size_t n,
                      int width, int start_column, int at_end_columns,
                      const char *o, const char *encoding,
                      char *p)
{
  return ulc_width_linebreaks_internal (s, n,
                                        width, start_column, at_end_columns,
                                        o, encoding, -1, p);
}

#endif

int
ulc_width_linebreaks_v2 (const char *s, size_t n,
                         int width, int start_column, int at_end_columns,
                         const char *o, const char *encoding,
                         char *p)
{
  return ulc_width_linebreaks_internal (s, n,
                                        width, start_column, at_end_columns,
                                        o, encoding, LBP_CR, p);
}


#ifdef TEST

#include <stdio.h>
#include <locale.h>

/* Read the contents of an input stream, and return it, terminated with a NUL
   byte. */
char *
read_file (FILE *stream)
{
#define BUFSIZE 4096
  char *buf = NULL;
  int alloc = 0;
  int size = 0;

  while (! feof (stream))
    {
      if (size + BUFSIZE > alloc)
        {
          alloc = alloc + alloc / 2;
          if (alloc < size + BUFSIZE)
            alloc = size + BUFSIZE;
          buf = realloc (buf, alloc);
          if (buf == NULL)
            {
              fprintf (stderr, "out of memory\n");
              exit (1);
            }
        }
      int count = fread (buf + size, 1, BUFSIZE, stream);
      if (count == 0)
        {
          if (ferror (stream))
            {
              perror ("fread");
              exit (1);
            }
        }
      else
        size += count;
    }
  buf = realloc (buf, size + 1);
  if (buf == NULL)
    {
      fprintf (stderr, "out of memory\n");
      exit (1);
    }
  buf[size] = '\0';
  return buf;
#undef BUFSIZE
}

int
main (int argc, char * argv[])
{
  setlocale (LC_CTYPE, "");
  if (argc == 2)
    {
      /* Insert line breaks for a given width.  */
      int width = atoi (argv[1]);
      char *input = read_file (stdin);
      int length = strlen (input);
      char *breaks = malloc (length);

      ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);

      for (int i = 0; i < length; i++)
        {
          switch (breaks[i])
            {
            case UC_BREAK_POSSIBLE:
              putc ('\n', stdout);
              break;
            case UC_BREAK_MANDATORY:
              break;
            case UC_BREAK_CR_BEFORE_LF:
              break;
            case UC_BREAK_PROHIBITED:
              break;
            default:
              abort ();
            }
          putc (input[i], stdout);
        }

      free (breaks);

      return 0;
    }
  else
    return 1;
}

#endif /* TEST */

Coverage Report

Created: 2026-03-12 07:14

Line	Count	Source
1		/* Line breaking of strings.
2		Copyright (C) 2001-2003, 2006-2026 Free Software Foundation, Inc.
3		Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5		This file is free software.
6		It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7		You can redistribute it and/or modify it under either
8		- the terms of the GNU Lesser General Public License as published
9		by the Free Software Foundation, either version 3, or (at your
10		option) any later version, or
11		- the terms of the GNU General Public License as published by the
12		Free Software Foundation; either version 2, or (at your option)
13		any later version, or
14		- the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16		This file is distributed in the hope that it will be useful,
17		but WITHOUT ANY WARRANTY; without even the implied warranty of
18		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19		Lesser General Public License and the GNU General Public License
20		for more details.
21
22		You should have received a copy of the GNU Lesser General Public
23		License and of the GNU General Public License along with this
24		program. If not, see <https://www.gnu.org/licenses/>. */
25
26		#include <config.h>
27
28		/* Specification. */
29		#include "unilbrk.h"
30
31		#include <stdlib.h>
32		#include <string.h>
33
34		#include "c-ctype.h"
35		#include "uniconv.h"
36		#include "unilbrk/internal.h"
37		#include "unilbrk/lbrktables.h"
38		#include "unilbrk/ulc-common.h"
39
40		/* Line breaking of a string in an arbitrary encoding.
41
42		We convert the input string to Unicode.
43
44		The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
45		UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
46		\U0000FFFF. UTF-16 and variants support only characters up to
47		\U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
48		UCS-4 specification leaves doubts about endianness and byte order mark.
49		glibc currently interprets it as big endian without byte order mark,
50		but this is not backed by an RFC. So we use UTF-8. It supports
51		characters up to \U7FFFFFFF and is unambiguously defined. */
52
53		static int
54		ulc_width_linebreaks_internal (const char *s, size_t n,
55		int width, int start_column, int at_end_columns,
56		const char o, const char encoding, int cr,
57		char *p)
58	0	{
59	0	if (n > 0)
60	0	{
61	0	if (is_utf8_encoding (encoding))
62	0	return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
63	0	else
64	0	{
65		/* Convert the string to UTF-8 and build a translation table
66		from offsets into s to offsets into the translated string. */
67	0	size_t offsets = (size_t ) malloc (n * sizeof (size_t));
68
69	0	if (offsets != NULL)
70	0	{
71	0	size_t m;
72	0	uint8_t *t =
73	0	u8_conv_from_encoding (encoding, iconveh_question_mark,
74	0	s, n, offsets, NULL, &m);
75	0	if (t != NULL)
76	0	{
77	0	char *memory =
78	0	(char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
79
80	0	if (m == 0 \|\| memory != NULL)
81	0	{
82	0	char q = (char ) memory;
83	0	char o8 = (o != NULL ? (char ) (q + m) : NULL);
84
85		/* Translate the overrides to the UTF-8 string. */
86	0	if (o != NULL)
87	0	{
88	0	memset (o8, UC_BREAK_UNDEFINED, m);
89	0	for (size_t i = 0; i < n; i++)
90	0	if (offsets[i] != (size_t)(-1))
91	0	o8[offsets[i]] = o[i];
92	0	}
93
94		/* Determine the line breaks of the UTF-8 string. */
95	0	int res_column =
96	0	u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);
97
98		/* Translate the result back to the original string. */
99	0	memset (p, UC_BREAK_PROHIBITED, n);
100	0	for (size_t i = 0; i < n; i++)
101	0	if (offsets[i] != (size_t)(-1))
102	0	p[i] = q[offsets[i]];
103
104	0	free (memory);
105	0	free (t);
106	0	free (offsets);
107	0	return res_column;
108	0	}
109	0	free (t);
110	0	}
111	0	free (offsets);
112	0	}
113		/* Impossible to convert. */
114	0	#if C_CTYPE_ASCII
115	0	if (is_all_ascii (s, n))
116	0	{
117		/* ASCII is a subset of UTF-8. */
118	0	return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
119	0	}
120	0	#endif
121		/* We have a non-ASCII string and cannot convert it.
122		Don't produce line breaks except those already present in the
123		input string. All we assume here is that the encoding is
124		minimally ASCII compatible. */
125	0	{
126	0	const char *s_end = s + n;
127	0	while (s < s_end)
128	0	{
129	0	p = ((o != NULL && o == UC_BREAK_MANDATORY)
130	0	\|\| *s == '\n'
131	0	? UC_BREAK_MANDATORY
132	0	: ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
133	0	\|\| (cr >= 0
134	0	&& *s == '\r'
135	0	&& s + 1 < s_end
136	0	&& *(s + 1) == '\n')
137	0	? UC_BREAK_CR_BEFORE_LF
138	0	: UC_BREAK_PROHIBITED));
139	0	s++;
140	0	p++;
141	0	if (o != NULL)
142	0	o++;
143	0	}
144		/* We cannot compute widths in this case. */
145	0	}
146	0	}
147	0	}
148	0	return start_column;
149	0	}
150
151		#if defined IN_LIBUNISTRING
152		/* For backward compatibility with older versions of libunistring. */
153
154		# undef ulc_width_linebreaks
155
156		int
157		ulc_width_linebreaks (const char *s, size_t n,
158		int width, int start_column, int at_end_columns,
159		const char o, const char encoding,
160		char *p)
161		{
162		return ulc_width_linebreaks_internal (s, n,
163		width, start_column, at_end_columns,
164		o, encoding, -1, p);
165		}
166
167		#endif
168
169		int
170		ulc_width_linebreaks_v2 (const char *s, size_t n,
171		int width, int start_column, int at_end_columns,
172		const char o, const char encoding,
173		char *p)
174	0	{
175	0	return ulc_width_linebreaks_internal (s, n,
176	0	width, start_column, at_end_columns,
177	0	o, encoding, LBP_CR, p);
178	0	}
179
180
181		#ifdef TEST
182
183		#include <stdio.h>
184		#include <locale.h>
185
186		/* Read the contents of an input stream, and return it, terminated with a NUL
187		byte. */
188		char *
189		read_file (FILE *stream)
190		{
191		#define BUFSIZE 4096
192		char *buf = NULL;
193		int alloc = 0;
194		int size = 0;
195
196		while (! feof (stream))
197		{
198		if (size + BUFSIZE > alloc)
199		{
200		alloc = alloc + alloc / 2;
201		if (alloc < size + BUFSIZE)
202		alloc = size + BUFSIZE;
203		buf = realloc (buf, alloc);
204		if (buf == NULL)
205		{
206		fprintf (stderr, "out of memory\n");
207		exit (1);
208		}
209		}
210		int count = fread (buf + size, 1, BUFSIZE, stream);
211		if (count == 0)
212		{
213		if (ferror (stream))
214		{
215		perror ("fread");
216		exit (1);
217		}
218		}
219		else
220		size += count;
221		}
222		buf = realloc (buf, size + 1);
223		if (buf == NULL)
224		{
225		fprintf (stderr, "out of memory\n");
226		exit (1);
227		}
228		buf[size] = '\0';
229		return buf;
230		#undef BUFSIZE
231		}
232
233		int
234		main (int argc, char * argv[])
235		{
236		setlocale (LC_CTYPE, "");
237		if (argc == 2)
238		{
239		/* Insert line breaks for a given width. */
240		int width = atoi (argv[1]);
241		char *input = read_file (stdin);
242		int length = strlen (input);
243		char *breaks = malloc (length);
244
245		ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);
246
247		for (int i = 0; i < length; i++)
248		{
249		switch (breaks[i])
250		{
251		case UC_BREAK_POSSIBLE:
252		putc ('\n', stdout);
253		break;
254		case UC_BREAK_MANDATORY:
255		break;
256		case UC_BREAK_CR_BEFORE_LF:
257		break;
258		case UC_BREAK_PROHIBITED:
259		break;
260		default:
261		abort ();
262		}
263		putc (input[i], stdout);
264		}
265
266		free (breaks);
267
268		return 0;
269		}
270		else
271		return 1;
272		}
273
274		#endif /* TEST */