/src/gettext-0.26/gettext-tools/libgettextpo/unilbrk/ulc-width-linebreaks.c
Line | Count | Source |
1 | | /* Line breaking of strings. |
2 | | Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2001. |
4 | | |
5 | | This file is free software. |
6 | | It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". |
7 | | You can redistribute it and/or modify it under either |
8 | | - the terms of the GNU Lesser General Public License as published |
9 | | by the Free Software Foundation, either version 3, or (at your |
10 | | option) any later version, or |
11 | | - the terms of the GNU General Public License as published by the |
12 | | Free Software Foundation; either version 2, or (at your option) |
13 | | any later version, or |
14 | | - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". |
15 | | |
16 | | This file is distributed in the hope that it will be useful, |
17 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | Lesser General Public License and the GNU General Public License |
20 | | for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License and of the GNU General Public License along with this |
24 | | program. If not, see <https://www.gnu.org/licenses/>. */ |
25 | | |
26 | | #include <config.h> |
27 | | |
28 | | /* Specification. */ |
29 | | #include "unilbrk.h" |
30 | | |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | |
34 | | #include "c-ctype.h" |
35 | | #include "uniconv.h" |
36 | | #include "unilbrk/internal.h" |
37 | | #include "unilbrk/lbrktables.h" |
38 | | #include "unilbrk/ulc-common.h" |
39 | | |
40 | | /* Line breaking of a string in an arbitrary encoding. |
41 | | |
42 | | We convert the input string to Unicode. |
43 | | |
44 | | The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, |
45 | | UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to |
46 | | \U0000FFFF. UTF-16 and variants support only characters up to |
47 | | \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. |
48 | | UCS-4 specification leaves doubts about endianness and byte order mark. |
49 | | glibc currently interprets it as big endian without byte order mark, |
50 | | but this is not backed by an RFC. So we use UTF-8. It supports |
51 | | characters up to \U7FFFFFFF and is unambiguously defined. */ |
52 | | |
53 | | static int |
54 | | ulc_width_linebreaks_internal (const char *s, size_t n, |
55 | | int width, int start_column, int at_end_columns, |
56 | | const char *o, const char *encoding, int cr, |
57 | | char *p) |
58 | 0 | { |
59 | 0 | if (n > 0) |
60 | 0 | { |
61 | 0 | if (is_utf8_encoding (encoding)) |
62 | 0 | return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p); |
63 | 0 | else |
64 | 0 | { |
65 | | /* Convert the string to UTF-8 and build a translation table |
66 | | from offsets into s to offsets into the translated string. */ |
67 | 0 | size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); |
68 | |
|
69 | 0 | if (offsets != NULL) |
70 | 0 | { |
71 | 0 | uint8_t *t; |
72 | 0 | size_t m; |
73 | |
|
74 | 0 | t = u8_conv_from_encoding (encoding, iconveh_question_mark, |
75 | 0 | s, n, offsets, NULL, &m); |
76 | 0 | if (t != NULL) |
77 | 0 | { |
78 | 0 | char *memory = |
79 | 0 | (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL); |
80 | |
|
81 | 0 | if (m == 0 || memory != NULL) |
82 | 0 | { |
83 | 0 | char *q = (char *) memory; |
84 | 0 | char *o8 = (o != NULL ? (char *) (q + m) : NULL); |
85 | 0 | int res_column; |
86 | 0 | size_t i; |
87 | | |
88 | | /* Translate the overrides to the UTF-8 string. */ |
89 | 0 | if (o != NULL) |
90 | 0 | { |
91 | 0 | memset (o8, UC_BREAK_UNDEFINED, m); |
92 | 0 | for (i = 0; i < n; i++) |
93 | 0 | if (offsets[i] != (size_t)(-1)) |
94 | 0 | o8[offsets[i]] = o[i]; |
95 | 0 | } |
96 | | |
97 | | /* Determine the line breaks of the UTF-8 string. */ |
98 | 0 | res_column = |
99 | 0 | u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q); |
100 | | |
101 | | /* Translate the result back to the original string. */ |
102 | 0 | memset (p, UC_BREAK_PROHIBITED, n); |
103 | 0 | for (i = 0; i < n; i++) |
104 | 0 | if (offsets[i] != (size_t)(-1)) |
105 | 0 | p[i] = q[offsets[i]]; |
106 | |
|
107 | 0 | free (memory); |
108 | 0 | free (t); |
109 | 0 | free (offsets); |
110 | 0 | return res_column; |
111 | 0 | } |
112 | 0 | free (t); |
113 | 0 | } |
114 | 0 | free (offsets); |
115 | 0 | } |
116 | | /* Impossible to convert. */ |
117 | 0 | #if C_CTYPE_ASCII |
118 | 0 | if (is_all_ascii (s, n)) |
119 | 0 | { |
120 | | /* ASCII is a subset of UTF-8. */ |
121 | 0 | return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p); |
122 | 0 | } |
123 | 0 | #endif |
124 | | /* We have a non-ASCII string and cannot convert it. |
125 | | Don't produce line breaks except those already present in the |
126 | | input string. All we assume here is that the encoding is |
127 | | minimally ASCII compatible. */ |
128 | 0 | { |
129 | 0 | const char *s_end = s + n; |
130 | 0 | while (s < s_end) |
131 | 0 | { |
132 | 0 | *p = ((o != NULL && *o == UC_BREAK_MANDATORY) |
133 | 0 | || *s == '\n' |
134 | 0 | ? UC_BREAK_MANDATORY |
135 | 0 | : ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF) |
136 | 0 | || (cr >= 0 |
137 | 0 | && *s == '\r' |
138 | 0 | && s + 1 < s_end |
139 | 0 | && *(s + 1) == '\n') |
140 | 0 | ? UC_BREAK_CR_BEFORE_LF |
141 | 0 | : UC_BREAK_PROHIBITED)); |
142 | 0 | s++; |
143 | 0 | p++; |
144 | 0 | if (o != NULL) |
145 | 0 | o++; |
146 | 0 | } |
147 | | /* We cannot compute widths in this case. */ |
148 | 0 | } |
149 | 0 | } |
150 | 0 | } |
151 | 0 | return start_column; |
152 | 0 | } |
153 | | |
154 | | #if defined IN_LIBUNISTRING |
155 | | /* For backward compatibility with older versions of libunistring. */ |
156 | | |
157 | | # undef ulc_width_linebreaks |
158 | | |
159 | | int |
160 | | ulc_width_linebreaks (const char *s, size_t n, |
161 | | int width, int start_column, int at_end_columns, |
162 | | const char *o, const char *encoding, |
163 | | char *p) |
164 | | { |
165 | | return ulc_width_linebreaks_internal (s, n, |
166 | | width, start_column, at_end_columns, |
167 | | o, encoding, -1, p); |
168 | | } |
169 | | |
170 | | #endif |
171 | | |
172 | | int |
173 | | ulc_width_linebreaks_v2 (const char *s, size_t n, |
174 | | int width, int start_column, int at_end_columns, |
175 | | const char *o, const char *encoding, |
176 | | char *p) |
177 | 0 | { |
178 | 0 | return ulc_width_linebreaks_internal (s, n, |
179 | 0 | width, start_column, at_end_columns, |
180 | 0 | o, encoding, LBP_CR, p); |
181 | 0 | } |
182 | | |
183 | | |
184 | | #ifdef TEST |
185 | | |
186 | | #include <stdio.h> |
187 | | #include <locale.h> |
188 | | |
189 | | /* Read the contents of an input stream, and return it, terminated with a NUL |
190 | | byte. */ |
191 | | char * |
192 | | read_file (FILE *stream) |
193 | | { |
194 | | #define BUFSIZE 4096 |
195 | | char *buf = NULL; |
196 | | int alloc = 0; |
197 | | int size = 0; |
198 | | int count; |
199 | | |
200 | | while (! feof (stream)) |
201 | | { |
202 | | if (size + BUFSIZE > alloc) |
203 | | { |
204 | | alloc = alloc + alloc / 2; |
205 | | if (alloc < size + BUFSIZE) |
206 | | alloc = size + BUFSIZE; |
207 | | buf = realloc (buf, alloc); |
208 | | if (buf == NULL) |
209 | | { |
210 | | fprintf (stderr, "out of memory\n"); |
211 | | exit (1); |
212 | | } |
213 | | } |
214 | | count = fread (buf + size, 1, BUFSIZE, stream); |
215 | | if (count == 0) |
216 | | { |
217 | | if (ferror (stream)) |
218 | | { |
219 | | perror ("fread"); |
220 | | exit (1); |
221 | | } |
222 | | } |
223 | | else |
224 | | size += count; |
225 | | } |
226 | | buf = realloc (buf, size + 1); |
227 | | if (buf == NULL) |
228 | | { |
229 | | fprintf (stderr, "out of memory\n"); |
230 | | exit (1); |
231 | | } |
232 | | buf[size] = '\0'; |
233 | | return buf; |
234 | | #undef BUFSIZE |
235 | | } |
236 | | |
237 | | int |
238 | | main (int argc, char * argv[]) |
239 | | { |
240 | | setlocale (LC_CTYPE, ""); |
241 | | if (argc == 2) |
242 | | { |
243 | | /* Insert line breaks for a given width. */ |
244 | | int width = atoi (argv[1]); |
245 | | char *input = read_file (stdin); |
246 | | int length = strlen (input); |
247 | | char *breaks = malloc (length); |
248 | | int i; |
249 | | |
250 | | ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks); |
251 | | |
252 | | for (i = 0; i < length; i++) |
253 | | { |
254 | | switch (breaks[i]) |
255 | | { |
256 | | case UC_BREAK_POSSIBLE: |
257 | | putc ('\n', stdout); |
258 | | break; |
259 | | case UC_BREAK_MANDATORY: |
260 | | break; |
261 | | case UC_BREAK_CR_BEFORE_LF: |
262 | | break; |
263 | | case UC_BREAK_PROHIBITED: |
264 | | break; |
265 | | default: |
266 | | abort (); |
267 | | } |
268 | | putc (input[i], stdout); |
269 | | } |
270 | | |
271 | | free (breaks); |
272 | | |
273 | | return 0; |
274 | | } |
275 | | else |
276 | | return 1; |
277 | | } |
278 | | |
279 | | #endif /* TEST */ |