/src/gettext-0.26/gettext-tools/libgettextpo/unilbrk/u8-possible-linebreaks.c
Line | Count | Source |
1 | | /* Line breaking of UTF-8 strings. |
2 | | Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2001. |
4 | | |
5 | | This file is free software. |
6 | | It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". |
7 | | You can redistribute it and/or modify it under either |
8 | | - the terms of the GNU Lesser General Public License as published |
9 | | by the Free Software Foundation, either version 3, or (at your |
10 | | option) any later version, or |
11 | | - the terms of the GNU General Public License as published by the |
12 | | Free Software Foundation; either version 2, or (at your option) |
13 | | any later version, or |
14 | | - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". |
15 | | |
16 | | This file is distributed in the hope that it will be useful, |
17 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | Lesser General Public License and the GNU General Public License |
20 | | for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License and of the GNU General Public License along with this |
24 | | program. If not, see <https://www.gnu.org/licenses/>. */ |
25 | | |
26 | | #include <config.h> |
27 | | |
28 | | /* Specification. */ |
29 | | #include "unilbrk.h" |
30 | | #include "unilbrk/internal.h" |
31 | | |
32 | | #include <stdlib.h> |
33 | | #include <string.h> |
34 | | |
35 | | #include "unilbrk/lbrktables.h" |
36 | | #include "uniwidth/cjk.h" |
37 | | #include "unistr.h" |
38 | | |
39 | | /* This file implements |
40 | | Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */ |
41 | | |
42 | | void |
43 | | u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, |
44 | | int cr, char *p) |
45 | 0 | { |
46 | 0 | if (n > 0) |
47 | 0 | { |
48 | 0 | int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1); |
49 | | |
50 | | /* Don't break inside multibyte characters. */ |
51 | 0 | memset (p, UC_BREAK_PROHIBITED, n); |
52 | |
|
53 | 0 | const uint8_t *s_end = s + n; |
54 | | |
55 | | /* We need 2 characters of lookahead: |
56 | | - 1 character of lookahead for (LB15c,LB19a,LB28a), |
57 | | - 2 characters of lookahead for (LB25). */ |
58 | 0 | const uint8_t *lookahead1_end; |
59 | 0 | ucs4_t lookahead1_uc; |
60 | 0 | int lookahead1_prop_ea; |
61 | 0 | const uint8_t *lookahead2_end; |
62 | 0 | ucs4_t lookahead2_uc; |
63 | 0 | int lookahead2_prop_ea; |
64 | | /* Get the first lookahead character. */ |
65 | 0 | lookahead1_end = s; |
66 | 0 | lookahead1_end += u8_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end); |
67 | 0 | lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc); |
68 | | /* Get the second lookahead character. */ |
69 | 0 | lookahead2_end = lookahead1_end; |
70 | 0 | if (lookahead2_end < s_end) |
71 | 0 | { |
72 | 0 | lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); |
73 | 0 | lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); |
74 | 0 | } |
75 | 0 | else |
76 | 0 | { |
77 | 0 | lookahead2_uc = 0xFFFD; |
78 | 0 | lookahead2_prop_ea = PROP_EA (LBP_BK, 0); |
79 | 0 | } |
80 | |
|
81 | 0 | int preceding_prop = LBP_BK; /* line break property of preceding character */ |
82 | 0 | int prev_prop = LBP_BK; /* line break property of previous character |
83 | | (= last character, ignoring intervening characters of class CM or ZWJ) */ |
84 | 0 | int prev_ea = 0; /* EastAsian property of previous character |
85 | | (= last character, ignoring intervening characters of class CM or ZWJ) */ |
86 | 0 | int prev2_ea = 0; /* EastAsian property of character before the previous character */ |
87 | 0 | bool prev_initial_hyphen = false; /* the previous character was a |
88 | | word-initial hyphen or U+2010 */ |
89 | 0 | bool prev_nus = false; /* before the previous character, there was a character |
90 | | with line break property LBP_NU and since then |
91 | | only characters with line break property LBP_SY |
92 | | or LBP_IS */ |
93 | 0 | int last_prop = LBP_BK; /* line break property of last non-space character |
94 | | (= last character, ignoring intervening characters of class SP or CM or ZWJ) */ |
95 | 0 | char *seen_space = NULL; /* Was a space seen after the last non-space character? */ |
96 | | |
97 | | /* Number of consecutive regional indicator (RI) characters seen |
98 | | immediately before the current point. */ |
99 | 0 | size_t ri_count = 0; |
100 | |
|
101 | 0 | do |
102 | 0 | { |
103 | | /* Read the next character. */ |
104 | 0 | size_t count = lookahead1_end - s; |
105 | 0 | s = lookahead1_end; |
106 | 0 | ucs4_t uc = lookahead1_uc; |
107 | 0 | int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */ |
108 | 0 | int prop = PROP (prop_ea); /* line break property of uc */ |
109 | 0 | int ea = EA (prop_ea); /* EastAsian property of uc */ |
110 | | /* Refill the pipeline of 2 lookahead characters. */ |
111 | 0 | lookahead1_end = lookahead2_end; |
112 | 0 | lookahead1_uc = lookahead2_uc; |
113 | 0 | lookahead1_prop_ea = lookahead2_prop_ea; |
114 | 0 | if (lookahead2_end < s_end) |
115 | 0 | { |
116 | 0 | lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); |
117 | 0 | lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); |
118 | 0 | } |
119 | 0 | else |
120 | 0 | { |
121 | 0 | lookahead2_uc = 0xFFFD; |
122 | 0 | lookahead2_prop_ea = PROP_EA (LBP_BK, 0); |
123 | 0 | } |
124 | |
|
125 | 0 | bool nus = /* ending at the previous character, there was a character |
126 | | with line break property LBP_NU and since then only |
127 | | characters with line break property LBP_SY or LBP_IS */ |
128 | 0 | (prev_prop == LBP_NU |
129 | 0 | || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS))); |
130 | |
|
131 | 0 | if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR) |
132 | 0 | { |
133 | | /* (LB4,LB5,LB6) Mandatory break. */ |
134 | 0 | *p = UC_BREAK_MANDATORY; |
135 | | /* cr is either LBP_CR or -1. In the first case, recognize |
136 | | a CR-LF sequence. */ |
137 | 0 | if (prev_prop == cr && prop == LBP_LF) |
138 | 0 | p[-1] = UC_BREAK_CR_BEFORE_LF; |
139 | 0 | last_prop = LBP_BK; |
140 | 0 | seen_space = NULL; |
141 | 0 | } |
142 | 0 | else |
143 | 0 | { |
144 | | /* Resolve property values whose behaviour is not fixed. */ |
145 | 0 | switch (prop) |
146 | 0 | { |
147 | 0 | case LBP_AI: |
148 | | /* Resolve ambiguous. */ |
149 | 0 | prop = LBP_AI_REPLACEMENT; |
150 | 0 | break; |
151 | 0 | case LBP_CB: |
152 | | /* This is arbitrary. */ |
153 | 0 | prop = LBP_ID1; |
154 | 0 | break; |
155 | 0 | case LBP_SA: |
156 | | /* We don't handle complex scripts yet. |
157 | | Treat LBP_SA like LBP_XX. */ |
158 | 0 | case LBP_XX: |
159 | | /* This is arbitrary. */ |
160 | 0 | prop = LBP_AL1; |
161 | 0 | break; |
162 | 0 | } |
163 | | |
164 | | /* Deal with spaces and combining characters. */ |
165 | 0 | if (prop == LBP_SP) |
166 | 0 | { |
167 | | /* (LB7) Don't break just before a space. */ |
168 | 0 | *p = UC_BREAK_PROHIBITED; |
169 | 0 | seen_space = p; |
170 | 0 | } |
171 | 0 | else if (prop == LBP_ZW) |
172 | 0 | { |
173 | | /* (LB7) Don't break just before a zero-width space. */ |
174 | 0 | *p = UC_BREAK_PROHIBITED; |
175 | 0 | last_prop = LBP_ZW; |
176 | 0 | seen_space = NULL; |
177 | 0 | } |
178 | 0 | else if (prop == LBP_CM || prop == LBP_ZWJ) |
179 | 0 | { |
180 | | /* (LB9) Don't break just before a combining character or |
181 | | zero-width joiner, except immediately after a mandatory |
182 | | break character, space, or zero-width space. */ |
183 | 0 | if (last_prop == LBP_BK) |
184 | 0 | { |
185 | | /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ |
186 | 0 | *p = UC_BREAK_PROHIBITED; |
187 | | /* (LB10) Treat CM or ZWJ as AL. */ |
188 | 0 | last_prop = LBP_AL1; |
189 | 0 | seen_space = NULL; |
190 | 0 | } |
191 | 0 | else if (last_prop == LBP_ZW |
192 | 0 | || (seen_space != NULL |
193 | | /* (LB14) has higher priority than (LB18). */ |
194 | 0 | && !(last_prop == LBP_OP1 || last_prop == LBP_OP2) |
195 | | /* (LB15a) has higher priority than (LB18). */ |
196 | 0 | && !(last_prop == LBP_QU2))) |
197 | 0 | { |
198 | | /* (LB8) Break after zero-width space. */ |
199 | | /* (LB18) Break after spaces. |
200 | | We do *not* implement the "legacy support for space |
201 | | character as base for combining marks" because now the |
202 | | NBSP CM sequence is recommended instead of SP CM. */ |
203 | 0 | *p = UC_BREAK_POSSIBLE; |
204 | | /* (LB10) Treat CM or ZWJ as AL. */ |
205 | 0 | last_prop = LBP_AL1; |
206 | 0 | seen_space = NULL; |
207 | 0 | } |
208 | 0 | else |
209 | 0 | { |
210 | | /* Treat X CM as if it were X. */ |
211 | 0 | *p = UC_BREAK_PROHIBITED; |
212 | 0 | } |
213 | 0 | } |
214 | 0 | else |
215 | 0 | { |
216 | | /* prop must be usable as an index for table 7.3 of UTR #14. */ |
217 | 0 | if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) |
218 | 0 | abort (); |
219 | | |
220 | 0 | if (last_prop == LBP_BK) |
221 | 0 | { |
222 | | /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ |
223 | 0 | *p = UC_BREAK_PROHIBITED; |
224 | 0 | } |
225 | 0 | else if (last_prop == LBP_ZW) |
226 | 0 | { |
227 | | /* (LB8) Break after zero-width space. */ |
228 | 0 | *p = UC_BREAK_POSSIBLE; |
229 | 0 | } |
230 | 0 | else if (preceding_prop == LBP_ZWJ) |
231 | 0 | { |
232 | | /* (LB8a) Don't break right after a zero-width joiner. */ |
233 | 0 | *p = UC_BREAK_PROHIBITED; |
234 | 0 | } |
235 | 0 | else if (prop == LBP_IS && prev_prop == LBP_SP |
236 | 0 | && PROP (lookahead1_prop_ea) == LBP_NU) |
237 | 0 | { |
238 | | /* (LB15c) Break before a decimal mark that follows a space. */ |
239 | 0 | *p = UC_BREAK_POSSIBLE; |
240 | 0 | } |
241 | 0 | else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3) |
242 | 0 | && (! prev_ea || ! EA (lookahead1_prop_ea)) |
243 | | /* (LB18) has higher priority than (LB19a). */ |
244 | 0 | && prev_prop != LBP_SP) |
245 | 0 | || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3) |
246 | 0 | && (! prev2_ea || ! ea))) |
247 | 0 | { |
248 | | /* (LB19a) Don't break on either side of ambiguous |
249 | | quotation marks, except next to an EastAsian character. */ |
250 | 0 | *p = UC_BREAK_PROHIBITED; |
251 | 0 | } |
252 | 0 | else if (prev_initial_hyphen |
253 | 0 | && (prop == LBP_AL1 || prop == LBP_AL2)) |
254 | 0 | { |
255 | | /* (LB20a) Don't break after a word-initial hyphen. */ |
256 | 0 | *p = UC_BREAK_PROHIBITED; |
257 | 0 | } |
258 | 0 | else if (prev_prop == LBP_HL_BA && prop != LBP_HL) |
259 | 0 | { |
260 | | /* (LB21a) Don't break after Hebrew + Hyphen/Break-After, |
261 | | before non-Hebrew. */ |
262 | 0 | *p = UC_BREAK_PROHIBITED; |
263 | 0 | } |
264 | 0 | else if ((prev_nus |
265 | 0 | && (prev_prop == LBP_CL |
266 | 0 | || prev_prop == LBP_CP1 || prev_prop == LBP_CP2) |
267 | 0 | && (prop == LBP_PO || prop == LBP_PR)) |
268 | 0 | || (nus && (prop == LBP_PO || prop == LBP_PR |
269 | 0 | || prop == LBP_NU))) |
270 | 0 | { |
271 | | /* (LB25) Don't break numbers. */ |
272 | 0 | *p = UC_BREAK_PROHIBITED; |
273 | 0 | } |
274 | 0 | else if ((prev_prop == LBP_PO || prev_prop == LBP_PR) |
275 | 0 | && (prop == LBP_OP1 || prop == LBP_OP2) |
276 | 0 | && (PROP (lookahead1_prop_ea) == LBP_NU |
277 | 0 | || (PROP (lookahead1_prop_ea) == LBP_IS |
278 | 0 | && PROP (lookahead2_prop_ea) == LBP_NU))) |
279 | 0 | { |
280 | | /* (LB25) Don't break numbers. */ |
281 | 0 | *p = UC_BREAK_PROHIBITED; |
282 | 0 | } |
283 | 0 | else if (prev_prop == LBP_AKLS_VI |
284 | 0 | && (prop == LBP_AK || prop == LBP_AL2)) |
285 | 0 | { |
286 | | /* (LB28a) Don't break inside orthographic syllables of |
287 | | Brahmic scripts, line 3. */ |
288 | 0 | *p = UC_BREAK_PROHIBITED; |
289 | 0 | } |
290 | 0 | else if (PROP (lookahead1_prop_ea) == LBP_VF |
291 | 0 | && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS) |
292 | 0 | && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS)) |
293 | 0 | { |
294 | | /* (LB28a) Don't break inside orthographic syllables of |
295 | | Brahmic scripts, line 4. */ |
296 | 0 | *p = UC_BREAK_PROHIBITED; |
297 | 0 | } |
298 | 0 | else if (last_prop == LBP_IS && uc == 0x003C) |
299 | 0 | { |
300 | | /* Partially disable (LB29) Do not break between numeric |
301 | | punctuation and alphabetics ("e.g."). We find it |
302 | | desirable to break before the HTML tag "</P>" in |
303 | | strings like "<P>Some sentence.</P>". */ |
304 | 0 | *p = UC_BREAK_POSSIBLE; |
305 | 0 | } |
306 | 0 | else if (last_prop == LBP_RI && prop == LBP_RI) |
307 | 0 | { |
308 | | /* (LB30a) Break between two regional indicator symbols |
309 | | if and only if there are an even number of regional |
310 | | indicators preceding the position of the break. */ |
311 | 0 | *p = (seen_space != NULL || (ri_count % 2) == 0 |
312 | 0 | ? UC_BREAK_POSSIBLE |
313 | 0 | : UC_BREAK_PROHIBITED); |
314 | 0 | } |
315 | 0 | else |
316 | 0 | { |
317 | 0 | int this_prop = prop; |
318 | 0 | if (prop == LBP_QU3) |
319 | 0 | { |
320 | | /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the |
321 | | next character's line break property is not one of |
322 | | BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ |
323 | 0 | switch (PROP (lookahead1_prop_ea)) |
324 | 0 | { |
325 | 0 | case LBP_BK: |
326 | 0 | case LBP_CR: |
327 | 0 | case LBP_LF: |
328 | 0 | case LBP_SP: |
329 | 0 | case LBP_GL: |
330 | 0 | case LBP_WJ: |
331 | 0 | case LBP_CL: |
332 | 0 | case LBP_QU1: case LBP_QU2: case LBP_QU3: |
333 | 0 | case LBP_CP1: case LBP_CP2: |
334 | 0 | case LBP_EX: |
335 | 0 | case LBP_IS: |
336 | 0 | case LBP_SY: |
337 | 0 | case LBP_ZW: |
338 | 0 | break; |
339 | 0 | default: |
340 | 0 | this_prop = LBP_QU1; |
341 | 0 | break; |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | 0 | switch (unilbrk_table [last_prop] [this_prop]) |
346 | 0 | { |
347 | 0 | case D: |
348 | 0 | *p = UC_BREAK_POSSIBLE; |
349 | 0 | break; |
350 | 0 | case I: |
351 | 0 | *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); |
352 | 0 | break; |
353 | 0 | case P: |
354 | 0 | *p = UC_BREAK_PROHIBITED; |
355 | 0 | break; |
356 | 0 | default: |
357 | 0 | abort (); |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | 0 | if (prop == LBP_QU2) |
362 | 0 | { |
363 | | /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the |
364 | | previous character's line break property was not one of |
365 | | BK, CR, LF, OP, QU, GL, SP, ZW. */ |
366 | 0 | switch (prev_prop) |
367 | 0 | { |
368 | 0 | case LBP_BK: |
369 | 0 | case LBP_CR: |
370 | 0 | case LBP_LF: |
371 | 0 | case LBP_OP1: case LBP_OP2: |
372 | 0 | case LBP_QU1: case LBP_QU2: case LBP_QU3: |
373 | 0 | case LBP_GL: |
374 | 0 | case LBP_SP: |
375 | 0 | case LBP_ZW: |
376 | 0 | break; |
377 | 0 | default: |
378 | 0 | prop = LBP_QU1; |
379 | 0 | break; |
380 | 0 | } |
381 | 0 | } |
382 | | |
383 | 0 | last_prop = prop; |
384 | 0 | seen_space = NULL; |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line |
389 | | break class except BK, CR, LF, NL, SP, or ZW. */ |
390 | 0 | if (!((prop == LBP_CM || prop == LBP_ZWJ) |
391 | 0 | && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR |
392 | 0 | || prev_prop == LBP_SP || prev_prop == LBP_ZW))) |
393 | 0 | { |
394 | 0 | prev_initial_hyphen = |
395 | 0 | (prop == LBP_HY || uc == 0x2010) |
396 | 0 | && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF |
397 | 0 | || prev_prop == LBP_SP || prev_prop == LBP_ZW |
398 | 0 | || prev_prop == LBP_CB || prev_prop == LBP_GL); |
399 | 0 | prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK |
400 | 0 | || prev_prop == LBP_AL2 |
401 | 0 | || prev_prop == LBP_AS) |
402 | 0 | ? LBP_AKLS_VI : |
403 | 0 | prev_prop == LBP_HL && (prop == LBP_HY |
404 | 0 | || (prop == LBP_BA && !ea)) |
405 | 0 | ? LBP_HL_BA : |
406 | 0 | prop); |
407 | 0 | prev2_ea = prev_ea; |
408 | 0 | prev_ea = ea; |
409 | 0 | prev_nus = nus; |
410 | 0 | } |
411 | |
|
412 | 0 | preceding_prop = prop; |
413 | |
|
414 | 0 | if (prop == LBP_RI) |
415 | 0 | ri_count++; |
416 | 0 | else |
417 | 0 | ri_count = 0; |
418 | |
|
419 | 0 | p += count; |
420 | 0 | } |
421 | 0 | while (s < s_end); |
422 | 0 | } |
423 | 0 | } |
424 | | |
425 | | #if defined IN_LIBUNISTRING |
426 | | /* For backward compatibility with older versions of libunistring. */ |
427 | | |
428 | | # undef u8_possible_linebreaks |
429 | | |
430 | | void |
431 | | u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, |
432 | | char *p) |
433 | | { |
434 | | u8_possible_linebreaks_loop (s, n, encoding, -1, p); |
435 | | } |
436 | | |
437 | | #endif |
438 | | |
439 | | void |
440 | | u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding, |
441 | | char *p) |
442 | 0 | { |
443 | 0 | u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p); |
444 | 0 | } |
445 | | |
446 | | |
447 | | #ifdef TEST |
448 | | |
449 | | #include <stdio.h> |
450 | | #include <string.h> |
451 | | |
452 | | /* Read the contents of an input stream, and return it, terminated with a NUL |
453 | | byte. */ |
454 | | char * |
455 | | read_file (FILE *stream) |
456 | | { |
457 | | #define BUFSIZE 4096 |
458 | | char *buf = NULL; |
459 | | int alloc = 0; |
460 | | int size = 0; |
461 | | int count; |
462 | | |
463 | | while (! feof (stream)) |
464 | | { |
465 | | if (size + BUFSIZE > alloc) |
466 | | { |
467 | | alloc = alloc + alloc / 2; |
468 | | if (alloc < size + BUFSIZE) |
469 | | alloc = size + BUFSIZE; |
470 | | buf = realloc (buf, alloc); |
471 | | if (buf == NULL) |
472 | | { |
473 | | fprintf (stderr, "out of memory\n"); |
474 | | exit (1); |
475 | | } |
476 | | } |
477 | | count = fread (buf + size, 1, BUFSIZE, stream); |
478 | | if (count == 0) |
479 | | { |
480 | | if (ferror (stream)) |
481 | | { |
482 | | perror ("fread"); |
483 | | exit (1); |
484 | | } |
485 | | } |
486 | | else |
487 | | size += count; |
488 | | } |
489 | | buf = realloc (buf, size + 1); |
490 | | if (buf == NULL) |
491 | | { |
492 | | fprintf (stderr, "out of memory\n"); |
493 | | exit (1); |
494 | | } |
495 | | buf[size] = '\0'; |
496 | | return buf; |
497 | | #undef BUFSIZE |
498 | | } |
499 | | |
500 | | int |
501 | | main (int argc, char * argv[]) |
502 | | { |
503 | | if (argc == 1) |
504 | | { |
505 | | /* Display all the break opportunities in the input string. */ |
506 | | char *input = read_file (stdin); |
507 | | int length = strlen (input); |
508 | | char *breaks = malloc (length); |
509 | | int i; |
510 | | |
511 | | u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks); |
512 | | |
513 | | for (i = 0; i < length; i++) |
514 | | { |
515 | | switch (breaks[i]) |
516 | | { |
517 | | case UC_BREAK_POSSIBLE: |
518 | | /* U+2027 in UTF-8 encoding */ |
519 | | putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); |
520 | | break; |
521 | | case UC_BREAK_MANDATORY: |
522 | | /* U+21B2 (or U+21B5) in UTF-8 encoding */ |
523 | | putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); |
524 | | break; |
525 | | case UC_BREAK_CR_BEFORE_LF: |
526 | | /* U+21E4 in UTF-8 encoding */ |
527 | | putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout); |
528 | | break; |
529 | | case UC_BREAK_PROHIBITED: |
530 | | break; |
531 | | default: |
532 | | abort (); |
533 | | } |
534 | | putc (input[i], stdout); |
535 | | } |
536 | | |
537 | | free (breaks); |
538 | | |
539 | | return 0; |
540 | | } |
541 | | else |
542 | | return 1; |
543 | | } |
544 | | |
545 | | #endif /* TEST */ |