Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gettext-0.26/gettext-tools/libgettextpo/unilbrk/u8-possible-linebreaks.c
Line
Count
Source
1
/* Line breaking of UTF-8 strings.
2
   Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5
   This file is free software.
6
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7
   You can redistribute it and/or modify it under either
8
     - the terms of the GNU Lesser General Public License as published
9
       by the Free Software Foundation, either version 3, or (at your
10
       option) any later version, or
11
     - the terms of the GNU General Public License as published by the
12
       Free Software Foundation; either version 2, or (at your option)
13
       any later version, or
14
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16
   This file is distributed in the hope that it will be useful,
17
   but WITHOUT ANY WARRANTY; without even the implied warranty of
18
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
   Lesser General Public License and the GNU General Public License
20
   for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License and of the GNU General Public License along with this
24
   program.  If not, see <https://www.gnu.org/licenses/>.  */
25
26
#include <config.h>
27
28
/* Specification.  */
29
#include "unilbrk.h"
30
#include "unilbrk/internal.h"
31
32
#include <stdlib.h>
33
#include <string.h>
34
35
#include "unilbrk/lbrktables.h"
36
#include "uniwidth/cjk.h"
37
#include "unistr.h"
38
39
/* This file implements
40
   Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
41
42
void
43
u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
44
                             int cr, char *p)
45
0
{
46
0
  if (n > 0)
47
0
    {
48
0
      int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
49
50
      /* Don't break inside multibyte characters.  */
51
0
      memset (p, UC_BREAK_PROHIBITED, n);
52
53
0
      const uint8_t *s_end = s + n;
54
55
      /* We need 2 characters of lookahead:
56
           - 1 character of lookahead for (LB15c,LB19a,LB28a),
57
           - 2 characters of lookahead for (LB25).  */
58
0
      const uint8_t *lookahead1_end;
59
0
      ucs4_t lookahead1_uc;
60
0
      int lookahead1_prop_ea;
61
0
      const uint8_t *lookahead2_end;
62
0
      ucs4_t lookahead2_uc;
63
0
      int lookahead2_prop_ea;
64
      /* Get the first lookahead character.  */
65
0
      lookahead1_end = s;
66
0
      lookahead1_end += u8_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end);
67
0
      lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
68
      /* Get the second lookahead character.  */
69
0
      lookahead2_end = lookahead1_end;
70
0
      if (lookahead2_end < s_end)
71
0
        {
72
0
          lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
73
0
          lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
74
0
        }
75
0
      else
76
0
        {
77
0
          lookahead2_uc = 0xFFFD;
78
0
          lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
79
0
        }
80
81
0
      int preceding_prop = LBP_BK; /* line break property of preceding character */
82
0
      int prev_prop = LBP_BK; /* line break property of previous character
83
                                 (= last character, ignoring intervening characters of class CM or ZWJ) */
84
0
      int prev_ea = 0;        /* EastAsian property of previous character
85
                                 (= last character, ignoring intervening characters of class CM or ZWJ) */
86
0
      int prev2_ea = 0;       /* EastAsian property of character before the previous character */
87
0
      bool prev_initial_hyphen = false; /* the previous character was a
88
                                           word-initial hyphen or U+2010 */
89
0
      bool prev_nus = false; /* before the previous character, there was a character
90
                                with line break property LBP_NU and since then
91
                                only characters with line break property LBP_SY
92
                                or LBP_IS */
93
0
      int last_prop = LBP_BK; /* line break property of last non-space character
94
                                 (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
95
0
      char *seen_space = NULL; /* Was a space seen after the last non-space character? */
96
97
      /* Number of consecutive regional indicator (RI) characters seen
98
         immediately before the current point.  */
99
0
      size_t ri_count = 0;
100
101
0
      do
102
0
        {
103
          /* Read the next character.  */
104
0
          size_t count = lookahead1_end - s;
105
0
          s = lookahead1_end;
106
0
          ucs4_t uc = lookahead1_uc;
107
0
          int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
108
0
          int prop = PROP (prop_ea); /* line break property of uc */
109
0
          int ea = EA (prop_ea);     /* EastAsian property of uc */
110
          /*  Refill the pipeline of 2 lookahead characters.  */
111
0
          lookahead1_end = lookahead2_end;
112
0
          lookahead1_uc = lookahead2_uc;
113
0
          lookahead1_prop_ea = lookahead2_prop_ea;
114
0
          if (lookahead2_end < s_end)
115
0
            {
116
0
              lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
117
0
              lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
118
0
            }
119
0
          else
120
0
            {
121
0
              lookahead2_uc = 0xFFFD;
122
0
              lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
123
0
            }
124
125
0
          bool nus = /* ending at the previous character, there was a character
126
                        with line break property LBP_NU and since then only
127
                        characters with line break property LBP_SY or LBP_IS */
128
0
            (prev_prop == LBP_NU
129
0
             || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
130
131
0
          if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
132
0
            {
133
              /* (LB4,LB5,LB6) Mandatory break.  */
134
0
              *p = UC_BREAK_MANDATORY;
135
              /* cr is either LBP_CR or -1.  In the first case, recognize
136
                 a CR-LF sequence.  */
137
0
              if (prev_prop == cr && prop == LBP_LF)
138
0
                p[-1] = UC_BREAK_CR_BEFORE_LF;
139
0
              last_prop = LBP_BK;
140
0
              seen_space = NULL;
141
0
            }
142
0
          else
143
0
            {
144
              /* Resolve property values whose behaviour is not fixed.  */
145
0
              switch (prop)
146
0
                {
147
0
                case LBP_AI:
148
                  /* Resolve ambiguous.  */
149
0
                  prop = LBP_AI_REPLACEMENT;
150
0
                  break;
151
0
                case LBP_CB:
152
                  /* This is arbitrary.  */
153
0
                  prop = LBP_ID1;
154
0
                  break;
155
0
                case LBP_SA:
156
                  /* We don't handle complex scripts yet.
157
                     Treat LBP_SA like LBP_XX.  */
158
0
                case LBP_XX:
159
                  /* This is arbitrary.  */
160
0
                  prop = LBP_AL1;
161
0
                  break;
162
0
                }
163
164
              /* Deal with spaces and combining characters.  */
165
0
              if (prop == LBP_SP)
166
0
                {
167
                  /* (LB7) Don't break just before a space.  */
168
0
                  *p = UC_BREAK_PROHIBITED;
169
0
                  seen_space = p;
170
0
                }
171
0
              else if (prop == LBP_ZW)
172
0
                {
173
                  /* (LB7) Don't break just before a zero-width space.  */
174
0
                  *p = UC_BREAK_PROHIBITED;
175
0
                  last_prop = LBP_ZW;
176
0
                  seen_space = NULL;
177
0
                }
178
0
              else if (prop == LBP_CM || prop == LBP_ZWJ)
179
0
                {
180
                  /* (LB9) Don't break just before a combining character or
181
                     zero-width joiner, except immediately after a mandatory
182
                     break character, space, or zero-width space.  */
183
0
                  if (last_prop == LBP_BK)
184
0
                    {
185
                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
186
0
                      *p = UC_BREAK_PROHIBITED;
187
                      /* (LB10) Treat CM or ZWJ as AL.  */
188
0
                      last_prop = LBP_AL1;
189
0
                      seen_space = NULL;
190
0
                    }
191
0
                  else if (last_prop == LBP_ZW
192
0
                           || (seen_space != NULL
193
                               /* (LB14) has higher priority than (LB18).  */
194
0
                               && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
195
                               /* (LB15a) has higher priority than (LB18).  */
196
0
                               && !(last_prop == LBP_QU2)))
197
0
                    {
198
                      /* (LB8) Break after zero-width space.  */
199
                      /* (LB18) Break after spaces.
200
                         We do *not* implement the "legacy support for space
201
                         character as base for combining marks" because now the
202
                         NBSP CM sequence is recommended instead of SP CM.  */
203
0
                      *p = UC_BREAK_POSSIBLE;
204
                      /* (LB10) Treat CM or ZWJ as AL.  */
205
0
                      last_prop = LBP_AL1;
206
0
                      seen_space = NULL;
207
0
                    }
208
0
                  else
209
0
                    {
210
                      /* Treat X CM as if it were X.  */
211
0
                      *p = UC_BREAK_PROHIBITED;
212
0
                    }
213
0
                }
214
0
              else
215
0
                {
216
                  /* prop must be usable as an index for table 7.3 of UTR #14.  */
217
0
                  if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
218
0
                    abort ();
219
220
0
                  if (last_prop == LBP_BK)
221
0
                    {
222
                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
223
0
                      *p = UC_BREAK_PROHIBITED;
224
0
                    }
225
0
                  else if (last_prop == LBP_ZW)
226
0
                    {
227
                      /* (LB8) Break after zero-width space.  */
228
0
                      *p = UC_BREAK_POSSIBLE;
229
0
                    }
230
0
                  else if (preceding_prop == LBP_ZWJ)
231
0
                    {
232
                      /* (LB8a) Don't break right after a zero-width joiner.  */
233
0
                      *p = UC_BREAK_PROHIBITED;
234
0
                    }
235
0
                  else if (prop == LBP_IS && prev_prop == LBP_SP
236
0
                           && PROP (lookahead1_prop_ea) == LBP_NU)
237
0
                    {
238
                      /* (LB15c) Break before a decimal mark that follows a space.  */
239
0
                      *p = UC_BREAK_POSSIBLE;
240
0
                    }
241
0
                  else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
242
0
                            && (! prev_ea || ! EA (lookahead1_prop_ea))
243
                            /* (LB18) has higher priority than (LB19a).  */
244
0
                            && prev_prop != LBP_SP)
245
0
                           || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
246
0
                               && (! prev2_ea || ! ea)))
247
0
                    {
248
                      /* (LB19a) Don't break on either side of ambiguous
249
                         quotation marks, except next to an EastAsian character.  */
250
0
                      *p = UC_BREAK_PROHIBITED;
251
0
                    }
252
0
                  else if (prev_initial_hyphen
253
0
                           && (prop == LBP_AL1 || prop == LBP_AL2))
254
0
                    {
255
                      /* (LB20a) Don't break after a word-initial hyphen.  */
256
0
                      *p = UC_BREAK_PROHIBITED;
257
0
                    }
258
0
                  else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
259
0
                    {
260
                      /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
261
                         before non-Hebrew.  */
262
0
                      *p = UC_BREAK_PROHIBITED;
263
0
                    }
264
0
                  else if ((prev_nus
265
0
                            && (prev_prop == LBP_CL
266
0
                                || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
267
0
                            && (prop == LBP_PO || prop == LBP_PR))
268
0
                           || (nus && (prop == LBP_PO || prop == LBP_PR
269
0
                                       || prop == LBP_NU)))
270
0
                    {
271
                      /* (LB25) Don't break numbers.  */
272
0
                      *p = UC_BREAK_PROHIBITED;
273
0
                    }
274
0
                  else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
275
0
                           && (prop == LBP_OP1 || prop == LBP_OP2)
276
0
                           && (PROP (lookahead1_prop_ea) == LBP_NU
277
0
                               || (PROP (lookahead1_prop_ea) == LBP_IS
278
0
                                   && PROP (lookahead2_prop_ea) == LBP_NU)))
279
0
                    {
280
                      /* (LB25) Don't break numbers.  */
281
0
                      *p = UC_BREAK_PROHIBITED;
282
0
                    }
283
0
                  else if (prev_prop == LBP_AKLS_VI
284
0
                           && (prop == LBP_AK || prop == LBP_AL2))
285
0
                    {
286
                      /* (LB28a) Don't break inside orthographic syllables of
287
                         Brahmic scripts, line 3.  */
288
0
                      *p = UC_BREAK_PROHIBITED;
289
0
                    }
290
0
                  else if (PROP (lookahead1_prop_ea) == LBP_VF
291
0
                           && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
292
0
                           && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
293
0
                    {
294
                      /* (LB28a) Don't break inside orthographic syllables of
295
                         Brahmic scripts, line 4.  */
296
0
                      *p = UC_BREAK_PROHIBITED;
297
0
                    }
298
0
                  else if (last_prop == LBP_IS && uc == 0x003C)
299
0
                    {
300
                      /* Partially disable (LB29) Do not break between numeric
301
                         punctuation and alphabetics ("e.g.").  We find it
302
                         desirable to break before the HTML tag "</P>" in
303
                         strings like "<P>Some sentence.</P>".  */
304
0
                      *p = UC_BREAK_POSSIBLE;
305
0
                    }
306
0
                  else if (last_prop == LBP_RI && prop == LBP_RI)
307
0
                    {
308
                      /* (LB30a) Break between two regional indicator symbols
309
                         if and only if there are an even number of regional
310
                         indicators preceding the position of the break.  */
311
0
                      *p = (seen_space != NULL || (ri_count % 2) == 0
312
0
                            ? UC_BREAK_POSSIBLE
313
0
                            : UC_BREAK_PROHIBITED);
314
0
                    }
315
0
                  else
316
0
                    {
317
0
                      int this_prop = prop;
318
0
                      if (prop == LBP_QU3)
319
0
                        {
320
                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
321
                             next character's line break property is not one of
322
                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
323
0
                          switch (PROP (lookahead1_prop_ea))
324
0
                            {
325
0
                            case LBP_BK:
326
0
                            case LBP_CR:
327
0
                            case LBP_LF:
328
0
                            case LBP_SP:
329
0
                            case LBP_GL:
330
0
                            case LBP_WJ:
331
0
                            case LBP_CL:
332
0
                            case LBP_QU1: case LBP_QU2: case LBP_QU3:
333
0
                            case LBP_CP1: case LBP_CP2:
334
0
                            case LBP_EX:
335
0
                            case LBP_IS:
336
0
                            case LBP_SY:
337
0
                            case LBP_ZW:
338
0
                              break;
339
0
                            default:
340
0
                              this_prop = LBP_QU1;
341
0
                              break;
342
0
                            }
343
0
                        }
344
345
0
                      switch (unilbrk_table [last_prop] [this_prop])
346
0
                        {
347
0
                        case D:
348
0
                          *p = UC_BREAK_POSSIBLE;
349
0
                          break;
350
0
                        case I:
351
0
                          *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
352
0
                          break;
353
0
                        case P:
354
0
                          *p = UC_BREAK_PROHIBITED;
355
0
                          break;
356
0
                        default:
357
0
                          abort ();
358
0
                        }
359
0
                    }
360
361
0
                  if (prop == LBP_QU2)
362
0
                    {
363
                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
364
                         previous character's line break property was not one of
365
                         BK, CR, LF, OP, QU, GL, SP, ZW.  */
366
0
                      switch (prev_prop)
367
0
                        {
368
0
                        case LBP_BK:
369
0
                        case LBP_CR:
370
0
                        case LBP_LF:
371
0
                        case LBP_OP1: case LBP_OP2:
372
0
                        case LBP_QU1: case LBP_QU2: case LBP_QU3:
373
0
                        case LBP_GL:
374
0
                        case LBP_SP:
375
0
                        case LBP_ZW:
376
0
                          break;
377
0
                        default:
378
0
                          prop = LBP_QU1;
379
0
                          break;
380
0
                        }
381
0
                    }
382
383
0
                  last_prop = prop;
384
0
                  seen_space = NULL;
385
0
                }
386
0
            }
387
388
          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
389
             break class except BK, CR, LF, NL, SP, or ZW.  */
390
0
          if (!((prop == LBP_CM || prop == LBP_ZWJ)
391
0
                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
392
0
                     || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
393
0
            {
394
0
              prev_initial_hyphen =
395
0
                (prop == LBP_HY || uc == 0x2010)
396
0
                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
397
0
                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
398
0
                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
399
0
              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
400
0
                                              || prev_prop == LBP_AL2
401
0
                                              || prev_prop == LBP_AS)
402
0
                           ? LBP_AKLS_VI :
403
0
                           prev_prop == LBP_HL && (prop == LBP_HY
404
0
                                                   || (prop == LBP_BA && !ea))
405
0
                           ? LBP_HL_BA :
406
0
                           prop);
407
0
              prev2_ea = prev_ea;
408
0
              prev_ea = ea;
409
0
              prev_nus = nus;
410
0
            }
411
412
0
          preceding_prop = prop;
413
414
0
          if (prop == LBP_RI)
415
0
            ri_count++;
416
0
          else
417
0
            ri_count = 0;
418
419
0
          p += count;
420
0
        }
421
0
      while (s < s_end);
422
0
    }
423
0
}
424
425
#if defined IN_LIBUNISTRING
426
/* For backward compatibility with older versions of libunistring.  */
427
428
# undef u8_possible_linebreaks
429
430
void
431
u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
432
                        char *p)
433
{
434
  u8_possible_linebreaks_loop (s, n, encoding, -1, p);
435
}
436
437
#endif
438
439
void
440
u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
441
                           char *p)
442
0
{
443
0
  u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
444
0
}
445
446
447
#ifdef TEST
448
449
#include <stdio.h>
450
#include <string.h>
451
452
/* Read the contents of an input stream, and return it, terminated with a NUL
453
   byte. */
454
char *
455
read_file (FILE *stream)
456
{
457
#define BUFSIZE 4096
458
  char *buf = NULL;
459
  int alloc = 0;
460
  int size = 0;
461
  int count;
462
463
  while (! feof (stream))
464
    {
465
      if (size + BUFSIZE > alloc)
466
        {
467
          alloc = alloc + alloc / 2;
468
          if (alloc < size + BUFSIZE)
469
            alloc = size + BUFSIZE;
470
          buf = realloc (buf, alloc);
471
          if (buf == NULL)
472
            {
473
              fprintf (stderr, "out of memory\n");
474
              exit (1);
475
            }
476
        }
477
      count = fread (buf + size, 1, BUFSIZE, stream);
478
      if (count == 0)
479
        {
480
          if (ferror (stream))
481
            {
482
              perror ("fread");
483
              exit (1);
484
            }
485
        }
486
      else
487
        size += count;
488
    }
489
  buf = realloc (buf, size + 1);
490
  if (buf == NULL)
491
    {
492
      fprintf (stderr, "out of memory\n");
493
      exit (1);
494
    }
495
  buf[size] = '\0';
496
  return buf;
497
#undef BUFSIZE
498
}
499
500
int
501
main (int argc, char * argv[])
502
{
503
  if (argc == 1)
504
    {
505
      /* Display all the break opportunities in the input string.  */
506
      char *input = read_file (stdin);
507
      int length = strlen (input);
508
      char *breaks = malloc (length);
509
      int i;
510
511
      u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
512
513
      for (i = 0; i < length; i++)
514
        {
515
          switch (breaks[i])
516
            {
517
            case UC_BREAK_POSSIBLE:
518
              /* U+2027 in UTF-8 encoding */
519
              putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
520
              break;
521
            case UC_BREAK_MANDATORY:
522
              /* U+21B2 (or U+21B5) in UTF-8 encoding */
523
              putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
524
              break;
525
            case UC_BREAK_CR_BEFORE_LF:
526
              /* U+21E4 in UTF-8 encoding */
527
              putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
528
              break;
529
            case UC_BREAK_PROHIBITED:
530
              break;
531
            default:
532
              abort ();
533
            }
534
          putc (input[i], stdout);
535
        }
536
537
      free (breaks);
538
539
      return 0;
540
    }
541
  else
542
    return 1;
543
}
544
545
#endif /* TEST */