/src/gettext-0.26/gettext-tools/libgettextpo/unilbrk/u8-possible-linebreaks.c

Source
/* Line breaking of UTF-8 strings.
   Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2001.

   This file is free software.
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   You can redistribute it and/or modify it under either
     - the terms of the GNU Lesser General Public License as published
       by the Free Software Foundation, either version 3, or (at your
       option) any later version, or
     - the terms of the GNU General Public License as published by the
       Free Software Foundation; either version 2, or (at your option)
       any later version, or
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".

   This file is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License and the GNU General Public License
   for more details.

   You should have received a copy of the GNU Lesser General Public
   License and of the GNU General Public License along with this
   program.  If not, see <https://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "unilbrk.h"
#include "unilbrk/internal.h"

#include <stdlib.h>
#include <string.h>

#include "unilbrk/lbrktables.h"
#include "uniwidth/cjk.h"
#include "unistr.h"

/* This file implements
   Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */

void
u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
                             int cr, char *p)
{
  if (n > 0)
    {
      int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);

      /* Don't break inside multibyte characters.  */
      memset (p, UC_BREAK_PROHIBITED, n);

      const uint8_t *s_end = s + n;

      /* We need 2 characters of lookahead:
           - 1 character of lookahead for (LB15c,LB19a,LB28a),
           - 2 characters of lookahead for (LB25).  */
      const uint8_t *lookahead1_end;
      ucs4_t lookahead1_uc;
      int lookahead1_prop_ea;
      const uint8_t *lookahead2_end;
      ucs4_t lookahead2_uc;
      int lookahead2_prop_ea;
      /* Get the first lookahead character.  */
      lookahead1_end = s;
      lookahead1_end += u8_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end);
      lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
      /* Get the second lookahead character.  */
      lookahead2_end = lookahead1_end;
      if (lookahead2_end < s_end)
        {
          lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
          lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
        }
      else
        {
          lookahead2_uc = 0xFFFD;
          lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
        }

      int preceding_prop = LBP_BK; /* line break property of preceding character */
      int prev_prop = LBP_BK; /* line break property of previous character
                                 (= last character, ignoring intervening characters of class CM or ZWJ) */
      int prev_ea = 0;        /* EastAsian property of previous character
                                 (= last character, ignoring intervening characters of class CM or ZWJ) */
      int prev2_ea = 0;       /* EastAsian property of character before the previous character */
      bool prev_initial_hyphen = false; /* the previous character was a
                                           word-initial hyphen or U+2010 */
      bool prev_nus = false; /* before the previous character, there was a character
                                with line break property LBP_NU and since then
                                only characters with line break property LBP_SY
                                or LBP_IS */
      int last_prop = LBP_BK; /* line break property of last non-space character
                                 (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
      char *seen_space = NULL; /* Was a space seen after the last non-space character? */

      /* Number of consecutive regional indicator (RI) characters seen
         immediately before the current point.  */
      size_t ri_count = 0;

      do
        {
          /* Read the next character.  */
          size_t count = lookahead1_end - s;
          s = lookahead1_end;
          ucs4_t uc = lookahead1_uc;
          int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
          int prop = PROP (prop_ea); /* line break property of uc */
          int ea = EA (prop_ea);     /* EastAsian property of uc */
          /*  Refill the pipeline of 2 lookahead characters.  */
          lookahead1_end = lookahead2_end;
          lookahead1_uc = lookahead2_uc;
          lookahead1_prop_ea = lookahead2_prop_ea;
          if (lookahead2_end < s_end)
            {
              lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
              lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
            }
          else
            {
              lookahead2_uc = 0xFFFD;
              lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
            }

          bool nus = /* ending at the previous character, there was a character
                        with line break property LBP_NU and since then only
                        characters with line break property LBP_SY or LBP_IS */
            (prev_prop == LBP_NU
             || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));

          if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
            {
              /* (LB4,LB5,LB6) Mandatory break.  */
              *p = UC_BREAK_MANDATORY;
              /* cr is either LBP_CR or -1.  In the first case, recognize
                 a CR-LF sequence.  */
              if (prev_prop == cr && prop == LBP_LF)
                p[-1] = UC_BREAK_CR_BEFORE_LF;
              last_prop = LBP_BK;
              seen_space = NULL;
            }
          else
            {
              /* Resolve property values whose behaviour is not fixed.  */
              switch (prop)
                {
                case LBP_AI:
                  /* Resolve ambiguous.  */
                  prop = LBP_AI_REPLACEMENT;
                  break;
                case LBP_CB:
                  /* This is arbitrary.  */
                  prop = LBP_ID1;
                  break;
                case LBP_SA:
                  /* We don't handle complex scripts yet.
                     Treat LBP_SA like LBP_XX.  */
                case LBP_XX:
                  /* This is arbitrary.  */
                  prop = LBP_AL1;
                  break;
                }

              /* Deal with spaces and combining characters.  */
              if (prop == LBP_SP)
                {
                  /* (LB7) Don't break just before a space.  */
                  *p = UC_BREAK_PROHIBITED;
                  seen_space = p;
                }
              else if (prop == LBP_ZW)
                {
                  /* (LB7) Don't break just before a zero-width space.  */
                  *p = UC_BREAK_PROHIBITED;
                  last_prop = LBP_ZW;
                  seen_space = NULL;
                }
              else if (prop == LBP_CM || prop == LBP_ZWJ)
                {
                  /* (LB9) Don't break just before a combining character or
                     zero-width joiner, except immediately after a mandatory
                     break character, space, or zero-width space.  */
                  if (last_prop == LBP_BK)
                    {
                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
                      *p = UC_BREAK_PROHIBITED;
                      /* (LB10) Treat CM or ZWJ as AL.  */
                      last_prop = LBP_AL1;
                      seen_space = NULL;
                    }
                  else if (last_prop == LBP_ZW
                           || (seen_space != NULL
                               /* (LB14) has higher priority than (LB18).  */
                               && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
                               /* (LB15a) has higher priority than (LB18).  */
                               && !(last_prop == LBP_QU2)))
                    {
                      /* (LB8) Break after zero-width space.  */
                      /* (LB18) Break after spaces.
                         We do *not* implement the "legacy support for space
                         character as base for combining marks" because now the
                         NBSP CM sequence is recommended instead of SP CM.  */
                      *p = UC_BREAK_POSSIBLE;
                      /* (LB10) Treat CM or ZWJ as AL.  */
                      last_prop = LBP_AL1;
                      seen_space = NULL;
                    }
                  else
                    {
                      /* Treat X CM as if it were X.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                }
              else
                {
                  /* prop must be usable as an index for table 7.3 of UTR #14.  */
                  if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
                    abort ();

                  if (last_prop == LBP_BK)
                    {
                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (last_prop == LBP_ZW)
                    {
                      /* (LB8) Break after zero-width space.  */
                      *p = UC_BREAK_POSSIBLE;
                    }
                  else if (preceding_prop == LBP_ZWJ)
                    {
                      /* (LB8a) Don't break right after a zero-width joiner.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (prop == LBP_IS && prev_prop == LBP_SP
                           && PROP (lookahead1_prop_ea) == LBP_NU)
                    {
                      /* (LB15c) Break before a decimal mark that follows a space.  */
                      *p = UC_BREAK_POSSIBLE;
                    }
                  else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
                            && (! prev_ea || ! EA (lookahead1_prop_ea))
                            /* (LB18) has higher priority than (LB19a).  */
                            && prev_prop != LBP_SP)
                           || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
                               && (! prev2_ea || ! ea)))
                    {
                      /* (LB19a) Don't break on either side of ambiguous
                         quotation marks, except next to an EastAsian character.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (prev_initial_hyphen
                           && (prop == LBP_AL1 || prop == LBP_AL2))
                    {
                      /* (LB20a) Don't break after a word-initial hyphen.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
                    {
                      /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
                         before non-Hebrew.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if ((prev_nus
                            && (prev_prop == LBP_CL
                                || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
                            && (prop == LBP_PO || prop == LBP_PR))
                           || (nus && (prop == LBP_PO || prop == LBP_PR
                                       || prop == LBP_NU)))
                    {
                      /* (LB25) Don't break numbers.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
                           && (prop == LBP_OP1 || prop == LBP_OP2)
                           && (PROP (lookahead1_prop_ea) == LBP_NU
                               || (PROP (lookahead1_prop_ea) == LBP_IS
                                   && PROP (lookahead2_prop_ea) == LBP_NU)))
                    {
                      /* (LB25) Don't break numbers.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (prev_prop == LBP_AKLS_VI
                           && (prop == LBP_AK || prop == LBP_AL2))
                    {
                      /* (LB28a) Don't break inside orthographic syllables of
                         Brahmic scripts, line 3.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (PROP (lookahead1_prop_ea) == LBP_VF
                           && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
                           && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
                    {
                      /* (LB28a) Don't break inside orthographic syllables of
                         Brahmic scripts, line 4.  */
                      *p = UC_BREAK_PROHIBITED;
                    }
                  else if (last_prop == LBP_IS && uc == 0x003C)
                    {
                      /* Partially disable (LB29) Do not break between numeric
                         punctuation and alphabetics ("e.g.").  We find it
                         desirable to break before the HTML tag "</P>" in
                         strings like "<P>Some sentence.</P>".  */
                      *p = UC_BREAK_POSSIBLE;
                    }
                  else if (last_prop == LBP_RI && prop == LBP_RI)
                    {
                      /* (LB30a) Break between two regional indicator symbols
                         if and only if there are an even number of regional
                         indicators preceding the position of the break.  */
                      *p = (seen_space != NULL || (ri_count % 2) == 0
                            ? UC_BREAK_POSSIBLE
                            : UC_BREAK_PROHIBITED);
                    }
                  else
                    {
                      int this_prop = prop;
                      if (prop == LBP_QU3)
                        {
                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
                             next character's line break property is not one of
                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
                          switch (PROP (lookahead1_prop_ea))
                            {
                            case LBP_BK:
                            case LBP_CR:
                            case LBP_LF:
                            case LBP_SP:
                            case LBP_GL:
                            case LBP_WJ:
                            case LBP_CL:
                            case LBP_QU1: case LBP_QU2: case LBP_QU3:
                            case LBP_CP1: case LBP_CP2:
                            case LBP_EX:
                            case LBP_IS:
                            case LBP_SY:
                            case LBP_ZW:
                              break;
                            default:
                              this_prop = LBP_QU1;
                              break;
                            }
                        }

                      switch (unilbrk_table [last_prop] [this_prop])
                        {
                        case D:
                          *p = UC_BREAK_POSSIBLE;
                          break;
                        case I:
                          *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
                          break;
                        case P:
                          *p = UC_BREAK_PROHIBITED;
                          break;
                        default:
                          abort ();
                        }
                    }

                  if (prop == LBP_QU2)
                    {
                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
                         previous character's line break property was not one of
                         BK, CR, LF, OP, QU, GL, SP, ZW.  */
                      switch (prev_prop)
                        {
                        case LBP_BK:
                        case LBP_CR:
                        case LBP_LF:
                        case LBP_OP1: case LBP_OP2:
                        case LBP_QU1: case LBP_QU2: case LBP_QU3:
                        case LBP_GL:
                        case LBP_SP:
                        case LBP_ZW:
                          break;
                        default:
                          prop = LBP_QU1;
                          break;
                        }
                    }

                  last_prop = prop;
                  seen_space = NULL;
                }
            }

          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
             break class except BK, CR, LF, NL, SP, or ZW.  */
          if (!((prop == LBP_CM || prop == LBP_ZWJ)
                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
                     || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
            {
              prev_initial_hyphen =
                (prop == LBP_HY || uc == 0x2010)
                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
                                              || prev_prop == LBP_AL2
                                              || prev_prop == LBP_AS)
                           ? LBP_AKLS_VI :
                           prev_prop == LBP_HL && (prop == LBP_HY
                                                   || (prop == LBP_BA && !ea))
                           ? LBP_HL_BA :
                           prop);
              prev2_ea = prev_ea;
              prev_ea = ea;
              prev_nus = nus;
            }

          preceding_prop = prop;

          if (prop == LBP_RI)
            ri_count++;
          else
            ri_count = 0;

          p += count;
        }
      while (s < s_end);
    }
}

#if defined IN_LIBUNISTRING
/* For backward compatibility with older versions of libunistring.  */

# undef u8_possible_linebreaks

void
u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
                        char *p)
{
  u8_possible_linebreaks_loop (s, n, encoding, -1, p);
}

#endif

void
u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
                           char *p)
{
  u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
}


#ifdef TEST

#include <stdio.h>
#include <string.h>

/* Read the contents of an input stream, and return it, terminated with a NUL
   byte. */
char *
read_file (FILE *stream)
{
#define BUFSIZE 4096
  char *buf = NULL;
  int alloc = 0;
  int size = 0;
  int count;

  while (! feof (stream))
    {
      if (size + BUFSIZE > alloc)
        {
          alloc = alloc + alloc / 2;
          if (alloc < size + BUFSIZE)
            alloc = size + BUFSIZE;
          buf = realloc (buf, alloc);
          if (buf == NULL)
            {
              fprintf (stderr, "out of memory\n");
              exit (1);
            }
        }
      count = fread (buf + size, 1, BUFSIZE, stream);
      if (count == 0)
        {
          if (ferror (stream))
            {
              perror ("fread");
              exit (1);
            }
        }
      else
        size += count;
    }
  buf = realloc (buf, size + 1);
  if (buf == NULL)
    {
      fprintf (stderr, "out of memory\n");
      exit (1);
    }
  buf[size] = '\0';
  return buf;
#undef BUFSIZE
}

int
main (int argc, char * argv[])
{
  if (argc == 1)
    {
      /* Display all the break opportunities in the input string.  */
      char *input = read_file (stdin);
      int length = strlen (input);
      char *breaks = malloc (length);
      int i;

      u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);

      for (i = 0; i < length; i++)
        {
          switch (breaks[i])
            {
            case UC_BREAK_POSSIBLE:
              /* U+2027 in UTF-8 encoding */
              putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
              break;
            case UC_BREAK_MANDATORY:
              /* U+21B2 (or U+21B5) in UTF-8 encoding */
              putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
              break;
            case UC_BREAK_CR_BEFORE_LF:
              /* U+21E4 in UTF-8 encoding */
              putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
              break;
            case UC_BREAK_PROHIBITED:
              break;
            default:
              abort ();
            }
          putc (input[i], stdout);
        }

      free (breaks);

      return 0;
    }
  else
    return 1;
}

#endif /* TEST */

Coverage Report

Created: 2026-01-25 07:18

Line	Count	Source
1		/* Line breaking of UTF-8 strings.
2		Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
3		Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5		This file is free software.
6		It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7		You can redistribute it and/or modify it under either
8		- the terms of the GNU Lesser General Public License as published
9		by the Free Software Foundation, either version 3, or (at your
10		option) any later version, or
11		- the terms of the GNU General Public License as published by the
12		Free Software Foundation; either version 2, or (at your option)
13		any later version, or
14		- the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16		This file is distributed in the hope that it will be useful,
17		but WITHOUT ANY WARRANTY; without even the implied warranty of
18		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19		Lesser General Public License and the GNU General Public License
20		for more details.
21
22		You should have received a copy of the GNU Lesser General Public
23		License and of the GNU General Public License along with this
24		program. If not, see <https://www.gnu.org/licenses/>. */
25
26		#include <config.h>
27
28		/* Specification. */
29		#include "unilbrk.h"
30		#include "unilbrk/internal.h"
31
32		#include <stdlib.h>
33		#include <string.h>
34
35		#include "unilbrk/lbrktables.h"
36		#include "uniwidth/cjk.h"
37		#include "unistr.h"
38
39		/* This file implements
40		Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
41
42		void
43		u8_possible_linebreaks_loop (const uint8_t s, size_t n, const char encoding,
44		int cr, char *p)
45	0	{
46	0	if (n > 0)
47	0	{
48	0	int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
49
50		/* Don't break inside multibyte characters. */
51	0	memset (p, UC_BREAK_PROHIBITED, n);
52
53	0	const uint8_t *s_end = s + n;
54
55		/* We need 2 characters of lookahead:
56		- 1 character of lookahead for (LB15c,LB19a,LB28a),
57		- 2 characters of lookahead for (LB25). */
58	0	const uint8_t *lookahead1_end;
59	0	ucs4_t lookahead1_uc;
60	0	int lookahead1_prop_ea;
61	0	const uint8_t *lookahead2_end;
62	0	ucs4_t lookahead2_uc;
63	0	int lookahead2_prop_ea;
64		/* Get the first lookahead character. */
65	0	lookahead1_end = s;
66	0	lookahead1_end += u8_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end);
67	0	lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
68		/* Get the second lookahead character. */
69	0	lookahead2_end = lookahead1_end;
70	0	if (lookahead2_end < s_end)
71	0	{
72	0	lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
73	0	lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
74	0	}
75	0	else
76	0	{
77	0	lookahead2_uc = 0xFFFD;
78	0	lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
79	0	}
80
81	0	int preceding_prop = LBP_BK; /* line break property of preceding character */
82	0	int prev_prop = LBP_BK; /* line break property of previous character
83		(= last character, ignoring intervening characters of class CM or ZWJ) */
84	0	int prev_ea = 0; /* EastAsian property of previous character
85		(= last character, ignoring intervening characters of class CM or ZWJ) */
86	0	int prev2_ea = 0; /* EastAsian property of character before the previous character */
87	0	bool prev_initial_hyphen = false; /* the previous character was a
88		word-initial hyphen or U+2010 */
89	0	bool prev_nus = false; /* before the previous character, there was a character
90		with line break property LBP_NU and since then
91		only characters with line break property LBP_SY
92		or LBP_IS */
93	0	int last_prop = LBP_BK; /* line break property of last non-space character
94		(= last character, ignoring intervening characters of class SP or CM or ZWJ) */
95	0	char seen_space = NULL; / Was a space seen after the last non-space character? */
96
97		/* Number of consecutive regional indicator (RI) characters seen
98		immediately before the current point. */
99	0	size_t ri_count = 0;
100
101	0	do
102	0	{
103		/* Read the next character. */
104	0	size_t count = lookahead1_end - s;
105	0	s = lookahead1_end;
106	0	ucs4_t uc = lookahead1_uc;
107	0	int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
108	0	int prop = PROP (prop_ea); /* line break property of uc */
109	0	int ea = EA (prop_ea); /* EastAsian property of uc */
110		/* Refill the pipeline of 2 lookahead characters. */
111	0	lookahead1_end = lookahead2_end;
112	0	lookahead1_uc = lookahead2_uc;
113	0	lookahead1_prop_ea = lookahead2_prop_ea;
114	0	if (lookahead2_end < s_end)
115	0	{
116	0	lookahead2_end += u8_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
117	0	lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
118	0	}
119	0	else
120	0	{
121	0	lookahead2_uc = 0xFFFD;
122	0	lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
123	0	}
124
125	0	bool nus = /* ending at the previous character, there was a character
126		with line break property LBP_NU and since then only
127		characters with line break property LBP_SY or LBP_IS */
128	0	(prev_prop == LBP_NU
129	0	\|\| (prev_nus && (prev_prop == LBP_SY \|\| prev_prop == LBP_IS)));
130
131	0	if (prop == LBP_BK \|\| prop == LBP_LF \|\| prop == LBP_CR)
132	0	{
133		/* (LB4,LB5,LB6) Mandatory break. */
134	0	*p = UC_BREAK_MANDATORY;
135		/* cr is either LBP_CR or -1. In the first case, recognize
136		a CR-LF sequence. */
137	0	if (prev_prop == cr && prop == LBP_LF)
138	0	p[-1] = UC_BREAK_CR_BEFORE_LF;
139	0	last_prop = LBP_BK;
140	0	seen_space = NULL;
141	0	}
142	0	else
143	0	{
144		/* Resolve property values whose behaviour is not fixed. */
145	0	switch (prop)
146	0	{
147	0	case LBP_AI:
148		/* Resolve ambiguous. */
149	0	prop = LBP_AI_REPLACEMENT;
150	0	break;
151	0	case LBP_CB:
152		/* This is arbitrary. */
153	0	prop = LBP_ID1;
154	0	break;
155	0	case LBP_SA:
156		/* We don't handle complex scripts yet.
157		Treat LBP_SA like LBP_XX. */
158	0	case LBP_XX:
159		/* This is arbitrary. */
160	0	prop = LBP_AL1;
161	0	break;
162	0	}
163
164		/* Deal with spaces and combining characters. */
165	0	if (prop == LBP_SP)
166	0	{
167		/* (LB7) Don't break just before a space. */
168	0	*p = UC_BREAK_PROHIBITED;
169	0	seen_space = p;
170	0	}
171	0	else if (prop == LBP_ZW)
172	0	{
173		/* (LB7) Don't break just before a zero-width space. */
174	0	*p = UC_BREAK_PROHIBITED;
175	0	last_prop = LBP_ZW;
176	0	seen_space = NULL;
177	0	}
178	0	else if (prop == LBP_CM \|\| prop == LBP_ZWJ)
179	0	{
180		/* (LB9) Don't break just before a combining character or
181		zero-width joiner, except immediately after a mandatory
182		break character, space, or zero-width space. */
183	0	if (last_prop == LBP_BK)
184	0	{
185		/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
186	0	*p = UC_BREAK_PROHIBITED;
187		/* (LB10) Treat CM or ZWJ as AL. */
188	0	last_prop = LBP_AL1;
189	0	seen_space = NULL;
190	0	}
191	0	else if (last_prop == LBP_ZW
192	0	\|\| (seen_space != NULL
193		/* (LB14) has higher priority than (LB18). */
194	0	&& !(last_prop == LBP_OP1 \|\| last_prop == LBP_OP2)
195		/* (LB15a) has higher priority than (LB18). */
196	0	&& !(last_prop == LBP_QU2)))
197	0	{
198		/* (LB8) Break after zero-width space. */
199		/* (LB18) Break after spaces.
200		We do not implement the "legacy support for space
201		character as base for combining marks" because now the
202		NBSP CM sequence is recommended instead of SP CM. */
203	0	*p = UC_BREAK_POSSIBLE;
204		/* (LB10) Treat CM or ZWJ as AL. */
205	0	last_prop = LBP_AL1;
206	0	seen_space = NULL;
207	0	}
208	0	else
209	0	{
210		/* Treat X CM as if it were X. */
211	0	*p = UC_BREAK_PROHIBITED;
212	0	}
213	0	}
214	0	else
215	0	{
216		/* prop must be usable as an index for table 7.3 of UTR #14. */
217	0	if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
218	0	abort ();
219
220	0	if (last_prop == LBP_BK)
221	0	{
222		/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
223	0	*p = UC_BREAK_PROHIBITED;
224	0	}
225	0	else if (last_prop == LBP_ZW)
226	0	{
227		/* (LB8) Break after zero-width space. */
228	0	*p = UC_BREAK_POSSIBLE;
229	0	}
230	0	else if (preceding_prop == LBP_ZWJ)
231	0	{
232		/* (LB8a) Don't break right after a zero-width joiner. */
233	0	*p = UC_BREAK_PROHIBITED;
234	0	}
235	0	else if (prop == LBP_IS && prev_prop == LBP_SP
236	0	&& PROP (lookahead1_prop_ea) == LBP_NU)
237	0	{
238		/* (LB15c) Break before a decimal mark that follows a space. */
239	0	*p = UC_BREAK_POSSIBLE;
240	0	}
241	0	else if (((prop == LBP_QU1 \|\| prop == LBP_QU2 \|\| prop == LBP_QU3)
242	0	&& (! prev_ea \|\| ! EA (lookahead1_prop_ea))
243		/* (LB18) has higher priority than (LB19a). */
244	0	&& prev_prop != LBP_SP)
245	0	\|\| ((prev_prop == LBP_QU1 \|\| prev_prop == LBP_QU2 \|\| prev_prop == LBP_QU3)
246	0	&& (! prev2_ea \|\| ! ea)))
247	0	{
248		/* (LB19a) Don't break on either side of ambiguous
249		quotation marks, except next to an EastAsian character. */
250	0	*p = UC_BREAK_PROHIBITED;
251	0	}
252	0	else if (prev_initial_hyphen
253	0	&& (prop == LBP_AL1 \|\| prop == LBP_AL2))
254	0	{
255		/* (LB20a) Don't break after a word-initial hyphen. */
256	0	*p = UC_BREAK_PROHIBITED;
257	0	}
258	0	else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
259	0	{
260		/* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
261		before non-Hebrew. */
262	0	*p = UC_BREAK_PROHIBITED;
263	0	}
264	0	else if ((prev_nus
265	0	&& (prev_prop == LBP_CL
266	0	\|\| prev_prop == LBP_CP1 \|\| prev_prop == LBP_CP2)
267	0	&& (prop == LBP_PO \|\| prop == LBP_PR))
268	0	\|\| (nus && (prop == LBP_PO \|\| prop == LBP_PR
269	0	\|\| prop == LBP_NU)))
270	0	{
271		/* (LB25) Don't break numbers. */
272	0	*p = UC_BREAK_PROHIBITED;
273	0	}
274	0	else if ((prev_prop == LBP_PO \|\| prev_prop == LBP_PR)
275	0	&& (prop == LBP_OP1 \|\| prop == LBP_OP2)
276	0	&& (PROP (lookahead1_prop_ea) == LBP_NU
277	0	\|\| (PROP (lookahead1_prop_ea) == LBP_IS
278	0	&& PROP (lookahead2_prop_ea) == LBP_NU)))
279	0	{
280		/* (LB25) Don't break numbers. */
281	0	*p = UC_BREAK_PROHIBITED;
282	0	}
283	0	else if (prev_prop == LBP_AKLS_VI
284	0	&& (prop == LBP_AK \|\| prop == LBP_AL2))
285	0	{
286		/* (LB28a) Don't break inside orthographic syllables of
287		Brahmic scripts, line 3. */
288	0	*p = UC_BREAK_PROHIBITED;
289	0	}
290	0	else if (PROP (lookahead1_prop_ea) == LBP_VF
291	0	&& (prop == LBP_AK \|\| prop == LBP_AL2 \|\| prop == LBP_AS)
292	0	&& (prev_prop == LBP_AK \|\| prev_prop == LBP_AL2 \|\| prev_prop == LBP_AS))
293	0	{
294		/* (LB28a) Don't break inside orthographic syllables of
295		Brahmic scripts, line 4. */
296	0	*p = UC_BREAK_PROHIBITED;
297	0	}
298	0	else if (last_prop == LBP_IS && uc == 0x003C)
299	0	{
300		/* Partially disable (LB29) Do not break between numeric
301		punctuation and alphabetics ("e.g."). We find it
302		desirable to break before the HTML tag "</P>" in
303		strings like "<P>Some sentence.</P>". */
304	0	*p = UC_BREAK_POSSIBLE;
305	0	}
306	0	else if (last_prop == LBP_RI && prop == LBP_RI)
307	0	{
308		/* (LB30a) Break between two regional indicator symbols
309		if and only if there are an even number of regional
310		indicators preceding the position of the break. */
311	0	*p = (seen_space != NULL \|\| (ri_count % 2) == 0
312	0	? UC_BREAK_POSSIBLE
313	0	: UC_BREAK_PROHIBITED);
314	0	}
315	0	else
316	0	{
317	0	int this_prop = prop;
318	0	if (prop == LBP_QU3)
319	0	{
320		/* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
321		next character's line break property is not one of
322		BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
323	0	switch (PROP (lookahead1_prop_ea))
324	0	{
325	0	case LBP_BK:
326	0	case LBP_CR:
327	0	case LBP_LF:
328	0	case LBP_SP:
329	0	case LBP_GL:
330	0	case LBP_WJ:
331	0	case LBP_CL:
332	0	case LBP_QU1: case LBP_QU2: case LBP_QU3:
333	0	case LBP_CP1: case LBP_CP2:
334	0	case LBP_EX:
335	0	case LBP_IS:
336	0	case LBP_SY:
337	0	case LBP_ZW:
338	0	break;
339	0	default:
340	0	this_prop = LBP_QU1;
341	0	break;
342	0	}
343	0	}
344
345	0	switch (unilbrk_table [last_prop] [this_prop])
346	0	{
347	0	case D:
348	0	*p = UC_BREAK_POSSIBLE;
349	0	break;
350	0	case I:
351	0	*p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
352	0	break;
353	0	case P:
354	0	*p = UC_BREAK_PROHIBITED;
355	0	break;
356	0	default:
357	0	abort ();
358	0	}
359	0	}
360
361	0	if (prop == LBP_QU2)
362	0	{
363		/* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
364		previous character's line break property was not one of
365		BK, CR, LF, OP, QU, GL, SP, ZW. */
366	0	switch (prev_prop)
367	0	{
368	0	case LBP_BK:
369	0	case LBP_CR:
370	0	case LBP_LF:
371	0	case LBP_OP1: case LBP_OP2:
372	0	case LBP_QU1: case LBP_QU2: case LBP_QU3:
373	0	case LBP_GL:
374	0	case LBP_SP:
375	0	case LBP_ZW:
376	0	break;
377	0	default:
378	0	prop = LBP_QU1;
379	0	break;
380	0	}
381	0	}
382
383	0	last_prop = prop;
384	0	seen_space = NULL;
385	0	}
386	0	}
387
388		/* (LB9) Treat X (CM \| ZWJ)* as if it were X, where X is any line
389		break class except BK, CR, LF, NL, SP, or ZW. */
390	0	if (!((prop == LBP_CM \|\| prop == LBP_ZWJ)
391	0	&& !(prev_prop == LBP_BK \|\| prev_prop == LBP_LF \|\| prev_prop == LBP_CR
392	0	\|\| prev_prop == LBP_SP \|\| prev_prop == LBP_ZW)))
393	0	{
394	0	prev_initial_hyphen =
395	0	(prop == LBP_HY \|\| uc == 0x2010)
396	0	&& (prev_prop == LBP_BK \|\| prev_prop == LBP_CR \|\| prev_prop == LBP_LF
397	0	\|\| prev_prop == LBP_SP \|\| prev_prop == LBP_ZW
398	0	\|\| prev_prop == LBP_CB \|\| prev_prop == LBP_GL);
399	0	prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
400	0	\|\| prev_prop == LBP_AL2
401	0	\|\| prev_prop == LBP_AS)
402	0	? LBP_AKLS_VI :
403	0	prev_prop == LBP_HL && (prop == LBP_HY
404	0	\|\| (prop == LBP_BA && !ea))
405	0	? LBP_HL_BA :
406	0	prop);
407	0	prev2_ea = prev_ea;
408	0	prev_ea = ea;
409	0	prev_nus = nus;
410	0	}
411
412	0	preceding_prop = prop;
413
414	0	if (prop == LBP_RI)
415	0	ri_count++;
416	0	else
417	0	ri_count = 0;
418
419	0	p += count;
420	0	}
421	0	while (s < s_end);
422	0	}
423	0	}
424
425		#if defined IN_LIBUNISTRING
426		/* For backward compatibility with older versions of libunistring. */
427
428		# undef u8_possible_linebreaks
429
430		void
431		u8_possible_linebreaks (const uint8_t s, size_t n, const char encoding,
432		char *p)
433		{
434		u8_possible_linebreaks_loop (s, n, encoding, -1, p);
435		}
436
437		#endif
438
439		void
440		u8_possible_linebreaks_v2 (const uint8_t s, size_t n, const char encoding,
441		char *p)
442	0	{
443	0	u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
444	0	}
445
446
447		#ifdef TEST
448
449		#include <stdio.h>
450		#include <string.h>
451
452		/* Read the contents of an input stream, and return it, terminated with a NUL
453		byte. */
454		char *
455		read_file (FILE *stream)
456		{
457		#define BUFSIZE 4096
458		char *buf = NULL;
459		int alloc = 0;
460		int size = 0;
461		int count;
462
463		while (! feof (stream))
464		{
465		if (size + BUFSIZE > alloc)
466		{
467		alloc = alloc + alloc / 2;
468		if (alloc < size + BUFSIZE)
469		alloc = size + BUFSIZE;
470		buf = realloc (buf, alloc);
471		if (buf == NULL)
472		{
473		fprintf (stderr, "out of memory\n");
474		exit (1);
475		}
476		}
477		count = fread (buf + size, 1, BUFSIZE, stream);
478		if (count == 0)
479		{
480		if (ferror (stream))
481		{
482		perror ("fread");
483		exit (1);
484		}
485		}
486		else
487		size += count;
488		}
489		buf = realloc (buf, size + 1);
490		if (buf == NULL)
491		{
492		fprintf (stderr, "out of memory\n");
493		exit (1);
494		}
495		buf[size] = '\0';
496		return buf;
497		#undef BUFSIZE
498		}
499
500		int
501		main (int argc, char * argv[])
502		{
503		if (argc == 1)
504		{
505		/* Display all the break opportunities in the input string. */
506		char *input = read_file (stdin);
507		int length = strlen (input);
508		char *breaks = malloc (length);
509		int i;
510
511		u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
512
513		for (i = 0; i < length; i++)
514		{
515		switch (breaks[i])
516		{
517		case UC_BREAK_POSSIBLE:
518		/* U+2027 in UTF-8 encoding */
519		putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
520		break;
521		case UC_BREAK_MANDATORY:
522		/* U+21B2 (or U+21B5) in UTF-8 encoding */
523		putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
524		break;
525		case UC_BREAK_CR_BEFORE_LF:
526		/* U+21E4 in UTF-8 encoding */
527		putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
528		break;
529		case UC_BREAK_PROHIBITED:
530		break;
531		default:
532		abort ();
533		}
534		putc (input[i], stdout);
535		}
536
537		free (breaks);
538
539		return 0;
540		}
541		else
542		return 1;
543		}
544
545		#endif /* TEST */