/src/poco/Foundation/src/pcre2_script_run.c

Source (jump to first uncovered line)
/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2021 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/

/* This module contains the function for checking a script run. */

#include "pcre2_config.h"
#include "pcre2_internal.h"


/*************************************************
*                Check script run                *
*************************************************/

/* A script run is conceptually a sequence of characters all in the same
Unicode script. However, it isn't quite that simple. There are special rules
for scripts that are commonly used together, and also special rules for digits.
This function implements the appropriate checks, which is possible only when
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
no Unicode support; however, it should never be called in that circumstance
because an error is given by pcre2_compile() if a script run is called for in a
version of PCRE2 compiled without Unicode support.

Arguments:
  pgr       point to the first character
  endptr    point after the last character
  utf       TRUE if in UTF mode

Returns:    TRUE if this is a valid script run
*/

/* These are states in the checking process. */

enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
       SCRIPT_HANPENDING,     /* Have had only Han characters */
       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
       SCRIPT_HANHANGUL       /* Expect Han or Hangul */
       };

#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)

BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
uint32_t require_state = SCRIPT_UNSET;
uint32_t require_map[FULL_MAPSIZE];
uint32_t map[FULL_MAPSIZE];
uint32_t require_digitset = 0;
uint32_t c;

#if PCRE2_CODE_UNIT_WIDTH == 32
(void)utf;    /* Avoid compiler warning */
#endif

/* Any string containing fewer than 2 characters is a valid script run. */

if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;

/* Initialize the require map. This is a full-size bitmap that has a bit for
every script, as opposed to the maps in ucd_script_sets, which only have bits
for scripts less than ucp_Unknown - those that appear in script extension
lists. */

for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;

/* Scan strings of two or more characters, checking the Unicode characteristics
of each code point. There is special code for scripts that can be combined with
characters from the Han Chinese script. This may be used in conjunction with
four other scripts in these combinations:

. Han with Hiragana and Katakana is allowed (for Japanese).
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
. Han with Hangul is allowed (for Korean).

If the first significant character's script is one of the four, the required
script type is immediately known. However, if the first significant
character's script is Han, we have to keep checking for a non-Han character.
Hence the SCRIPT_HANPENDING state. */

for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
  uint32_t script = ucd->script;

  /* If the script is Unknown, the string is not a valid script run. Such
  characters can only form script runs of length one (see test above). */

  if (script == ucp_Unknown) return FALSE;

  /* A character without any script extensions whose script is Inherited or
  Common is always accepted with any script. If there are extensions, the
  following processing happens for all scripts. */

  if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
    {
    BOOL OK;

    /* Set up a full-sized map for this character that can include bits for all
    scripts. Copy the scriptx map for this character (which covers those
    scripts that appear in script extension lists), set the remaining values to
    zero, and then, except for Common or Inherited, add this script's bit to
    the map. */

    memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);

    /* Handle the different checking states */

    switch(require_state)
      {
      /* First significant character - it might follow Common or Inherited
      characters that do not have any script extensions. */

      case SCRIPT_UNSET:
      switch(script)
        {
        case ucp_Han:
        require_state = SCRIPT_HANPENDING;
        break;

        case ucp_Hiragana:
        case ucp_Katakana:
        require_state = SCRIPT_HANHIRAKATA;
        break;

        case ucp_Bopomofo:
        require_state = SCRIPT_HANBOPOMOFO;
        break;

        case ucp_Hangul:
        require_state = SCRIPT_HANHANGUL;
        break;

        default:
        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
        require_state = SCRIPT_MAP;
        break;
        }
      break;

      /* The first significant character was Han. An inspection of the Unicode
      11.0.0 files shows that there are the following types of Script Extension
      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
      scripts:

      . Bopomofo + Han
      . Han + Hiragana + Katakana
      . Hiragana + Katakana
      . Bopopmofo + Hangul + Han + Hiragana + Katakana

      The following code tries to make sense of this. */

#define FOUND_BOPOMOFO 1
#define FOUND_HIRAGANA 2
#define FOUND_KATAKANA 4
#define FOUND_HANGUL   8

      case SCRIPT_HANPENDING:
      if (script != ucp_Han)   /* Another Han does nothing */
        {
        uint32_t chspecial = 0;

        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;

        if (chspecial == 0) return FALSE;   /* Not allowed with Han */

        if (chspecial == FOUND_BOPOMOFO)
          require_state = SCRIPT_HANBOPOMOFO;
        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
          require_state = SCRIPT_HANHIRAKATA;

        /* Otherwise this character must be allowed with all of them, so remain
        in the pending state. */
        }
      break;

      /* Previously encountered one of the "with Han" scripts. Check that
      this character is appropriate. */

      case SCRIPT_HANHIRAKATA:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
          MAPBIT(map, ucp_Katakana) == 0) return FALSE;
      break;

      case SCRIPT_HANBOPOMOFO:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
      break;

      case SCRIPT_HANHANGUL:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
      break;

      /* Previously encountered one or more characters that are allowed with a
      list of scripts. */

      case SCRIPT_MAP:
      OK = FALSE;

      for (int i = 0; i < FULL_MAPSIZE; i++)
        {
        if ((require_map[i] & map[i]) != 0)
          {
          OK = TRUE;
          break;
          }
        }

      if (!OK) return FALSE;

      /* The rest of the string must be in this script, but we have to
      allow for the Han complications. */

      switch(script)
        {
        case ucp_Han:
        require_state = SCRIPT_HANPENDING;
        break;

        case ucp_Hiragana:
        case ucp_Katakana:
        require_state = SCRIPT_HANHIRAKATA;
        break;

        case ucp_Bopomofo:
        require_state = SCRIPT_HANBOPOMOFO;
        break;

        case ucp_Hangul:
        require_state = SCRIPT_HANHANGUL;
        break;

        /* Compute the intersection of the required list of scripts and the
        allowed scripts for this character. */

        default:
        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
        break;
        }

      break;
      }
    }   /* End checking character's script and extensions. */

  /* The character is in an acceptable script. We must now ensure that all
  decimal digits in the string come from the same set. Some scripts (e.g.
  Common, Arabic) have more than one set of decimal digits. This code does
  not allow mixing sets, even within the same script. The vector called
  PRIV(ucd_digit_sets)[] contains, in its first element, the number of
  following elements, and then, in ascending order, the code points of the
  '9' characters in every set of 10 digits. Each set is identified by the
  offset in the vector of its '9' character. An initial check of the first
  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */

  if (ucd->chartype == ucp_Nd)
    {
    uint32_t digitset;

    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
      {
      int mid;
      int bot = 1;
      int top = PRIV(ucd_digit_sets)[0];
      for (;;)
        {
        if (top <= bot + 1)    /* <= rather than == is paranoia */
          {
          digitset = top;
          break;
          }
        mid = (top + bot) / 2;
        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
        }
      }

    /* A required value of 0 means "unset". */

    if (require_digitset == 0) require_digitset = digitset;
      else if (digitset != require_digitset) return FALSE;
    }   /* End digit handling */

  /* If we haven't yet got to the end, pick up the next character. */

  if (ptr >= endptr) return TRUE;
  GETCHARINCTEST(c, ptr);
  }  /* End checking loop */

#else   /* NOT SUPPORT_UNICODE */
(void)ptr;
(void)endptr;
(void)utf;
return TRUE;
#endif  /* SUPPORT_UNICODE */
}

/* End of pcre2_script_run.c */

Coverage Report

Created: 2024-01-03 06:08

Line	Count	Source (jump to first uncovered line)
1		/*************************************************
2		* Perl-Compatible Regular Expressions *
3		*************************************************/
4
5		/* PCRE is a library of functions to support regular expressions whose syntax
6		and semantics are as close as possible to those of the Perl 5 language.
7
8		Written by Philip Hazel
9		Original API code Copyright (c) 1997-2012 University of Cambridge
10		New API code Copyright (c) 2016-2021 University of Cambridge
11
12		-----------------------------------------------------------------------------
13		Redistribution and use in source and binary forms, with or without
14		modification, are permitted provided that the following conditions are met:
15
16		* Redistributions of source code must retain the above copyright notice,
17		this list of conditions and the following disclaimer.
18
19		* Redistributions in binary form must reproduce the above copyright
20		notice, this list of conditions and the following disclaimer in the
21		documentation and/or other materials provided with the distribution.
22
23		* Neither the name of the University of Cambridge nor the names of its
24		contributors may be used to endorse or promote products derived from
25		this software without specific prior written permission.
26
27		THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28		AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29		IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30		ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31		LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32		CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33		SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34		INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35		CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36		ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37		POSSIBILITY OF SUCH DAMAGE.
38		-----------------------------------------------------------------------------
39		*/
40
41		/* This module contains the function for checking a script run. */
42
43		#include "pcre2_config.h"
44		#include "pcre2_internal.h"
45
46
47		/*************************************************
48		* Check script run *
49		*************************************************/
50
51		/* A script run is conceptually a sequence of characters all in the same
52		Unicode script. However, it isn't quite that simple. There are special rules
53		for scripts that are commonly used together, and also special rules for digits.
54		This function implements the appropriate checks, which is possible only when
55		PCRE2 is compiled with Unicode support. The function returns TRUE if there is
56		no Unicode support; however, it should never be called in that circumstance
57		because an error is given by pcre2_compile() if a script run is called for in a
58		version of PCRE2 compiled without Unicode support.
59
60		Arguments:
61		pgr point to the first character
62		endptr point after the last character
63		utf TRUE if in UTF mode
64
65		Returns: TRUE if this is a valid script run
66		*/
67
68		/* These are states in the checking process. */
69
70		enum { SCRIPT_UNSET, /* Requirement as yet unknown */
71		SCRIPT_MAP, /* Bitmap contains acceptable scripts */
72		SCRIPT_HANPENDING, /* Have had only Han characters */
73		SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
74		SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
75		SCRIPT_HANHANGUL /* Expect Han or Hangul */
76		};
77
78	0	#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
79	0	#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
80
81		BOOL
82		PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
83	0	{
84	0	#ifdef SUPPORT_UNICODE
85	0	uint32_t require_state = SCRIPT_UNSET;
86	0	uint32_t require_map[FULL_MAPSIZE];
87	0	uint32_t map[FULL_MAPSIZE];
88	0	uint32_t require_digitset = 0;
89	0	uint32_t c;
90
91		#if PCRE2_CODE_UNIT_WIDTH == 32
92		(void)utf; /* Avoid compiler warning */
93		#endif
94
95		/* Any string containing fewer than 2 characters is a valid script run. */
96
97	0	if (ptr >= endptr) return TRUE;
98	0	GETCHARINCTEST(c, ptr);
99	0	if (ptr >= endptr) return TRUE;
100
101		/* Initialize the require map. This is a full-size bitmap that has a bit for
102		every script, as opposed to the maps in ucd_script_sets, which only have bits
103		for scripts less than ucp_Unknown - those that appear in script extension
104		lists. */
105
106	0	for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
107
108		/* Scan strings of two or more characters, checking the Unicode characteristics
109		of each code point. There is special code for scripts that can be combined with
110		characters from the Han Chinese script. This may be used in conjunction with
111		four other scripts in these combinations:
112
113		. Han with Hiragana and Katakana is allowed (for Japanese).
114		. Han with Bopomofo is allowed (for Taiwanese Mandarin).
115		. Han with Hangul is allowed (for Korean).
116
117		If the first significant character's script is one of the four, the required
118		script type is immediately known. However, if the first significant
119		character's script is Han, we have to keep checking for a non-Han character.
120		Hence the SCRIPT_HANPENDING state. */
121
122	0	for (;;)
123	0	{
124	0	const ucd_record *ucd = GET_UCD(c);
125	0	uint32_t script = ucd->script;
126
127		/* If the script is Unknown, the string is not a valid script run. Such
128		characters can only form script runs of length one (see test above). */
129
130	0	if (script == ucp_Unknown) return FALSE;
131
132		/* A character without any script extensions whose script is Inherited or
133		Common is always accepted with any script. If there are extensions, the
134		following processing happens for all scripts. */
135
136	0	if (UCD_SCRIPTX_PROP(ucd) != 0 \|\| (script != ucp_Inherited && script != ucp_Common))
137	0	{
138	0	BOOL OK;
139
140		/* Set up a full-sized map for this character that can include bits for all
141		scripts. Copy the scriptx map for this character (which covers those
142		scripts that appear in script extension lists), set the remaining values to
143		zero, and then, except for Common or Inherited, add this script's bit to
144		the map. */
145
146	0	memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
147	0	memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
148	0	if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
149
150		/* Handle the different checking states */
151
152	0	switch(require_state)
153	0	{
154		/* First significant character - it might follow Common or Inherited
155		characters that do not have any script extensions. */
156
157	0	case SCRIPT_UNSET:
158	0	switch(script)
159	0	{
160	0	case ucp_Han:
161	0	require_state = SCRIPT_HANPENDING;
162	0	break;
163
164	0	case ucp_Hiragana:
165	0	case ucp_Katakana:
166	0	require_state = SCRIPT_HANHIRAKATA;
167	0	break;
168
169	0	case ucp_Bopomofo:
170	0	require_state = SCRIPT_HANBOPOMOFO;
171	0	break;
172
173	0	case ucp_Hangul:
174	0	require_state = SCRIPT_HANHANGUL;
175	0	break;
176
177	0	default:
178	0	memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
179	0	require_state = SCRIPT_MAP;
180	0	break;
181	0	}
182	0	break;
183
184		/* The first significant character was Han. An inspection of the Unicode
185		11.0.0 files shows that there are the following types of Script Extension
186		list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
187		scripts:
188
189		. Bopomofo + Han
190		. Han + Hiragana + Katakana
191		. Hiragana + Katakana
192		. Bopopmofo + Hangul + Han + Hiragana + Katakana
193
194		The following code tries to make sense of this. */
195
196	0	#define FOUND_BOPOMOFO 1
197	0	#define FOUND_HIRAGANA 2
198	0	#define FOUND_KATAKANA 4
199	0	#define FOUND_HANGUL 8
200
201	0	case SCRIPT_HANPENDING:
202	0	if (script != ucp_Han) /* Another Han does nothing */
203	0	{
204	0	uint32_t chspecial = 0;
205
206	0	if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial \|= FOUND_BOPOMOFO;
207	0	if (MAPBIT(map, ucp_Hiragana) != 0) chspecial \|= FOUND_HIRAGANA;
208	0	if (MAPBIT(map, ucp_Katakana) != 0) chspecial \|= FOUND_KATAKANA;
209	0	if (MAPBIT(map, ucp_Hangul) != 0) chspecial \|= FOUND_HANGUL;
210
211	0	if (chspecial == 0) return FALSE; /* Not allowed with Han */
212
213	0	if (chspecial == FOUND_BOPOMOFO)
214	0	require_state = SCRIPT_HANBOPOMOFO;
215	0	else if (chspecial == (FOUND_HIRAGANA\|FOUND_KATAKANA))
216	0	require_state = SCRIPT_HANHIRAKATA;
217
218		/* Otherwise this character must be allowed with all of them, so remain
219		in the pending state. */
220	0	}
221	0	break;
222
223		/* Previously encountered one of the "with Han" scripts. Check that
224		this character is appropriate. */
225
226	0	case SCRIPT_HANHIRAKATA:
227	0	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
228	0	MAPBIT(map, ucp_Katakana) == 0) return FALSE;
229	0	break;
230
231	0	case SCRIPT_HANBOPOMOFO:
232	0	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
233	0	break;
234
235	0	case SCRIPT_HANHANGUL:
236	0	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
237	0	break;
238
239		/* Previously encountered one or more characters that are allowed with a
240		list of scripts. */
241
242	0	case SCRIPT_MAP:
243	0	OK = FALSE;
244
245	0	for (int i = 0; i < FULL_MAPSIZE; i++)
246	0	{
247	0	if ((require_map[i] & map[i]) != 0)
248	0	{
249	0	OK = TRUE;
250	0	break;
251	0	}
252	0	}
253
254	0	if (!OK) return FALSE;
255
256		/* The rest of the string must be in this script, but we have to
257		allow for the Han complications. */
258
259	0	switch(script)
260	0	{
261	0	case ucp_Han:
262	0	require_state = SCRIPT_HANPENDING;
263	0	break;
264
265	0	case ucp_Hiragana:
266	0	case ucp_Katakana:
267	0	require_state = SCRIPT_HANHIRAKATA;
268	0	break;
269
270	0	case ucp_Bopomofo:
271	0	require_state = SCRIPT_HANBOPOMOFO;
272	0	break;
273
274	0	case ucp_Hangul:
275	0	require_state = SCRIPT_HANHANGUL;
276	0	break;
277
278		/* Compute the intersection of the required list of scripts and the
279		allowed scripts for this character. */
280
281	0	default:
282	0	for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
283	0	break;
284	0	}
285
286	0	break;
287	0	}
288	0	} /* End checking character's script and extensions. */
289
290		/* The character is in an acceptable script. We must now ensure that all
291		decimal digits in the string come from the same set. Some scripts (e.g.
292		Common, Arabic) have more than one set of decimal digits. This code does
293		not allow mixing sets, even within the same script. The vector called
294		PRIV(ucd_digit_sets)[] contains, in its first element, the number of
295		following elements, and then, in ascending order, the code points of the
296		'9' characters in every set of 10 digits. Each set is identified by the
297		offset in the vector of its '9' character. An initial check of the first
298		value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
299
300	0	if (ucd->chartype == ucp_Nd)
301	0	{
302	0	uint32_t digitset;
303
304	0	if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
305	0	{
306	0	int mid;
307	0	int bot = 1;
308	0	int top = PRIV(ucd_digit_sets)[0];
309	0	for (;;)
310	0	{
311	0	if (top <= bot + 1) /* <= rather than == is paranoia */
312	0	{
313	0	digitset = top;
314	0	break;
315	0	}
316	0	mid = (top + bot) / 2;
317	0	if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
318	0	}
319	0	}
320
321		/* A required value of 0 means "unset". */
322
323	0	if (require_digitset == 0) require_digitset = digitset;
324	0	else if (digitset != require_digitset) return FALSE;
325	0	} /* End digit handling */
326
327		/* If we haven't yet got to the end, pick up the next character. */
328
329	0	if (ptr >= endptr) return TRUE;
330	0	GETCHARINCTEST(c, ptr);
331	0	} /* End checking loop */
332
333		#else /* NOT SUPPORT_UNICODE */
334		(void)ptr;
335		(void)endptr;
336		(void)utf;
337		return TRUE;
338		#endif /* SUPPORT_UNICODE */
339	0	}
340
341		/* End of pcre2_script_run.c */