/src/glib/subprojects/pcre2-10.44/src/pcre2_extuni.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | /* This module contains an internal function that is used to match a Unicode |
42 | | extended grapheme sequence. It is used by both pcre2_match() and |
43 | | pcre2_def_match(). However, it is called only when Unicode support is being |
44 | | compiled. Nevertheless, we provide a dummy function when there is no Unicode |
45 | | support, because some compilers do not like functionless source files. */ |
46 | | |
47 | | |
48 | | #ifdef HAVE_CONFIG_H |
49 | | #include "config.h" |
50 | | #endif |
51 | | |
52 | | |
53 | | #include "pcre2_internal.h" |
54 | | |
55 | | |
56 | | /* Dummy function */ |
57 | | |
58 | | #ifndef SUPPORT_UNICODE |
59 | | PCRE2_SPTR |
60 | | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
61 | | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
62 | | { |
63 | | (void)c; |
64 | | (void)eptr; |
65 | | (void)start_subject; |
66 | | (void)end_subject; |
67 | | (void)utf; |
68 | | (void)xcount; |
69 | | return NULL; |
70 | | } |
71 | | #else |
72 | | |
73 | | |
74 | | /************************************************* |
75 | | * Match an extended grapheme sequence * |
76 | | *************************************************/ |
77 | | |
78 | | /* NOTE: The logic contained in this function is replicated in three special- |
79 | | purpose functions in the pcre2_jit_compile.c module. If the logic below is |
80 | | changed, they must be kept in step so that the interpreter and the JIT have the |
81 | | same behaviour. |
82 | | |
83 | | Arguments: |
84 | | c the first character |
85 | | eptr pointer to next character |
86 | | start_subject pointer to start of subject |
87 | | end_subject pointer to end of subject |
88 | | utf TRUE if in UTF mode |
89 | | xcount pointer to count of additional characters, |
90 | | or NULL if count not needed |
91 | | |
92 | | Returns: pointer after the end of the sequence |
93 | | */ |
94 | | |
95 | | PCRE2_SPTR |
96 | | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
97 | | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
98 | 0 | { |
99 | 0 | BOOL was_ep_ZWJ = FALSE; |
100 | 0 | int lgb = UCD_GRAPHBREAK(c); |
101 | |
|
102 | 0 | while (eptr < end_subject) |
103 | 0 | { |
104 | 0 | int rgb; |
105 | 0 | int len = 1; |
106 | 0 | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
107 | 0 | rgb = UCD_GRAPHBREAK(c); |
108 | 0 | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
109 | | |
110 | | /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was |
111 | | preceded by Extended Pictographic. */ |
112 | | |
113 | 0 | if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) |
114 | 0 | break; |
115 | | |
116 | | /* Not breaking between Regional Indicators is allowed only if there |
117 | | are an even number of preceding RIs. */ |
118 | | |
119 | 0 | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
120 | 0 | { |
121 | 0 | int ricount = 0; |
122 | 0 | PCRE2_SPTR bptr = eptr - 1; |
123 | 0 | if (utf) BACKCHAR(bptr); |
124 | | |
125 | | /* bptr is pointing to the left-hand character */ |
126 | |
|
127 | 0 | while (bptr > start_subject) |
128 | 0 | { |
129 | 0 | bptr--; |
130 | 0 | if (utf) |
131 | 0 | { |
132 | 0 | BACKCHAR(bptr); |
133 | 0 | GETCHAR(c, bptr); |
134 | 0 | } |
135 | 0 | else |
136 | 0 | c = *bptr; |
137 | 0 | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; |
138 | 0 | ricount++; |
139 | 0 | } |
140 | 0 | if ((ricount & 1) != 0) break; /* Grapheme break required */ |
141 | 0 | } |
142 | | |
143 | | /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in |
144 | | between; see next statement). */ |
145 | | |
146 | 0 | was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); |
147 | | |
148 | | /* If Extend follows Extended_Pictographic, do not update lgb; this allows |
149 | | any number of them before a following ZWJ. */ |
150 | |
|
151 | 0 | if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb; |
152 | |
|
153 | 0 | eptr += len; |
154 | 0 | if (xcount != NULL) *xcount += 1; |
155 | 0 | } |
156 | |
|
157 | 0 | return eptr; |
158 | 0 | } |
159 | | |
160 | | #endif /* SUPPORT_UNICODE */ |
161 | | |
162 | | /* End of pcre2_extuni.c */ |