/src/pcre2/src/pcre2_extuni.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | /* This module contains an internal function that is used to match a Unicode |
43 | | extended grapheme sequence. It is used by both pcre2_match() and |
44 | | pcre2_dfa_match(). However, it is called only when Unicode support is being |
45 | | compiled. Nevertheless, we provide a dummy function when there is no Unicode |
46 | | support, because some compilers do not like functionless source files. */ |
47 | | |
48 | | |
49 | | #include "pcre2_internal.h" |
50 | | |
51 | | |
52 | | |
53 | | /* Dummy function */ |
54 | | |
55 | | #ifndef SUPPORT_UNICODE |
56 | | PCRE2_SPTR |
57 | | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
58 | | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
59 | | { |
60 | | (void)c; |
61 | | (void)eptr; |
62 | | (void)start_subject; |
63 | | (void)end_subject; |
64 | | (void)utf; |
65 | | (void)xcount; |
66 | | return NULL; |
67 | | } |
68 | | #else |
69 | | |
70 | | |
71 | | /************************************************* |
72 | | * Match an extended grapheme sequence * |
73 | | *************************************************/ |
74 | | |
75 | | /* NOTE: The logic contained in this function is replicated in three special- |
76 | | purpose functions in the pcre2_jit_compile.c module. If the logic below is |
77 | | changed, they must be kept in step so that the interpreter and the JIT have the |
78 | | same behaviour. |
79 | | |
80 | | Arguments: |
81 | | c the first character |
82 | | eptr pointer to next character |
83 | | start_subject pointer to start of subject |
84 | | end_subject pointer to end of subject |
85 | | utf TRUE if in UTF mode |
86 | | xcount pointer to count of additional characters, |
87 | | or NULL if count not needed |
88 | | |
89 | | Returns: pointer after the end of the sequence |
90 | | */ |
91 | | |
92 | | PCRE2_SPTR |
93 | | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
94 | | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
95 | 13.9M | { |
96 | 13.9M | BOOL was_ep_ZWJ = FALSE; |
97 | 13.9M | int lgb = UCD_GRAPHBREAK(c); |
98 | | |
99 | 14.0M | while (eptr < end_subject) |
100 | 13.5M | { |
101 | 13.5M | int rgb; |
102 | 13.5M | int len = 1; |
103 | 13.5M | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
104 | 13.5M | rgb = UCD_GRAPHBREAK(c); |
105 | 13.5M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
106 | | |
107 | | /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was |
108 | | preceded by Extended Pictographic. */ |
109 | | |
110 | 119k | if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) |
111 | 3.58k | break; |
112 | | |
113 | | /* Not breaking between Regional Indicators is allowed only if there |
114 | | are an even number of preceding RIs. */ |
115 | | |
116 | 115k | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
117 | 4.96k | { |
118 | 4.96k | int ricount = 0; |
119 | 4.96k | PCRE2_SPTR bptr = eptr - 1; |
120 | 4.96k | if (utf) BACKCHAR(bptr); |
121 | | |
122 | | /* bptr is pointing to the left-hand character */ |
123 | | |
124 | 7.26k | while (bptr > start_subject) |
125 | 6.32k | { |
126 | 6.32k | bptr--; |
127 | 6.32k | if (utf) |
128 | 6.32k | { |
129 | 6.32k | BACKCHAR(bptr); |
130 | 6.32k | GETCHAR(c, bptr); |
131 | 6.32k | } |
132 | 0 | else |
133 | 0 | c = *bptr; |
134 | 6.32k | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; |
135 | 2.29k | ricount++; |
136 | 2.29k | } |
137 | 4.96k | if ((ricount & 1) != 0) break; /* Grapheme break required */ |
138 | 4.96k | } |
139 | | |
140 | | /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in |
141 | | between; see next statement). */ |
142 | | |
143 | 113k | was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); |
144 | | |
145 | | /* If Extend follows Extended_Pictographic, do not update lgb; this allows |
146 | | any number of them before a following ZWJ. */ |
147 | | |
148 | 113k | if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb; |
149 | | |
150 | 113k | eptr += len; |
151 | 113k | if (xcount != NULL) *xcount += 1; |
152 | 113k | } |
153 | | |
154 | 13.9M | return eptr; |
155 | 13.9M | } Line | Count | Source | 95 | 6.48M | { | 96 | 6.48M | BOOL was_ep_ZWJ = FALSE; | 97 | 6.48M | int lgb = UCD_GRAPHBREAK(c); | 98 | | | 99 | 6.51M | while (eptr < end_subject) | 100 | 6.35M | { | 101 | 6.35M | int rgb; | 102 | 6.35M | int len = 1; | 103 | 6.35M | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } | 104 | 6.35M | rgb = UCD_GRAPHBREAK(c); | 105 | 6.35M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; | 106 | | | 107 | | /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was | 108 | | preceded by Extended Pictographic. */ | 109 | | | 110 | 32.5k | if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) | 111 | 1.35k | break; | 112 | | | 113 | | /* Not breaking between Regional Indicators is allowed only if there | 114 | | are an even number of preceding RIs. */ | 115 | | | 116 | 31.1k | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) | 117 | 4.96k | { | 118 | 4.96k | int ricount = 0; | 119 | 4.96k | PCRE2_SPTR bptr = eptr - 1; | 120 | 4.96k | if (utf) BACKCHAR(bptr); | 121 | | | 122 | | /* bptr is pointing to the left-hand character */ | 123 | | | 124 | 7.26k | while (bptr > start_subject) | 125 | 6.32k | { | 126 | 6.32k | bptr--; | 127 | 6.32k | if (utf) | 128 | 6.32k | { | 129 | 6.32k | BACKCHAR(bptr); | 130 | 6.32k | GETCHAR(c, bptr); | 131 | 6.32k | } | 132 | 0 | else | 133 | 0 | c = *bptr; | 134 | 6.32k | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; | 135 | 2.29k | ricount++; | 136 | 2.29k | } | 137 | 4.96k | if ((ricount & 1) != 0) break; /* Grapheme break required */ | 138 | 4.96k | } | 139 | | | 140 | | /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in | 141 | | between; see next statement). */ | 142 | | | 143 | 29.5k | was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); | 144 | | | 145 | | /* If Extend follows Extended_Pictographic, do not update lgb; this allows | 146 | | any number of them before a following ZWJ. */ | 147 | | | 148 | 29.5k | if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb; | 149 | | | 150 | 29.5k | eptr += len; | 151 | 29.5k | if (xcount != NULL) *xcount += 1; | 152 | 29.5k | } | 153 | | | 154 | 6.48M | return eptr; | 155 | 6.48M | } |
Line | Count | Source | 95 | 3.07M | { | 96 | 3.07M | BOOL was_ep_ZWJ = FALSE; | 97 | 3.07M | int lgb = UCD_GRAPHBREAK(c); | 98 | | | 99 | 3.09M | while (eptr < end_subject) | 100 | 2.97M | { | 101 | 2.97M | int rgb; | 102 | 2.97M | int len = 1; | 103 | 2.97M | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } | 104 | 2.97M | rgb = UCD_GRAPHBREAK(c); | 105 | 2.97M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; | 106 | | | 107 | | /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was | 108 | | preceded by Extended Pictographic. */ | 109 | | | 110 | 24.6k | if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) | 111 | 1.37k | break; | 112 | | | 113 | | /* Not breaking between Regional Indicators is allowed only if there | 114 | | are an even number of preceding RIs. */ | 115 | | | 116 | 23.2k | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) | 117 | 0 | { | 118 | 0 | int ricount = 0; | 119 | 0 | PCRE2_SPTR bptr = eptr - 1; | 120 | 0 | if (utf) BACKCHAR(bptr); | 121 | | | 122 | | /* bptr is pointing to the left-hand character */ | 123 | |
| 124 | 0 | while (bptr > start_subject) | 125 | 0 | { | 126 | 0 | bptr--; | 127 | 0 | if (utf) | 128 | 0 | { | 129 | 0 | BACKCHAR(bptr); | 130 | 0 | GETCHAR(c, bptr); | 131 | 0 | } | 132 | 0 | else | 133 | 0 | c = *bptr; | 134 | 0 | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; | 135 | 0 | ricount++; | 136 | 0 | } | 137 | 0 | if ((ricount & 1) != 0) break; /* Grapheme break required */ | 138 | 0 | } | 139 | | | 140 | | /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in | 141 | | between; see next statement). */ | 142 | | | 143 | 23.2k | was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); | 144 | | | 145 | | /* If Extend follows Extended_Pictographic, do not update lgb; this allows | 146 | | any number of them before a following ZWJ. */ | 147 | | | 148 | 23.2k | if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb; | 149 | | | 150 | 23.2k | eptr += len; | 151 | 23.2k | if (xcount != NULL) *xcount += 1; | 152 | 23.2k | } | 153 | | | 154 | 3.07M | return eptr; | 155 | 3.07M | } |
Line | Count | Source | 95 | 4.36M | { | 96 | 4.36M | BOOL was_ep_ZWJ = FALSE; | 97 | 4.36M | int lgb = UCD_GRAPHBREAK(c); | 98 | | | 99 | 4.42M | while (eptr < end_subject) | 100 | 4.23M | { | 101 | 4.23M | int rgb; | 102 | 4.23M | int len = 1; | 103 | 4.23M | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } | 104 | 4.23M | rgb = UCD_GRAPHBREAK(c); | 105 | 4.23M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; | 106 | | | 107 | | /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was | 108 | | preceded by Extended Pictographic. */ | 109 | | | 110 | 61.8k | if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) | 111 | 857 | break; | 112 | | | 113 | | /* Not breaking between Regional Indicators is allowed only if there | 114 | | are an even number of preceding RIs. */ | 115 | | | 116 | 61.0k | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) | 117 | 0 | { | 118 | 0 | int ricount = 0; | 119 | 0 | PCRE2_SPTR bptr = eptr - 1; | 120 | 0 | if (utf) BACKCHAR(bptr); | 121 | | | 122 | | /* bptr is pointing to the left-hand character */ | 123 | |
| 124 | 0 | while (bptr > start_subject) | 125 | 0 | { | 126 | 0 | bptr--; | 127 | 0 | if (utf) | 128 | 0 | { | 129 | 0 | BACKCHAR(bptr); | 130 | 0 | GETCHAR(c, bptr); | 131 | 0 | } | 132 | 0 | else | 133 | 0 | c = *bptr; | 134 | 0 | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; | 135 | 0 | ricount++; | 136 | 0 | } | 137 | 0 | if ((ricount & 1) != 0) break; /* Grapheme break required */ | 138 | 0 | } | 139 | | | 140 | | /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in | 141 | | between; see next statement). */ | 142 | | | 143 | 61.0k | was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); | 144 | | | 145 | | /* If Extend follows Extended_Pictographic, do not update lgb; this allows | 146 | | any number of them before a following ZWJ. */ | 147 | | | 148 | 61.0k | if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb; | 149 | | | 150 | 61.0k | eptr += len; | 151 | 61.0k | if (xcount != NULL) *xcount += 1; | 152 | 61.0k | } | 153 | | | 154 | 4.36M | return eptr; | 155 | 4.36M | } |
|
156 | | |
157 | | #endif /* SUPPORT_UNICODE */ |
158 | | |
159 | | /* End of pcre2_extuni.c */ |