Line | Count | Source |
1 | | /* ccl - routines for character classes */ |
2 | | |
3 | | /* SPDX-License-Identifier: BSD-3-Clause-flex */ |
4 | | |
5 | | /* Copyright (c) 1990 The Regents of the University of California. */ |
6 | | /* All rights reserved. */ |
7 | | |
8 | | /* This code is derived from software contributed to Berkeley by */ |
9 | | /* Vern Paxson. */ |
10 | | |
11 | | /* The United States Government has rights in this work pursuant */ |
12 | | /* to contract no. DE-AC03-76SF00098 between the United States */ |
13 | | /* Department of Energy and the University of California. */ |
14 | | |
15 | | /* This file is part of flex. */ |
16 | | |
17 | | /* Redistribution and use in source and binary forms, with or without */ |
18 | | /* modification, are permitted provided that the following conditions */ |
19 | | /* are met: */ |
20 | | |
21 | | /* 1. Redistributions of source code must retain the above copyright */ |
22 | | /* notice, this list of conditions and the following disclaimer. */ |
23 | | /* 2. Redistributions in binary form must reproduce the above copyright */ |
24 | | /* notice, this list of conditions and the following disclaimer in the */ |
25 | | /* documentation and/or other materials provided with the distribution. */ |
26 | | |
27 | | /* Neither the name of the University nor the names of its contributors */ |
28 | | /* may be used to endorse or promote products derived from this software */ |
29 | | /* without specific prior written permission. */ |
30 | | |
31 | | /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ |
32 | | /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ |
33 | | /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ |
34 | | /* PURPOSE. */ |
35 | | |
36 | | #include "flexdef.h" |
37 | | |
38 | | /* return true if the chr is in the ccl. Takes negation into account. */ |
39 | | static bool |
40 | | ccl_contains (const int cclp, const int ch) |
41 | 0 | { |
42 | 0 | int ind, len, i; |
43 | |
|
44 | 0 | len = ccllen[cclp]; |
45 | 0 | ind = cclmap[cclp]; |
46 | |
|
47 | 0 | for (i = 0; i < len; ++i) |
48 | 0 | if (ccltbl[ind + i] == ch) |
49 | 0 | return !cclng[cclp]; |
50 | | |
51 | 0 | return cclng[cclp]; |
52 | 0 | } |
53 | | |
54 | | |
55 | | /* ccladd - add a single character to a ccl */ |
56 | | |
57 | | void ccladd (int cclp, int ch) |
58 | 0 | { |
59 | 0 | int ind, len, newpos, i; |
60 | |
|
61 | 0 | check_char (ch); |
62 | |
|
63 | 0 | len = ccllen[cclp]; |
64 | 0 | ind = cclmap[cclp]; |
65 | | |
66 | | /* check to see if the character is already in the ccl */ |
67 | |
|
68 | 0 | for (i = 0; i < len; ++i) |
69 | 0 | if (ccltbl[ind + i] == ch) |
70 | 0 | return; |
71 | | |
72 | | /* mark newlines */ |
73 | 0 | if (ch == nlch) |
74 | 0 | ccl_has_nl[cclp] = true; |
75 | |
|
76 | 0 | newpos = ind + len; |
77 | | |
78 | | /* For a non-last cclp, expanding the set will overflow and overwrite a |
79 | | * char in the next cclp. |
80 | | * FIXME: Need another allocation scheme for ccl's. */ |
81 | 0 | if (cclp != lastccl) { |
82 | 0 | flexfatal(_("internal error: trying to add a char to a non-last ccl.\n")); |
83 | 0 | } |
84 | |
|
85 | 0 | if (newpos >= current_max_ccl_tbl_size) { |
86 | 0 | current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; |
87 | |
|
88 | 0 | ++num_reallocs; |
89 | |
|
90 | 0 | ccltbl = reallocate_Character_array (ccltbl, |
91 | 0 | current_max_ccl_tbl_size); |
92 | 0 | } |
93 | |
|
94 | 0 | ccllen[cclp] = len + 1; |
95 | 0 | ccltbl[newpos] = (unsigned char) ch; |
96 | 0 | } |
97 | | |
98 | | /* dump_cclp - same thing as list_character_set, but for cclps. */ |
99 | | |
100 | | static void dump_cclp (FILE* file, int cclp) |
101 | 0 | { |
102 | 0 | int i; |
103 | 0 |
|
104 | 0 | putc ('[', file); |
105 | 0 |
|
106 | 0 | for (i = 0; i < ctrl.csize; ++i) { |
107 | 0 | if (ccl_contains(cclp, i)){ |
108 | 0 | int start_char = i; |
109 | 0 |
|
110 | 0 | putc (' ', file); |
111 | 0 |
|
112 | 0 | fputs (readable_form (i), file); |
113 | 0 |
|
114 | 0 | while (++i < ctrl.csize && ccl_contains(cclp,i)) ; |
115 | 0 |
|
116 | 0 | if (i - 1 > start_char) |
117 | 0 | /* this was a run */ |
118 | 0 | fprintf (file, "-%s", |
119 | 0 | readable_form (i - 1)); |
120 | 0 |
|
121 | 0 | putc (' ', file); |
122 | 0 | } |
123 | 0 | } |
124 | 0 |
|
125 | 0 | putc (']', file); |
126 | 0 | } |
127 | | |
128 | | |
129 | | |
130 | | /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ |
131 | | int |
132 | | ccl_set_diff (int a, int b) |
133 | 0 | { |
134 | 0 | int d, ch; |
135 | | |
136 | | /* create new class */ |
137 | 0 | d = cclinit(); |
138 | | |
139 | | /* In order to handle negation, we spin through all possible chars, |
140 | | * adding each char in a that is not in b. |
141 | | * (This could be O(n^2), but n is small and bounded.) |
142 | | */ |
143 | 0 | for ( ch = 0; ch < ctrl.csize; ++ch ) |
144 | 0 | if (ccl_contains (a, ch) && !ccl_contains(b, ch)) |
145 | 0 | ccladd (d, ch); |
146 | | |
147 | | /* debug */ |
148 | 0 | if (0){ |
149 | 0 | fprintf(stderr, "ccl_set_diff ("); |
150 | 0 | fprintf(stderr, "\n "); |
151 | 0 | dump_cclp (stderr, a); |
152 | 0 | fprintf(stderr, "\n "); |
153 | 0 | dump_cclp (stderr, b); |
154 | 0 | fprintf(stderr, "\n "); |
155 | 0 | dump_cclp (stderr, d); |
156 | 0 | fprintf(stderr, "\n)\n"); |
157 | 0 | } |
158 | 0 | return d; |
159 | 0 | } |
160 | | |
161 | | /* ccl_set_union - create a new ccl as the set union of the two given ccls. */ |
162 | | int |
163 | | ccl_set_union (int a, int b) |
164 | 0 | { |
165 | 0 | int d, i; |
166 | | |
167 | | /* create new class */ |
168 | 0 | d = cclinit(); |
169 | | |
170 | | /* Add all of a */ |
171 | 0 | for (i = 0; i < ccllen[a]; ++i) |
172 | 0 | ccladd (d, ccltbl[cclmap[a] + i]); |
173 | | |
174 | | /* Add all of b */ |
175 | 0 | for (i = 0; i < ccllen[b]; ++i) |
176 | 0 | ccladd (d, ccltbl[cclmap[b] + i]); |
177 | | |
178 | | /* debug */ |
179 | 0 | if (0){ |
180 | 0 | fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); |
181 | 0 | fprintf(stderr, "\n "); |
182 | 0 | dump_cclp (stderr, a); |
183 | 0 | fprintf(stderr, "\n "); |
184 | 0 | dump_cclp (stderr, b); |
185 | 0 | fprintf(stderr, "\n "); |
186 | 0 | dump_cclp (stderr, d); |
187 | 0 | fprintf(stderr, "\n)\n"); |
188 | 0 | } |
189 | 0 | return d; |
190 | 0 | } |
191 | | |
192 | | |
193 | | /* cclinit - return an empty ccl */ |
194 | | |
195 | | int cclinit (void) |
196 | 0 | { |
197 | 0 | if (++lastccl >= current_maxccls) { |
198 | 0 | current_maxccls += MAX_CCLS_INCREMENT; |
199 | |
|
200 | 0 | ++num_reallocs; |
201 | |
|
202 | 0 | cclmap = |
203 | 0 | reallocate_integer_array (cclmap, current_maxccls); |
204 | 0 | ccllen = |
205 | 0 | reallocate_integer_array (ccllen, current_maxccls); |
206 | 0 | cclng = reallocate_integer_array (cclng, current_maxccls); |
207 | 0 | ccl_has_nl = reallocate_array(ccl_has_nl, current_maxccls, sizeof(char)); |
208 | 0 | } |
209 | |
|
210 | 0 | if (lastccl == 1) |
211 | | /* we're making the first ccl */ |
212 | 0 | cclmap[lastccl] = 0; |
213 | | |
214 | 0 | else |
215 | | /* The new pointer is just past the end of the last ccl. |
216 | | * Since the cclmap points to the \first/ character of a |
217 | | * ccl, adding the length of the ccl to the cclmap pointer |
218 | | * will produce a cursor to the first free space. |
219 | | */ |
220 | 0 | cclmap[lastccl] = |
221 | 0 | cclmap[lastccl - 1] + ccllen[lastccl - 1]; |
222 | |
|
223 | 0 | ccllen[lastccl] = 0; |
224 | 0 | cclng[lastccl] = 0; /* ccl's start out life un-negated */ |
225 | 0 | ccl_has_nl[lastccl] = false; |
226 | |
|
227 | 0 | return lastccl; |
228 | 0 | } |
229 | | |
230 | | |
231 | | /* cclnegate - negate the given ccl */ |
232 | | |
233 | | void cclnegate (int cclp) |
234 | 0 | { |
235 | 0 | cclng[cclp] = 1; |
236 | 0 | ccl_has_nl[cclp] = !ccl_has_nl[cclp]; |
237 | 0 | } |
238 | | |
239 | | |
240 | | /* list_character_set - list the members of a set of characters in CCL form |
241 | | * |
242 | | * Writes to the given file a character-class representation of those |
243 | | * characters present in the given CCL. A character is present if it |
244 | | * has a non-zero value in the cset array. |
245 | | */ |
246 | | |
247 | | void list_character_set (FILE *file, int cset[]) |
248 | 0 | { |
249 | 0 | int i; |
250 | |
|
251 | 0 | putc ('[', file); |
252 | |
|
253 | 0 | for (i = 0; i < ctrl.csize; ++i) { |
254 | 0 | if (cset[i]) { |
255 | 0 | int start_char = i; |
256 | |
|
257 | 0 | putc (' ', file); |
258 | |
|
259 | 0 | fputs (readable_form (i), file); |
260 | |
|
261 | 0 | while (++i < ctrl.csize && cset[i]) ; |
262 | |
|
263 | 0 | if (i - 1 > start_char) |
264 | | /* this was a run */ |
265 | 0 | fprintf (file, "-%s", |
266 | 0 | readable_form (i - 1)); |
267 | |
|
268 | 0 | putc (' ', file); |
269 | 0 | } |
270 | 0 | } |
271 | |
|
272 | 0 | putc (']', file); |
273 | 0 | } |
274 | | |
275 | | /** Determines if the range [c1-c2] is unambiguous in a case-insensitive |
276 | | * scanner. Specifically, if a lowercase or uppercase character, x, is in the |
277 | | * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also |
278 | | * be in the range. If not, then this range is ambiguous, and the function |
279 | | * returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that |
280 | | * [a-z] will be labeled ambiguous because it does not include [A-Z]. |
281 | | * |
282 | | * @param c1 the lower end of the range |
283 | | * @param c2 the upper end of the range |
284 | | * @return true if [c1-c2] is not ambiguous for a caseless scanner. |
285 | | */ |
286 | | bool range_covers_case (int c1, int c2) |
287 | 0 | { |
288 | 0 | int i, o; |
289 | |
|
290 | 0 | for (i = c1; i <= c2; i++) { |
291 | 0 | if (has_case (i)) { |
292 | 0 | o = reverse_case (i); |
293 | 0 | if (o < c1 || c2 < o) |
294 | 0 | return false; |
295 | 0 | } |
296 | 0 | } |
297 | 0 | return true; |
298 | 0 | } |
299 | | |
300 | | /** Reverse the case of a character, if possible. |
301 | | * @return c if case-reversal does not apply. |
302 | | */ |
303 | | int reverse_case (int c) |
304 | 0 | { |
305 | 0 | return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c); |
306 | 0 | } |
307 | | |
308 | | /** Return true if c is uppercase or lowercase. */ |
309 | | bool has_case (int c) |
310 | 0 | { |
311 | 0 | return (isupper (c) || islower (c)) ? true : false; |
312 | 0 | } |