/src/oniguruma/src/sjis.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | sjis.c - Oniguruma (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2020 K.Kosako |
6 | | * All rights reserved. |
7 | | * |
8 | | * Redistribution and use in source and binary forms, with or without |
9 | | * modification, are permitted provided that the following conditions |
10 | | * are met: |
11 | | * 1. Redistributions of source code must retain the above copyright |
12 | | * notice, this list of conditions and the following disclaimer. |
13 | | * 2. Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in the |
15 | | * documentation and/or other materials provided with the distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 | | * SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include "regint.h" |
31 | | |
32 | | static const int EncLen_SJIS[] = { |
33 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
34 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
35 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
36 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
37 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
38 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
39 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
40 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
42 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
43 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
44 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
45 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
46 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
47 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
48 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 |
49 | | }; |
50 | | |
51 | | static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { |
52 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
53 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
54 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
55 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
56 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
57 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
58 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
59 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, |
60 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
61 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
62 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
63 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
64 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
65 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
66 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
67 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 |
68 | | }; |
69 | | |
70 | 100k | #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) |
71 | 123k | #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] |
72 | | |
73 | | static int |
74 | | mbc_enc_len(const UChar* p) |
75 | 696k | { |
76 | 696k | return EncLen_SJIS[*p]; |
77 | 696k | } |
78 | | |
79 | | static int |
80 | | is_valid_mbc_string(const UChar* p, const UChar* end) |
81 | 9.06k | { |
82 | 197k | while (p < end) { |
83 | 189k | if (*p < 0x80) { |
84 | 159k | p++; |
85 | 159k | } |
86 | 29.0k | else if (*p < 0xa1) { |
87 | 4.91k | if (*p == 0xa0 || *p == 0x80) |
88 | 8 | return FALSE; |
89 | 4.91k | p++; |
90 | 4.91k | if (p >= end) return FALSE; |
91 | 4.87k | if (*p < 0x40 || *p > 0xfc || *p == 0x7f) |
92 | 29 | return FALSE; |
93 | 4.84k | p++; |
94 | 4.84k | } |
95 | 24.1k | else if (*p < 0xe0) { |
96 | 21.3k | p++; |
97 | 21.3k | } |
98 | 2.86k | else if (*p < 0xfd) { |
99 | 2.84k | p++; |
100 | 2.84k | if (p >= end) return FALSE; |
101 | 2.80k | if (*p < 0x40 || *p > 0xfc || *p == 0x7f) |
102 | 12 | return FALSE; |
103 | 2.79k | p++; |
104 | 2.79k | } |
105 | 20 | else |
106 | 20 | return FALSE; |
107 | 189k | } |
108 | | |
109 | 8.92k | return TRUE; |
110 | 9.06k | } |
111 | | |
112 | | static int |
113 | | code_to_mbclen(OnigCodePoint code) |
114 | 319k | { |
115 | 319k | if (code < 256) { |
116 | 316k | if (EncLen_SJIS[(int )code] == 1) |
117 | 264k | return 1; |
118 | 316k | } |
119 | 2.98k | else if (code < 0x10000) { |
120 | 2.82k | if (EncLen_SJIS[(int )(code >> 8) & 0xff] == 2) |
121 | 2.75k | return 2; |
122 | 2.82k | } |
123 | | |
124 | 53.0k | return ONIGERR_INVALID_CODE_POINT_VALUE; |
125 | 319k | } |
126 | | |
127 | | static OnigCodePoint |
128 | | mbc_to_code(const UChar* p, const UChar* end) |
129 | 235k | { |
130 | 235k | int c, i, len; |
131 | 235k | OnigCodePoint n; |
132 | | |
133 | 235k | len = enclen(ONIG_ENCODING_SJIS, p); |
134 | 235k | c = *p++; |
135 | 235k | n = c; |
136 | 235k | if (len == 1) return n; |
137 | | |
138 | 39.4k | for (i = 1; i < len; i++) { |
139 | 19.7k | if (p >= end) break; |
140 | 19.7k | c = *p++; |
141 | 19.7k | n <<= 8; n += c; |
142 | 19.7k | } |
143 | 19.7k | return n; |
144 | 235k | } |
145 | | |
146 | | static int |
147 | | code_to_mbc(OnigCodePoint code, UChar *buf) |
148 | 610 | { |
149 | 610 | UChar *p = buf; |
150 | | |
151 | 610 | if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); |
152 | 610 | *p++ = (UChar )(code & 0xff); |
153 | | |
154 | 610 | return (int )(p - buf); |
155 | 610 | } |
156 | | |
157 | | static int |
158 | | mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, |
159 | | const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) |
160 | 3.03k | { |
161 | 3.03k | const UChar* p = *pp; |
162 | | |
163 | 3.03k | if (ONIGENC_IS_MBC_ASCII(p)) { |
164 | 2.34k | *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); |
165 | 2.34k | (*pp)++; |
166 | 2.34k | return 1; |
167 | 2.34k | } |
168 | 684 | else { |
169 | 684 | int i; |
170 | 684 | int len = enclen(ONIG_ENCODING_SJIS, p); |
171 | | |
172 | 1.75k | for (i = 0; i < len; i++) { |
173 | 1.07k | *lower++ = *p++; |
174 | 1.07k | } |
175 | 684 | (*pp) += len; |
176 | 684 | return len; /* return byte length of converted char to lower */ |
177 | 684 | } |
178 | 3.03k | } |
179 | | |
180 | | static UChar* |
181 | | left_adjust_char_head(const UChar* start, const UChar* s) |
182 | 133k | { |
183 | 133k | const UChar *p; |
184 | 133k | int len; |
185 | | |
186 | 133k | if (s <= start) return (UChar* )s; |
187 | 122k | p = s; |
188 | | |
189 | 122k | if (SJIS_ISMB_TRAIL(*p)) { |
190 | 101k | while (p > start) { |
191 | 100k | if (! SJIS_ISMB_FIRST(*--p)) { |
192 | 83.7k | p++; |
193 | 83.7k | break; |
194 | 83.7k | } |
195 | 100k | } |
196 | 84.3k | } |
197 | 122k | len = enclen(ONIG_ENCODING_SJIS, p); |
198 | 122k | if (p + len > s) return (UChar* )p; |
199 | 954 | p += len; |
200 | 954 | return (UChar* )(p + ((s - p) & ~1)); |
201 | 122k | } |
202 | | |
203 | | static int |
204 | | is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) |
205 | 515 | { |
206 | 515 | const UChar c = *s; |
207 | 515 | return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE); |
208 | 515 | } |
209 | | |
210 | | |
211 | | static const OnigCodePoint CR_Hiragana[] = { |
212 | | 1, |
213 | | 0x829f, 0x82f1 |
214 | | }; /* CR_Hiragana */ |
215 | | |
216 | | static const OnigCodePoint CR_Katakana[] = { |
217 | | 4, |
218 | | 0x00a6, 0x00af, |
219 | | 0x00b1, 0x00dd, |
220 | | 0x8340, 0x837e, |
221 | | 0x8380, 0x8396, |
222 | | }; /* CR_Katakana */ |
223 | | |
224 | | static const OnigCodePoint* PropertyList[] = { |
225 | | CR_Hiragana, |
226 | | CR_Katakana |
227 | | }; |
228 | | |
229 | | |
230 | | static int |
231 | | property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end) |
232 | 166 | { |
233 | 166 | struct PropertyNameCtype* pc; |
234 | 166 | int len = (int )(end - p); |
235 | 166 | char q[32]; |
236 | | |
237 | 166 | if (len < sizeof(q) - 1) { |
238 | 164 | xmemcpy(q, p, (size_t )len); |
239 | 164 | q[len] = '\0'; |
240 | 164 | pc = onigenc_sjis_lookup_property_name(q, len); |
241 | 164 | if (pc != 0) |
242 | 140 | return pc->ctype; |
243 | 164 | } |
244 | | |
245 | 26 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
246 | 166 | } |
247 | | |
248 | | static int |
249 | | is_code_ctype(OnigCodePoint code, unsigned int ctype) |
250 | 242k | { |
251 | 242k | if (ctype <= ONIGENC_MAX_STD_CTYPE) { |
252 | 242k | if (code < 128) |
253 | 181k | return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); |
254 | 61.0k | else { |
255 | 61.0k | if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { |
256 | 43.8k | return (code_to_mbclen(code) > 1 ? TRUE : FALSE); |
257 | 43.8k | } |
258 | 61.0k | } |
259 | 242k | } |
260 | 0 | else { |
261 | 0 | ctype -= (ONIGENC_MAX_STD_CTYPE + 1); |
262 | 0 | if (ctype >= (unsigned int )(sizeof(PropertyList)/sizeof(PropertyList[0]))) |
263 | 0 | return ONIGERR_TYPE_BUG; |
264 | | |
265 | 0 | return onig_is_in_code_range((UChar* )PropertyList[ctype], code); |
266 | 0 | } |
267 | | |
268 | 17.1k | return FALSE; |
269 | 242k | } |
270 | | |
271 | | static int |
272 | | get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, |
273 | | const OnigCodePoint* ranges[]) |
274 | 1.03k | { |
275 | 1.03k | if (ctype <= ONIGENC_MAX_STD_CTYPE) { |
276 | 998 | return ONIG_NO_SUPPORT_CONFIG; |
277 | 998 | } |
278 | 32 | else { |
279 | 32 | *sb_out = 0x80; |
280 | | |
281 | 32 | ctype -= (ONIGENC_MAX_STD_CTYPE + 1); |
282 | 32 | if (ctype >= (OnigCtype )(sizeof(PropertyList)/sizeof(PropertyList[0]))) |
283 | 0 | return ONIGERR_TYPE_BUG; |
284 | | |
285 | 32 | *ranges = PropertyList[ctype]; |
286 | 32 | return 0; |
287 | 32 | } |
288 | 1.03k | } |
289 | | |
290 | | OnigEncodingType OnigEncodingSJIS = { |
291 | | mbc_enc_len, |
292 | | "Shift_JIS", /* name */ |
293 | | 2, /* max enc length */ |
294 | | 1, /* min enc length */ |
295 | | onigenc_is_mbc_newline_0x0a, |
296 | | mbc_to_code, |
297 | | code_to_mbclen, |
298 | | code_to_mbc, |
299 | | mbc_case_fold, |
300 | | onigenc_ascii_apply_all_case_fold, |
301 | | onigenc_ascii_get_case_fold_codes_by_str, |
302 | | property_name_to_ctype, |
303 | | is_code_ctype, |
304 | | get_ctype_code_range, |
305 | | left_adjust_char_head, |
306 | | is_allowed_reverse_match, |
307 | | NULL, /* init */ |
308 | | NULL, /* is_initialized */ |
309 | | is_valid_mbc_string, |
310 | | ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0, |
311 | | 0, 0 |
312 | | }; |