/src/php-src/oniguruma/src/big5.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | big5.c - Oniguruma (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2020 K.Kosako |
6 | | * All rights reserved. |
7 | | * |
8 | | * Redistribution and use in source and binary forms, with or without |
9 | | * modification, are permitted provided that the following conditions |
10 | | * are met: |
11 | | * 1. Redistributions of source code must retain the above copyright |
12 | | * notice, this list of conditions and the following disclaimer. |
13 | | * 2. Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in the |
15 | | * documentation and/or other materials provided with the distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 | | * SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include "regenc.h" |
31 | | |
32 | | static const int EncLen_BIG5[] = { |
33 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
34 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
35 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
36 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
37 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
38 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
39 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
40 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
42 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
43 | | 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
44 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
45 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
46 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
47 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
48 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 |
49 | | }; |
50 | | |
51 | | static int |
52 | | big5_mbc_enc_len(const UChar* p) |
53 | 0 | { |
54 | 0 | return EncLen_BIG5[*p]; |
55 | 0 | } |
56 | | |
57 | | static int |
58 | | big5_code_to_mbclen(OnigCodePoint code) |
59 | 0 | { |
60 | 0 | if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; |
61 | | |
62 | 0 | if ((code & 0xff00) != 0) { |
63 | 0 | if (EncLen_BIG5[(int )(code >> 8) & 0xff] == 2) |
64 | 0 | return 2; |
65 | 0 | } |
66 | 0 | else { |
67 | 0 | if (EncLen_BIG5[(int )(code & 0xff)] == 1) |
68 | 0 | return 1; |
69 | 0 | } |
70 | | |
71 | 0 | return ONIGERR_INVALID_CODE_POINT_VALUE; |
72 | 0 | } |
73 | | |
74 | | static int |
75 | | is_valid_mbc_string(const UChar* p, const UChar* end) |
76 | 0 | { |
77 | 0 | while (p < end) { |
78 | 0 | if (*p < 0x80) { |
79 | 0 | p++; |
80 | 0 | } |
81 | 0 | else if (*p < 0xa1) { |
82 | 0 | return FALSE; |
83 | 0 | } |
84 | 0 | else if (*p < 0xff) { |
85 | 0 | p++; |
86 | 0 | if (p >= end) return FALSE; |
87 | 0 | if (*p < 0x40) return FALSE; |
88 | 0 | if (*p > 0x7e && *p < 0xa1) return FALSE; |
89 | 0 | if (*p == 0xff) return FALSE; |
90 | 0 | p++; |
91 | 0 | } |
92 | 0 | else |
93 | 0 | return FALSE; |
94 | 0 | } |
95 | |
|
96 | 0 | return TRUE; |
97 | 0 | } |
98 | | |
99 | | static OnigCodePoint |
100 | | big5_mbc_to_code(const UChar* p, const UChar* end) |
101 | 0 | { |
102 | 0 | return onigenc_mbn_mbc_to_code(ONIG_ENCODING_BIG5, p, end); |
103 | 0 | } |
104 | | |
105 | | static int |
106 | | big5_code_to_mbc(OnigCodePoint code, UChar *buf) |
107 | 0 | { |
108 | 0 | return onigenc_mb2_code_to_mbc(ONIG_ENCODING_BIG5, code, buf); |
109 | 0 | } |
110 | | |
111 | | static int |
112 | | big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, |
113 | | UChar* lower) |
114 | 0 | { |
115 | 0 | return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_BIG5, flag, |
116 | 0 | pp, end, lower); |
117 | 0 | } |
118 | | |
119 | | static int |
120 | | big5_is_code_ctype(OnigCodePoint code, unsigned int ctype) |
121 | 0 | { |
122 | 0 | return onigenc_mb2_is_code_ctype(ONIG_ENCODING_BIG5, code, ctype); |
123 | 0 | } |
124 | | |
125 | | static const char BIG5_CAN_BE_TRAIL_TABLE[256] = { |
126 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
127 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
128 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
129 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
130 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
131 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
132 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
133 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, |
134 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
135 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
136 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
137 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
138 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
139 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
140 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
141 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 |
142 | | }; |
143 | | |
144 | 0 | #define BIG5_ISMB_FIRST(byte) (EncLen_BIG5[byte] > 1) |
145 | 0 | #define BIG5_ISMB_TRAIL(byte) BIG5_CAN_BE_TRAIL_TABLE[(byte)] |
146 | | |
147 | | static UChar* |
148 | | big5_left_adjust_char_head(const UChar* start, const UChar* s) |
149 | 0 | { |
150 | 0 | const UChar *p; |
151 | 0 | int len; |
152 | |
|
153 | 0 | if (s <= start) return (UChar* )s; |
154 | 0 | p = s; |
155 | |
|
156 | 0 | if (BIG5_ISMB_TRAIL(*p)) { |
157 | 0 | while (p > start) { |
158 | 0 | if (! BIG5_ISMB_FIRST(*--p)) { |
159 | 0 | p++; |
160 | 0 | break; |
161 | 0 | } |
162 | 0 | } |
163 | 0 | } |
164 | 0 | len = enclen(ONIG_ENCODING_BIG5, p); |
165 | 0 | if (p + len > s) return (UChar* )p; |
166 | 0 | p += len; |
167 | 0 | return (UChar* )(p + ((s - p) & ~1)); |
168 | 0 | } |
169 | | |
170 | | static int |
171 | | big5_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) |
172 | 0 | { |
173 | 0 | const UChar c = *s; |
174 | |
|
175 | 0 | return (BIG5_ISMB_TRAIL(c) ? FALSE : TRUE); |
176 | 0 | } |
177 | | |
178 | | OnigEncodingType OnigEncodingBIG5 = { |
179 | | big5_mbc_enc_len, |
180 | | "Big5", /* name */ |
181 | | 2, /* max enc length */ |
182 | | 1, /* min enc length */ |
183 | | onigenc_is_mbc_newline_0x0a, |
184 | | big5_mbc_to_code, |
185 | | big5_code_to_mbclen, |
186 | | big5_code_to_mbc, |
187 | | big5_mbc_case_fold, |
188 | | onigenc_ascii_apply_all_case_fold, |
189 | | onigenc_ascii_get_case_fold_codes_by_str, |
190 | | onigenc_minimum_property_name_to_ctype, |
191 | | big5_is_code_ctype, |
192 | | onigenc_not_support_get_ctype_code_range, |
193 | | big5_left_adjust_char_head, |
194 | | big5_is_allowed_reverse_match, |
195 | | NULL, /* init */ |
196 | | NULL, /* is_initialized */ |
197 | | is_valid_mbc_string, |
198 | | ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1, |
199 | | 0, 0 |
200 | | }; |