/src/fluent-bit/lib/onigmo/enc/utf_8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | utf_8.c - Oniguruma (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> |
6 | | * All rights reserved. |
7 | | * |
8 | | * Redistribution and use in source and binary forms, with or without |
9 | | * modification, are permitted provided that the following conditions |
10 | | * are met: |
11 | | * 1. Redistributions of source code must retain the above copyright |
12 | | * notice, this list of conditions and the following disclaimer. |
13 | | * 2. Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in the |
15 | | * documentation and/or other materials provided with the distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 | | * SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include "regenc.h" |
31 | | #ifdef RUBY |
32 | | # include "encindex.h" |
33 | | #endif |
34 | | |
35 | | #ifndef ENCINDEX_UTF_8 |
36 | | # define ENCINDEX_UTF_8 0 |
37 | | #endif |
38 | | |
39 | | #define USE_INVALID_CODE_SCHEME |
40 | | /* #define USE_UTF8_31BITS */ |
41 | | |
42 | | #ifdef USE_INVALID_CODE_SCHEME |
43 | | /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ |
44 | 7.31k | # define INVALID_CODE_FE 0xfffffffe |
45 | 329k | # define INVALID_CODE_FF 0xffffffff |
46 | | #endif |
47 | | |
48 | | #ifndef USE_UTF8_31BITS |
49 | 5.64M | #define VALID_CODE_LIMIT 0x0010ffff |
50 | | #else |
51 | | #define VALID_CODE_LIMIT 0x7fffffff |
52 | | #endif |
53 | | |
54 | 286k | #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) |
55 | | |
56 | | static const int EncLen_UTF8[] = { |
57 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
58 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
59 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
60 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
61 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
62 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
63 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
64 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
65 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
66 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
67 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
68 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
69 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
70 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
71 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
72 | | #ifndef USE_UTF8_31BITS |
73 | | 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
74 | | #else |
75 | | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 |
76 | | #endif |
77 | | }; |
78 | | |
79 | | typedef enum { |
80 | | FAILURE = -2, |
81 | | ACCEPT, |
82 | | S0, S1, S2, S3, |
83 | | S4, S5, S6, S7, |
84 | | S8, S9,S10,S11, |
85 | | } state_t; |
86 | | #define A ACCEPT |
87 | | #define F FAILURE |
88 | | static const signed char trans[][0x100] = { |
89 | | { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
90 | | /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
91 | | /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
92 | | /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
93 | | /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
94 | | /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
95 | | /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
96 | | /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
97 | | /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
98 | | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
99 | | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
100 | | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
101 | | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
102 | | /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
103 | | /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
104 | | /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, |
105 | | #ifndef USE_UTF8_31BITS |
106 | | /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F |
107 | | #else |
108 | | /* f */ 5, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 9,10,11, F, F |
109 | | #endif |
110 | | }, |
111 | | { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
112 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
113 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
114 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
115 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
116 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
117 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
118 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
119 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
120 | | /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
121 | | /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
122 | | /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
123 | | /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, |
124 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
125 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
126 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
127 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
128 | | }, |
129 | | { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
130 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
131 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
132 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
133 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
134 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
135 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
136 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
137 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
138 | | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
139 | | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
140 | | /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
141 | | /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
142 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
143 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
144 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
145 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
146 | | }, |
147 | | { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
148 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
149 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
150 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
151 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
152 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
153 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
154 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
155 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
156 | | /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
157 | | /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
158 | | /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
159 | | /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
160 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
161 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
162 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
163 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
164 | | }, |
165 | | { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
166 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
167 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
168 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
169 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
170 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
171 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
172 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
173 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
174 | | /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
175 | | /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
176 | | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
177 | | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
178 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
179 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
180 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
181 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
182 | | }, |
183 | | { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
184 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
185 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
186 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
187 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
188 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
189 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
190 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
191 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
192 | | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
193 | | /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
194 | | /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
195 | | /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
196 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
197 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
198 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
199 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
200 | | }, |
201 | | { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
202 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
203 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
204 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
205 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
206 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
207 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
208 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
209 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
210 | | /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
211 | | /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
212 | | /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
213 | | /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
214 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
215 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
216 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
217 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
218 | | }, |
219 | | { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
220 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
221 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
222 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
223 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
224 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
225 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
226 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
227 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
228 | | /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
229 | | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
230 | | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
231 | | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
232 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
233 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
234 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
235 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
236 | | }, |
237 | | #ifdef USE_UTF8_31BITS |
238 | | { /* S8 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
239 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
240 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
241 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
242 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
243 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
244 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
245 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
246 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
247 | | /* 8 */ F, F, F, F, F, F, F, F, 6, 6, 6, 6, 6, 6, 6, 6, |
248 | | /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
249 | | /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
250 | | /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
251 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
252 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
253 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
254 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
255 | | }, |
256 | | { /* S9 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
257 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
258 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
259 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
260 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
261 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
262 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
263 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
264 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
265 | | /* 8 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
266 | | /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
267 | | /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
268 | | /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
269 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
270 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
271 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
272 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
273 | | }, |
274 | | { /* S10 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
275 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
276 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
277 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
278 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
279 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
280 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
281 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
282 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
283 | | /* 8 */ F, F, F, F, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
284 | | /* 9 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
285 | | /* a */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
286 | | /* b */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
287 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
288 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
289 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
290 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
291 | | }, |
292 | | { /* S11 0 1 2 3 4 5 6 7 8 9 a b c d e f */ |
293 | | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
294 | | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
295 | | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
296 | | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
297 | | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
298 | | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
299 | | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
300 | | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
301 | | /* 8 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
302 | | /* 9 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
303 | | /* a */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
304 | | /* b */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
305 | | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
306 | | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
307 | | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, |
308 | | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F |
309 | | }, |
310 | | #endif // USE_UTF8_31BITS |
311 | | }; |
312 | | #undef A |
313 | | #undef F |
314 | | |
315 | | static int |
316 | | mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) |
317 | 243M | { |
318 | 243M | int firstbyte = *p++; |
319 | 243M | state_t s; |
320 | 243M | s = trans[0][firstbyte]; |
321 | 243M | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : |
322 | 227M | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
323 | | |
324 | 15.7M | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1); |
325 | 15.2M | s = trans[s][*p++]; |
326 | 15.2M | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : |
327 | 5.33M | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
328 | | |
329 | 9.88M | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2); |
330 | 9.86M | s = trans[s][*p++]; |
331 | 9.86M | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : |
332 | 437k | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
333 | | |
334 | 9.43M | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3); |
335 | 9.42M | s = trans[s][*p++]; |
336 | | |
337 | 9.42M | #ifndef USE_UTF8_31BITS |
338 | 9.42M | return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : |
339 | 9.42M | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
340 | | #else |
341 | | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : |
342 | | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
343 | | |
344 | | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-4); |
345 | | s = trans[s][*p++]; |
346 | | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(5) : |
347 | | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
348 | | |
349 | | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-5); |
350 | | s = trans[s][*p++]; |
351 | | return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(6) : |
352 | | ONIGENC_CONSTRUCT_MBCLEN_INVALID(); |
353 | | #endif |
354 | 9.43M | } |
355 | | |
356 | | static int |
357 | | is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) |
358 | 52.0M | { |
359 | 52.0M | if (p < end) { |
360 | 52.0M | if (*p == 0x0a) return 1; |
361 | | |
362 | | #ifdef USE_UNICODE_ALL_LINE_TERMINATORS |
363 | | if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1; |
364 | | if (p + 1 < end) { |
365 | | if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ |
366 | | return 1; |
367 | | if (p + 2 < end) { |
368 | | if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) |
369 | | && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ |
370 | | return 1; |
371 | | } |
372 | | } |
373 | | #endif |
374 | 52.0M | } |
375 | | |
376 | 51.8M | return 0; |
377 | 52.0M | } |
378 | | |
379 | | static OnigCodePoint |
380 | | mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) |
381 | 38.2M | { |
382 | 38.2M | int c, len; |
383 | 38.2M | OnigCodePoint n; |
384 | | |
385 | 38.2M | len = mbc_enc_len(p, end, enc); |
386 | 38.2M | c = *p++; |
387 | 38.2M | if (len > 1) { |
388 | 192k | len--; |
389 | 192k | n = c & ((1 << (6 - len)) - 1); |
390 | 539k | while (len--) { |
391 | 346k | c = *p++; |
392 | 346k | n = (n << 6) | (c & ((1 << 6) - 1)); |
393 | 346k | } |
394 | 192k | return n; |
395 | 192k | } |
396 | 38.1M | else { |
397 | 38.1M | #ifdef USE_INVALID_CODE_SCHEME |
398 | 38.1M | if (c > 0xfd) { |
399 | 334k | return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); |
400 | 334k | } |
401 | 37.7M | #endif |
402 | 37.7M | return (OnigCodePoint )c; |
403 | 38.1M | } |
404 | 38.2M | } |
405 | | |
406 | | static int |
407 | | code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) |
408 | 35.9M | { |
409 | 35.9M | if ((code & 0xffffff80) == 0) return 1; |
410 | 33.2M | else if ((code & 0xfffff800) == 0) return 2; |
411 | 22.4M | else if ((code & 0xffff0000) == 0) return 3; |
412 | 5.64M | #ifndef USE_UTF8_31BITS |
413 | 5.64M | else if (code <= VALID_CODE_LIMIT) return 4; |
414 | | #else |
415 | | else if ((code & 0xffe00000) == 0) return 4; |
416 | | else if ((code & 0xfc000000) == 0) return 5; |
417 | | else if (code <= VALID_CODE_LIMIT) return 6; |
418 | | #endif |
419 | 1.06k | #ifdef USE_INVALID_CODE_SCHEME |
420 | 1.06k | else if (code == INVALID_CODE_FE) return 1; |
421 | 1.02k | else if (code == INVALID_CODE_FF) return 1; |
422 | 2 | #endif |
423 | 2 | else |
424 | 2 | return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
425 | 35.9M | } |
426 | | |
427 | | static int |
428 | | code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) |
429 | 595k | { |
430 | 595k | #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80) |
431 | 595k | #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) |
432 | | |
433 | 595k | if ((code & 0xffffff80) == 0) { |
434 | 309k | *buf = (UChar )code; |
435 | 309k | return 1; |
436 | 309k | } |
437 | 286k | else { |
438 | 286k | UChar *p = buf; |
439 | | |
440 | 286k | if ((code & 0xfffff800) == 0) { |
441 | 133k | *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); |
442 | 133k | } |
443 | 153k | else if ((code & 0xffff0000) == 0) { |
444 | 152k | *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); |
445 | 152k | *p++ = UTF8_TRAILS(code, 6); |
446 | 152k | } |
447 | 382 | #ifndef USE_UTF8_31BITS |
448 | 382 | else if (code <= VALID_CODE_LIMIT) { |
449 | 316 | *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); |
450 | 316 | *p++ = UTF8_TRAILS(code, 12); |
451 | 316 | *p++ = UTF8_TRAILS(code, 6); |
452 | 316 | } |
453 | | #else |
454 | | else if ((code & 0xffe00000) == 0) { |
455 | | *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); |
456 | | *p++ = UTF8_TRAILS(code, 12); |
457 | | *p++ = UTF8_TRAILS(code, 6); |
458 | | } |
459 | | else if ((code & 0xfc000000) == 0) { |
460 | | *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); |
461 | | *p++ = UTF8_TRAILS(code, 18); |
462 | | *p++ = UTF8_TRAILS(code, 12); |
463 | | *p++ = UTF8_TRAILS(code, 6); |
464 | | } |
465 | | else if (code <= VALID_CODE_LIMIT) { |
466 | | *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); |
467 | | *p++ = UTF8_TRAILS(code, 24); |
468 | | *p++ = UTF8_TRAILS(code, 18); |
469 | | *p++ = UTF8_TRAILS(code, 12); |
470 | | *p++ = UTF8_TRAILS(code, 6); |
471 | | } |
472 | | #endif |
473 | | |
474 | 66 | #ifdef USE_INVALID_CODE_SCHEME |
475 | 66 | else if (code == INVALID_CODE_FE) { |
476 | 65 | *p = 0xfe; |
477 | 65 | return 1; |
478 | 65 | } |
479 | 1 | else if (code == INVALID_CODE_FF) { |
480 | 0 | *p = 0xff; |
481 | 0 | return 1; |
482 | 0 | } |
483 | 1 | #endif |
484 | 1 | else { |
485 | 1 | return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
486 | 1 | } |
487 | | |
488 | 286k | *p++ = UTF8_TRAIL0(code); |
489 | 286k | return (int )(p - buf); |
490 | 286k | } |
491 | 595k | } |
492 | | |
493 | | static int |
494 | | mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, |
495 | | const UChar* end, UChar* fold, OnigEncoding enc) |
496 | 300k | { |
497 | 300k | const UChar* p = *pp; |
498 | | |
499 | 300k | if (ONIGENC_IS_MBC_ASCII(p)) { |
500 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
501 | | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { |
502 | | if (*p == 0x49) { |
503 | | *fold++ = 0xc4; |
504 | | *fold = 0xb1; |
505 | | (*pp)++; |
506 | | return 2; |
507 | | } |
508 | | } |
509 | | #endif |
510 | | |
511 | 211k | *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); |
512 | 211k | (*pp)++; |
513 | 211k | return 1; /* return byte length of converted char to lower */ |
514 | 211k | } |
515 | 88.4k | else { |
516 | 88.4k | return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold); |
517 | 88.4k | } |
518 | 300k | } |
519 | | |
520 | | |
521 | | static int |
522 | | get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, |
523 | | const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) |
524 | 2.43M | { |
525 | 2.43M | *sb_out = 0x80; |
526 | 2.43M | return onigenc_unicode_ctype_code_range(ctype, ranges); |
527 | 2.43M | } |
528 | | |
529 | | |
530 | | static UChar* |
531 | | left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) |
532 | 120k | { |
533 | 120k | const UChar *p; |
534 | | |
535 | 120k | if (s <= start) return (UChar* )s; |
536 | 111k | p = s; |
537 | | |
538 | 143k | while (!utf8_islead(*p) && p > start) p--; |
539 | 111k | return (UChar* )p; |
540 | 120k | } |
541 | | |
542 | | static int |
543 | | get_case_fold_codes_by_str(OnigCaseFoldType flag, |
544 | | const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], |
545 | | OnigEncoding enc) |
546 | 291k | { |
547 | 291k | return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items); |
548 | 291k | } |
549 | | |
550 | | OnigEncodingDefine(utf_8, UTF_8) = { |
551 | | mbc_enc_len, |
552 | | "UTF-8", /* name */ |
553 | | #ifndef USE_UTF8_31BITS |
554 | | 4, /* max byte length */ |
555 | | #else |
556 | | 6, /* max byte length */ |
557 | | #endif |
558 | | 1, /* min byte length */ |
559 | | is_mbc_newline, |
560 | | mbc_to_code, |
561 | | code_to_mbclen, |
562 | | code_to_mbc, |
563 | | mbc_case_fold, |
564 | | onigenc_unicode_apply_all_case_fold, |
565 | | get_case_fold_codes_by_str, |
566 | | onigenc_unicode_property_name_to_ctype, |
567 | | onigenc_unicode_is_code_ctype, |
568 | | get_ctype_code_range, |
569 | | left_adjust_char_head, |
570 | | onigenc_always_true_is_allowed_reverse_match, |
571 | | #ifdef USE_CASE_MAP_API |
572 | | onigenc_unicode_case_map, |
573 | | #else |
574 | | NULL, |
575 | | #endif |
576 | | ENCINDEX_UTF_8, |
577 | | ONIGENC_FLAG_UNICODE, |
578 | | }; |
579 | | ENC_ALIAS("CP65001", "UTF-8") |
580 | | |
581 | | /* |
582 | | * Name: UTF8-MAC |
583 | | * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html |
584 | | * Link: http://developer.apple.com/qa/qa2001/qa1235.html |
585 | | * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html |
586 | | * Link: http://www.gnu.org/software/emacs/NEWS.23.2 |
587 | | */ |
588 | | ENC_REPLICATE("UTF8-MAC", "UTF-8") |
589 | | ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") |
590 | | ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ |