Coverage Report

Created: 2023-03-26 07:01

/src/fluent-bit/lib/onigmo/enc/utf_8.c
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
  utf_8.c -  Oniguruma (regular expression library)
3
**********************************************************************/
4
/*-
5
 * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 * 1. Redistributions of source code must retain the above copyright
12
 *    notice, this list of conditions and the following disclaimer.
13
 * 2. Redistributions in binary form must reproduce the above copyright
14
 *    notice, this list of conditions and the following disclaimer in the
15
 *    documentation and/or other materials provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
 * SUCH DAMAGE.
28
 */
29
30
#include "regenc.h"
31
#ifdef RUBY
32
# include "encindex.h"
33
#endif
34
35
#ifndef ENCINDEX_UTF_8
36
# define ENCINDEX_UTF_8 0
37
#endif
38
39
#define USE_INVALID_CODE_SCHEME
40
/* #define USE_UTF8_31BITS */
41
42
#ifdef USE_INVALID_CODE_SCHEME
43
/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
44
7.31k
# define INVALID_CODE_FE  0xfffffffe
45
329k
# define INVALID_CODE_FF  0xffffffff
46
#endif
47
48
#ifndef USE_UTF8_31BITS
49
5.64M
#define VALID_CODE_LIMIT  0x0010ffff
50
#else
51
#define VALID_CODE_LIMIT  0x7fffffff
52
#endif
53
54
286k
#define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
55
56
static const int EncLen_UTF8[] = {
57
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
70
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
71
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
72
#ifndef USE_UTF8_31BITS
73
  4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
74
#else
75
  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
76
#endif
77
};
78
79
typedef enum {
80
  FAILURE = -2,
81
  ACCEPT,
82
  S0, S1, S2, S3,
83
  S4, S5, S6, S7,
84
  S8, S9,S10,S11,
85
} state_t;
86
#define A ACCEPT
87
#define F FAILURE
88
static const signed char trans[][0x100] = {
89
  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
90
    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
91
    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
92
    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
93
    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
94
    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
95
    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
96
    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
97
    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
98
    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99
    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100
    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101
    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
102
    /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103
    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
    /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
105
#ifndef USE_UTF8_31BITS
106
    /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
107
#else
108
    /* f */ 5, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 9,10,11, F, F
109
#endif
110
  },
111
  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
112
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
119
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
120
    /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
121
    /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
122
    /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
123
    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
124
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
125
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
126
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
127
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
128
  },
129
  { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
130
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
131
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
132
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
133
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
135
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
136
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
137
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
138
    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
139
    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
140
    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
141
    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
143
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
144
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
145
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
146
  },
147
  { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
148
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
149
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
150
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
151
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
153
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
154
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156
    /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157
    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158
    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159
    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
161
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
162
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
163
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
164
  },
165
  { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
166
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
167
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
168
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
169
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
172
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
173
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
174
    /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175
    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176
    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177
    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
178
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
179
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
180
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
181
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
182
  },
183
  { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
184
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
185
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
186
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
187
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
189
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
190
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
191
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
192
    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193
    /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
194
    /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
195
    /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
196
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
197
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
198
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
199
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
200
  },
201
  { /* S6   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
202
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
203
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
204
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
205
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
206
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
207
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
208
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
209
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
210
    /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
211
    /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
212
    /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
213
    /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
214
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
215
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
216
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
217
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
218
  },
219
  { /* S7   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
220
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
221
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
222
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
223
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
224
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
225
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
226
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
227
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
228
    /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
229
    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
230
    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
231
    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
232
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
233
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
234
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
235
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
236
  },
237
#ifdef USE_UTF8_31BITS
238
  { /* S8   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
239
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
240
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
241
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
242
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
243
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
244
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
245
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
246
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
247
    /* 8 */ F, F, F, F, F, F, F, F, 6, 6, 6, 6, 6, 6, 6, 6,
248
    /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
249
    /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
250
    /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
251
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
252
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
253
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
254
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
255
  },
256
  { /* S9   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
257
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
258
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
259
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
260
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
261
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
262
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
263
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
264
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
265
    /* 8 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
266
    /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
267
    /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
268
    /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
269
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
270
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
271
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
272
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
273
  },
274
  { /* S10  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
275
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
276
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
277
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
278
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
279
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
280
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
281
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
282
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
283
    /* 8 */ F, F, F, F, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
284
    /* 9 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
285
    /* a */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
286
    /* b */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
287
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
288
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
289
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
290
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
291
  },
292
  { /* S11  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
293
    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
294
    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
295
    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
296
    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
297
    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
298
    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
299
    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
300
    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
301
    /* 8 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
302
    /* 9 */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
303
    /* a */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
304
    /* b */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
305
    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
306
    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
307
    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
308
    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
309
  },
310
#endif // USE_UTF8_31BITS
311
};
312
#undef A
313
#undef F
314
315
static int
316
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
317
243M
{
318
243M
  int firstbyte = *p++;
319
243M
  state_t s;
320
243M
  s = trans[0][firstbyte];
321
243M
  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
322
227M
                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
323
324
15.7M
  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
325
15.2M
  s = trans[s][*p++];
326
15.2M
  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
327
5.33M
                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
328
329
9.88M
  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
330
9.86M
  s = trans[s][*p++];
331
9.86M
  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
332
437k
                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
333
334
9.43M
  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
335
9.42M
  s = trans[s][*p++];
336
337
9.42M
#ifndef USE_UTF8_31BITS
338
9.42M
  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
339
9.42M
                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
340
#else
341
  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
342
                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
343
344
  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-4);
345
  s = trans[s][*p++];
346
  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(5) :
347
                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
348
349
  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-5);
350
  s = trans[s][*p++];
351
  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(6) :
352
                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
353
#endif
354
9.43M
}
355
356
static int
357
is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
358
52.0M
{
359
52.0M
  if (p < end) {
360
52.0M
    if (*p == 0x0a) return 1;
361
362
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
363
    if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
364
    if (p + 1 < end) {
365
      if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
366
  return 1;
367
      if (p + 2 < end) {
368
  if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
369
      && *(p+1) == 0x80 && *p == 0xe2)  /* U+2028, U+2029 */
370
    return 1;
371
      }
372
    }
373
#endif
374
52.0M
  }
375
376
51.8M
  return 0;
377
52.0M
}
378
379
static OnigCodePoint
380
mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
381
38.2M
{
382
38.2M
  int c, len;
383
38.2M
  OnigCodePoint n;
384
385
38.2M
  len = mbc_enc_len(p, end, enc);
386
38.2M
  c = *p++;
387
38.2M
  if (len > 1) {
388
192k
    len--;
389
192k
    n = c & ((1 << (6 - len)) - 1);
390
539k
    while (len--) {
391
346k
      c = *p++;
392
346k
      n = (n << 6) | (c & ((1 << 6) - 1));
393
346k
    }
394
192k
    return n;
395
192k
  }
396
38.1M
  else {
397
38.1M
#ifdef USE_INVALID_CODE_SCHEME
398
38.1M
    if (c > 0xfd) {
399
334k
      return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
400
334k
    }
401
37.7M
#endif
402
37.7M
    return (OnigCodePoint )c;
403
38.1M
  }
404
38.2M
}
405
406
static int
407
code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
408
35.9M
{
409
35.9M
  if      ((code & 0xffffff80) == 0) return 1;
410
33.2M
  else if ((code & 0xfffff800) == 0) return 2;
411
22.4M
  else if ((code & 0xffff0000) == 0) return 3;
412
5.64M
#ifndef USE_UTF8_31BITS
413
5.64M
  else if (code <= VALID_CODE_LIMIT) return 4;
414
#else
415
  else if ((code & 0xffe00000) == 0) return 4;
416
  else if ((code & 0xfc000000) == 0) return 5;
417
  else if (code <= VALID_CODE_LIMIT) return 6;
418
#endif
419
1.06k
#ifdef USE_INVALID_CODE_SCHEME
420
1.06k
  else if (code == INVALID_CODE_FE) return 1;
421
1.02k
  else if (code == INVALID_CODE_FF) return 1;
422
2
#endif
423
2
  else
424
2
    return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
425
35.9M
}
426
427
static int
428
code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
429
595k
{
430
595k
#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
431
595k
#define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)
432
433
595k
  if ((code & 0xffffff80) == 0) {
434
309k
    *buf = (UChar )code;
435
309k
    return 1;
436
309k
  }
437
286k
  else {
438
286k
    UChar *p = buf;
439
440
286k
    if ((code & 0xfffff800) == 0) {
441
133k
      *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
442
133k
    }
443
153k
    else if ((code & 0xffff0000) == 0) {
444
152k
      *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
445
152k
      *p++ = UTF8_TRAILS(code, 6);
446
152k
    }
447
382
#ifndef USE_UTF8_31BITS
448
382
    else if (code <= VALID_CODE_LIMIT) {
449
316
      *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
450
316
      *p++ = UTF8_TRAILS(code, 12);
451
316
      *p++ = UTF8_TRAILS(code,  6);
452
316
    }
453
#else
454
    else if ((code & 0xffe00000) == 0) {
455
        *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
456
        *p++ = UTF8_TRAILS(code, 12);
457
        *p++ = UTF8_TRAILS(code,  6);
458
    }
459
    else if ((code & 0xfc000000) == 0) {
460
        *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
461
        *p++ = UTF8_TRAILS(code, 18);
462
        *p++ = UTF8_TRAILS(code, 12);
463
        *p++ = UTF8_TRAILS(code,  6);
464
    }
465
    else if (code <= VALID_CODE_LIMIT) {
466
        *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
467
        *p++ = UTF8_TRAILS(code, 24);
468
        *p++ = UTF8_TRAILS(code, 18);
469
        *p++ = UTF8_TRAILS(code, 12);
470
        *p++ = UTF8_TRAILS(code,  6);
471
    }
472
#endif
473
474
66
#ifdef USE_INVALID_CODE_SCHEME
475
66
    else if (code == INVALID_CODE_FE) {
476
65
      *p = 0xfe;
477
65
      return 1;
478
65
    }
479
1
    else if (code == INVALID_CODE_FF) {
480
0
      *p = 0xff;
481
0
      return 1;
482
0
    }
483
1
#endif
484
1
    else {
485
1
      return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
486
1
    }
487
488
286k
    *p++ = UTF8_TRAIL0(code);
489
286k
    return (int )(p - buf);
490
286k
  }
491
595k
}
492
493
static int
494
mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
495
        const UChar* end, UChar* fold, OnigEncoding enc)
496
300k
{
497
300k
  const UChar* p = *pp;
498
499
300k
  if (ONIGENC_IS_MBC_ASCII(p)) {
500
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
501
    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
502
      if (*p == 0x49) {
503
  *fold++ = 0xc4;
504
  *fold   = 0xb1;
505
  (*pp)++;
506
  return 2;
507
      }
508
    }
509
#endif
510
511
211k
    *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
512
211k
    (*pp)++;
513
211k
    return 1; /* return byte length of converted char to lower */
514
211k
  }
515
88.4k
  else {
516
88.4k
    return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
517
88.4k
  }
518
300k
}
519
520
521
static int
522
get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
523
         const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
524
2.43M
{
525
2.43M
  *sb_out = 0x80;
526
2.43M
  return onigenc_unicode_ctype_code_range(ctype, ranges);
527
2.43M
}
528
529
530
static UChar*
531
left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
532
120k
{
533
120k
  const UChar *p;
534
535
120k
  if (s <= start) return (UChar* )s;
536
111k
  p = s;
537
538
143k
  while (!utf8_islead(*p) && p > start) p--;
539
111k
  return (UChar* )p;
540
120k
}
541
542
static int
543
get_case_fold_codes_by_str(OnigCaseFoldType flag,
544
    const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
545
    OnigEncoding enc)
546
291k
{
547
291k
  return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
548
291k
}
549
550
OnigEncodingDefine(utf_8, UTF_8) = {
551
  mbc_enc_len,
552
  "UTF-8",     /* name */
553
#ifndef USE_UTF8_31BITS
554
  4,           /* max byte length */
555
#else
556
  6,           /* max byte length */
557
#endif
558
  1,           /* min byte length */
559
  is_mbc_newline,
560
  mbc_to_code,
561
  code_to_mbclen,
562
  code_to_mbc,
563
  mbc_case_fold,
564
  onigenc_unicode_apply_all_case_fold,
565
  get_case_fold_codes_by_str,
566
  onigenc_unicode_property_name_to_ctype,
567
  onigenc_unicode_is_code_ctype,
568
  get_ctype_code_range,
569
  left_adjust_char_head,
570
  onigenc_always_true_is_allowed_reverse_match,
571
#ifdef USE_CASE_MAP_API
572
  onigenc_unicode_case_map,
573
#else
574
  NULL,
575
#endif
576
  ENCINDEX_UTF_8,
577
  ONIGENC_FLAG_UNICODE,
578
};
579
ENC_ALIAS("CP65001", "UTF-8")
580
581
/*
582
 * Name: UTF8-MAC
583
 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
584
 * Link: http://developer.apple.com/qa/qa2001/qa1235.html
585
 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
586
 * Link: http://www.gnu.org/software/emacs/NEWS.23.2
587
 */
588
ENC_REPLICATE("UTF8-MAC", "UTF-8")
589
ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
590
ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */