/src/testdir/build/lua-master/source/lutf8lib.c
Line | Count | Source |
1 | | /* |
2 | | ** $Id: lutf8lib.c $ |
3 | | ** Standard library for UTF-8 manipulation |
4 | | ** See Copyright Notice in lua.h |
5 | | */ |
6 | | |
7 | | #define lutf8lib_c |
8 | | #define LUA_LIB |
9 | | |
10 | | #include "lprefix.h" |
11 | | |
12 | | |
13 | | #include <limits.h> |
14 | | #include <stdlib.h> |
15 | | #include <string.h> |
16 | | |
17 | | #include "lua.h" |
18 | | |
19 | | #include "lauxlib.h" |
20 | | #include "lualib.h" |
21 | | #include "llimits.h" |
22 | | |
23 | | |
24 | 175k | #define MAXUNICODE 0x10FFFFu |
25 | | |
26 | 13.1k | #define MAXUTF 0x7FFFFFFFu |
27 | | |
28 | | |
29 | 142 | #define MSGInvalid "invalid UTF-8 code" |
30 | | |
31 | | |
32 | 19.4k | #define iscont(c) (((c) & 0xC0) == 0x80) |
33 | 1.30k | #define iscontp(p) iscont(*(p)) |
34 | | |
35 | | |
36 | | /* from strlib */ |
37 | | /* translate a relative string position: negative means back from end */ |
38 | 31.7k | static lua_Integer u_posrelat (lua_Integer pos, size_t len) { |
39 | 31.7k | if (pos >= 0) return pos; |
40 | 10.3k | else if (0u - (size_t)pos > len) return 0; |
41 | 10.0k | else return (lua_Integer)len + pos + 1; |
42 | 31.7k | } |
43 | | |
44 | | |
45 | | /* |
46 | | ** Decode one UTF-8 sequence, returning NULL if byte sequence is |
47 | | ** invalid. The array 'limits' stores the minimum value for each |
48 | | ** sequence length, to check for overlong representations. Its first |
49 | | ** entry forces an error for non-ASCII bytes with no continuation |
50 | | ** bytes (count == 0). |
51 | | */ |
52 | 93.8k | static const char *utf8_decode (const char *s, l_uint32 *val, int strict) { |
53 | 93.8k | static const l_uint32 limits[] = |
54 | 93.8k | {~(l_uint32)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u}; |
55 | 93.8k | unsigned int c = (unsigned char)s[0]; |
56 | 93.8k | l_uint32 res = 0; /* final result */ |
57 | 93.8k | if (c < 0x80) /* ASCII? */ |
58 | 83.8k | res = c; |
59 | 9.99k | else { |
60 | 9.99k | int count = 0; /* to count number of continuation bytes */ |
61 | 23.0k | for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ |
62 | 16.3k | unsigned int cc = (unsigned char)s[++count]; /* read next byte */ |
63 | 16.3k | if (!iscont(cc)) /* not a continuation byte? */ |
64 | 3.31k | return NULL; /* invalid byte sequence */ |
65 | 13.0k | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ |
66 | 13.0k | } |
67 | 6.68k | res |= ((l_uint32)(c & 0x7F) << (count * 5)); /* add first byte */ |
68 | 6.68k | if (count > 5 || res > MAXUTF || res < limits[count]) |
69 | 1.94k | return NULL; /* invalid byte sequence */ |
70 | 4.73k | s += count; /* skip continuation bytes read */ |
71 | 4.73k | } |
72 | 88.5k | if (strict) { |
73 | | /* check for invalid code points; too large or surrogates */ |
74 | 87.6k | if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) |
75 | 2.39k | return NULL; |
76 | 87.6k | } |
77 | 86.1k | if (val) *val = res; |
78 | 86.1k | return s + 1; /* +1 to include first byte */ |
79 | 88.5k | } |
80 | | |
81 | | |
82 | | /* |
83 | | ** utf8len(s [, i [, j [, lax]]]) --> number of characters that |
84 | | ** start in the range [i,j], or nil + current position if 's' is not |
85 | | ** well formed in that interval |
86 | | */ |
87 | 15.7k | static int utflen (lua_State *L) { |
88 | 15.7k | lua_Integer n = 0; /* counter for the number of characters */ |
89 | 15.7k | size_t len; /* string length in bytes */ |
90 | 15.7k | const char *s = luaL_checklstring(L, 1, &len); |
91 | 15.7k | lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); |
92 | 15.7k | lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); |
93 | 15.7k | int lax = lua_toboolean(L, 4); |
94 | 15.7k | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, |
95 | 15.7k | "initial position out of bounds"); |
96 | 15.7k | luaL_argcheck(L, --posj < (lua_Integer)len, 3, |
97 | 15.7k | "final position out of bounds"); |
98 | 100k | while (posi <= posj) { |
99 | 92.3k | const char *s1 = utf8_decode(s + posi, NULL, !lax); |
100 | 92.3k | if (s1 == NULL) { /* conversion error? */ |
101 | 7.56k | luaL_pushfail(L); /* return fail ... */ |
102 | 7.56k | lua_pushinteger(L, posi + 1); /* ... and current position */ |
103 | 7.56k | return 2; |
104 | 7.56k | } |
105 | 84.7k | posi = ct_diff2S(s1 - s); |
106 | 84.7k | n++; |
107 | 84.7k | } |
108 | 8.23k | lua_pushinteger(L, n); |
109 | 8.23k | return 1; |
110 | 15.7k | } |
111 | | |
112 | | |
113 | | /* |
114 | | ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all |
115 | | ** characters that start in the range [i,j] |
116 | | */ |
117 | 91 | static int codepoint (lua_State *L) { |
118 | 91 | size_t len; |
119 | 91 | const char *s = luaL_checklstring(L, 1, &len); |
120 | 91 | lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); |
121 | 91 | lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); |
122 | 91 | int lax = lua_toboolean(L, 4); |
123 | 91 | int n; |
124 | 91 | const char *se; |
125 | 91 | luaL_argcheck(L, posi >= 1, 2, "out of bounds"); |
126 | 91 | luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds"); |
127 | 91 | if (posi > pose) return 0; /* empty interval; return no values */ |
128 | 91 | if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ |
129 | 0 | return luaL_error(L, "string slice too long"); |
130 | 91 | n = (int)(pose - posi) + 1; /* upper bound for number of returns */ |
131 | 91 | luaL_checkstack(L, n, "string slice too long"); |
132 | 91 | n = 0; /* count the number of returns */ |
133 | 91 | se = s + pose; /* string end */ |
134 | 179 | for (s += posi - 1; s < se;) { |
135 | 90 | l_uint32 code; |
136 | 90 | s = utf8_decode(s, &code, !lax); |
137 | 90 | if (s == NULL) |
138 | 2 | return luaL_error(L, MSGInvalid); |
139 | 88 | lua_pushinteger(L, l_castU2S(code)); |
140 | 88 | n++; |
141 | 88 | } |
142 | 89 | return n; |
143 | 91 | } |
144 | | |
145 | | |
146 | 1.70k | static void pushutfchar (lua_State *L, int arg) { |
147 | 1.70k | lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); |
148 | 1.70k | luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); |
149 | 1.70k | lua_pushfstring(L, "%U", (long)code); |
150 | 1.70k | } |
151 | | |
152 | | |
153 | | /* |
154 | | ** utfchar(n1, n2, ...) -> char(n1)..char(n2)... |
155 | | */ |
156 | 4.00k | static int utfchar (lua_State *L) { |
157 | 4.00k | int n = lua_gettop(L); /* number of arguments */ |
158 | 4.00k | if (n == 1) /* optimize common case of single char */ |
159 | 1.06k | pushutfchar(L, 1); |
160 | 2.94k | else { |
161 | 2.94k | int i; |
162 | 2.94k | luaL_Buffer b; |
163 | 2.94k | luaL_buffinit(L, &b); |
164 | 3.58k | for (i = 1; i <= n; i++) { |
165 | 645 | pushutfchar(L, i); |
166 | 645 | luaL_addvalue(&b); |
167 | 645 | } |
168 | 2.94k | luaL_pushresult(&b); |
169 | 2.94k | } |
170 | 4.00k | return 1; |
171 | 4.00k | } |
172 | | |
173 | | |
174 | | /* |
175 | | ** offset(s, n, [i]) -> indices where n-th character counting from |
176 | | ** position 'i' starts and ends; 0 means character at 'i'. |
177 | | */ |
178 | 11 | static int byteoffset (lua_State *L) { |
179 | 11 | size_t len; |
180 | 11 | const char *s = luaL_checklstring(L, 1, &len); |
181 | 11 | lua_Integer n = luaL_checkinteger(L, 2); |
182 | 11 | lua_Integer posi = (n >= 0) ? 1 : cast_st2S(len) + 1; |
183 | 11 | posi = u_posrelat(luaL_optinteger(L, 3, posi), len); |
184 | 11 | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, |
185 | 11 | "position out of bounds"); |
186 | 11 | if (n == 0) { |
187 | | /* find beginning of current byte sequence */ |
188 | 0 | while (posi > 0 && iscontp(s + posi)) posi--; |
189 | 0 | } |
190 | 11 | else { |
191 | 11 | if (iscontp(s + posi)) |
192 | 0 | return luaL_error(L, "initial position is a continuation byte"); |
193 | 11 | if (n < 0) { |
194 | 0 | while (n < 0 && posi > 0) { /* move back */ |
195 | 0 | do { /* find beginning of previous character */ |
196 | 0 | posi--; |
197 | 0 | } while (posi > 0 && iscontp(s + posi)); |
198 | 0 | n++; |
199 | 0 | } |
200 | 0 | } |
201 | 11 | else { |
202 | 11 | n--; /* do not move for 1st character */ |
203 | 11 | while (n > 0 && posi < (lua_Integer)len) { |
204 | 0 | do { /* find beginning of next character */ |
205 | 0 | posi++; |
206 | 0 | } while (iscontp(s + posi)); /* (cannot pass final '\0') */ |
207 | 0 | n--; |
208 | 0 | } |
209 | 11 | } |
210 | 11 | } |
211 | 11 | if (n != 0) { /* did not find given character? */ |
212 | 0 | luaL_pushfail(L); |
213 | 0 | return 1; |
214 | 0 | } |
215 | 11 | lua_pushinteger(L, posi + 1); /* initial position */ |
216 | 11 | if ((s[posi] & 0x80) != 0) { /* multi-byte character? */ |
217 | 0 | if (iscont(s[posi])) |
218 | 0 | return luaL_error(L, "initial position is a continuation byte"); |
219 | 0 | while (iscontp(s + posi + 1)) |
220 | 0 | posi++; /* skip to last continuation byte */ |
221 | 0 | } |
222 | | /* else one-byte character: final position is the initial one */ |
223 | 11 | lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */ |
224 | 11 | return 2; |
225 | 11 | } |
226 | | |
227 | | |
228 | 3.37k | static int iter_aux (lua_State *L, int strict) { |
229 | 3.37k | size_t len; |
230 | 3.37k | const char *s = luaL_checklstring(L, 1, &len); |
231 | 3.37k | lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2); |
232 | 3.37k | if (n < len) { |
233 | 1.44k | while (iscontp(s + n)) n++; /* go to next character */ |
234 | 1.44k | } |
235 | 3.37k | if (n >= len) /* (also handles original 'n' being negative) */ |
236 | 1.96k | return 0; /* no more codepoints */ |
237 | 1.41k | else { |
238 | 1.41k | l_uint32 code; |
239 | 1.41k | const char *next = utf8_decode(s + n, &code, strict); |
240 | 1.41k | if (next == NULL || iscontp(next)) |
241 | 140 | return luaL_error(L, MSGInvalid); |
242 | 1.27k | lua_pushinteger(L, l_castU2S(n + 1)); |
243 | 1.27k | lua_pushinteger(L, l_castU2S(code)); |
244 | 1.27k | return 2; |
245 | 1.41k | } |
246 | 3.37k | } |
247 | | |
248 | | |
249 | 3.37k | static int iter_auxstrict (lua_State *L) { |
250 | 3.37k | return iter_aux(L, 1); |
251 | 3.37k | } |
252 | | |
253 | 0 | static int iter_auxlax (lua_State *L) { |
254 | 0 | return iter_aux(L, 0); |
255 | 0 | } |
256 | | |
257 | | |
258 | 4.55k | static int iter_codes (lua_State *L) { |
259 | 4.55k | int lax = lua_toboolean(L, 2); |
260 | 4.55k | const char *s = luaL_checkstring(L, 1); |
261 | 4.55k | luaL_argcheck(L, !iscontp(s), 1, MSGInvalid); |
262 | 4.55k | lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); |
263 | 4.55k | lua_pushvalue(L, 1); |
264 | 4.55k | lua_pushinteger(L, 0); |
265 | 4.55k | return 3; |
266 | 4.55k | } |
267 | | |
268 | | |
269 | | /* pattern to match a single UTF-8 character */ |
270 | 54.2k | #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" |
271 | | |
272 | | |
273 | | static const luaL_Reg funcs[] = { |
274 | | {"offset", byteoffset}, |
275 | | {"codepoint", codepoint}, |
276 | | {"char", utfchar}, |
277 | | {"len", utflen}, |
278 | | {"codes", iter_codes}, |
279 | | /* placeholders */ |
280 | | {"charpattern", NULL}, |
281 | | {NULL, NULL} |
282 | | }; |
283 | | |
284 | | |
285 | 27.1k | LUAMOD_API int luaopen_utf8 (lua_State *L) { |
286 | 27.1k | luaL_newlib(L, funcs); |
287 | 27.1k | lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); |
288 | 27.1k | lua_setfield(L, -2, "charpattern"); |
289 | 27.1k | return 1; |
290 | 27.1k | } |
291 | | |