/src/mupdf/thirdparty/mujs/utf.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * The authors of this software are Rob Pike and Ken Thompson. |
3 | | * Copyright (c) 2002 by Lucent Technologies. |
4 | | * Permission to use, copy, modify, and distribute this software for any |
5 | | * purpose without fee is hereby granted, provided that this entire notice |
6 | | * is included in all copies of any software which is or includes a copy |
7 | | * or modification of this software and in all copies of the supporting |
8 | | * documentation for such software. |
9 | | * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED |
10 | | * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE |
11 | | * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY |
12 | | * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. |
13 | | */ |
14 | | #include <stdlib.h> |
15 | | #include <string.h> |
16 | | |
17 | | #include "utf.h" |
18 | | #include "utfdata.h" |
19 | | |
20 | 0 | #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) |
21 | | |
22 | | typedef unsigned char uchar; |
23 | | |
24 | | enum |
25 | | { |
26 | | Bit1 = 7, |
27 | | Bitx = 6, |
28 | | Bit2 = 5, |
29 | | Bit3 = 4, |
30 | | Bit4 = 3, |
31 | | Bit5 = 2, |
32 | | |
33 | | T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ |
34 | | Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ |
35 | | T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ |
36 | | T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ |
37 | | T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ |
38 | | T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ |
39 | | |
40 | | Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ |
41 | | Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ |
42 | | Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ |
43 | | Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ |
44 | | |
45 | | Maskx = (1<<Bitx)-1, /* 0011 1111 */ |
46 | | Testx = Maskx ^ 0xFF, /* 1100 0000 */ |
47 | | |
48 | | Bad = Runeerror |
49 | | }; |
50 | | |
51 | | int |
52 | | chartorune(Rune *rune, const char *str) |
53 | 0 | { |
54 | 0 | int c, c1, c2, c3; |
55 | 0 | int l; |
56 | | |
57 | | /* overlong null character */ |
58 | 0 | if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) { |
59 | 0 | *rune = 0; |
60 | 0 | return 2; |
61 | 0 | } |
62 | | |
63 | | /* |
64 | | * one character sequence |
65 | | * 00000-0007F => T1 |
66 | | */ |
67 | 0 | c = *(uchar*)str; |
68 | 0 | if(c < Tx) { |
69 | 0 | *rune = c; |
70 | 0 | return 1; |
71 | 0 | } |
72 | | |
73 | | /* |
74 | | * two character sequence |
75 | | * 0080-07FF => T2 Tx |
76 | | */ |
77 | 0 | c1 = *(uchar*)(str+1) ^ Tx; |
78 | 0 | if(c1 & Testx) |
79 | 0 | goto bad; |
80 | 0 | if(c < T3) { |
81 | 0 | if(c < T2) |
82 | 0 | goto bad; |
83 | 0 | l = ((c << Bitx) | c1) & Rune2; |
84 | 0 | if(l <= Rune1) |
85 | 0 | goto bad; |
86 | 0 | *rune = l; |
87 | 0 | return 2; |
88 | 0 | } |
89 | | |
90 | | /* |
91 | | * three character sequence |
92 | | * 0800-FFFF => T3 Tx Tx |
93 | | */ |
94 | 0 | c2 = *(uchar*)(str+2) ^ Tx; |
95 | 0 | if(c2 & Testx) |
96 | 0 | goto bad; |
97 | 0 | if(c < T4) { |
98 | 0 | l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; |
99 | 0 | if(l <= Rune2) |
100 | 0 | goto bad; |
101 | 0 | *rune = l; |
102 | 0 | return 3; |
103 | 0 | } |
104 | | |
105 | | /* |
106 | | * four character sequence |
107 | | * 10000-10FFFF => T4 Tx Tx Tx |
108 | | */ |
109 | 0 | if(UTFmax >= 4) { |
110 | 0 | c3 = *(uchar*)(str+3) ^ Tx; |
111 | 0 | if(c3 & Testx) |
112 | 0 | goto bad; |
113 | 0 | if(c < T5) { |
114 | 0 | l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; |
115 | 0 | if(l <= Rune3) |
116 | 0 | goto bad; |
117 | 0 | if(l > Runemax) |
118 | 0 | goto bad; |
119 | 0 | *rune = l; |
120 | 0 | return 4; |
121 | 0 | } |
122 | 0 | } |
123 | | |
124 | | /* |
125 | | * bad decoding |
126 | | */ |
127 | 0 | bad: |
128 | 0 | *rune = Bad; |
129 | 0 | return 1; |
130 | 0 | } |
131 | | |
132 | | int |
133 | | runetochar(char *str, const Rune *rune) |
134 | 0 | { |
135 | 0 | int c = *rune; |
136 | | |
137 | | /* overlong null character */ |
138 | 0 | if (c == 0) { |
139 | 0 | str[0] = (char)0xc0; |
140 | 0 | str[1] = (char)0x80; |
141 | 0 | return 2; |
142 | 0 | } |
143 | | |
144 | | /* |
145 | | * one character sequence |
146 | | * 00000-0007F => 00-7F |
147 | | */ |
148 | 0 | if(c <= Rune1) { |
149 | 0 | str[0] = c; |
150 | 0 | return 1; |
151 | 0 | } |
152 | | |
153 | | /* |
154 | | * two character sequence |
155 | | * 00080-007FF => T2 Tx |
156 | | */ |
157 | 0 | if(c <= Rune2) { |
158 | 0 | str[0] = T2 | (c >> 1*Bitx); |
159 | 0 | str[1] = Tx | (c & Maskx); |
160 | 0 | return 2; |
161 | 0 | } |
162 | | |
163 | | /* |
164 | | * three character sequence |
165 | | * 00800-0FFFF => T3 Tx Tx |
166 | | */ |
167 | 0 | if(c > Runemax) |
168 | 0 | c = Runeerror; |
169 | 0 | if(c <= Rune3) { |
170 | 0 | str[0] = T3 | (c >> 2*Bitx); |
171 | 0 | str[1] = Tx | ((c >> 1*Bitx) & Maskx); |
172 | 0 | str[2] = Tx | (c & Maskx); |
173 | 0 | return 3; |
174 | 0 | } |
175 | | |
176 | | /* |
177 | | * four character sequence |
178 | | * 010000-1FFFFF => T4 Tx Tx Tx |
179 | | */ |
180 | 0 | str[0] = T4 | (c >> 3*Bitx); |
181 | 0 | str[1] = Tx | ((c >> 2*Bitx) & Maskx); |
182 | 0 | str[2] = Tx | ((c >> 1*Bitx) & Maskx); |
183 | 0 | str[3] = Tx | (c & Maskx); |
184 | 0 | return 4; |
185 | 0 | } |
186 | | |
187 | | int |
188 | | runelen(int c) |
189 | 0 | { |
190 | 0 | Rune rune; |
191 | 0 | char str[10]; |
192 | |
|
193 | 0 | rune = c; |
194 | 0 | return runetochar(str, &rune); |
195 | 0 | } |
196 | | |
197 | | static const Rune * |
198 | | ucd_bsearch(Rune c, const Rune *t, int n, int ne) |
199 | 0 | { |
200 | 0 | const Rune *p; |
201 | 0 | int m; |
202 | |
|
203 | 0 | while(n > 1) { |
204 | 0 | m = n/2; |
205 | 0 | p = t + m*ne; |
206 | 0 | if(c >= p[0]) { |
207 | 0 | t = p; |
208 | 0 | n = n-m; |
209 | 0 | } else |
210 | 0 | n = m; |
211 | 0 | } |
212 | 0 | if(n && c >= t[0]) |
213 | 0 | return t; |
214 | 0 | return 0; |
215 | 0 | } |
216 | | |
217 | | Rune |
218 | | tolowerrune(Rune c) |
219 | 0 | { |
220 | 0 | const Rune *p; |
221 | |
|
222 | 0 | p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3); |
223 | 0 | if(p && c >= p[0] && c <= p[1]) |
224 | 0 | return c + p[2]; |
225 | 0 | p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2); |
226 | 0 | if(p && c == p[0]) |
227 | 0 | return c + p[1]; |
228 | 0 | return c; |
229 | 0 | } |
230 | | |
231 | | Rune |
232 | | toupperrune(Rune c) |
233 | 0 | { |
234 | 0 | const Rune *p; |
235 | |
|
236 | 0 | p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3); |
237 | 0 | if(p && c >= p[0] && c <= p[1]) |
238 | 0 | return c + p[2]; |
239 | 0 | p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2); |
240 | 0 | if(p && c == p[0]) |
241 | 0 | return c + p[1]; |
242 | 0 | return c; |
243 | 0 | } |
244 | | |
245 | | int |
246 | | islowerrune(Rune c) |
247 | 0 | { |
248 | 0 | const Rune *p; |
249 | |
|
250 | 0 | p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3); |
251 | 0 | if(p && c >= p[0] && c <= p[1]) |
252 | 0 | return 1; |
253 | 0 | p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2); |
254 | 0 | if(p && c == p[0]) |
255 | 0 | return 1; |
256 | 0 | return 0; |
257 | 0 | } |
258 | | |
259 | | int |
260 | | isupperrune(Rune c) |
261 | 0 | { |
262 | 0 | const Rune *p; |
263 | |
|
264 | 0 | p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3); |
265 | 0 | if(p && c >= p[0] && c <= p[1]) |
266 | 0 | return 1; |
267 | 0 | p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2); |
268 | 0 | if(p && c == p[0]) |
269 | 0 | return 1; |
270 | 0 | return 0; |
271 | 0 | } |
272 | | |
273 | | int |
274 | | isalpharune(Rune c) |
275 | 0 | { |
276 | 0 | const Rune *p; |
277 | |
|
278 | 0 | p = ucd_bsearch(c, ucd_alpha2, nelem(ucd_alpha2)/2, 2); |
279 | 0 | if(p && c >= p[0] && c <= p[1]) |
280 | 0 | return 1; |
281 | 0 | p = ucd_bsearch(c, ucd_alpha1, nelem(ucd_alpha1), 1); |
282 | 0 | if(p && c == p[0]) |
283 | 0 | return 1; |
284 | 0 | return 0; |
285 | 0 | } |