/src/mupdf/source/fitz/ucdn.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
3 | | * |
4 | | * Permission to use, copy, modify, and/or distribute this software for any |
5 | | * purpose with or without fee is hereby granted, provided that the above |
6 | | * copyright notice and this permission notice appear in all copies. |
7 | | * |
8 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
9 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
10 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
11 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
12 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
13 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
14 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
15 | | */ |
16 | | |
17 | | #include "mupdf/fitz.h" |
18 | | #include "mupdf/ucdn.h" |
19 | | |
20 | | #include <stdio.h> |
21 | | #include <stdlib.h> |
22 | | |
23 | | typedef struct { |
24 | | unsigned char category; |
25 | | unsigned char combining; |
26 | | unsigned char bidi_class; |
27 | | unsigned char east_asian_width; |
28 | | unsigned char script; |
29 | | unsigned char linebreak_class; |
30 | | } UCDRecord; |
31 | | |
32 | | typedef struct { |
33 | | unsigned short from, to; |
34 | | } MirrorPair; |
35 | | |
36 | | typedef struct { |
37 | | unsigned short from, to; |
38 | | unsigned char type; |
39 | | } BracketPair; |
40 | | |
41 | | typedef struct { |
42 | | unsigned int start; |
43 | | short count, index; |
44 | | } Reindex; |
45 | | |
46 | | #include "ucdn_db.h" |
47 | | |
48 | | /* constants required for Hangul (de)composition */ |
49 | 0 | #define SBASE 0xAC00 |
50 | 0 | #define LBASE 0x1100 |
51 | 0 | #define VBASE 0x1161 |
52 | 0 | #define TBASE 0x11A7 |
53 | 0 | #define SCOUNT 11172 |
54 | 0 | #define LCOUNT 19 |
55 | 0 | #define VCOUNT 21 |
56 | 0 | #define TCOUNT 28 |
57 | 0 | #define NCOUNT (VCOUNT * TCOUNT) |
58 | | |
59 | | static const UCDRecord *get_ucd_record(uint32_t code) |
60 | 66.8k | { |
61 | 66.8k | int index, offset; |
62 | | |
63 | 66.8k | if (code >= 0x110000) |
64 | 0 | index = 0; |
65 | 66.8k | else { |
66 | 66.8k | index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
67 | 66.8k | offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
68 | 66.8k | index = index1[index + offset] << SHIFT2; |
69 | 66.8k | offset = code & ((1<<SHIFT2) - 1); |
70 | 66.8k | index = index2[index + offset]; |
71 | 66.8k | } |
72 | | |
73 | 66.8k | return &ucd_records[index]; |
74 | 66.8k | } |
75 | | |
76 | | static const unsigned short *get_decomp_record(uint32_t code) |
77 | 0 | { |
78 | 0 | int index, offset; |
79 | |
|
80 | 0 | if (code >= 0x110000) |
81 | 0 | index = 0; |
82 | 0 | else { |
83 | 0 | index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
84 | 0 | << DECOMP_SHIFT1; |
85 | 0 | offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
86 | 0 | index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
87 | 0 | offset = code & ((1<<DECOMP_SHIFT2) - 1); |
88 | 0 | index = decomp_index2[index + offset]; |
89 | 0 | } |
90 | |
|
91 | 0 | return &decomp_data[index]; |
92 | 0 | } |
93 | | |
94 | | static int compare_reindex(const void *a, const void *b) |
95 | 0 | { |
96 | 0 | Reindex *ra = (Reindex *)a; |
97 | 0 | Reindex *rb = (Reindex *)b; |
98 | |
|
99 | 0 | if (ra->start < rb->start) |
100 | 0 | return -1; |
101 | 0 | else if (ra->start > (rb->start + rb->count)) |
102 | 0 | return 1; |
103 | 0 | else |
104 | 0 | return 0; |
105 | 0 | } |
106 | | |
107 | | static int get_comp_index(uint32_t code, const Reindex *idx, size_t len) |
108 | 0 | { |
109 | 0 | Reindex *res; |
110 | 0 | Reindex r = {0, 0, 0}; |
111 | 0 | r.start = code; |
112 | 0 | res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex); |
113 | |
|
114 | 0 | if (res != NULL) |
115 | 0 | return res->index + (code - res->start); |
116 | 0 | else |
117 | 0 | return -1; |
118 | 0 | } |
119 | | |
120 | | static int compare_mp(const void *a, const void *b) |
121 | 0 | { |
122 | 0 | MirrorPair *mpa = (MirrorPair *)a; |
123 | 0 | MirrorPair *mpb = (MirrorPair *)b; |
124 | 0 | return mpa->from - mpb->from; |
125 | 0 | } |
126 | | |
127 | | static int compare_bp(const void *a, const void *b) |
128 | 0 | { |
129 | 0 | BracketPair *bpa = (BracketPair *)a; |
130 | 0 | BracketPair *bpb = (BracketPair *)b; |
131 | 0 | return bpa->from - bpb->from; |
132 | 0 | } |
133 | | |
134 | | static BracketPair *search_bp(uint32_t code) |
135 | 0 | { |
136 | 0 | BracketPair bp = {0,0,2}; |
137 | 0 | BracketPair *res; |
138 | |
|
139 | 0 | bp.from = code; |
140 | 0 | res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, |
141 | 0 | sizeof(BracketPair), compare_bp); |
142 | 0 | return res; |
143 | 0 | } |
144 | | |
145 | | static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
146 | 0 | { |
147 | 0 | int si = code - SBASE; |
148 | |
|
149 | 0 | if (si < 0 || si >= SCOUNT) |
150 | 0 | return 0; |
151 | | |
152 | 0 | if (si % TCOUNT) { |
153 | | /* LV,T */ |
154 | 0 | *a = SBASE + (si / TCOUNT) * TCOUNT; |
155 | 0 | *b = TBASE + (si % TCOUNT); |
156 | 0 | return 3; |
157 | 0 | } else { |
158 | | /* L,V */ |
159 | 0 | *a = LBASE + (si / NCOUNT); |
160 | 0 | *b = VBASE + (si % NCOUNT) / TCOUNT; |
161 | 0 | return 2; |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | | static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
166 | 0 | { |
167 | 0 | if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) { |
168 | | /* LV,T */ |
169 | 0 | *code = a + (b - TBASE); |
170 | 0 | return 3; |
171 | 0 | } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) { |
172 | | /* L,V */ |
173 | 0 | int li = a - LBASE; |
174 | 0 | int vi = b - VBASE; |
175 | 0 | *code = SBASE + li * NCOUNT + vi * TCOUNT; |
176 | 0 | return 2; |
177 | 0 | } else { |
178 | 0 | return 0; |
179 | 0 | } |
180 | 0 | } |
181 | | |
182 | | static uint32_t decode_utf16(const unsigned short **code_ptr) |
183 | 0 | { |
184 | 0 | const unsigned short *code = *code_ptr; |
185 | |
|
186 | 0 | if (code[0] < 0xd800 || code[0] > 0xdc00) { |
187 | 0 | *code_ptr += 1; |
188 | 0 | return (uint32_t)code[0]; |
189 | 0 | } else { |
190 | 0 | *code_ptr += 2; |
191 | 0 | return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
192 | 0 | (((uint32_t)code[0] - 0xd800) << 10); |
193 | 0 | } |
194 | 0 | } |
195 | | |
196 | | const char *ucdn_get_unicode_version(void) |
197 | 0 | { |
198 | 0 | return UNIDATA_VERSION; |
199 | 0 | } |
200 | | |
201 | | int ucdn_get_combining_class(uint32_t code) |
202 | 0 | { |
203 | 0 | return get_ucd_record(code)->combining; |
204 | 0 | } |
205 | | |
206 | | int ucdn_get_east_asian_width(uint32_t code) |
207 | 0 | { |
208 | 0 | return get_ucd_record(code)->east_asian_width; |
209 | 0 | } |
210 | | |
211 | | int ucdn_get_general_category(uint32_t code) |
212 | 0 | { |
213 | 0 | return get_ucd_record(code)->category; |
214 | 0 | } |
215 | | |
216 | | int ucdn_get_bidi_class(uint32_t code) |
217 | 0 | { |
218 | 0 | return get_ucd_record(code)->bidi_class; |
219 | 0 | } |
220 | | |
221 | | int ucdn_get_mirrored(uint32_t code) |
222 | 0 | { |
223 | 0 | return ucdn_mirror(code) != code; |
224 | 0 | } |
225 | | |
226 | | int ucdn_get_script(uint32_t code) |
227 | 66.8k | { |
228 | 66.8k | return get_ucd_record(code)->script; |
229 | 66.8k | } |
230 | | |
231 | | int ucdn_get_linebreak_class(uint32_t code) |
232 | 0 | { |
233 | 0 | return get_ucd_record(code)->linebreak_class; |
234 | 0 | } |
235 | | |
236 | | int ucdn_get_resolved_linebreak_class(uint32_t code) |
237 | 0 | { |
238 | 0 | const UCDRecord *record = get_ucd_record(code); |
239 | |
|
240 | 0 | switch (record->linebreak_class) |
241 | 0 | { |
242 | 0 | case UCDN_LINEBREAK_CLASS_AI: |
243 | 0 | case UCDN_LINEBREAK_CLASS_SG: |
244 | 0 | case UCDN_LINEBREAK_CLASS_XX: |
245 | 0 | return UCDN_LINEBREAK_CLASS_AL; |
246 | | |
247 | 0 | case UCDN_LINEBREAK_CLASS_SA: |
248 | 0 | if (record->category == UCDN_GENERAL_CATEGORY_MC || |
249 | 0 | record->category == UCDN_GENERAL_CATEGORY_MN) |
250 | 0 | return UCDN_LINEBREAK_CLASS_CM; |
251 | 0 | return UCDN_LINEBREAK_CLASS_AL; |
252 | | |
253 | 0 | case UCDN_LINEBREAK_CLASS_CJ: |
254 | 0 | return UCDN_LINEBREAK_CLASS_NS; |
255 | | |
256 | 0 | case UCDN_LINEBREAK_CLASS_CB: |
257 | 0 | return UCDN_LINEBREAK_CLASS_B2; |
258 | | |
259 | 0 | case UCDN_LINEBREAK_CLASS_NL: |
260 | 0 | return UCDN_LINEBREAK_CLASS_BK; |
261 | | |
262 | 0 | default: |
263 | 0 | return record->linebreak_class; |
264 | 0 | } |
265 | 0 | } |
266 | | |
267 | | uint32_t ucdn_mirror(uint32_t code) |
268 | 0 | { |
269 | 0 | MirrorPair mp = {0}; |
270 | 0 | MirrorPair *res; |
271 | |
|
272 | 0 | mp.from = code; |
273 | 0 | res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, |
274 | 0 | sizeof(MirrorPair), compare_mp); |
275 | |
|
276 | 0 | if (res == NULL) |
277 | 0 | return code; |
278 | 0 | else |
279 | 0 | return res->to; |
280 | 0 | } |
281 | | |
282 | | uint32_t ucdn_paired_bracket(uint32_t code) |
283 | 0 | { |
284 | 0 | BracketPair *res = search_bp(code); |
285 | 0 | if (res == NULL) |
286 | 0 | return code; |
287 | 0 | else |
288 | 0 | return res->to; |
289 | 0 | } |
290 | | |
291 | | int ucdn_paired_bracket_type(uint32_t code) |
292 | 0 | { |
293 | 0 | BracketPair *res = search_bp(code); |
294 | 0 | if (res == NULL) |
295 | 0 | return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE; |
296 | 0 | else |
297 | 0 | return res->type; |
298 | 0 | } |
299 | | |
300 | | int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
301 | 0 | { |
302 | 0 | const unsigned short *rec; |
303 | 0 | int len; |
304 | |
|
305 | 0 | if (hangul_pair_decompose(code, a, b)) |
306 | 0 | return 1; |
307 | | |
308 | 0 | rec = get_decomp_record(code); |
309 | 0 | len = rec[0] >> 8; |
310 | |
|
311 | 0 | if ((rec[0] & 0xff) != 0 || len == 0) |
312 | 0 | return 0; |
313 | | |
314 | 0 | rec++; |
315 | 0 | *a = decode_utf16(&rec); |
316 | 0 | if (len > 1) |
317 | 0 | *b = decode_utf16(&rec); |
318 | 0 | else |
319 | 0 | *b = 0; |
320 | |
|
321 | 0 | return 1; |
322 | 0 | } |
323 | | |
324 | | int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
325 | 0 | { |
326 | 0 | int l, r, index, indexi, offset; |
327 | |
|
328 | 0 | if (hangul_pair_compose(code, a, b)) |
329 | 0 | return 1; |
330 | | |
331 | 0 | l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex)); |
332 | 0 | r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex)); |
333 | |
|
334 | 0 | if (l < 0 || r < 0) |
335 | 0 | return 0; |
336 | | |
337 | 0 | indexi = l * TOTAL_LAST + r; |
338 | 0 | index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
339 | 0 | offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
340 | 0 | index = comp_index1[index + offset] << COMP_SHIFT2; |
341 | 0 | offset = indexi & ((1<<COMP_SHIFT2) - 1); |
342 | 0 | *code = comp_data[index + offset]; |
343 | |
|
344 | 0 | return *code != 0; |
345 | 0 | } |
346 | | |
347 | | int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
348 | 0 | { |
349 | 0 | int i, len; |
350 | 0 | const unsigned short *rec = get_decomp_record(code); |
351 | 0 | len = rec[0] >> 8; |
352 | |
|
353 | 0 | if (len == 0) |
354 | 0 | return 0; |
355 | | |
356 | 0 | rec++; |
357 | 0 | for (i = 0; i < len; i++) |
358 | 0 | decomposed[i] = decode_utf16(&rec); |
359 | |
|
360 | 0 | return len; |
361 | 0 | } |