/src/mupdf/source/fitz/bidi.c
Line | Count | Source |
1 | | /* |
2 | | * Bidirectional text processing. |
3 | | * |
4 | | * Processes unicode text by arranging the characters into an order suitable |
5 | | * for display. E.g. Hebrew text will be arranged from right-to-left and |
6 | | * any English within the text will remain in the left-to-right order. |
7 | | * Characters such as parenthesis will be substituted for their mirrored |
8 | | * equivalents if they are part of text which must be reversed. |
9 | | * |
10 | | * This is an implementation of the unicode Bidirectional Algorithm which |
11 | | * can be found here: http://www.unicode.org/reports/tr9/ and is based |
12 | | * on the reference implementation of the algorithm found on that page. |
13 | | * |
14 | | * For a nice overview of how it works, read this... |
15 | | * http://www.w3.org/TR/REC-html40/struct/dirlang.html |
16 | | * |
17 | | * Extracted from the SmartOffice code, where it was modified by Ian |
18 | | * Beveridge. |
19 | | * |
20 | | * Copyright (C) Picsel, 2004. All Rights Reserved. |
21 | | */ |
22 | | |
23 | | /* |
24 | | * Original copyright notice from unicode reference implementation. |
25 | | * ---------------------------------------------------------------- |
26 | | * Written by: Asmus Freytag |
27 | | * C++ and Windows dependencies removed, and |
28 | | * command line interface added by: Rick McGowan |
29 | | * |
30 | | * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved |
31 | | */ |
32 | | |
33 | | /* |
34 | | * Includes... |
35 | | */ |
36 | | |
37 | | #include "mupdf/fitz.h" |
38 | | #include "mupdf/ucdn.h" |
39 | | #include "bidi-imp.h" /* standard bidi code interface */ |
40 | | #include <assert.h> |
41 | | |
42 | | /* |
43 | | * Macros... |
44 | | */ |
45 | | |
46 | 0 | #define ODD(x) ((x) & 1) |
47 | | |
48 | | #define REPLACEABLE_TYPE(t) ( \ |
49 | | ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \ |
50 | | ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \ |
51 | | ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) ) |
52 | | |
53 | | #ifdef DEBUG_BIDI_VERBOSE |
54 | | #define DBUGVF(params) do { fz_warn params; } while (0) |
55 | | #else |
56 | | #define DBUGVF(params) do {} while (0) |
57 | | #endif |
58 | | |
59 | | #ifdef DEBUG_BIDI_OUTLINE |
60 | | #define DBUGH(params) do { fz_warn params; } while (0) |
61 | | #else |
62 | 0 | #define DBUGH(params) do {} while (0) |
63 | | #endif |
64 | | |
65 | | #define UNICODE_EOS 0 |
66 | | #define UNICODE_DIGIT_ZERO 0x0030 |
67 | | #define UNICODE_DIGIT_NINE 0x0039 |
68 | | #define UNICODE_SUPERSCRIPT_TWO 0x00B2 |
69 | | #define UNICODE_SUPERSCRIPT_THREE 0x00B3 |
70 | | #define UNICODE_SUPERSCRIPT_ONE 0x00B9 |
71 | | #define UNICODE_RTL_START 0x0590 |
72 | | #define UNICODE_RTL_END 0x07BF |
73 | | #define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660 |
74 | | #define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669 |
75 | | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0 |
76 | | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9 |
77 | | #define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C |
78 | | #define UNICODE_SUPERSCRIPT_ZERO 0x2070 |
79 | | #define UNICODE_SUPERSCRIPT_FOUR 0x2074 |
80 | | #define UNICODE_SUPERSCRIPT_NINE 0x2079 |
81 | | #define UNICODE_SUBSCRIPT_ZERO 0x2080 |
82 | | #define UNICODE_SUBSCRIPT_NINE 0x2089 |
83 | | #define UNICODE_CIRCLED_DIGIT_ONE 0x2460 |
84 | | #define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B |
85 | | #define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA |
86 | | #define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10 |
87 | | #define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19 |
88 | | |
89 | | #ifndef TRUE |
90 | 0 | #define TRUE (1) |
91 | | #endif |
92 | | #ifndef FALSE |
93 | 0 | #define FALSE (0) |
94 | | #endif |
95 | | |
96 | | /* |
97 | | * Enumerations... |
98 | | */ |
99 | | |
100 | | #ifdef DEBUG_BIDI_VERBOSE |
101 | | /* display support: */ |
102 | | static const char char_from_types[] = |
103 | | { |
104 | | ' ', /* ON */ |
105 | | '>', /* L */ |
106 | | '<', /* R */ |
107 | | '9', /* AN */ |
108 | | '1', /* EN */ |
109 | | 'a', /* AL */ |
110 | | '@', /* NSM */ |
111 | | '.', /* CS */ |
112 | | ',', /* ES */ |
113 | | '$', /* ET */ |
114 | | ':', /* BN */ |
115 | | 'X', /* S */ |
116 | | '_', /* WS */ |
117 | | 'B', /* B */ |
118 | | '+', /* RLO */ |
119 | | '+', /* RLE */ |
120 | | '+', /* LRO */ |
121 | | '+', /* LRE */ |
122 | | '-', /* PDF */ |
123 | | '=' /* LS */ |
124 | | }; |
125 | | #endif |
126 | | |
127 | | /* |
128 | | * Functions and static functions... |
129 | | */ |
130 | | |
131 | | /* UCDN uses a different ordering than Bidi does. We cannot |
132 | | * change to the UCDN ordering, as the bidi-std.c code relies |
133 | | * on the exact ordering (at least that N = ON = 0). We |
134 | | * therefore map between the two using this small table. It |
135 | | * also takes care of fudging LRI, RLI, FSI and PDI, that this |
136 | | * code does not currently support. */ |
137 | | static const uint8_t ucdn_to_bidi[] = |
138 | | { |
139 | | BDI_L, /* UCDN_BIDI_CLASS_L = 0 */ |
140 | | BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */ |
141 | | BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */ |
142 | | BDI_R, /* UCDN_BIDI_CLASS_R = 3 */ |
143 | | BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */ |
144 | | BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */ |
145 | | BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */ |
146 | | BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */ |
147 | | BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */ |
148 | | BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */ |
149 | | BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */ |
150 | | BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */ |
151 | | BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */ |
152 | | BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */ |
153 | | BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */ |
154 | | BDI_B, /* UCDN_BIDI_CLASS_B = 15 */ |
155 | | BDI_S, /* UCDN_BIDI_CLASS_S = 16 */ |
156 | | BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */ |
157 | | BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */ |
158 | | BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */ |
159 | | BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */ |
160 | | BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */ |
161 | | BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */ |
162 | | }; |
163 | | |
164 | 0 | #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)]) |
165 | | |
166 | | /* Return a direction for white-space on the second pass of the algorithm. */ |
167 | | static fz_bidi_chartype class_from_ch_n(uint32_t ch) |
168 | 0 | { |
169 | 0 | fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch); |
170 | 0 | if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS) |
171 | 0 | return BDI_N; |
172 | 0 | return from_ch_ws; |
173 | 0 | } |
174 | | |
175 | | static const unsigned char ucdn_script_from_block_table[256] = { |
176 | | UCDN_SCRIPT_LATIN, /* U+0000 */ |
177 | | UCDN_SCRIPT_LATIN, /* U+0100 */ |
178 | | UCDN_SCRIPT_LATIN, /* U+0200 */ |
179 | | UCDN_SCRIPT_GREEK, /* U+0300 */ |
180 | | UCDN_SCRIPT_CYRILLIC, /* U+0400 */ |
181 | | UCDN_SCRIPT_ARMENIAN, /* U+0500 */ |
182 | | UCDN_SCRIPT_ARABIC, /* U+0600 */ |
183 | | UCDN_SCRIPT_SYRIAC, /* U+0700 */ |
184 | | UCDN_SCRIPT_ARABIC, /* U+0800 */ |
185 | | UCDN_SCRIPT_DEVANAGARI, /* U+0900 */ |
186 | | UCDN_SCRIPT_GUJARATI, /* U+0A00 */ |
187 | | UCDN_SCRIPT_ORIYA, /* U+0B00 */ |
188 | | UCDN_SCRIPT_TELUGU, /* U+0C00 */ |
189 | | UCDN_SCRIPT_MALAYALAM, /* U+0D00 */ |
190 | | UCDN_SCRIPT_THAI, /* U+0E00 */ |
191 | | UCDN_SCRIPT_TIBETAN, /* U+0F00 */ |
192 | | UCDN_SCRIPT_MYANMAR, /* U+1000 */ |
193 | | UCDN_SCRIPT_HANGUL, /* U+1100 */ |
194 | | UCDN_SCRIPT_ETHIOPIC, /* U+1200 */ |
195 | | UCDN_SCRIPT_ETHIOPIC, /* U+1300 */ |
196 | | UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1400 */ |
197 | | UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1500 */ |
198 | | UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1600 */ |
199 | | UCDN_SCRIPT_KHMER, /* U+1700 */ |
200 | | UCDN_SCRIPT_MONGOLIAN, /* U+1800 */ |
201 | | UCDN_SCRIPT_NEW_TAI_LUE, /* U+1900 */ |
202 | | UCDN_SCRIPT_TAI_THAM, /* U+1A00 */ |
203 | | UCDN_SCRIPT_BALINESE, /* U+1B00 */ |
204 | | UCDN_SCRIPT_LEPCHA, /* U+1C00 */ |
205 | | UCDN_SCRIPT_LATIN, /* U+1D00 */ |
206 | | UCDN_SCRIPT_LATIN, /* U+1E00 */ |
207 | | UCDN_SCRIPT_GREEK, /* U+1F00 */ |
208 | | UCDN_SCRIPT_COMMON, /* U+2000 */ |
209 | | UCDN_SCRIPT_LATIN, /* U+2100 */ |
210 | | UCDN_SCRIPT_COMMON, /* U+2200 */ |
211 | | UCDN_SCRIPT_COMMON, /* U+2300 */ |
212 | | UCDN_SCRIPT_COMMON, /* U+2400 */ |
213 | | UCDN_SCRIPT_COMMON, /* U+2500 */ |
214 | | UCDN_SCRIPT_COMMON, /* U+2600 */ |
215 | | UCDN_SCRIPT_COMMON, /* U+2700 */ |
216 | | UCDN_SCRIPT_BRAILLE, /* U+2800 */ |
217 | | UCDN_SCRIPT_COMMON, /* U+2900 */ |
218 | | UCDN_SCRIPT_COMMON, /* U+2A00 */ |
219 | | UCDN_SCRIPT_COMMON, /* U+2B00 */ |
220 | | UCDN_SCRIPT_COPTIC, /* U+2C00 */ |
221 | | UCDN_SCRIPT_ETHIOPIC, /* U+2D00 */ |
222 | | UCDN_SCRIPT_HAN, /* U+2E00 */ |
223 | | UCDN_SCRIPT_HAN, /* U+2F00 */ |
224 | | UCDN_SCRIPT_KATAKANA, /* U+3000 */ |
225 | | UCDN_SCRIPT_HANGUL, /* U+3100 */ |
226 | | UCDN_SCRIPT_HANGUL, /* U+3200 */ |
227 | | UCDN_SCRIPT_KATAKANA, /* U+3300 */ |
228 | | UCDN_SCRIPT_HAN, /* U+3400 */ |
229 | | UCDN_SCRIPT_HAN, /* U+3500 */ |
230 | | UCDN_SCRIPT_HAN, /* U+3600 */ |
231 | | UCDN_SCRIPT_HAN, /* U+3700 */ |
232 | | UCDN_SCRIPT_HAN, /* U+3800 */ |
233 | | UCDN_SCRIPT_HAN, /* U+3900 */ |
234 | | UCDN_SCRIPT_HAN, /* U+3A00 */ |
235 | | UCDN_SCRIPT_HAN, /* U+3B00 */ |
236 | | UCDN_SCRIPT_HAN, /* U+3C00 */ |
237 | | UCDN_SCRIPT_HAN, /* U+3D00 */ |
238 | | UCDN_SCRIPT_HAN, /* U+3E00 */ |
239 | | UCDN_SCRIPT_HAN, /* U+3F00 */ |
240 | | UCDN_SCRIPT_HAN, /* U+4000 */ |
241 | | UCDN_SCRIPT_HAN, /* U+4100 */ |
242 | | UCDN_SCRIPT_HAN, /* U+4200 */ |
243 | | UCDN_SCRIPT_HAN, /* U+4300 */ |
244 | | UCDN_SCRIPT_HAN, /* U+4400 */ |
245 | | UCDN_SCRIPT_HAN, /* U+4500 */ |
246 | | UCDN_SCRIPT_HAN, /* U+4600 */ |
247 | | UCDN_SCRIPT_HAN, /* U+4700 */ |
248 | | UCDN_SCRIPT_HAN, /* U+4800 */ |
249 | | UCDN_SCRIPT_HAN, /* U+4900 */ |
250 | | UCDN_SCRIPT_HAN, /* U+4A00 */ |
251 | | UCDN_SCRIPT_HAN, /* U+4B00 */ |
252 | | UCDN_SCRIPT_HAN, /* U+4C00 */ |
253 | | UCDN_SCRIPT_HAN, /* U+4D00 */ |
254 | | UCDN_SCRIPT_HAN, /* U+4E00 */ |
255 | | UCDN_SCRIPT_HAN, /* U+4F00 */ |
256 | | UCDN_SCRIPT_HAN, /* U+5000 */ |
257 | | UCDN_SCRIPT_HAN, /* U+5100 */ |
258 | | UCDN_SCRIPT_HAN, /* U+5200 */ |
259 | | UCDN_SCRIPT_HAN, /* U+5300 */ |
260 | | UCDN_SCRIPT_HAN, /* U+5400 */ |
261 | | UCDN_SCRIPT_HAN, /* U+5500 */ |
262 | | UCDN_SCRIPT_HAN, /* U+5600 */ |
263 | | UCDN_SCRIPT_HAN, /* U+5700 */ |
264 | | UCDN_SCRIPT_HAN, /* U+5800 */ |
265 | | UCDN_SCRIPT_HAN, /* U+5900 */ |
266 | | UCDN_SCRIPT_HAN, /* U+5A00 */ |
267 | | UCDN_SCRIPT_HAN, /* U+5B00 */ |
268 | | UCDN_SCRIPT_HAN, /* U+5C00 */ |
269 | | UCDN_SCRIPT_HAN, /* U+5D00 */ |
270 | | UCDN_SCRIPT_HAN, /* U+5E00 */ |
271 | | UCDN_SCRIPT_HAN, /* U+5F00 */ |
272 | | UCDN_SCRIPT_HAN, /* U+6000 */ |
273 | | UCDN_SCRIPT_HAN, /* U+6100 */ |
274 | | UCDN_SCRIPT_HAN, /* U+6200 */ |
275 | | UCDN_SCRIPT_HAN, /* U+6300 */ |
276 | | UCDN_SCRIPT_HAN, /* U+6400 */ |
277 | | UCDN_SCRIPT_HAN, /* U+6500 */ |
278 | | UCDN_SCRIPT_HAN, /* U+6600 */ |
279 | | UCDN_SCRIPT_HAN, /* U+6700 */ |
280 | | UCDN_SCRIPT_HAN, /* U+6800 */ |
281 | | UCDN_SCRIPT_HAN, /* U+6900 */ |
282 | | UCDN_SCRIPT_HAN, /* U+6A00 */ |
283 | | UCDN_SCRIPT_HAN, /* U+6B00 */ |
284 | | UCDN_SCRIPT_HAN, /* U+6C00 */ |
285 | | UCDN_SCRIPT_HAN, /* U+6D00 */ |
286 | | UCDN_SCRIPT_HAN, /* U+6E00 */ |
287 | | UCDN_SCRIPT_HAN, /* U+6F00 */ |
288 | | UCDN_SCRIPT_HAN, /* U+7000 */ |
289 | | UCDN_SCRIPT_HAN, /* U+7100 */ |
290 | | UCDN_SCRIPT_HAN, /* U+7200 */ |
291 | | UCDN_SCRIPT_HAN, /* U+7300 */ |
292 | | UCDN_SCRIPT_HAN, /* U+7400 */ |
293 | | UCDN_SCRIPT_HAN, /* U+7500 */ |
294 | | UCDN_SCRIPT_HAN, /* U+7600 */ |
295 | | UCDN_SCRIPT_HAN, /* U+7700 */ |
296 | | UCDN_SCRIPT_HAN, /* U+7800 */ |
297 | | UCDN_SCRIPT_HAN, /* U+7900 */ |
298 | | UCDN_SCRIPT_HAN, /* U+7A00 */ |
299 | | UCDN_SCRIPT_HAN, /* U+7B00 */ |
300 | | UCDN_SCRIPT_HAN, /* U+7C00 */ |
301 | | UCDN_SCRIPT_HAN, /* U+7D00 */ |
302 | | UCDN_SCRIPT_HAN, /* U+7E00 */ |
303 | | UCDN_SCRIPT_HAN, /* U+7F00 */ |
304 | | UCDN_SCRIPT_HAN, /* U+8000 */ |
305 | | UCDN_SCRIPT_HAN, /* U+8100 */ |
306 | | UCDN_SCRIPT_HAN, /* U+8200 */ |
307 | | UCDN_SCRIPT_HAN, /* U+8300 */ |
308 | | UCDN_SCRIPT_HAN, /* U+8400 */ |
309 | | UCDN_SCRIPT_HAN, /* U+8500 */ |
310 | | UCDN_SCRIPT_HAN, /* U+8600 */ |
311 | | UCDN_SCRIPT_HAN, /* U+8700 */ |
312 | | UCDN_SCRIPT_HAN, /* U+8800 */ |
313 | | UCDN_SCRIPT_HAN, /* U+8900 */ |
314 | | UCDN_SCRIPT_HAN, /* U+8A00 */ |
315 | | UCDN_SCRIPT_HAN, /* U+8B00 */ |
316 | | UCDN_SCRIPT_HAN, /* U+8C00 */ |
317 | | UCDN_SCRIPT_HAN, /* U+8D00 */ |
318 | | UCDN_SCRIPT_HAN, /* U+8E00 */ |
319 | | UCDN_SCRIPT_HAN, /* U+8F00 */ |
320 | | UCDN_SCRIPT_HAN, /* U+9000 */ |
321 | | UCDN_SCRIPT_HAN, /* U+9100 */ |
322 | | UCDN_SCRIPT_HAN, /* U+9200 */ |
323 | | UCDN_SCRIPT_HAN, /* U+9300 */ |
324 | | UCDN_SCRIPT_HAN, /* U+9400 */ |
325 | | UCDN_SCRIPT_HAN, /* U+9500 */ |
326 | | UCDN_SCRIPT_HAN, /* U+9600 */ |
327 | | UCDN_SCRIPT_HAN, /* U+9700 */ |
328 | | UCDN_SCRIPT_HAN, /* U+9800 */ |
329 | | UCDN_SCRIPT_HAN, /* U+9900 */ |
330 | | UCDN_SCRIPT_HAN, /* U+9A00 */ |
331 | | UCDN_SCRIPT_HAN, /* U+9B00 */ |
332 | | UCDN_SCRIPT_HAN, /* U+9C00 */ |
333 | | UCDN_SCRIPT_HAN, /* U+9D00 */ |
334 | | UCDN_SCRIPT_HAN, /* U+9E00 */ |
335 | | UCDN_SCRIPT_HAN, /* U+9F00 */ |
336 | | UCDN_SCRIPT_YI, /* U+A000 */ |
337 | | UCDN_SCRIPT_YI, /* U+A100 */ |
338 | | UCDN_SCRIPT_YI, /* U+A200 */ |
339 | | UCDN_SCRIPT_YI, /* U+A300 */ |
340 | | UCDN_SCRIPT_YI, /* U+A400 */ |
341 | | UCDN_SCRIPT_VAI, /* U+A500 */ |
342 | | UCDN_SCRIPT_CYRILLIC, /* U+A600 */ |
343 | | UCDN_SCRIPT_LATIN, /* U+A700 */ |
344 | | UCDN_SCRIPT_SAURASHTRA, /* U+A800 */ |
345 | | UCDN_SCRIPT_JAVANESE, /* U+A900 */ |
346 | | UCDN_SCRIPT_CHAM, /* U+AA00 */ |
347 | | UCDN_SCRIPT_CHEROKEE, /* U+AB00 */ |
348 | | UCDN_SCRIPT_HANGUL, /* U+AC00 */ |
349 | | UCDN_SCRIPT_HANGUL, /* U+AD00 */ |
350 | | UCDN_SCRIPT_HANGUL, /* U+AE00 */ |
351 | | UCDN_SCRIPT_HANGUL, /* U+AF00 */ |
352 | | UCDN_SCRIPT_HANGUL, /* U+B000 */ |
353 | | UCDN_SCRIPT_HANGUL, /* U+B100 */ |
354 | | UCDN_SCRIPT_HANGUL, /* U+B200 */ |
355 | | UCDN_SCRIPT_HANGUL, /* U+B300 */ |
356 | | UCDN_SCRIPT_HANGUL, /* U+B400 */ |
357 | | UCDN_SCRIPT_HANGUL, /* U+B500 */ |
358 | | UCDN_SCRIPT_HANGUL, /* U+B600 */ |
359 | | UCDN_SCRIPT_HANGUL, /* U+B700 */ |
360 | | UCDN_SCRIPT_HANGUL, /* U+B800 */ |
361 | | UCDN_SCRIPT_HANGUL, /* U+B900 */ |
362 | | UCDN_SCRIPT_HANGUL, /* U+BA00 */ |
363 | | UCDN_SCRIPT_HANGUL, /* U+BB00 */ |
364 | | UCDN_SCRIPT_HANGUL, /* U+BC00 */ |
365 | | UCDN_SCRIPT_HANGUL, /* U+BD00 */ |
366 | | UCDN_SCRIPT_HANGUL, /* U+BE00 */ |
367 | | UCDN_SCRIPT_HANGUL, /* U+BF00 */ |
368 | | UCDN_SCRIPT_HANGUL, /* U+C000 */ |
369 | | UCDN_SCRIPT_HANGUL, /* U+C100 */ |
370 | | UCDN_SCRIPT_HANGUL, /* U+C200 */ |
371 | | UCDN_SCRIPT_HANGUL, /* U+C300 */ |
372 | | UCDN_SCRIPT_HANGUL, /* U+C400 */ |
373 | | UCDN_SCRIPT_HANGUL, /* U+C500 */ |
374 | | UCDN_SCRIPT_HANGUL, /* U+C600 */ |
375 | | UCDN_SCRIPT_HANGUL, /* U+C700 */ |
376 | | UCDN_SCRIPT_HANGUL, /* U+C800 */ |
377 | | UCDN_SCRIPT_HANGUL, /* U+C900 */ |
378 | | UCDN_SCRIPT_HANGUL, /* U+CA00 */ |
379 | | UCDN_SCRIPT_HANGUL, /* U+CB00 */ |
380 | | UCDN_SCRIPT_HANGUL, /* U+CC00 */ |
381 | | UCDN_SCRIPT_HANGUL, /* U+CD00 */ |
382 | | UCDN_SCRIPT_HANGUL, /* U+CE00 */ |
383 | | UCDN_SCRIPT_HANGUL, /* U+CF00 */ |
384 | | UCDN_SCRIPT_HANGUL, /* U+D000 */ |
385 | | UCDN_SCRIPT_HANGUL, /* U+D100 */ |
386 | | UCDN_SCRIPT_HANGUL, /* U+D200 */ |
387 | | UCDN_SCRIPT_HANGUL, /* U+D300 */ |
388 | | UCDN_SCRIPT_HANGUL, /* U+D400 */ |
389 | | UCDN_SCRIPT_HANGUL, /* U+D500 */ |
390 | | UCDN_SCRIPT_HANGUL, /* U+D600 */ |
391 | | UCDN_SCRIPT_HANGUL, /* U+D700 */ |
392 | | UCDN_SCRIPT_COMMON, /* U+D800 */ |
393 | | UCDN_SCRIPT_COMMON, /* U+D900 */ |
394 | | UCDN_SCRIPT_COMMON, /* U+DA00 */ |
395 | | UCDN_SCRIPT_COMMON, /* U+DB00 */ |
396 | | UCDN_SCRIPT_COMMON, /* U+DC00 */ |
397 | | UCDN_SCRIPT_COMMON, /* U+DD00 */ |
398 | | UCDN_SCRIPT_COMMON, /* U+DE00 */ |
399 | | UCDN_SCRIPT_COMMON, /* U+DF00 */ |
400 | | UCDN_SCRIPT_COMMON, /* U+E000 */ |
401 | | UCDN_SCRIPT_COMMON, /* U+E100 */ |
402 | | UCDN_SCRIPT_COMMON, /* U+E200 */ |
403 | | UCDN_SCRIPT_COMMON, /* U+E300 */ |
404 | | UCDN_SCRIPT_COMMON, /* U+E400 */ |
405 | | UCDN_SCRIPT_COMMON, /* U+E500 */ |
406 | | UCDN_SCRIPT_COMMON, /* U+E600 */ |
407 | | UCDN_SCRIPT_COMMON, /* U+E700 */ |
408 | | UCDN_SCRIPT_COMMON, /* U+E800 */ |
409 | | UCDN_SCRIPT_COMMON, /* U+E900 */ |
410 | | UCDN_SCRIPT_COMMON, /* U+EA00 */ |
411 | | UCDN_SCRIPT_COMMON, /* U+EB00 */ |
412 | | UCDN_SCRIPT_COMMON, /* U+EC00 */ |
413 | | UCDN_SCRIPT_COMMON, /* U+ED00 */ |
414 | | UCDN_SCRIPT_COMMON, /* U+EE00 */ |
415 | | UCDN_SCRIPT_COMMON, /* U+EF00 */ |
416 | | UCDN_SCRIPT_COMMON, /* U+F000 */ |
417 | | UCDN_SCRIPT_COMMON, /* U+F100 */ |
418 | | UCDN_SCRIPT_COMMON, /* U+F200 */ |
419 | | UCDN_SCRIPT_COMMON, /* U+F300 */ |
420 | | UCDN_SCRIPT_COMMON, /* U+F400 */ |
421 | | UCDN_SCRIPT_COMMON, /* U+F500 */ |
422 | | UCDN_SCRIPT_COMMON, /* U+F600 */ |
423 | | UCDN_SCRIPT_COMMON, /* U+F700 */ |
424 | | UCDN_SCRIPT_COMMON, /* U+F800 */ |
425 | | UCDN_SCRIPT_HAN, /* U+F900 */ |
426 | | UCDN_SCRIPT_HAN, /* U+FA00 */ |
427 | | UCDN_SCRIPT_ARABIC, /* U+FB00 */ |
428 | | UCDN_SCRIPT_ARABIC, /* U+FC00 */ |
429 | | UCDN_SCRIPT_ARABIC, /* U+FD00 */ |
430 | | UCDN_SCRIPT_ARABIC, /* U+FE00 */ |
431 | | UCDN_SCRIPT_KATAKANA, /* U+FF00 */ |
432 | | }; |
433 | | |
434 | | static int |
435 | | guess_script_from_block(int c) |
436 | 0 | { |
437 | 0 | if (c < 0x10000) |
438 | 0 | return ucdn_script_from_block_table[c >> 8]; |
439 | 0 | return UCDN_SCRIPT_COMMON; |
440 | 0 | } |
441 | | |
442 | | /* Split fragments into single scripts (or punctuation + single script) */ |
443 | | static void |
444 | | split_at_script(const uint32_t *fragment, |
445 | | size_t fragment_len, |
446 | | int level, |
447 | | void *arg, |
448 | | fz_bidi_fragment_fn *callback) |
449 | 0 | { |
450 | 0 | int script_guess = UCDN_SCRIPT_COMMON; |
451 | 0 | int script = UCDN_SCRIPT_COMMON; |
452 | 0 | size_t script_start, i; |
453 | |
|
454 | 0 | script_start = 0; |
455 | 0 | for (i = 0; i < fragment_len; i++) |
456 | 0 | { |
457 | 0 | int s = ucdn_get_script(fragment[i]); |
458 | 0 | if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED || s == UCDN_SCRIPT_UNKNOWN) |
459 | 0 | { |
460 | | /* Punctuation etc. This is fine. */ |
461 | | /* Guess script using the unicode block if we've not determined it yet. */ |
462 | 0 | if (script_guess == UCDN_SCRIPT_COMMON) |
463 | 0 | script_guess = guess_script_from_block(fragment[i]); |
464 | 0 | } |
465 | 0 | else if (s == script) |
466 | 0 | { |
467 | | /* Same script. Still fine. */ |
468 | 0 | } |
469 | 0 | else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED || script == UCDN_SCRIPT_UNKNOWN) |
470 | 0 | { |
471 | | /* First non punctuation thing. Set the script. */ |
472 | 0 | script = s; |
473 | 0 | } |
474 | 0 | else |
475 | 0 | { |
476 | | /* Change of script. Break the fragment. */ |
477 | 0 | assert(script != UCDN_SCRIPT_COMMON); |
478 | 0 | (*callback)(&fragment[script_start], i - script_start, level, script, arg); |
479 | 0 | script_start = i; |
480 | 0 | script_guess = UCDN_SCRIPT_COMMON; |
481 | 0 | script = s; |
482 | 0 | } |
483 | 0 | } |
484 | |
|
485 | 0 | if (script_start != fragment_len) |
486 | 0 | { |
487 | 0 | if (script == UCDN_SCRIPT_COMMON) |
488 | 0 | script = script_guess; |
489 | 0 | (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg); |
490 | 0 | } |
491 | 0 | } |
492 | | |
493 | | /* Determines the character classes for all following |
494 | | * passes of the algorithm. A character class is basically the type of Bidi |
495 | | * behaviour that the character exhibits. |
496 | | */ |
497 | | static void |
498 | | classify_characters(const uint32_t *text, |
499 | | fz_bidi_chartype *types, |
500 | | size_t len, |
501 | | fz_bidi_flags flags) |
502 | 0 | { |
503 | 0 | size_t i; |
504 | |
|
505 | 0 | if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0) |
506 | 0 | { |
507 | 0 | for (i = 0; i < len; i++) |
508 | 0 | { |
509 | 0 | types[i] = class_from_ch_ws(text[i]); |
510 | 0 | } |
511 | 0 | } |
512 | 0 | else |
513 | 0 | { |
514 | | #ifdef DEBUG_BIDI_VERBOSE |
515 | | fprintf(stderr, "Text: "); |
516 | | for (i = 0; i < len; i++) |
517 | | { |
518 | | /* So that we can actually sort of read the debug string, any |
519 | | * non-ascii characters are replaced with a 1-digit hash |
520 | | * value from 0-9, making non-english characters appear |
521 | | * as numbers |
522 | | */ |
523 | | fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ? |
524 | | text[i] : text[i] % 9 + '0'); |
525 | | } |
526 | | fprintf(stderr, "\nTypes: "); |
527 | | #endif |
528 | 0 | for (i = 0; i < len; i++) |
529 | 0 | { |
530 | 0 | types[i] = class_from_ch_n(text[i]); |
531 | | #ifdef DEBUG_BIDI_VERBOSE |
532 | | fprintf(stderr, "%c", char_from_types[(int)types[i]]); |
533 | | #endif |
534 | 0 | } |
535 | | #ifdef DEBUG_BIDI_VERBOSE |
536 | | fprintf(stderr, "\n"); |
537 | | #endif |
538 | 0 | } |
539 | 0 | } |
540 | | |
541 | | /* Determines the base level of the text. |
542 | | * Implements rule P2 of the Unicode Bidi Algorithm. |
543 | | * Note: Ignores explicit embeddings |
544 | | */ |
545 | | static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len) |
546 | 0 | { |
547 | 0 | size_t i; |
548 | |
|
549 | 0 | for (i = 0; i < len; i++) |
550 | 0 | { |
551 | 0 | switch (types[i]) |
552 | 0 | { |
553 | | /* strong left */ |
554 | 0 | case BDI_L: |
555 | 0 | return FZ_BIDI_LTR; |
556 | | |
557 | | /* strong right */ |
558 | 0 | case BDI_R: |
559 | 0 | case BDI_AL: |
560 | 0 | return FZ_BIDI_RTL; |
561 | 0 | } |
562 | 0 | } |
563 | 0 | return FZ_BIDI_LTR; |
564 | 0 | } |
565 | | |
566 | | static fz_bidi_direction direction_from_type(fz_bidi_chartype type) |
567 | 0 | { |
568 | 0 | switch (type) |
569 | 0 | { |
570 | 0 | case BDI_L: |
571 | 0 | case BDI_EN: |
572 | 0 | return FZ_BIDI_LTR; |
573 | | |
574 | 0 | case BDI_R: |
575 | 0 | case BDI_AL: |
576 | 0 | return FZ_BIDI_RTL; |
577 | | |
578 | 0 | default: |
579 | 0 | return FZ_BIDI_NEUTRAL; |
580 | 0 | } |
581 | 0 | } |
582 | | |
583 | | static void |
584 | | classify_quoted_blocks(const uint32_t *text, |
585 | | fz_bidi_chartype *types, |
586 | | size_t len) |
587 | 0 | { |
588 | 0 | size_t i; |
589 | 0 | int inQuote = FALSE; |
590 | 0 | int pdfNeeded = FALSE; |
591 | 0 | int ltrFound = FALSE; |
592 | 0 | int rtlFound = FALSE; |
593 | | |
594 | | /* Only do anything special here if there is mixed content |
595 | | * (LTR *and* RTL) in the text. |
596 | | */ |
597 | 0 | for (i = 0; i < len; i++) |
598 | 0 | { |
599 | 0 | switch (direction_from_type(types[i])) |
600 | 0 | { |
601 | 0 | case FZ_BIDI_LTR: |
602 | 0 | ltrFound = TRUE; |
603 | 0 | break; |
604 | | |
605 | 0 | case FZ_BIDI_RTL: |
606 | 0 | rtlFound = TRUE; |
607 | 0 | break; |
608 | | |
609 | 0 | default: |
610 | 0 | break; |
611 | 0 | } |
612 | 0 | } |
613 | | |
614 | | /* Only make any changes if *both* LTR and RTL characters exist |
615 | | * in this text. |
616 | | */ |
617 | 0 | if (!ltrFound || !rtlFound) |
618 | 0 | { |
619 | 0 | return; |
620 | 0 | } |
621 | | |
622 | 0 | for (i = 0; i < len; i++) |
623 | 0 | { |
624 | 0 | if (text[i]=='"') |
625 | 0 | { |
626 | | /* If we're already in a quote then terminate it, |
627 | | * else start a new block. |
628 | | */ |
629 | 0 | if (inQuote) |
630 | 0 | { |
631 | 0 | inQuote = FALSE; |
632 | 0 | if (pdfNeeded) |
633 | 0 | { |
634 | 0 | pdfNeeded = FALSE; |
635 | 0 | types[i] = BDI_PDF; |
636 | 0 | } |
637 | 0 | } |
638 | 0 | else |
639 | 0 | { |
640 | 0 | size_t j; |
641 | 0 | int done = FALSE; |
642 | |
|
643 | 0 | inQuote = TRUE; |
644 | | |
645 | | /* Find the first strong right or left type and |
646 | | * use that to determine whether we should classify |
647 | | * the quote as LRE or RLE. Or neither, if we |
648 | | * hit another quote before any strongly-directional |
649 | | * character. |
650 | | */ |
651 | 0 | for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j) |
652 | 0 | { |
653 | 0 | switch(types[j]) |
654 | 0 | { |
655 | 0 | case BDI_RLE: |
656 | 0 | case BDI_LRE: |
657 | 0 | done = TRUE; |
658 | 0 | break; |
659 | | |
660 | 0 | case BDI_L: |
661 | 0 | case BDI_EN: |
662 | 0 | types[i] = BDI_LRE; |
663 | 0 | pdfNeeded = TRUE; |
664 | 0 | done = TRUE; |
665 | 0 | break; |
666 | | |
667 | 0 | case BDI_R: |
668 | 0 | case BDI_AL: |
669 | 0 | types[i] = BDI_RLE; |
670 | 0 | pdfNeeded = TRUE; |
671 | 0 | done = TRUE; |
672 | 0 | break; |
673 | | |
674 | 0 | default: |
675 | 0 | break; |
676 | 0 | } |
677 | 0 | } |
678 | 0 | } |
679 | 0 | } |
680 | 0 | } |
681 | 0 | } |
682 | | |
683 | | /* Creates a buffer with an embedding level for every character in the |
684 | | * given text. Also determines the base level and returns it in |
685 | | * *baseDir if *baseDir does not initially contain a valid direction. |
686 | | */ |
687 | | static fz_bidi_level * |
688 | | create_levels(fz_context *ctx, |
689 | | const uint32_t *text, |
690 | | size_t len, |
691 | | fz_bidi_direction *baseDir, |
692 | | int resolveWhiteSpace, |
693 | | int flags) |
694 | 0 | { |
695 | 0 | fz_bidi_level *levels, *plevels; |
696 | 0 | fz_bidi_chartype *types = NULL; |
697 | 0 | fz_bidi_chartype *ptypes; |
698 | 0 | fz_bidi_level baseLevel; |
699 | 0 | const uint32_t *ptext; |
700 | 0 | size_t plen, remaining; |
701 | |
|
702 | 0 | levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels"); |
703 | |
|
704 | 0 | fz_var(types); |
705 | |
|
706 | 0 | fz_try(ctx) |
707 | 0 | { |
708 | 0 | types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype)); |
709 | |
|
710 | 0 | classify_characters(text, types, len, flags); |
711 | |
|
712 | 0 | if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL) |
713 | 0 | { |
714 | | /* Derive the base level from the text and |
715 | | * update *baseDir in case the caller wants to know. |
716 | | */ |
717 | 0 | baseLevel = base_level_from_text(types, len); |
718 | 0 | *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; |
719 | 0 | } |
720 | 0 | else |
721 | 0 | { |
722 | 0 | baseLevel = (fz_bidi_level)*baseDir; |
723 | 0 | } |
724 | |
|
725 | 0 | { |
726 | | /* Replace tab with base direction, i.e. make tab appear as |
727 | | * 'strong left' if the base direction is left-to-right and |
728 | | * 'strong right' if base direction is right-to-left. This |
729 | | * allows Layout to implicitly treat tabs as 'segment separators'. |
730 | | */ |
731 | 0 | size_t i; |
732 | |
|
733 | 0 | for (i = 0u; i < len; i++) |
734 | 0 | { |
735 | 0 | if (text[i]=='\t') |
736 | 0 | { |
737 | 0 | types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L; |
738 | 0 | } |
739 | 0 | } |
740 | 0 | } |
741 | | |
742 | | /* Look for quotation marks. Classify them as RLE or LRE |
743 | | * or leave them alone, depending on what follows them. |
744 | | */ |
745 | 0 | classify_quoted_blocks(text, types, len); |
746 | | |
747 | | /* Work one paragraph at a time. */ |
748 | 0 | plevels = levels; |
749 | 0 | ptypes = types; |
750 | 0 | ptext = text; |
751 | 0 | remaining = len; |
752 | 0 | while (remaining) |
753 | 0 | { |
754 | 0 | plen = fz_bidi_resolve_paragraphs(ptypes, remaining); |
755 | | |
756 | | /* Work out the levels and character types... */ |
757 | 0 | (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0); |
758 | 0 | fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen); |
759 | 0 | fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen); |
760 | 0 | fz_bidi_resolve_implicit(ptypes, plevels, plen); |
761 | |
|
762 | 0 | classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE); |
763 | |
|
764 | 0 | if (resolveWhiteSpace) |
765 | 0 | { |
766 | | /* resolve whitespace */ |
767 | 0 | fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen); |
768 | 0 | } |
769 | |
|
770 | 0 | plevels += plen; |
771 | 0 | ptypes += plen; |
772 | 0 | ptext += plen; |
773 | 0 | remaining -= plen; |
774 | 0 | } |
775 | | |
776 | | /* The levels buffer now has odd and even numbers indicating |
777 | | * rtl or ltr characters, respectively. |
778 | | */ |
779 | | #ifdef DEBUG_BIDI_VERBOSE |
780 | | fprintf(stderr, "Levels: "); |
781 | | { |
782 | | size_t i; |
783 | | for (i = 0; i < len; i++) |
784 | | { |
785 | | fprintf(stderr, "%d", levels[i]>9?0:levels[i]); |
786 | | } |
787 | | fprintf(stderr, "\n"); |
788 | | } |
789 | | #endif |
790 | 0 | } |
791 | 0 | fz_always(ctx) |
792 | 0 | { |
793 | 0 | fz_free(ctx, types); |
794 | 0 | } |
795 | 0 | fz_catch(ctx) |
796 | 0 | { |
797 | 0 | fz_free(ctx, levels); |
798 | 0 | fz_rethrow(ctx); |
799 | 0 | } |
800 | 0 | return levels; |
801 | 0 | } |
802 | | |
803 | | /* Partitions the given character sequence into one or more unidirectional |
804 | | * fragments and invokes the given callback function for each fragment. |
805 | | */ |
806 | | void fz_bidi_fragment_text(fz_context *ctx, |
807 | | const uint32_t *text, |
808 | | size_t textlen, |
809 | | fz_bidi_direction *baseDir, |
810 | | fz_bidi_fragment_fn *callback, |
811 | | void *arg, |
812 | | int flags) |
813 | 0 | { |
814 | 0 | size_t startOfFragment; |
815 | 0 | size_t i; |
816 | 0 | fz_bidi_level *levels; |
817 | |
|
818 | 0 | if (text == NULL || callback == NULL || textlen == 0) |
819 | 0 | return; |
820 | | |
821 | 0 | DBUGH((ctx, "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen)); |
822 | |
|
823 | 0 | levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags); |
824 | | |
825 | | /* We now have an array with an embedding level |
826 | | * for each character in text. |
827 | | */ |
828 | 0 | assert(levels != NULL); |
829 | |
|
830 | 0 | fz_try(ctx) |
831 | 0 | { |
832 | 0 | startOfFragment = 0; |
833 | 0 | for (i = 1; i < textlen; i++) |
834 | 0 | { |
835 | 0 | if (levels[i] != levels[i-1]) |
836 | 0 | { |
837 | | /* We've gone past the end of the fragment. |
838 | | * Create a text object for it, then start |
839 | | * a new fragment. |
840 | | */ |
841 | 0 | split_at_script(&text[startOfFragment], |
842 | 0 | i - startOfFragment, |
843 | 0 | levels[startOfFragment], |
844 | 0 | arg, |
845 | 0 | callback); |
846 | 0 | startOfFragment = i; |
847 | 0 | } |
848 | 0 | } |
849 | | /* Now i == textlen. Deal with the final (or maybe only) fragment. */ |
850 | | /* otherwise create 1 fragment */ |
851 | 0 | split_at_script(&text[startOfFragment], |
852 | 0 | i - startOfFragment, |
853 | 0 | levels[startOfFragment], |
854 | 0 | arg, |
855 | 0 | callback); |
856 | 0 | } |
857 | 0 | fz_always(ctx) |
858 | 0 | { |
859 | 0 | fz_free(ctx, levels); |
860 | 0 | } |
861 | 0 | fz_catch(ctx) |
862 | 0 | { |
863 | 0 | fz_rethrow(ctx); |
864 | 0 | } |
865 | 0 | } |