Coverage Report

Created: 2026-05-16 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/bidi.c
Line
Count
Source
1
/*
2
 * Bidirectional text processing.
3
 *
4
 * Processes unicode text by arranging the characters into an order suitable
5
 * for display. E.g. Hebrew text will be arranged from right-to-left and
6
 * any English within the text will remain in the left-to-right order.
7
 * Characters such as parenthesis will be substituted for their mirrored
8
 * equivalents if they are part of text which must be reversed.
9
 *
10
 * This is an implementation of the unicode Bidirectional Algorithm which
11
 * can be found here: http://www.unicode.org/reports/tr9/ and is based
12
 * on the reference implementation of the algorithm found on that page.
13
 *
14
 * For a nice overview of how it works, read this...
15
 * http://www.w3.org/TR/REC-html40/struct/dirlang.html
16
 *
17
 * Extracted from the SmartOffice code, where it was modified by Ian
18
 * Beveridge.
19
 *
20
 * Copyright (C) Picsel, 2004. All Rights Reserved.
21
 */
22
23
/*
24
 * Original copyright notice from unicode reference implementation.
25
 * ----------------------------------------------------------------
26
 * Written by: Asmus Freytag
27
 *  C++ and Windows dependencies removed, and
28
 *  command line interface added by: Rick McGowan
29
 *
30
 *  Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
31
 */
32
33
/*
34
 * Includes...
35
 */
36
37
#include "mupdf/fitz.h"
38
#include "mupdf/ucdn.h"
39
#include "bidi-imp.h" /* standard bidi code interface */
40
#include <assert.h>
41
42
/*
43
 * Macros...
44
 */
45
46
0
#define ODD(x) ((x) & 1)
47
48
#define REPLACEABLE_TYPE(t) ( \
49
    ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
50
    ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
51
    ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
52
53
#ifdef DEBUG_BIDI_VERBOSE
54
#define DBUGVF(params) do { fz_warn params; } while (0)
55
#else
56
#define DBUGVF(params) do {} while (0)
57
#endif
58
59
#ifdef DEBUG_BIDI_OUTLINE
60
#define DBUGH(params) do { fz_warn params; } while (0)
61
#else
62
0
#define DBUGH(params) do {} while (0)
63
#endif
64
65
#define UNICODE_EOS         0
66
#define UNICODE_DIGIT_ZERO        0x0030
67
#define UNICODE_DIGIT_NINE        0x0039
68
#define UNICODE_SUPERSCRIPT_TWO       0x00B2
69
#define UNICODE_SUPERSCRIPT_THREE     0x00B3
70
#define UNICODE_SUPERSCRIPT_ONE       0x00B9
71
#define UNICODE_RTL_START       0x0590
72
#define UNICODE_RTL_END         0x07BF
73
#define UNICODE_ARABIC_INDIC_DIGIT_ZERO     0x0660
74
#define UNICODE_ARABIC_INDIC_DIGIT_NINE     0x0669
75
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO  0x06F0
76
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE  0x06F9
77
#define UNICODE_ZERO_WIDTH_NON_JOINER     0x200C
78
#define UNICODE_SUPERSCRIPT_ZERO      0x2070
79
#define UNICODE_SUPERSCRIPT_FOUR      0x2074
80
#define UNICODE_SUPERSCRIPT_NINE      0x2079
81
#define UNICODE_SUBSCRIPT_ZERO        0x2080
82
#define UNICODE_SUBSCRIPT_NINE        0x2089
83
#define UNICODE_CIRCLED_DIGIT_ONE     0x2460
84
#define UNICODE_NUMBER_TWENTY_FULL_STOP     0x249B
85
#define UNICODE_CIRCLED_DIGIT_ZERO      0x24EA
86
#define UNICODE_FULLWIDTH_DIGIT_ZERO      0xFF10
87
#define UNICODE_FULLWIDTH_DIGIT_NINE      0xFF19
88
89
#ifndef TRUE
90
0
#define TRUE (1)
91
#endif
92
#ifndef FALSE
93
0
#define FALSE (0)
94
#endif
95
96
/*
97
 * Enumerations...
98
 */
99
100
#ifdef DEBUG_BIDI_VERBOSE
101
/* display support: */
102
static const char char_from_types[] =
103
{
104
  ' ',  /* ON */
105
  '>',  /* L */
106
  '<',  /* R */
107
  '9',  /* AN */
108
  '1',  /* EN */
109
  'a',  /* AL */
110
  '@',  /* NSM */
111
  '.',  /* CS */
112
  ',',  /* ES */
113
  '$',  /* ET */
114
  ':',  /* BN */
115
  'X',  /* S */
116
  '_',  /* WS */
117
  'B',  /* B */
118
  '+',  /* RLO */
119
  '+',  /* RLE */
120
  '+',  /* LRO */
121
  '+',  /* LRE */
122
  '-',  /* PDF */
123
  '=' /* LS */
124
};
125
#endif
126
127
/*
128
 * Functions and static functions...
129
 */
130
131
/* UCDN uses a different ordering than Bidi does. We cannot
132
 * change to the UCDN ordering, as the bidi-std.c code relies
133
 * on the exact ordering (at least that N = ON = 0). We
134
 * therefore map between the two using this small table. It
135
 * also takes care of fudging LRI, RLI, FSI and PDI, that this
136
 * code does not currently support. */
137
static const uint8_t ucdn_to_bidi[] =
138
{
139
  BDI_L,    /* UCDN_BIDI_CLASS_L = 0 */
140
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRE = 1 */
141
  BDI_LRO,  /* UCDN_BIDI_CLASS_LRO = 2 */
142
  BDI_R,    /* UCDN_BIDI_CLASS_R = 3 */
143
  BDI_AL,   /* UCDN_BIDI_CLASS_AL = 4 */
144
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLE = 5 */
145
  BDI_RLO,  /* UCDN_BIDI_CLASS_RLO = 6 */
146
  BDI_PDF,  /* UCDN_BIDI_CLASS_PDF = 7 */
147
  BDI_EN,   /* UCDN_BIDI_CLASS_EN = 8 */
148
  BDI_ES,   /* UCDN_BIDI_CLASS_ES = 9 */
149
  BDI_ET,   /* UCDN_BIDI_CLASS_ET = 10 */
150
  BDI_AN,   /* UCDN_BIDI_CLASS_AN = 11 */
151
  BDI_CS,   /* UCDN_BIDI_CLASS_CS = 12 */
152
  BDI_NSM,  /* UCDN_BIDI_CLASS_NSM = 13 */
153
  BDI_BN,   /* UCDN_BIDI_CLASS_BN = 14 */
154
  BDI_B,    /* UCDN_BIDI_CLASS_B = 15 */
155
  BDI_S,    /* UCDN_BIDI_CLASS_S = 16 */
156
  BDI_WS,   /* UCDN_BIDI_CLASS_WS = 17 */
157
  BDI_ON,   /* UCDN_BIDI_CLASS_ON = 18 */
158
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRI = 19 */
159
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLI = 20 */
160
  BDI_N,    /* UCDN_BIDI_CLASS_FSI = 21 */
161
  BDI_N,    /* UCDN_BIDI_CLASS_PDI = 22 */
162
};
163
164
0
#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
165
166
/* Return a direction for white-space on the second pass of the algorithm. */
167
static fz_bidi_chartype class_from_ch_n(uint32_t ch)
168
0
{
169
0
  fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
170
0
  if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
171
0
    return BDI_N;
172
0
  return from_ch_ws;
173
0
}
174
175
static const unsigned char ucdn_script_from_block_table[256] = {
176
  UCDN_SCRIPT_LATIN, /* U+0000 */
177
  UCDN_SCRIPT_LATIN, /* U+0100 */
178
  UCDN_SCRIPT_LATIN, /* U+0200 */
179
  UCDN_SCRIPT_GREEK, /* U+0300 */
180
  UCDN_SCRIPT_CYRILLIC, /* U+0400 */
181
  UCDN_SCRIPT_ARMENIAN, /* U+0500 */
182
  UCDN_SCRIPT_ARABIC, /* U+0600 */
183
  UCDN_SCRIPT_SYRIAC, /* U+0700 */
184
  UCDN_SCRIPT_ARABIC, /* U+0800 */
185
  UCDN_SCRIPT_DEVANAGARI, /* U+0900 */
186
  UCDN_SCRIPT_GUJARATI, /* U+0A00 */
187
  UCDN_SCRIPT_ORIYA, /* U+0B00 */
188
  UCDN_SCRIPT_TELUGU, /* U+0C00 */
189
  UCDN_SCRIPT_MALAYALAM, /* U+0D00 */
190
  UCDN_SCRIPT_THAI, /* U+0E00 */
191
  UCDN_SCRIPT_TIBETAN, /* U+0F00 */
192
  UCDN_SCRIPT_MYANMAR, /* U+1000 */
193
  UCDN_SCRIPT_HANGUL, /* U+1100 */
194
  UCDN_SCRIPT_ETHIOPIC, /* U+1200 */
195
  UCDN_SCRIPT_ETHIOPIC, /* U+1300 */
196
  UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1400 */
197
  UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1500 */
198
  UCDN_SCRIPT_CANADIAN_ABORIGINAL, /* U+1600 */
199
  UCDN_SCRIPT_KHMER, /* U+1700 */
200
  UCDN_SCRIPT_MONGOLIAN, /* U+1800 */
201
  UCDN_SCRIPT_NEW_TAI_LUE, /* U+1900 */
202
  UCDN_SCRIPT_TAI_THAM, /* U+1A00 */
203
  UCDN_SCRIPT_BALINESE, /* U+1B00 */
204
  UCDN_SCRIPT_LEPCHA, /* U+1C00 */
205
  UCDN_SCRIPT_LATIN, /* U+1D00 */
206
  UCDN_SCRIPT_LATIN, /* U+1E00 */
207
  UCDN_SCRIPT_GREEK, /* U+1F00 */
208
  UCDN_SCRIPT_COMMON, /* U+2000 */
209
  UCDN_SCRIPT_LATIN, /* U+2100 */
210
  UCDN_SCRIPT_COMMON, /* U+2200 */
211
  UCDN_SCRIPT_COMMON, /* U+2300 */
212
  UCDN_SCRIPT_COMMON, /* U+2400 */
213
  UCDN_SCRIPT_COMMON, /* U+2500 */
214
  UCDN_SCRIPT_COMMON, /* U+2600 */
215
  UCDN_SCRIPT_COMMON, /* U+2700 */
216
  UCDN_SCRIPT_BRAILLE, /* U+2800 */
217
  UCDN_SCRIPT_COMMON, /* U+2900 */
218
  UCDN_SCRIPT_COMMON, /* U+2A00 */
219
  UCDN_SCRIPT_COMMON, /* U+2B00 */
220
  UCDN_SCRIPT_COPTIC, /* U+2C00 */
221
  UCDN_SCRIPT_ETHIOPIC, /* U+2D00 */
222
  UCDN_SCRIPT_HAN, /* U+2E00 */
223
  UCDN_SCRIPT_HAN, /* U+2F00 */
224
  UCDN_SCRIPT_KATAKANA, /* U+3000 */
225
  UCDN_SCRIPT_HANGUL, /* U+3100 */
226
  UCDN_SCRIPT_HANGUL, /* U+3200 */
227
  UCDN_SCRIPT_KATAKANA, /* U+3300 */
228
  UCDN_SCRIPT_HAN, /* U+3400 */
229
  UCDN_SCRIPT_HAN, /* U+3500 */
230
  UCDN_SCRIPT_HAN, /* U+3600 */
231
  UCDN_SCRIPT_HAN, /* U+3700 */
232
  UCDN_SCRIPT_HAN, /* U+3800 */
233
  UCDN_SCRIPT_HAN, /* U+3900 */
234
  UCDN_SCRIPT_HAN, /* U+3A00 */
235
  UCDN_SCRIPT_HAN, /* U+3B00 */
236
  UCDN_SCRIPT_HAN, /* U+3C00 */
237
  UCDN_SCRIPT_HAN, /* U+3D00 */
238
  UCDN_SCRIPT_HAN, /* U+3E00 */
239
  UCDN_SCRIPT_HAN, /* U+3F00 */
240
  UCDN_SCRIPT_HAN, /* U+4000 */
241
  UCDN_SCRIPT_HAN, /* U+4100 */
242
  UCDN_SCRIPT_HAN, /* U+4200 */
243
  UCDN_SCRIPT_HAN, /* U+4300 */
244
  UCDN_SCRIPT_HAN, /* U+4400 */
245
  UCDN_SCRIPT_HAN, /* U+4500 */
246
  UCDN_SCRIPT_HAN, /* U+4600 */
247
  UCDN_SCRIPT_HAN, /* U+4700 */
248
  UCDN_SCRIPT_HAN, /* U+4800 */
249
  UCDN_SCRIPT_HAN, /* U+4900 */
250
  UCDN_SCRIPT_HAN, /* U+4A00 */
251
  UCDN_SCRIPT_HAN, /* U+4B00 */
252
  UCDN_SCRIPT_HAN, /* U+4C00 */
253
  UCDN_SCRIPT_HAN, /* U+4D00 */
254
  UCDN_SCRIPT_HAN, /* U+4E00 */
255
  UCDN_SCRIPT_HAN, /* U+4F00 */
256
  UCDN_SCRIPT_HAN, /* U+5000 */
257
  UCDN_SCRIPT_HAN, /* U+5100 */
258
  UCDN_SCRIPT_HAN, /* U+5200 */
259
  UCDN_SCRIPT_HAN, /* U+5300 */
260
  UCDN_SCRIPT_HAN, /* U+5400 */
261
  UCDN_SCRIPT_HAN, /* U+5500 */
262
  UCDN_SCRIPT_HAN, /* U+5600 */
263
  UCDN_SCRIPT_HAN, /* U+5700 */
264
  UCDN_SCRIPT_HAN, /* U+5800 */
265
  UCDN_SCRIPT_HAN, /* U+5900 */
266
  UCDN_SCRIPT_HAN, /* U+5A00 */
267
  UCDN_SCRIPT_HAN, /* U+5B00 */
268
  UCDN_SCRIPT_HAN, /* U+5C00 */
269
  UCDN_SCRIPT_HAN, /* U+5D00 */
270
  UCDN_SCRIPT_HAN, /* U+5E00 */
271
  UCDN_SCRIPT_HAN, /* U+5F00 */
272
  UCDN_SCRIPT_HAN, /* U+6000 */
273
  UCDN_SCRIPT_HAN, /* U+6100 */
274
  UCDN_SCRIPT_HAN, /* U+6200 */
275
  UCDN_SCRIPT_HAN, /* U+6300 */
276
  UCDN_SCRIPT_HAN, /* U+6400 */
277
  UCDN_SCRIPT_HAN, /* U+6500 */
278
  UCDN_SCRIPT_HAN, /* U+6600 */
279
  UCDN_SCRIPT_HAN, /* U+6700 */
280
  UCDN_SCRIPT_HAN, /* U+6800 */
281
  UCDN_SCRIPT_HAN, /* U+6900 */
282
  UCDN_SCRIPT_HAN, /* U+6A00 */
283
  UCDN_SCRIPT_HAN, /* U+6B00 */
284
  UCDN_SCRIPT_HAN, /* U+6C00 */
285
  UCDN_SCRIPT_HAN, /* U+6D00 */
286
  UCDN_SCRIPT_HAN, /* U+6E00 */
287
  UCDN_SCRIPT_HAN, /* U+6F00 */
288
  UCDN_SCRIPT_HAN, /* U+7000 */
289
  UCDN_SCRIPT_HAN, /* U+7100 */
290
  UCDN_SCRIPT_HAN, /* U+7200 */
291
  UCDN_SCRIPT_HAN, /* U+7300 */
292
  UCDN_SCRIPT_HAN, /* U+7400 */
293
  UCDN_SCRIPT_HAN, /* U+7500 */
294
  UCDN_SCRIPT_HAN, /* U+7600 */
295
  UCDN_SCRIPT_HAN, /* U+7700 */
296
  UCDN_SCRIPT_HAN, /* U+7800 */
297
  UCDN_SCRIPT_HAN, /* U+7900 */
298
  UCDN_SCRIPT_HAN, /* U+7A00 */
299
  UCDN_SCRIPT_HAN, /* U+7B00 */
300
  UCDN_SCRIPT_HAN, /* U+7C00 */
301
  UCDN_SCRIPT_HAN, /* U+7D00 */
302
  UCDN_SCRIPT_HAN, /* U+7E00 */
303
  UCDN_SCRIPT_HAN, /* U+7F00 */
304
  UCDN_SCRIPT_HAN, /* U+8000 */
305
  UCDN_SCRIPT_HAN, /* U+8100 */
306
  UCDN_SCRIPT_HAN, /* U+8200 */
307
  UCDN_SCRIPT_HAN, /* U+8300 */
308
  UCDN_SCRIPT_HAN, /* U+8400 */
309
  UCDN_SCRIPT_HAN, /* U+8500 */
310
  UCDN_SCRIPT_HAN, /* U+8600 */
311
  UCDN_SCRIPT_HAN, /* U+8700 */
312
  UCDN_SCRIPT_HAN, /* U+8800 */
313
  UCDN_SCRIPT_HAN, /* U+8900 */
314
  UCDN_SCRIPT_HAN, /* U+8A00 */
315
  UCDN_SCRIPT_HAN, /* U+8B00 */
316
  UCDN_SCRIPT_HAN, /* U+8C00 */
317
  UCDN_SCRIPT_HAN, /* U+8D00 */
318
  UCDN_SCRIPT_HAN, /* U+8E00 */
319
  UCDN_SCRIPT_HAN, /* U+8F00 */
320
  UCDN_SCRIPT_HAN, /* U+9000 */
321
  UCDN_SCRIPT_HAN, /* U+9100 */
322
  UCDN_SCRIPT_HAN, /* U+9200 */
323
  UCDN_SCRIPT_HAN, /* U+9300 */
324
  UCDN_SCRIPT_HAN, /* U+9400 */
325
  UCDN_SCRIPT_HAN, /* U+9500 */
326
  UCDN_SCRIPT_HAN, /* U+9600 */
327
  UCDN_SCRIPT_HAN, /* U+9700 */
328
  UCDN_SCRIPT_HAN, /* U+9800 */
329
  UCDN_SCRIPT_HAN, /* U+9900 */
330
  UCDN_SCRIPT_HAN, /* U+9A00 */
331
  UCDN_SCRIPT_HAN, /* U+9B00 */
332
  UCDN_SCRIPT_HAN, /* U+9C00 */
333
  UCDN_SCRIPT_HAN, /* U+9D00 */
334
  UCDN_SCRIPT_HAN, /* U+9E00 */
335
  UCDN_SCRIPT_HAN, /* U+9F00 */
336
  UCDN_SCRIPT_YI, /* U+A000 */
337
  UCDN_SCRIPT_YI, /* U+A100 */
338
  UCDN_SCRIPT_YI, /* U+A200 */
339
  UCDN_SCRIPT_YI, /* U+A300 */
340
  UCDN_SCRIPT_YI, /* U+A400 */
341
  UCDN_SCRIPT_VAI, /* U+A500 */
342
  UCDN_SCRIPT_CYRILLIC, /* U+A600 */
343
  UCDN_SCRIPT_LATIN, /* U+A700 */
344
  UCDN_SCRIPT_SAURASHTRA, /* U+A800 */
345
  UCDN_SCRIPT_JAVANESE, /* U+A900 */
346
  UCDN_SCRIPT_CHAM, /* U+AA00 */
347
  UCDN_SCRIPT_CHEROKEE, /* U+AB00 */
348
  UCDN_SCRIPT_HANGUL, /* U+AC00 */
349
  UCDN_SCRIPT_HANGUL, /* U+AD00 */
350
  UCDN_SCRIPT_HANGUL, /* U+AE00 */
351
  UCDN_SCRIPT_HANGUL, /* U+AF00 */
352
  UCDN_SCRIPT_HANGUL, /* U+B000 */
353
  UCDN_SCRIPT_HANGUL, /* U+B100 */
354
  UCDN_SCRIPT_HANGUL, /* U+B200 */
355
  UCDN_SCRIPT_HANGUL, /* U+B300 */
356
  UCDN_SCRIPT_HANGUL, /* U+B400 */
357
  UCDN_SCRIPT_HANGUL, /* U+B500 */
358
  UCDN_SCRIPT_HANGUL, /* U+B600 */
359
  UCDN_SCRIPT_HANGUL, /* U+B700 */
360
  UCDN_SCRIPT_HANGUL, /* U+B800 */
361
  UCDN_SCRIPT_HANGUL, /* U+B900 */
362
  UCDN_SCRIPT_HANGUL, /* U+BA00 */
363
  UCDN_SCRIPT_HANGUL, /* U+BB00 */
364
  UCDN_SCRIPT_HANGUL, /* U+BC00 */
365
  UCDN_SCRIPT_HANGUL, /* U+BD00 */
366
  UCDN_SCRIPT_HANGUL, /* U+BE00 */
367
  UCDN_SCRIPT_HANGUL, /* U+BF00 */
368
  UCDN_SCRIPT_HANGUL, /* U+C000 */
369
  UCDN_SCRIPT_HANGUL, /* U+C100 */
370
  UCDN_SCRIPT_HANGUL, /* U+C200 */
371
  UCDN_SCRIPT_HANGUL, /* U+C300 */
372
  UCDN_SCRIPT_HANGUL, /* U+C400 */
373
  UCDN_SCRIPT_HANGUL, /* U+C500 */
374
  UCDN_SCRIPT_HANGUL, /* U+C600 */
375
  UCDN_SCRIPT_HANGUL, /* U+C700 */
376
  UCDN_SCRIPT_HANGUL, /* U+C800 */
377
  UCDN_SCRIPT_HANGUL, /* U+C900 */
378
  UCDN_SCRIPT_HANGUL, /* U+CA00 */
379
  UCDN_SCRIPT_HANGUL, /* U+CB00 */
380
  UCDN_SCRIPT_HANGUL, /* U+CC00 */
381
  UCDN_SCRIPT_HANGUL, /* U+CD00 */
382
  UCDN_SCRIPT_HANGUL, /* U+CE00 */
383
  UCDN_SCRIPT_HANGUL, /* U+CF00 */
384
  UCDN_SCRIPT_HANGUL, /* U+D000 */
385
  UCDN_SCRIPT_HANGUL, /* U+D100 */
386
  UCDN_SCRIPT_HANGUL, /* U+D200 */
387
  UCDN_SCRIPT_HANGUL, /* U+D300 */
388
  UCDN_SCRIPT_HANGUL, /* U+D400 */
389
  UCDN_SCRIPT_HANGUL, /* U+D500 */
390
  UCDN_SCRIPT_HANGUL, /* U+D600 */
391
  UCDN_SCRIPT_HANGUL, /* U+D700 */
392
  UCDN_SCRIPT_COMMON, /* U+D800 */
393
  UCDN_SCRIPT_COMMON, /* U+D900 */
394
  UCDN_SCRIPT_COMMON, /* U+DA00 */
395
  UCDN_SCRIPT_COMMON, /* U+DB00 */
396
  UCDN_SCRIPT_COMMON, /* U+DC00 */
397
  UCDN_SCRIPT_COMMON, /* U+DD00 */
398
  UCDN_SCRIPT_COMMON, /* U+DE00 */
399
  UCDN_SCRIPT_COMMON, /* U+DF00 */
400
  UCDN_SCRIPT_COMMON, /* U+E000 */
401
  UCDN_SCRIPT_COMMON, /* U+E100 */
402
  UCDN_SCRIPT_COMMON, /* U+E200 */
403
  UCDN_SCRIPT_COMMON, /* U+E300 */
404
  UCDN_SCRIPT_COMMON, /* U+E400 */
405
  UCDN_SCRIPT_COMMON, /* U+E500 */
406
  UCDN_SCRIPT_COMMON, /* U+E600 */
407
  UCDN_SCRIPT_COMMON, /* U+E700 */
408
  UCDN_SCRIPT_COMMON, /* U+E800 */
409
  UCDN_SCRIPT_COMMON, /* U+E900 */
410
  UCDN_SCRIPT_COMMON, /* U+EA00 */
411
  UCDN_SCRIPT_COMMON, /* U+EB00 */
412
  UCDN_SCRIPT_COMMON, /* U+EC00 */
413
  UCDN_SCRIPT_COMMON, /* U+ED00 */
414
  UCDN_SCRIPT_COMMON, /* U+EE00 */
415
  UCDN_SCRIPT_COMMON, /* U+EF00 */
416
  UCDN_SCRIPT_COMMON, /* U+F000 */
417
  UCDN_SCRIPT_COMMON, /* U+F100 */
418
  UCDN_SCRIPT_COMMON, /* U+F200 */
419
  UCDN_SCRIPT_COMMON, /* U+F300 */
420
  UCDN_SCRIPT_COMMON, /* U+F400 */
421
  UCDN_SCRIPT_COMMON, /* U+F500 */
422
  UCDN_SCRIPT_COMMON, /* U+F600 */
423
  UCDN_SCRIPT_COMMON, /* U+F700 */
424
  UCDN_SCRIPT_COMMON, /* U+F800 */
425
  UCDN_SCRIPT_HAN, /* U+F900 */
426
  UCDN_SCRIPT_HAN, /* U+FA00 */
427
  UCDN_SCRIPT_ARABIC, /* U+FB00 */
428
  UCDN_SCRIPT_ARABIC, /* U+FC00 */
429
  UCDN_SCRIPT_ARABIC, /* U+FD00 */
430
  UCDN_SCRIPT_ARABIC, /* U+FE00 */
431
  UCDN_SCRIPT_KATAKANA, /* U+FF00 */
432
};
433
434
static int
435
guess_script_from_block(int c)
436
0
{
437
0
  if (c < 0x10000)
438
0
    return ucdn_script_from_block_table[c >> 8];
439
0
  return UCDN_SCRIPT_COMMON;
440
0
}
441
442
/* Split fragments into single scripts (or punctuation + single script) */
443
static void
444
split_at_script(const uint32_t *fragment,
445
    size_t fragment_len,
446
    int level,
447
    void *arg,
448
    fz_bidi_fragment_fn *callback)
449
0
{
450
0
  int script_guess = UCDN_SCRIPT_COMMON;
451
0
  int script = UCDN_SCRIPT_COMMON;
452
0
  size_t script_start, i;
453
454
0
  script_start = 0;
455
0
  for (i = 0; i < fragment_len; i++)
456
0
  {
457
0
    int s = ucdn_get_script(fragment[i]);
458
0
    if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED || s == UCDN_SCRIPT_UNKNOWN)
459
0
    {
460
      /* Punctuation etc. This is fine. */
461
      /* Guess script using the unicode block if we've not determined it yet. */
462
0
      if (script_guess == UCDN_SCRIPT_COMMON)
463
0
        script_guess = guess_script_from_block(fragment[i]);
464
0
    }
465
0
    else if (s == script)
466
0
    {
467
      /* Same script. Still fine. */
468
0
    }
469
0
    else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED || script == UCDN_SCRIPT_UNKNOWN)
470
0
    {
471
      /* First non punctuation thing. Set the script. */
472
0
      script = s;
473
0
    }
474
0
    else
475
0
    {
476
      /* Change of script. Break the fragment. */
477
0
      assert(script != UCDN_SCRIPT_COMMON);
478
0
      (*callback)(&fragment[script_start], i - script_start, level, script, arg);
479
0
      script_start = i;
480
0
      script_guess = UCDN_SCRIPT_COMMON;
481
0
      script = s;
482
0
    }
483
0
  }
484
485
0
  if (script_start != fragment_len)
486
0
  {
487
0
    if (script == UCDN_SCRIPT_COMMON)
488
0
      script = script_guess;
489
0
    (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
490
0
  }
491
0
}
492
493
/* Determines the character classes for all following
494
 * passes of the algorithm. A character class is basically the type of Bidi
495
 * behaviour that the character exhibits.
496
 */
497
static void
498
classify_characters(const uint32_t *text,
499
    fz_bidi_chartype *types,
500
    size_t len,
501
    fz_bidi_flags flags)
502
0
{
503
0
  size_t i;
504
505
0
  if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
506
0
  {
507
0
    for (i = 0; i < len; i++)
508
0
    {
509
0
      types[i] = class_from_ch_ws(text[i]);
510
0
    }
511
0
  }
512
0
  else
513
0
  {
514
#ifdef DEBUG_BIDI_VERBOSE
515
    fprintf(stderr, "Text:  ");
516
    for (i = 0; i < len; i++)
517
    {
518
      /* So that we can actually sort of read the debug string, any
519
       * non-ascii characters are replaced with a 1-digit hash
520
       * value from 0-9, making non-english characters appear
521
       * as numbers
522
       */
523
      fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
524
          text[i] : text[i] % 9 + '0');
525
    }
526
    fprintf(stderr, "\nTypes: ");
527
#endif
528
0
    for (i = 0; i < len; i++)
529
0
    {
530
0
      types[i] = class_from_ch_n(text[i]);
531
#ifdef DEBUG_BIDI_VERBOSE
532
      fprintf(stderr, "%c", char_from_types[(int)types[i]]);
533
#endif
534
0
    }
535
#ifdef DEBUG_BIDI_VERBOSE
536
    fprintf(stderr, "\n");
537
#endif
538
0
  }
539
0
}
540
541
/* Determines the base level of the text.
542
 * Implements rule P2 of the Unicode Bidi Algorithm.
543
 * Note: Ignores explicit embeddings
544
 */
545
static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
546
0
{
547
0
  size_t i;
548
549
0
  for (i = 0; i < len; i++)
550
0
  {
551
0
    switch (types[i])
552
0
    {
553
    /* strong left */
554
0
    case BDI_L:
555
0
      return FZ_BIDI_LTR;
556
557
    /* strong right */
558
0
    case BDI_R:
559
0
    case BDI_AL:
560
0
      return FZ_BIDI_RTL;
561
0
    }
562
0
  }
563
0
  return FZ_BIDI_LTR;
564
0
}
565
566
static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
567
0
{
568
0
  switch (type)
569
0
  {
570
0
  case BDI_L:
571
0
  case BDI_EN:
572
0
    return FZ_BIDI_LTR;
573
574
0
  case BDI_R:
575
0
  case BDI_AL:
576
0
    return FZ_BIDI_RTL;
577
578
0
  default:
579
0
    return FZ_BIDI_NEUTRAL;
580
0
  }
581
0
}
582
583
static void
584
classify_quoted_blocks(const uint32_t *text,
585
    fz_bidi_chartype *types,
586
    size_t len)
587
0
{
588
0
  size_t i;
589
0
  int inQuote = FALSE;
590
0
  int pdfNeeded = FALSE;
591
0
  int ltrFound = FALSE;
592
0
  int rtlFound = FALSE;
593
594
  /* Only do anything special here if there is mixed content
595
   * (LTR *and* RTL) in the text.
596
   */
597
0
  for (i = 0; i < len; i++)
598
0
  {
599
0
    switch (direction_from_type(types[i]))
600
0
    {
601
0
    case FZ_BIDI_LTR:
602
0
      ltrFound = TRUE;
603
0
      break;
604
605
0
    case FZ_BIDI_RTL:
606
0
      rtlFound = TRUE;
607
0
      break;
608
609
0
    default:
610
0
      break;
611
0
    }
612
0
  }
613
614
  /* Only make any changes if *both* LTR and RTL characters exist
615
   * in this text.
616
   */
617
0
  if (!ltrFound || !rtlFound)
618
0
  {
619
0
    return;
620
0
  }
621
622
0
  for (i = 0; i < len; i++)
623
0
  {
624
0
    if (text[i]=='"')
625
0
    {
626
      /* If we're already in a quote then terminate it,
627
       * else start a new block.
628
       */
629
0
      if (inQuote)
630
0
      {
631
0
        inQuote = FALSE;
632
0
        if (pdfNeeded)
633
0
        {
634
0
          pdfNeeded = FALSE;
635
0
          types[i] = BDI_PDF;
636
0
        }
637
0
      }
638
0
      else
639
0
      {
640
0
        size_t j;
641
0
        int done = FALSE;
642
643
0
        inQuote = TRUE;
644
645
        /* Find the first strong right or left type and
646
         * use that to determine whether we should classify
647
         * the quote as LRE or RLE. Or neither, if we
648
         * hit another quote before any strongly-directional
649
         * character.
650
         */
651
0
        for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
652
0
        {
653
0
          switch(types[j])
654
0
          {
655
0
          case BDI_RLE:
656
0
          case BDI_LRE:
657
0
            done = TRUE;
658
0
            break;
659
660
0
          case BDI_L:
661
0
          case BDI_EN:
662
0
            types[i] = BDI_LRE;
663
0
            pdfNeeded = TRUE;
664
0
            done = TRUE;
665
0
            break;
666
667
0
          case BDI_R:
668
0
          case BDI_AL:
669
0
            types[i] = BDI_RLE;
670
0
            pdfNeeded = TRUE;
671
0
            done = TRUE;
672
0
            break;
673
674
0
          default:
675
0
            break;
676
0
          }
677
0
        }
678
0
      }
679
0
    }
680
0
  }
681
0
}
682
683
/* Creates a buffer with an embedding level for every character in the
684
 * given text. Also determines the base level and returns it in
685
 * *baseDir if *baseDir does not initially contain a valid direction.
686
 */
687
static fz_bidi_level *
688
create_levels(fz_context *ctx,
689
    const uint32_t *text,
690
    size_t len,
691
    fz_bidi_direction *baseDir,
692
    int resolveWhiteSpace,
693
    int flags)
694
0
{
695
0
  fz_bidi_level *levels, *plevels;
696
0
  fz_bidi_chartype *types = NULL;
697
0
  fz_bidi_chartype *ptypes;
698
0
  fz_bidi_level baseLevel;
699
0
  const uint32_t *ptext;
700
0
  size_t plen, remaining;
701
702
0
  levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");
703
704
0
  fz_var(types);
705
706
0
  fz_try(ctx)
707
0
  {
708
0
    types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
709
710
0
    classify_characters(text, types, len, flags);
711
712
0
    if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
713
0
    {
714
      /* Derive the base level from the text and
715
       * update *baseDir in case the caller wants to know.
716
       */
717
0
      baseLevel = base_level_from_text(types, len);
718
0
      *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
719
0
    }
720
0
    else
721
0
    {
722
0
      baseLevel = (fz_bidi_level)*baseDir;
723
0
    }
724
725
0
    {
726
      /* Replace tab with base direction, i.e. make tab appear as
727
       * 'strong left' if the base direction is left-to-right and
728
       * 'strong right' if base direction is right-to-left. This
729
       * allows Layout to implicitly treat tabs as 'segment separators'.
730
       */
731
0
      size_t i;
732
733
0
      for (i = 0u; i < len; i++)
734
0
      {
735
0
        if (text[i]=='\t')
736
0
        {
737
0
          types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
738
0
        }
739
0
      }
740
0
    }
741
742
    /* Look for quotation marks. Classify them as RLE or LRE
743
     * or leave them alone, depending on what follows them.
744
     */
745
0
    classify_quoted_blocks(text, types, len);
746
747
    /* Work one paragraph at a time. */
748
0
    plevels = levels;
749
0
    ptypes = types;
750
0
    ptext = text;
751
0
    remaining = len;
752
0
    while (remaining)
753
0
    {
754
0
      plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
755
756
      /* Work out the levels and character types... */
757
0
      (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
758
0
      fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
759
0
      fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
760
0
      fz_bidi_resolve_implicit(ptypes, plevels, plen);
761
762
0
      classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
763
764
0
      if (resolveWhiteSpace)
765
0
      {
766
        /* resolve whitespace */
767
0
        fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
768
0
      }
769
770
0
      plevels += plen;
771
0
      ptypes += plen;
772
0
      ptext += plen;
773
0
      remaining -= plen;
774
0
    }
775
776
    /* The levels buffer now has odd and even numbers indicating
777
     * rtl or ltr characters, respectively.
778
     */
779
#ifdef DEBUG_BIDI_VERBOSE
780
    fprintf(stderr, "Levels: ");
781
    {
782
      size_t i;
783
      for (i = 0; i < len; i++)
784
      {
785
        fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
786
      }
787
      fprintf(stderr, "\n");
788
    }
789
#endif
790
0
  }
791
0
  fz_always(ctx)
792
0
  {
793
0
    fz_free(ctx, types);
794
0
  }
795
0
  fz_catch(ctx)
796
0
  {
797
0
    fz_free(ctx, levels);
798
0
    fz_rethrow(ctx);
799
0
  }
800
0
  return levels;
801
0
}
802
803
/* Partitions the given character sequence into one or more unidirectional
804
 * fragments and invokes the given callback function for each fragment.
805
 */
806
void fz_bidi_fragment_text(fz_context *ctx,
807
    const uint32_t *text,
808
    size_t textlen,
809
    fz_bidi_direction *baseDir,
810
    fz_bidi_fragment_fn *callback,
811
    void *arg,
812
    int flags)
813
0
{
814
0
  size_t startOfFragment;
815
0
  size_t i;
816
0
  fz_bidi_level *levels;
817
818
0
  if (text == NULL || callback == NULL || textlen == 0)
819
0
    return;
820
821
0
  DBUGH((ctx, "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
822
823
0
  levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
824
825
  /* We now have an array with an embedding level
826
   * for each character in text.
827
   */
828
0
  assert(levels != NULL);
829
830
0
  fz_try(ctx)
831
0
  {
832
0
    startOfFragment = 0;
833
0
    for (i = 1; i < textlen; i++)
834
0
    {
835
0
      if (levels[i] != levels[i-1])
836
0
      {
837
        /* We've gone past the end of the fragment.
838
         * Create a text object for it, then start
839
         * a new fragment.
840
         */
841
0
        split_at_script(&text[startOfFragment],
842
0
            i - startOfFragment,
843
0
            levels[startOfFragment],
844
0
            arg,
845
0
            callback);
846
0
        startOfFragment = i;
847
0
      }
848
0
    }
849
    /* Now i == textlen. Deal with the final (or maybe only) fragment. */
850
    /* otherwise create 1 fragment */
851
0
    split_at_script(&text[startOfFragment],
852
0
        i - startOfFragment,
853
0
        levels[startOfFragment],
854
0
        arg,
855
0
        callback);
856
0
  }
857
0
  fz_always(ctx)
858
0
  {
859
0
    fz_free(ctx, levels);
860
0
  }
861
0
  fz_catch(ctx)
862
0
  {
863
0
    fz_rethrow(ctx);
864
0
  }
865
0
}