Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/fitz/bidi.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Bidirectional text processing.
3
 *
4
 * Processes unicode text by arranging the characters into an order suitable
5
 * for display. E.g. Hebrew text will be arranged from right-to-left and
6
 * any English within the text will remain in the left-to-right order.
7
 * Characters such as parenthesis will be substituted for their mirrored
8
 * equivalents if they are part of text which must be reversed.
9
 *
10
 * This is an implementation of the unicode Bidirectional Algorithm which
11
 * can be found here: http://www.unicode.org/reports/tr9/ and is based
12
 * on the reference implementation of the algorithm found on that page.
13
 *
14
 * For a nice overview of how it works, read this...
15
 * http://www.w3.org/TR/REC-html40/struct/dirlang.html
16
 *
17
 * Extracted from the SmartOffice code, where it was modified by Ian
18
 * Beveridge.
19
 *
20
 * Copyright (C) Picsel, 2004. All Rights Reserved.
21
 */
22
23
/*
24
 * Original copyright notice from unicode reference implementation.
25
 * ----------------------------------------------------------------
26
 * Written by: Asmus Freytag
27
 *  C++ and Windows dependencies removed, and
28
 *  command line interface added by: Rick McGowan
29
 *
30
 *  Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
31
 */
32
33
/*
34
 * Includes...
35
 */
36
37
#include "mupdf/fitz.h"
38
#include "mupdf/ucdn.h"
39
#include "bidi-imp.h" /* standard bidi code interface */
40
#include <assert.h>
41
42
/*
43
 * Macros...
44
 */
45
46
0
#define ODD(x) ((x) & 1)
47
48
#define REPLACEABLE_TYPE(t) ( \
49
    ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
50
    ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
51
    ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
52
53
#ifdef DEBUG_BIDI_VERBOSE
54
#define DBUGVF(params) do { fz_warn params; } while (0)
55
#else
56
#define DBUGVF(params) do {} while (0)
57
#endif
58
59
#ifdef DEBUG_BIDI_OUTLINE
60
#define DBUGH(params) do { fz_warn params; } while (0)
61
#else
62
244
#define DBUGH(params) do {} while (0)
63
#endif
64
65
#define UNICODE_EOS         0
66
#define UNICODE_DIGIT_ZERO        0x0030
67
#define UNICODE_DIGIT_NINE        0x0039
68
#define UNICODE_SUPERSCRIPT_TWO       0x00B2
69
#define UNICODE_SUPERSCRIPT_THREE     0x00B3
70
#define UNICODE_SUPERSCRIPT_ONE       0x00B9
71
#define UNICODE_RTL_START       0x0590
72
#define UNICODE_RTL_END         0x07BF
73
#define UNICODE_ARABIC_INDIC_DIGIT_ZERO     0x0660
74
#define UNICODE_ARABIC_INDIC_DIGIT_NINE     0x0669
75
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO  0x06F0
76
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE  0x06F9
77
#define UNICODE_ZERO_WIDTH_NON_JOINER     0x200C
78
#define UNICODE_SUPERSCRIPT_ZERO      0x2070
79
#define UNICODE_SUPERSCRIPT_FOUR      0x2074
80
#define UNICODE_SUPERSCRIPT_NINE      0x2079
81
#define UNICODE_SUBSCRIPT_ZERO        0x2080
82
#define UNICODE_SUBSCRIPT_NINE        0x2089
83
#define UNICODE_CIRCLED_DIGIT_ONE     0x2460
84
#define UNICODE_NUMBER_TWENTY_FULL_STOP     0x249B
85
#define UNICODE_CIRCLED_DIGIT_ZERO      0x24EA
86
#define UNICODE_FULLWIDTH_DIGIT_ZERO      0xFF10
87
#define UNICODE_FULLWIDTH_DIGIT_NINE      0xFF19
88
89
#ifndef TRUE
90
12.5k
#define TRUE (1)
91
#endif
92
#ifndef FALSE
93
1.22k
#define FALSE (0)
94
#endif
95
96
/*
97
 * Enumerations...
98
 */
99
100
#ifdef DEBUG_BIDI_VERBOSE
101
/* display support: */
102
static const char char_from_types[] =
103
{
104
  ' ',  /* ON */
105
  '>',  /* L */
106
  '<',  /* R */
107
  '9',  /* AN */
108
  '1',  /* EN */
109
  'a',  /* AL */
110
  '@',  /* NSM */
111
  '.',  /* CS */
112
  ',',  /* ES */
113
  '$',  /* ET */
114
  ':',  /* BN */
115
  'X',  /* S */
116
  '_',  /* WS */
117
  'B',  /* B */
118
  '+',  /* RLO */
119
  '+',  /* RLE */
120
  '+',  /* LRO */
121
  '+',  /* LRE */
122
  '-',  /* PDF */
123
  '=' /* LS */
124
};
125
#endif
126
127
/*
128
 * Functions and static functions...
129
 */
130
131
/* UCDN uses a different ordering than Bidi does. We cannot
132
 * change to the UCDN ordering, as the bidi-std.c code relies
133
 * on the exact ordering (at least that N = ON = 0). We
134
 * therefore map between the two using this small table. It
135
 * also takes care of fudging LRI, RLI, FSI and PDI, that this
136
 * code does not currently support. */
137
static const uint8_t ucdn_to_bidi[] =
138
{
139
  BDI_L,    /* UCDN_BIDI_CLASS_L = 0 */
140
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRE = 1 */
141
  BDI_LRO,  /* UCDN_BIDI_CLASS_LRO = 2 */
142
  BDI_R,    /* UCDN_BIDI_CLASS_R = 3 */
143
  BDI_AL,   /* UCDN_BIDI_CLASS_AL = 4 */
144
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLE = 5 */
145
  BDI_RLO,  /* UCDN_BIDI_CLASS_RLO = 6 */
146
  BDI_PDF,  /* UCDN_BIDI_CLASS_PDF = 7 */
147
  BDI_EN,   /* UCDN_BIDI_CLASS_EN = 8 */
148
  BDI_ES,   /* UCDN_BIDI_CLASS_ES = 9 */
149
  BDI_ET,   /* UCDN_BIDI_CLASS_ET = 10 */
150
  BDI_AN,   /* UCDN_BIDI_CLASS_AN = 11 */
151
  BDI_CS,   /* UCDN_BIDI_CLASS_CS = 12 */
152
  BDI_NSM,  /* UCDN_BIDI_CLASS_NSM = 13 */
153
  BDI_BN,   /* UCDN_BIDI_CLASS_BN = 14 */
154
  BDI_B,    /* UCDN_BIDI_CLASS_B = 15 */
155
  BDI_S,    /* UCDN_BIDI_CLASS_S = 16 */
156
  BDI_WS,   /* UCDN_BIDI_CLASS_WS = 17 */
157
  BDI_ON,   /* UCDN_BIDI_CLASS_ON = 18 */
158
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRI = 19 */
159
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLI = 20 */
160
  BDI_N,    /* UCDN_BIDI_CLASS_FSI = 21 */
161
  BDI_N,    /* UCDN_BIDI_CLASS_PDI = 22 */
162
};
163
164
30.4k
#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
165
166
/* Return a direction for white-space on the second pass of the algorithm. */
167
static fz_bidi_chartype class_from_ch_n(uint32_t ch)
168
15.2k
{
169
15.2k
  fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
170
15.2k
  if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
171
1.57k
    return BDI_N;
172
13.6k
  return from_ch_ws;
173
15.2k
}
174
175
/* Split fragments into single scripts (or punctuation + single script) */
176
static void
177
split_at_script(const uint32_t *fragment,
178
    size_t fragment_len,
179
    int level,
180
    void *arg,
181
    fz_bidi_fragment_fn *callback)
182
244
{
183
244
  int script = UCDN_SCRIPT_COMMON;
184
244
  size_t script_start, i;
185
186
244
  script_start = 0;
187
15.4k
  for (i = 0; i < fragment_len; i++)
188
15.2k
  {
189
15.2k
    int s = ucdn_get_script(fragment[i]);
190
15.2k
    if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
191
3.19k
    {
192
      /* Punctuation etc. This is fine. */
193
3.19k
    }
194
12.0k
    else if (s == script)
195
11.8k
    {
196
      /* Same script. Still fine. */
197
11.8k
    }
198
216
    else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
199
216
    {
200
      /* First non punctuation thing. Set the script. */
201
216
      script = s;
202
216
    }
203
0
    else
204
0
    {
205
      /* Change of script. Break the fragment. */
206
0
      (*callback)(&fragment[script_start], i - script_start, level, script, arg);
207
0
      script_start = i;
208
0
      script = s;
209
0
    }
210
15.2k
  }
211
244
  if (script_start != fragment_len)
212
244
  {
213
244
    (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
214
244
  }
215
244
}
216
217
/* Determines the character classes for all following
218
 * passes of the algorithm. A character class is basically the type of Bidi
219
 * behaviour that the character exhibits.
220
 */
221
static void
222
classify_characters(const uint32_t *text,
223
    fz_bidi_chartype *types,
224
    size_t len,
225
    fz_bidi_flags flags)
226
488
{
227
488
  size_t i;
228
229
488
  if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
230
244
  {
231
15.4k
    for (i = 0; i < len; i++)
232
15.2k
    {
233
15.2k
      types[i] = class_from_ch_ws(text[i]);
234
15.2k
    }
235
244
  }
236
244
  else
237
244
  {
238
#ifdef DEBUG_BIDI_VERBOSE
239
    fprintf(stderr, "Text:  ");
240
    for (i = 0; i < len; i++)
241
    {
242
      /* So that we can actually sort of read the debug string, any
243
       * non-ascii characters are replaced with a 1-digit hash
244
       * value from 0-9, making non-english characters appear
245
       * as numbers
246
       */
247
      fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
248
          text[i] : text[i] % 9 + '0');
249
    }
250
    fprintf(stderr, "\nTypes: ");
251
#endif
252
15.4k
    for (i = 0; i < len; i++)
253
15.2k
    {
254
15.2k
      types[i] = class_from_ch_n(text[i]);
255
#ifdef DEBUG_BIDI_VERBOSE
256
      fprintf(stderr, "%c", char_from_types[(int)types[i]]);
257
#endif
258
15.2k
    }
259
#ifdef DEBUG_BIDI_VERBOSE
260
    fprintf(stderr, "\n");
261
#endif
262
244
  }
263
488
}
264
265
/* Determines the base level of the text.
266
 * Implements rule P2 of the Unicode Bidi Algorithm.
267
 * Note: Ignores explicit embeddings
268
 */
269
static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
270
0
{
271
0
  size_t i;
272
273
0
  for (i = 0; i < len; i++)
274
0
  {
275
0
    switch (types[i])
276
0
    {
277
    /* strong left */
278
0
    case BDI_L:
279
0
      return FZ_BIDI_LTR;
280
281
    /* strong right */
282
0
    case BDI_R:
283
0
    case BDI_AL:
284
0
      return FZ_BIDI_RTL;
285
0
    }
286
0
  }
287
0
  return FZ_BIDI_LTR;
288
0
}
289
290
static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
291
15.2k
{
292
15.2k
  switch (type)
293
15.2k
  {
294
12.0k
  case BDI_L:
295
12.5k
  case BDI_EN:
296
12.5k
    return FZ_BIDI_LTR;
297
298
0
  case BDI_R:
299
0
  case BDI_AL:
300
0
    return FZ_BIDI_RTL;
301
302
2.64k
  default:
303
2.64k
    return FZ_BIDI_NEUTRAL;
304
15.2k
  }
305
15.2k
}
306
307
static void
308
classify_quoted_blocks(const uint32_t *text,
309
    fz_bidi_chartype *types,
310
    size_t len)
311
244
{
312
244
  size_t i;
313
244
  int inQuote = FALSE;
314
244
  int pdfNeeded = FALSE;
315
244
  int ltrFound = FALSE;
316
244
  int rtlFound = FALSE;
317
318
  /* Only do anything special here if there is mixed content
319
   * (LTR *and* RTL) in the text.
320
   */
321
15.4k
  for (i = 0; i < len; i++)
322
15.2k
  {
323
15.2k
    switch (direction_from_type(types[i]))
324
15.2k
    {
325
12.5k
    case FZ_BIDI_LTR:
326
12.5k
      ltrFound = TRUE;
327
12.5k
      break;
328
329
0
    case FZ_BIDI_RTL:
330
0
      rtlFound = TRUE;
331
0
      break;
332
333
2.64k
    default:
334
2.64k
      break;
335
15.2k
    }
336
15.2k
  }
337
338
  /* Only make any changes if *both* LTR and RTL characters exist
339
   * in this text.
340
   */
341
244
  if (!ltrFound || !rtlFound)
342
244
  {
343
244
    return;
344
244
  }
345
346
0
  for (i = 0; i < len; i++)
347
0
  {
348
0
    if (text[i]=='"')
349
0
    {
350
      /* If we're already in a quote then terminate it,
351
       * else start a new block.
352
       */
353
0
      if (inQuote)
354
0
      {
355
0
        inQuote = FALSE;
356
0
        if (pdfNeeded)
357
0
        {
358
0
          pdfNeeded = FALSE;
359
0
          types[i] = BDI_PDF;
360
0
        }
361
0
      }
362
0
      else
363
0
      {
364
0
        size_t j;
365
0
        int done = FALSE;
366
367
0
        inQuote = TRUE;
368
369
        /* Find the first strong right or left type and
370
         * use that to determine whether we should classify
371
         * the quote as LRE or RLE. Or neither, if we
372
         * hit another quote before any strongly-directional
373
         * character.
374
         */
375
0
        for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
376
0
        {
377
0
          switch(types[j])
378
0
          {
379
0
          case BDI_RLE:
380
0
          case BDI_LRE:
381
0
            done = TRUE;
382
0
            break;
383
384
0
          case BDI_L:
385
0
          case BDI_EN:
386
0
            types[i] = BDI_LRE;
387
0
            pdfNeeded = TRUE;
388
0
            done = TRUE;
389
0
            break;
390
391
0
          case BDI_R:
392
0
          case BDI_AL:
393
0
            types[i] = BDI_RLE;
394
0
            pdfNeeded = TRUE;
395
0
            done = TRUE;
396
0
            break;
397
398
0
          default:
399
0
            break;
400
0
          }
401
0
        }
402
0
      }
403
0
    }
404
0
  }
405
0
}
406
407
/* Creates a buffer with an embedding level for every character in the
408
 * given text. Also determines the base level and returns it in
409
 * *baseDir if *baseDir does not initially contain a valid direction.
410
 */
411
static fz_bidi_level *
412
create_levels(fz_context *ctx,
413
    const uint32_t *text,
414
    size_t len,
415
    fz_bidi_direction *baseDir,
416
    int resolveWhiteSpace,
417
    int flags)
418
244
{
419
244
  fz_bidi_level *levels, *plevels;
420
244
  fz_bidi_chartype *types = NULL;
421
244
  fz_bidi_chartype *ptypes;
422
244
  fz_bidi_level baseLevel;
423
244
  const uint32_t *ptext;
424
244
  size_t plen, remaining;
425
426
244
  levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");
427
428
244
  fz_var(types);
429
430
488
  fz_try(ctx)
431
488
  {
432
244
    types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
433
434
244
    classify_characters(text, types, len, flags);
435
436
244
    if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
437
0
    {
438
      /* Derive the base level from the text and
439
       * update *baseDir in case the caller wants to know.
440
       */
441
0
      baseLevel = base_level_from_text(types, len);
442
0
      *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
443
0
    }
444
244
    else
445
244
    {
446
244
      baseLevel = (fz_bidi_level)*baseDir;
447
244
    }
448
449
244
    {
450
      /* Replace tab with base direction, i.e. make tab appear as
451
       * 'strong left' if the base direction is left-to-right and
452
       * 'strong right' if base direction is right-to-left. This
453
       * allows Layout to implicitly treat tabs as 'segment separators'.
454
       */
455
244
      size_t i;
456
457
15.4k
      for (i = 0u; i < len; i++)
458
15.2k
      {
459
15.2k
        if (text[i]=='\t')
460
0
        {
461
0
          types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
462
0
        }
463
15.2k
      }
464
244
    }
465
466
    /* Look for quotation marks. Classify them as RLE or LRE
467
     * or leave them alone, depending on what follows them.
468
     */
469
244
    classify_quoted_blocks(text, types, len);
470
471
    /* Work one paragraph at a time. */
472
244
    plevels = levels;
473
244
    ptypes = types;
474
244
    ptext = text;
475
244
    remaining = len;
476
488
    while (remaining)
477
244
    {
478
244
      plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
479
480
      /* Work out the levels and character types... */
481
244
      (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
482
244
      fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
483
244
      fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
484
244
      fz_bidi_resolve_implicit(ptypes, plevels, plen);
485
486
244
      classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
487
488
244
      if (resolveWhiteSpace)
489
0
      {
490
        /* resolve whitespace */
491
0
        fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
492
0
      }
493
494
244
      plevels += plen;
495
244
      ptypes += plen;
496
244
      ptext += plen;
497
244
      remaining -= plen;
498
244
    }
499
500
    /* The levels buffer now has odd and even numbers indicating
501
     * rtl or ltr characters, respectively.
502
     */
503
#ifdef DEBUG_BIDI_VERBOSE
504
    fprintf(stderr, "Levels: ");
505
    {
506
      size_t i;
507
      for (i = 0; i < len; i++)
508
      {
509
        fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
510
      }
511
      fprintf(stderr, "\n");
512
    }
513
#endif
514
244
  }
515
488
  fz_always(ctx)
516
244
  {
517
244
    fz_free(ctx, types);
518
244
  }
519
244
  fz_catch(ctx)
520
0
  {
521
0
    fz_free(ctx, levels);
522
0
    fz_rethrow(ctx);
523
0
  }
524
244
  return levels;
525
244
}
526
527
/* Partitions the given character sequence into one or more unidirectional
528
 * fragments and invokes the given callback function for each fragment.
529
 */
530
void fz_bidi_fragment_text(fz_context *ctx,
531
    const uint32_t *text,
532
    size_t textlen,
533
    fz_bidi_direction *baseDir,
534
    fz_bidi_fragment_fn *callback,
535
    void *arg,
536
    int flags)
537
244
{
538
244
  size_t startOfFragment;
539
244
  size_t i;
540
244
  fz_bidi_level *levels;
541
542
244
  if (text == NULL || callback == NULL || textlen == 0)
543
0
    return;
544
545
244
  DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
546
547
244
  levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
548
549
  /* We now have an array with an embedding level
550
   * for each character in text.
551
   */
552
244
  assert(levels != NULL);
553
554
488
  fz_try(ctx)
555
488
  {
556
244
    startOfFragment = 0;
557
15.2k
    for (i = 1; i < textlen; i++)
558
14.9k
    {
559
14.9k
      if (levels[i] != levels[i-1])
560
0
      {
561
        /* We've gone past the end of the fragment.
562
         * Create a text object for it, then start
563
         * a new fragment.
564
         */
565
0
        split_at_script(&text[startOfFragment],
566
0
            i - startOfFragment,
567
0
            levels[startOfFragment],
568
0
            arg,
569
0
            callback);
570
0
        startOfFragment = i;
571
0
      }
572
14.9k
    }
573
    /* Now i == textlen. Deal with the final (or maybe only) fragment. */
574
    /* otherwise create 1 fragment */
575
244
    split_at_script(&text[startOfFragment],
576
244
        i - startOfFragment,
577
244
        levels[startOfFragment],
578
244
        arg,
579
244
        callback);
580
244
  }
581
488
  fz_always(ctx)
582
244
  {
583
244
    fz_free(ctx, levels);
584
244
  }
585
244
  fz_catch(ctx)
586
0
  {
587
0
    fz_rethrow(ctx);
588
0
  }
589
244
}