/src/mupdf/source/fitz/bidi.c

Source (jump to first uncovered line)
/*
 * Bidirectional text processing.
 *
 * Processes unicode text by arranging the characters into an order suitable
 * for display. E.g. Hebrew text will be arranged from right-to-left and
 * any English within the text will remain in the left-to-right order.
 * Characters such as parenthesis will be substituted for their mirrored
 * equivalents if they are part of text which must be reversed.
 *
 * This is an implementation of the unicode Bidirectional Algorithm which
 * can be found here: http://www.unicode.org/reports/tr9/ and is based
 * on the reference implementation of the algorithm found on that page.
 *
 * For a nice overview of how it works, read this...
 * http://www.w3.org/TR/REC-html40/struct/dirlang.html
 *
 * Extracted from the SmartOffice code, where it was modified by Ian
 * Beveridge.
 *
 * Copyright (C) Picsel, 2004. All Rights Reserved.
 */

/*
 * Original copyright notice from unicode reference implementation.
 * ----------------------------------------------------------------
 * Written by: Asmus Freytag
 *  C++ and Windows dependencies removed, and
 *  command line interface added by: Rick McGowan
 *
 *  Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
 */

/*
 * Includes...
 */

#include "mupdf/fitz.h"
#include "mupdf/ucdn.h"
#include "bidi-imp.h" /* standard bidi code interface */
#include <assert.h>

/*
 * Macros...
 */

#define ODD(x) ((x) & 1)

#define REPLACEABLE_TYPE(t) ( \
    ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
    ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
    ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )

#ifdef DEBUG_BIDI_VERBOSE
#define DBUGVF(params) do { fz_warn params; } while (0)
#else
#define DBUGVF(params) do {} while (0)
#endif

#ifdef DEBUG_BIDI_OUTLINE
#define DBUGH(params) do { fz_warn params; } while (0)
#else
#define DBUGH(params) do {} while (0)
#endif

#define UNICODE_EOS         0
#define UNICODE_DIGIT_ZERO        0x0030
#define UNICODE_DIGIT_NINE        0x0039
#define UNICODE_SUPERSCRIPT_TWO       0x00B2
#define UNICODE_SUPERSCRIPT_THREE     0x00B3
#define UNICODE_SUPERSCRIPT_ONE       0x00B9
#define UNICODE_RTL_START       0x0590
#define UNICODE_RTL_END         0x07BF
#define UNICODE_ARABIC_INDIC_DIGIT_ZERO     0x0660
#define UNICODE_ARABIC_INDIC_DIGIT_NINE     0x0669
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO  0x06F0
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE  0x06F9
#define UNICODE_ZERO_WIDTH_NON_JOINER     0x200C
#define UNICODE_SUPERSCRIPT_ZERO      0x2070
#define UNICODE_SUPERSCRIPT_FOUR      0x2074
#define UNICODE_SUPERSCRIPT_NINE      0x2079
#define UNICODE_SUBSCRIPT_ZERO        0x2080
#define UNICODE_SUBSCRIPT_NINE        0x2089
#define UNICODE_CIRCLED_DIGIT_ONE     0x2460
#define UNICODE_NUMBER_TWENTY_FULL_STOP     0x249B
#define UNICODE_CIRCLED_DIGIT_ZERO      0x24EA
#define UNICODE_FULLWIDTH_DIGIT_ZERO      0xFF10
#define UNICODE_FULLWIDTH_DIGIT_NINE      0xFF19

#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif

/*
 * Enumerations...
 */

#ifdef DEBUG_BIDI_VERBOSE
/* display support: */
static const char char_from_types[] =
{
  ' ',  /* ON */
  '>',  /* L */
  '<',  /* R */
  '9',  /* AN */
  '1',  /* EN */
  'a',  /* AL */
  '@',  /* NSM */
  '.',  /* CS */
  ',',  /* ES */
  '$',  /* ET */
  ':',  /* BN */
  'X',  /* S */
  '_',  /* WS */
  'B',  /* B */
  '+',  /* RLO */
  '+',  /* RLE */
  '+',  /* LRO */
  '+',  /* LRE */
  '-',  /* PDF */
  '=' /* LS */
};
#endif

/*
 * Functions and static functions...
 */

/* UCDN uses a different ordering than Bidi does. We cannot
 * change to the UCDN ordering, as the bidi-std.c code relies
 * on the exact ordering (at least that N = ON = 0). We
 * therefore map between the two using this small table. It
 * also takes care of fudging LRI, RLI, FSI and PDI, that this
 * code does not currently support. */
static const uint8_t ucdn_to_bidi[] =
{
  BDI_L,    /* UCDN_BIDI_CLASS_L = 0 */
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRE = 1 */
  BDI_LRO,  /* UCDN_BIDI_CLASS_LRO = 2 */
  BDI_R,    /* UCDN_BIDI_CLASS_R = 3 */
  BDI_AL,   /* UCDN_BIDI_CLASS_AL = 4 */
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLE = 5 */
  BDI_RLO,  /* UCDN_BIDI_CLASS_RLO = 6 */
  BDI_PDF,  /* UCDN_BIDI_CLASS_PDF = 7 */
  BDI_EN,   /* UCDN_BIDI_CLASS_EN = 8 */
  BDI_ES,   /* UCDN_BIDI_CLASS_ES = 9 */
  BDI_ET,   /* UCDN_BIDI_CLASS_ET = 10 */
  BDI_AN,   /* UCDN_BIDI_CLASS_AN = 11 */
  BDI_CS,   /* UCDN_BIDI_CLASS_CS = 12 */
  BDI_NSM,  /* UCDN_BIDI_CLASS_NSM = 13 */
  BDI_BN,   /* UCDN_BIDI_CLASS_BN = 14 */
  BDI_B,    /* UCDN_BIDI_CLASS_B = 15 */
  BDI_S,    /* UCDN_BIDI_CLASS_S = 16 */
  BDI_WS,   /* UCDN_BIDI_CLASS_WS = 17 */
  BDI_ON,   /* UCDN_BIDI_CLASS_ON = 18 */
  BDI_LRE,  /* UCDN_BIDI_CLASS_LRI = 19 */
  BDI_RLE,  /* UCDN_BIDI_CLASS_RLI = 20 */
  BDI_N,    /* UCDN_BIDI_CLASS_FSI = 21 */
  BDI_N,    /* UCDN_BIDI_CLASS_PDI = 22 */
};

#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])

/* Return a direction for white-space on the second pass of the algorithm. */
static fz_bidi_chartype class_from_ch_n(uint32_t ch)
{
  fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
  if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
    return BDI_N;
  return from_ch_ws;
}

/* Split fragments into single scripts (or punctuation + single script) */
static void
split_at_script(const uint32_t *fragment,
    size_t fragment_len,
    int level,
    void *arg,
    fz_bidi_fragment_fn *callback)
{
  int script = UCDN_SCRIPT_COMMON;
  size_t script_start, i;

  script_start = 0;
  for (i = 0; i < fragment_len; i++)
  {
    int s = ucdn_get_script(fragment[i]);
    if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
    {
      /* Punctuation etc. This is fine. */
    }
    else if (s == script)
    {
      /* Same script. Still fine. */
    }
    else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
    {
      /* First non punctuation thing. Set the script. */
      script = s;
    }
    else
    {
      /* Change of script. Break the fragment. */
      (*callback)(&fragment[script_start], i - script_start, level, script, arg);
      script_start = i;
      script = s;
    }
  }
  if (script_start != fragment_len)
  {
    (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
  }
}

/* Determines the character classes for all following
 * passes of the algorithm. A character class is basically the type of Bidi
 * behaviour that the character exhibits.
 */
static void
classify_characters(const uint32_t *text,
    fz_bidi_chartype *types,
    size_t len,
    fz_bidi_flags flags)
{
  size_t i;

  if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
  {
    for (i = 0; i < len; i++)
    {
      types[i] = class_from_ch_ws(text[i]);
    }
  }
  else
  {
#ifdef DEBUG_BIDI_VERBOSE
    fprintf(stderr, "Text:  ");
    for (i = 0; i < len; i++)
    {
      /* So that we can actually sort of read the debug string, any
       * non-ascii characters are replaced with a 1-digit hash
       * value from 0-9, making non-english characters appear
       * as numbers
       */
      fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
          text[i] : text[i] % 9 + '0');
    }
    fprintf(stderr, "\nTypes: ");
#endif
    for (i = 0; i < len; i++)
    {
      types[i] = class_from_ch_n(text[i]);
#ifdef DEBUG_BIDI_VERBOSE
      fprintf(stderr, "%c", char_from_types[(int)types[i]]);
#endif
    }
#ifdef DEBUG_BIDI_VERBOSE
    fprintf(stderr, "\n");
#endif
  }
}

/* Determines the base level of the text.
 * Implements rule P2 of the Unicode Bidi Algorithm.
 * Note: Ignores explicit embeddings
 */
static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
{
  size_t i;

  for (i = 0; i < len; i++)
  {
    switch (types[i])
    {
    /* strong left */
    case BDI_L:
      return FZ_BIDI_LTR;

    /* strong right */
    case BDI_R:
    case BDI_AL:
      return FZ_BIDI_RTL;
    }
  }
  return FZ_BIDI_LTR;
}

static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
{
  switch (type)
  {
  case BDI_L:
  case BDI_EN:
    return FZ_BIDI_LTR;

  case BDI_R:
  case BDI_AL:
    return FZ_BIDI_RTL;

  default:
    return FZ_BIDI_NEUTRAL;
  }
}

static void
classify_quoted_blocks(const uint32_t *text,
    fz_bidi_chartype *types,
    size_t len)
{
  size_t i;
  int inQuote = FALSE;
  int pdfNeeded = FALSE;
  int ltrFound = FALSE;
  int rtlFound = FALSE;

  /* Only do anything special here if there is mixed content
   * (LTR *and* RTL) in the text.
   */
  for (i = 0; i < len; i++)
  {
    switch (direction_from_type(types[i]))
    {
    case FZ_BIDI_LTR:
      ltrFound = TRUE;
      break;

    case FZ_BIDI_RTL:
      rtlFound = TRUE;
      break;

    default:
      break;
    }
  }

  /* Only make any changes if *both* LTR and RTL characters exist
   * in this text.
   */
  if (!ltrFound || !rtlFound)
  {
    return;
  }

  for (i = 0; i < len; i++)
  {
    if (text[i]=='"')
    {
      /* If we're already in a quote then terminate it,
       * else start a new block.
       */
      if (inQuote)
      {
        inQuote = FALSE;
        if (pdfNeeded)
        {
          pdfNeeded = FALSE;
          types[i] = BDI_PDF;
        }
      }
      else
      {
        size_t j;
        int done = FALSE;

        inQuote = TRUE;

        /* Find the first strong right or left type and
         * use that to determine whether we should classify
         * the quote as LRE or RLE. Or neither, if we
         * hit another quote before any strongly-directional
         * character.
         */
        for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
        {
          switch(types[j])
          {
          case BDI_RLE:
          case BDI_LRE:
            done = TRUE;
            break;

          case BDI_L:
          case BDI_EN:
            types[i] = BDI_LRE;
            pdfNeeded = TRUE;
            done = TRUE;
            break;

          case BDI_R:
          case BDI_AL:
            types[i] = BDI_RLE;
            pdfNeeded = TRUE;
            done = TRUE;
            break;

          default:
            break;
          }
        }
      }
    }
  }
}

/* Creates a buffer with an embedding level for every character in the
 * given text. Also determines the base level and returns it in
 * *baseDir if *baseDir does not initially contain a valid direction.
 */
static fz_bidi_level *
create_levels(fz_context *ctx,
    const uint32_t *text,
    size_t len,
    fz_bidi_direction *baseDir,
    int resolveWhiteSpace,
    int flags)
{
  fz_bidi_level *levels, *plevels;
  fz_bidi_chartype *types = NULL;
  fz_bidi_chartype *ptypes;
  fz_bidi_level baseLevel;
  const uint32_t *ptext;
  size_t plen, remaining;

  levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");

  fz_var(types);

  fz_try(ctx)
  {
    types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));

    classify_characters(text, types, len, flags);

    if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
    {
      /* Derive the base level from the text and
       * update *baseDir in case the caller wants to know.
       */
      baseLevel = base_level_from_text(types, len);
      *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
    }
    else
    {
      baseLevel = (fz_bidi_level)*baseDir;
    }

    {
      /* Replace tab with base direction, i.e. make tab appear as
       * 'strong left' if the base direction is left-to-right and
       * 'strong right' if base direction is right-to-left. This
       * allows Layout to implicitly treat tabs as 'segment separators'.
       */
      size_t i;

      for (i = 0u; i < len; i++)
      {
        if (text[i]=='\t')
        {
          types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
        }
      }
    }

    /* Look for quotation marks. Classify them as RLE or LRE
     * or leave them alone, depending on what follows them.
     */
    classify_quoted_blocks(text, types, len);

    /* Work one paragraph at a time. */
    plevels = levels;
    ptypes = types;
    ptext = text;
    remaining = len;
    while (remaining)
    {
      plen = fz_bidi_resolve_paragraphs(ptypes, remaining);

      /* Work out the levels and character types... */
      (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
      fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
      fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
      fz_bidi_resolve_implicit(ptypes, plevels, plen);

      classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);

      if (resolveWhiteSpace)
      {
        /* resolve whitespace */
        fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
      }

      plevels += plen;
      ptypes += plen;
      ptext += plen;
      remaining -= plen;
    }

    /* The levels buffer now has odd and even numbers indicating
     * rtl or ltr characters, respectively.
     */
#ifdef DEBUG_BIDI_VERBOSE
    fprintf(stderr, "Levels: ");
    {
      size_t i;
      for (i = 0; i < len; i++)
      {
        fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
      }
      fprintf(stderr, "\n");
    }
#endif
  }
  fz_always(ctx)
  {
    fz_free(ctx, types);
  }
  fz_catch(ctx)
  {
    fz_free(ctx, levels);
    fz_rethrow(ctx);
  }
  return levels;
}

/* Partitions the given character sequence into one or more unidirectional
 * fragments and invokes the given callback function for each fragment.
 */
void fz_bidi_fragment_text(fz_context *ctx,
    const uint32_t *text,
    size_t textlen,
    fz_bidi_direction *baseDir,
    fz_bidi_fragment_fn *callback,
    void *arg,
    int flags)
{
  size_t startOfFragment;
  size_t i;
  fz_bidi_level *levels;

  if (text == NULL || callback == NULL || textlen == 0)
    return;

  DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));

  levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);

  /* We now have an array with an embedding level
   * for each character in text.
   */
  assert(levels != NULL);

  fz_try(ctx)
  {
    startOfFragment = 0;
    for (i = 1; i < textlen; i++)
    {
      if (levels[i] != levels[i-1])
      {
        /* We've gone past the end of the fragment.
         * Create a text object for it, then start
         * a new fragment.
         */
        split_at_script(&text[startOfFragment],
            i - startOfFragment,
            levels[startOfFragment],
            arg,
            callback);
        startOfFragment = i;
      }
    }
    /* Now i == textlen. Deal with the final (or maybe only) fragment. */
    /* otherwise create 1 fragment */
    split_at_script(&text[startOfFragment],
        i - startOfFragment,
        levels[startOfFragment],
        arg,
        callback);
  }
  fz_always(ctx)
  {
    fz_free(ctx, levels);
  }
  fz_catch(ctx)
  {
    fz_rethrow(ctx);
  }
}

Coverage Report

Created: 2024-05-20 06:23

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Bidirectional text processing.
3		*
4		* Processes unicode text by arranging the characters into an order suitable
5		* for display. E.g. Hebrew text will be arranged from right-to-left and
6		* any English within the text will remain in the left-to-right order.
7		* Characters such as parenthesis will be substituted for their mirrored
8		* equivalents if they are part of text which must be reversed.
9		*
10		* This is an implementation of the unicode Bidirectional Algorithm which
11		* can be found here: http://www.unicode.org/reports/tr9/ and is based
12		* on the reference implementation of the algorithm found on that page.
13		*
14		* For a nice overview of how it works, read this...
15		* http://www.w3.org/TR/REC-html40/struct/dirlang.html
16		*
17		* Extracted from the SmartOffice code, where it was modified by Ian
18		* Beveridge.
19		*
20		* Copyright (C) Picsel, 2004. All Rights Reserved.
21		*/
22
23		/*
24		* Original copyright notice from unicode reference implementation.
25		* ----------------------------------------------------------------
26		* Written by: Asmus Freytag
27		* C++ and Windows dependencies removed, and
28		* command line interface added by: Rick McGowan
29		*
30		* Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
31		*/
32
33		/*
34		* Includes...
35		*/
36
37		#include "mupdf/fitz.h"
38		#include "mupdf/ucdn.h"
39		#include "bidi-imp.h" /* standard bidi code interface */
40		#include <assert.h>
41
42		/*
43		* Macros...
44		*/
45
46	0	#define ODD(x) ((x) & 1)
47
48		#define REPLACEABLE_TYPE(t) ( \
49		((t)==BDI_ES) \|\| ((t)==BDI_ET) \|\| ((t)==BDI_CS) \|\| \
50		((t)==BDI_NSM) \|\| ((t)==BDI_PDF) \|\| ((t)==BDI_BN) \|\| \
51		((t)==BDI_S) \|\| ((t)==BDI_WS) \|\| ((t)==BDI_N) )
52
53		#ifdef DEBUG_BIDI_VERBOSE
54		#define DBUGVF(params) do { fz_warn params; } while (0)
55		#else
56		#define DBUGVF(params) do {} while (0)
57		#endif
58
59		#ifdef DEBUG_BIDI_OUTLINE
60		#define DBUGH(params) do { fz_warn params; } while (0)
61		#else
62	244	#define DBUGH(params) do {} while (0)
63		#endif
64
65		#define UNICODE_EOS 0
66		#define UNICODE_DIGIT_ZERO 0x0030
67		#define UNICODE_DIGIT_NINE 0x0039
68		#define UNICODE_SUPERSCRIPT_TWO 0x00B2
69		#define UNICODE_SUPERSCRIPT_THREE 0x00B3
70		#define UNICODE_SUPERSCRIPT_ONE 0x00B9
71		#define UNICODE_RTL_START 0x0590
72		#define UNICODE_RTL_END 0x07BF
73		#define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
74		#define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
75		#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
76		#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
77		#define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
78		#define UNICODE_SUPERSCRIPT_ZERO 0x2070
79		#define UNICODE_SUPERSCRIPT_FOUR 0x2074
80		#define UNICODE_SUPERSCRIPT_NINE 0x2079
81		#define UNICODE_SUBSCRIPT_ZERO 0x2080
82		#define UNICODE_SUBSCRIPT_NINE 0x2089
83		#define UNICODE_CIRCLED_DIGIT_ONE 0x2460
84		#define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
85		#define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
86		#define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
87		#define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
88
89		#ifndef TRUE
90	12.5k	#define TRUE (1)
91		#endif
92		#ifndef FALSE
93	1.22k	#define FALSE (0)
94		#endif
95
96		/*
97		* Enumerations...
98		*/
99
100		#ifdef DEBUG_BIDI_VERBOSE
101		/* display support: */
102		static const char char_from_types[] =
103		{
104		' ', /* ON */
105		'>', /* L */
106		'<', /* R */
107		'9', /* AN */
108		'1', /* EN */
109		'a', /* AL */
110		'@', /* NSM */
111		'.', /* CS */
112		',', /* ES */
113		'$', /* ET */
114		':', /* BN */
115		'X', /* S */
116		'_', /* WS */
117		'B', /* B */
118		'+', /* RLO */
119		'+', /* RLE */
120		'+', /* LRO */
121		'+', /* LRE */
122		'-', /* PDF */
123		'=' /* LS */
124		};
125		#endif
126
127		/*
128		* Functions and static functions...
129		*/
130
131		/* UCDN uses a different ordering than Bidi does. We cannot
132		* change to the UCDN ordering, as the bidi-std.c code relies
133		* on the exact ordering (at least that N = ON = 0). We
134		* therefore map between the two using this small table. It
135		* also takes care of fudging LRI, RLI, FSI and PDI, that this
136		* code does not currently support. */
137		static const uint8_t ucdn_to_bidi[] =
138		{
139		BDI_L, /* UCDN_BIDI_CLASS_L = 0 */
140		BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */
141		BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */
142		BDI_R, /* UCDN_BIDI_CLASS_R = 3 */
143		BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */
144		BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */
145		BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */
146		BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */
147		BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */
148		BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */
149		BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */
150		BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */
151		BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */
152		BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */
153		BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */
154		BDI_B, /* UCDN_BIDI_CLASS_B = 15 */
155		BDI_S, /* UCDN_BIDI_CLASS_S = 16 */
156		BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */
157		BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */
158		BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */
159		BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */
160		BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */
161		BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */
162		};
163
164	30.4k	#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
165
166		/* Return a direction for white-space on the second pass of the algorithm. */
167		static fz_bidi_chartype class_from_ch_n(uint32_t ch)
168	15.2k	{
169	15.2k	fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
170	15.2k	if (from_ch_ws == BDI_S \|\| from_ch_ws == BDI_WS)
171	1.57k	return BDI_N;
172	13.6k	return from_ch_ws;
173	15.2k	}
174
175		/* Split fragments into single scripts (or punctuation + single script) */
176		static void
177		split_at_script(const uint32_t *fragment,
178		size_t fragment_len,
179		int level,
180		void *arg,
181		fz_bidi_fragment_fn *callback)
182	244	{
183	244	int script = UCDN_SCRIPT_COMMON;
184	244	size_t script_start, i;
185
186	244	script_start = 0;
187	15.4k	for (i = 0; i < fragment_len; i++)
188	15.2k	{
189	15.2k	int s = ucdn_get_script(fragment[i]);
190	15.2k	if (s == UCDN_SCRIPT_COMMON \|\| s == UCDN_SCRIPT_INHERITED)
191	3.19k	{
192		/* Punctuation etc. This is fine. */
193	3.19k	}
194	12.0k	else if (s == script)
195	11.8k	{
196		/* Same script. Still fine. */
197	11.8k	}
198	216	else if (script == UCDN_SCRIPT_COMMON \|\| script == UCDN_SCRIPT_INHERITED)
199	216	{
200		/* First non punctuation thing. Set the script. */
201	216	script = s;
202	216	}
203	0	else
204	0	{
205		/* Change of script. Break the fragment. */
206	0	(*callback)(&fragment[script_start], i - script_start, level, script, arg);
207	0	script_start = i;
208	0	script = s;
209	0	}
210	15.2k	}
211	244	if (script_start != fragment_len)
212	244	{
213	244	(*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
214	244	}
215	244	}
216
217		/* Determines the character classes for all following
218		* passes of the algorithm. A character class is basically the type of Bidi
219		* behaviour that the character exhibits.
220		*/
221		static void
222		classify_characters(const uint32_t *text,
223		fz_bidi_chartype *types,
224		size_t len,
225		fz_bidi_flags flags)
226	488	{
227	488	size_t i;
228
229	488	if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
230	244	{
231	15.4k	for (i = 0; i < len; i++)
232	15.2k	{
233	15.2k	types[i] = class_from_ch_ws(text[i]);
234	15.2k	}
235	244	}
236	244	else
237	244	{
238		#ifdef DEBUG_BIDI_VERBOSE
239		fprintf(stderr, "Text: ");
240		for (i = 0; i < len; i++)
241		{
242		/* So that we can actually sort of read the debug string, any
243		* non-ascii characters are replaced with a 1-digit hash
244		* value from 0-9, making non-english characters appear
245		* as numbers
246		*/
247		fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
248		text[i] : text[i] % 9 + '0');
249		}
250		fprintf(stderr, "\nTypes: ");
251		#endif
252	15.4k	for (i = 0; i < len; i++)
253	15.2k	{
254	15.2k	types[i] = class_from_ch_n(text[i]);
255		#ifdef DEBUG_BIDI_VERBOSE
256		fprintf(stderr, "%c", char_from_types[(int)types[i]]);
257		#endif
258	15.2k	}
259		#ifdef DEBUG_BIDI_VERBOSE
260		fprintf(stderr, "\n");
261		#endif
262	244	}
263	488	}
264
265		/* Determines the base level of the text.
266		* Implements rule P2 of the Unicode Bidi Algorithm.
267		* Note: Ignores explicit embeddings
268		*/
269		static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
270	0	{
271	0	size_t i;
272
273	0	for (i = 0; i < len; i++)
274	0	{
275	0	switch (types[i])
276	0	{
277		/* strong left */
278	0	case BDI_L:
279	0	return FZ_BIDI_LTR;
280
281		/* strong right */
282	0	case BDI_R:
283	0	case BDI_AL:
284	0	return FZ_BIDI_RTL;
285	0	}
286	0	}
287	0	return FZ_BIDI_LTR;
288	0	}
289
290		static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
291	15.2k	{
292	15.2k	switch (type)
293	15.2k	{
294	12.0k	case BDI_L:
295	12.5k	case BDI_EN:
296	12.5k	return FZ_BIDI_LTR;
297
298	0	case BDI_R:
299	0	case BDI_AL:
300	0	return FZ_BIDI_RTL;
301
302	2.64k	default:
303	2.64k	return FZ_BIDI_NEUTRAL;
304	15.2k	}
305	15.2k	}
306
307		static void
308		classify_quoted_blocks(const uint32_t *text,
309		fz_bidi_chartype *types,
310		size_t len)
311	244	{
312	244	size_t i;
313	244	int inQuote = FALSE;
314	244	int pdfNeeded = FALSE;
315	244	int ltrFound = FALSE;
316	244	int rtlFound = FALSE;
317
318		/* Only do anything special here if there is mixed content
319		* (LTR and RTL) in the text.
320		*/
321	15.4k	for (i = 0; i < len; i++)
322	15.2k	{
323	15.2k	switch (direction_from_type(types[i]))
324	15.2k	{
325	12.5k	case FZ_BIDI_LTR:
326	12.5k	ltrFound = TRUE;
327	12.5k	break;
328
329	0	case FZ_BIDI_RTL:
330	0	rtlFound = TRUE;
331	0	break;
332
333	2.64k	default:
334	2.64k	break;
335	15.2k	}
336	15.2k	}
337
338		/* Only make any changes if both LTR and RTL characters exist
339		* in this text.
340		*/
341	244	if (!ltrFound \|\| !rtlFound)
342	244	{
343	244	return;
344	244	}
345
346	0	for (i = 0; i < len; i++)
347	0	{
348	0	if (text[i]=='"')
349	0	{
350		/* If we're already in a quote then terminate it,
351		* else start a new block.
352		*/
353	0	if (inQuote)
354	0	{
355	0	inQuote = FALSE;
356	0	if (pdfNeeded)
357	0	{
358	0	pdfNeeded = FALSE;
359	0	types[i] = BDI_PDF;
360	0	}
361	0	}
362	0	else
363	0	{
364	0	size_t j;
365	0	int done = FALSE;
366
367	0	inQuote = TRUE;
368
369		/* Find the first strong right or left type and
370		* use that to determine whether we should classify
371		* the quote as LRE or RLE. Or neither, if we
372		* hit another quote before any strongly-directional
373		* character.
374		*/
375	0	for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
376	0	{
377	0	switch(types[j])
378	0	{
379	0	case BDI_RLE:
380	0	case BDI_LRE:
381	0	done = TRUE;
382	0	break;
383
384	0	case BDI_L:
385	0	case BDI_EN:
386	0	types[i] = BDI_LRE;
387	0	pdfNeeded = TRUE;
388	0	done = TRUE;
389	0	break;
390
391	0	case BDI_R:
392	0	case BDI_AL:
393	0	types[i] = BDI_RLE;
394	0	pdfNeeded = TRUE;
395	0	done = TRUE;
396	0	break;
397
398	0	default:
399	0	break;
400	0	}
401	0	}
402	0	}
403	0	}
404	0	}
405	0	}
406
407		/* Creates a buffer with an embedding level for every character in the
408		* given text. Also determines the base level and returns it in
409		* baseDir if baseDir does not initially contain a valid direction.
410		*/
411		static fz_bidi_level *
412		create_levels(fz_context *ctx,
413		const uint32_t *text,
414		size_t len,
415		fz_bidi_direction *baseDir,
416		int resolveWhiteSpace,
417		int flags)
418	244	{
419	244	fz_bidi_level levels, plevels;
420	244	fz_bidi_chartype *types = NULL;
421	244	fz_bidi_chartype *ptypes;
422	244	fz_bidi_level baseLevel;
423	244	const uint32_t *ptext;
424	244	size_t plen, remaining;
425
426	244	levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");
427
428	244	fz_var(types);
429
430	488	fz_try(ctx)
431	488	{
432	244	types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
433
434	244	classify_characters(text, types, len, flags);
435
436	244	if (baseDir != FZ_BIDI_LTR && baseDir != FZ_BIDI_RTL)
437	0	{
438		/* Derive the base level from the text and
439		* update *baseDir in case the caller wants to know.
440		*/
441	0	baseLevel = base_level_from_text(types, len);
442	0	*baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
443	0	}
444	244	else
445	244	{
446	244	baseLevel = (fz_bidi_level)*baseDir;
447	244	}
448
449	244	{
450		/* Replace tab with base direction, i.e. make tab appear as
451		* 'strong left' if the base direction is left-to-right and
452		* 'strong right' if base direction is right-to-left. This
453		* allows Layout to implicitly treat tabs as 'segment separators'.
454		*/
455	244	size_t i;
456
457	15.4k	for (i = 0u; i < len; i++)
458	15.2k	{
459	15.2k	if (text[i]=='\t')
460	0	{
461	0	types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
462	0	}
463	15.2k	}
464	244	}
465
466		/* Look for quotation marks. Classify them as RLE or LRE
467		* or leave them alone, depending on what follows them.
468		*/
469	244	classify_quoted_blocks(text, types, len);
470
471		/* Work one paragraph at a time. */
472	244	plevels = levels;
473	244	ptypes = types;
474	244	ptext = text;
475	244	remaining = len;
476	488	while (remaining)
477	244	{
478	244	plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
479
480		/* Work out the levels and character types... */
481	244	(void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
482	244	fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
483	244	fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
484	244	fz_bidi_resolve_implicit(ptypes, plevels, plen);
485
486	244	classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
487
488	244	if (resolveWhiteSpace)
489	0	{
490		/* resolve whitespace */
491	0	fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
492	0	}
493
494	244	plevels += plen;
495	244	ptypes += plen;
496	244	ptext += plen;
497	244	remaining -= plen;
498	244	}
499
500		/* The levels buffer now has odd and even numbers indicating
501		* rtl or ltr characters, respectively.
502		*/
503		#ifdef DEBUG_BIDI_VERBOSE
504		fprintf(stderr, "Levels: ");
505		{
506		size_t i;
507		for (i = 0; i < len; i++)
508		{
509		fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
510		}
511		fprintf(stderr, "\n");
512		}
513		#endif
514	244	}
515	488	fz_always(ctx)
516	244	{
517	244	fz_free(ctx, types);
518	244	}
519	244	fz_catch(ctx)
520	0	{
521	0	fz_free(ctx, levels);
522	0	fz_rethrow(ctx);
523	0	}
524	244	return levels;
525	244	}
526
527		/* Partitions the given character sequence into one or more unidirectional
528		* fragments and invokes the given callback function for each fragment.
529		*/
530		void fz_bidi_fragment_text(fz_context *ctx,
531		const uint32_t *text,
532		size_t textlen,
533		fz_bidi_direction *baseDir,
534		fz_bidi_fragment_fn *callback,
535		void *arg,
536		int flags)
537	244	{
538	244	size_t startOfFragment;
539	244	size_t i;
540	244	fz_bidi_level *levels;
541
542	244	if (text == NULL \|\| callback == NULL \|\| textlen == 0)
543	0	return;
544
545	244	DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
546
547	244	levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
548
549		/* We now have an array with an embedding level
550		* for each character in text.
551		*/
552	244	assert(levels != NULL);
553
554	488	fz_try(ctx)
555	488	{
556	244	startOfFragment = 0;
557	15.2k	for (i = 1; i < textlen; i++)
558	14.9k	{
559	14.9k	if (levels[i] != levels[i-1])
560	0	{
561		/* We've gone past the end of the fragment.
562		* Create a text object for it, then start
563		* a new fragment.
564		*/
565	0	split_at_script(&text[startOfFragment],
566	0	i - startOfFragment,
567	0	levels[startOfFragment],
568	0	arg,
569	0	callback);
570	0	startOfFragment = i;
571	0	}
572	14.9k	}
573		/* Now i == textlen. Deal with the final (or maybe only) fragment. */
574		/* otherwise create 1 fragment */
575	244	split_at_script(&text[startOfFragment],
576	244	i - startOfFragment,
577	244	levels[startOfFragment],
578	244	arg,
579	244	callback);
580	244	}
581	488	fz_always(ctx)
582	244	{
583	244	fz_free(ctx, levels);
584	244	}
585	244	fz_catch(ctx)
586	0	{
587	0	fz_rethrow(ctx);
588	0	}
589	244	}