Coverage Report

Created: 2025-07-18 06:20

/src/xpdf-4.05/xpdf/TextOutputDev.h
Line
Count
Source (jump to first uncovered line)
1
//========================================================================
2
//
3
// TextOutputDev.h
4
//
5
// Copyright 1997-2012 Glyph & Cog, LLC
6
//
7
//========================================================================
8
9
#ifndef TEXTOUTPUTDEV_H
10
#define TEXTOUTPUTDEV_H
11
12
#include <aconf.h>
13
14
#include <stdio.h>
15
#include "gtypes.h"
16
#include "GfxFont.h"
17
#include "OutputDev.h"
18
19
class GList;
20
class UnicodeMap;
21
class UnicodeRemapping;
22
23
class TextBlock;
24
class TextChar;
25
class TextGaps;
26
class TextLink;
27
class TextPage;
28
29
//------------------------------------------------------------------------
30
31
typedef void (*TextOutputFunc)(void *stream, const char *text, int len);
32
33
//------------------------------------------------------------------------
34
// TextOutputControl
35
//------------------------------------------------------------------------
36
37
enum TextOutputMode {
38
  textOutReadingOrder,    // format into reading order
39
  textOutPhysLayout,    // maintain original physical layout
40
  textOutSimpleLayout,    // simple one-column physical layout
41
  textOutSimple2Layout,   // simple one-column physical layout
42
  textOutTableLayout,   // similar to PhysLayout, but optimized
43
        //   for tables
44
  textOutLinePrinter,   // strict fixed-pitch/height layout
45
  textOutRawOrder   // keep text in content stream order
46
};
47
48
enum TextOutputOverlapHandling {
49
  textOutIgnoreOverlaps,  // no special handling for overlaps
50
  textOutAppendOverlaps,  // append overlapping text to main text
51
  textOutDiscardOverlaps  // discard overlapping text
52
};
53
54
class TextOutputControl {
55
public:
56
57
  TextOutputControl();
58
0
  ~TextOutputControl() {}
59
60
  TextOutputMode mode;    // formatting mode
61
  double fixedPitch;    // if this is non-zero, assume fixed-pitch
62
        //   characters with this width
63
        //   (only relevant for PhysLayout, Table,
64
        //   and LinePrinter modes)
65
  double fixedLineSpacing;  // fixed line spacing (only relevant for
66
        //   LinePrinter mode)
67
  GBool html;     // enable extra processing for HTML
68
  GBool clipText;   // separate clipped text and add it back
69
        //   in after forming columns
70
  GBool discardDiagonalText;  // discard all text that's not close to
71
        //   0/90/180/270 degrees
72
  GBool discardRotatedText; // discard all text that's not horizontal
73
        //   (0 degrees)
74
  GBool discardInvisibleText; // discard all invisible characters
75
  GBool discardClippedText; // discard all clipped characters
76
  GBool splitRotatedWords;  // do not combine horizontal and
77
        //   non-horizontal chars in a single
78
        //   word
79
  TextOutputOverlapHandling // how to handle overlapping text
80
               overlapHandling;
81
  GBool separateLargeChars; // separate "large" characters from
82
        //   "regular" characters
83
  GBool insertBOM;    // insert a Unicode BOM at the start of
84
        //   the text output
85
  double marginLeft,    // characters outside the margins are
86
         marginRight,   //   discarded
87
         marginTop,
88
         marginBottom;
89
};
90
91
//------------------------------------------------------------------------
92
// TextFontInfo
93
//------------------------------------------------------------------------
94
95
class TextFontInfo {
96
public:
97
98
  // Create a TextFontInfo for the current font in [state].
99
  TextFontInfo(GfxState *state);
100
101
  // Create a dummy TextFontInfo.
102
  TextFontInfo();
103
104
  ~TextFontInfo();
105
106
  GBool matches(GfxState *state);
107
108
  // Get the font name (which may be NULL).
109
0
  GString *getFontName() { return fontName; }
110
111
  // Get font descriptor flags.
112
0
  GBool isFixedWidth() { return flags & fontFixedWidth; }
113
0
  GBool isSerif() { return flags & fontSerif; }
114
0
  GBool isSymbolic() { return flags & fontSymbolic; }
115
0
  GBool isItalic() { return flags & fontItalic; }
116
0
  GBool isBold() { return flags & fontBold; }
117
118
  // Get the width of the 'm' character, if available.
119
0
  double getMWidth() { return mWidth; }
120
121
0
  double getAscent() { return ascent; }
122
0
  double getDescent() { return descent; }
123
124
0
  Ref getFontID() { return fontID; }
125
126
private:
127
128
  Ref fontID;
129
  GString *fontName;
130
  int flags;
131
  double mWidth;
132
  double ascent, descent;
133
134
  friend class TextLine;
135
  friend class TextPage;
136
  friend class TextWord;
137
};
138
139
//------------------------------------------------------------------------
140
// TextWord
141
//------------------------------------------------------------------------
142
143
class TextWord {
144
public:
145
146
  TextWord(GList *chars, int start, int lenA,
147
     int rotA, GBool rotatedA, int dirA, GBool spaceAfterA);
148
  ~TextWord();
149
0
  TextWord *copy() { return new TextWord(this); }
150
151
  // Get the TextFontInfo object associated with this word.
152
0
  TextFontInfo *getFontInfo() { return font; }
153
154
0
  int getLength() { return len; }
155
0
  Unicode getChar(int idx) { return text[idx]; }
156
  GString *getText();
157
0
  GString *getFontName() { return font->fontName; }
158
  void getColor(double *r, double *g, double *b)
159
0
    { *r = colorR; *g = colorG; *b = colorB; }
160
0
  GBool isInvisible() { return invisible; }
161
  void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
162
0
    { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
163
  void getCharBBox(int charIdx, double *xMinA, double *yMinA,
164
       double *xMaxA, double *yMaxA);
165
0
  double getFontSize() { return fontSize; }
166
0
  int getRotation() { return rot; }
167
0
  GBool isRotated() { return (GBool)rotated; }
168
0
  int getCharPos() { return charPos[0]; }
169
0
  int getCharLen() { return charPos[len] - charPos[0]; }
170
0
  int getDirection() { return dir; }
171
0
  GBool getSpaceAfter() { return spaceAfter; }
172
  double getBaseline();
173
0
  GBool isUnderlined() { return underlined; }
174
  GString *getLinkURI();
175
176
private:
177
178
  TextWord(TextWord *word);
179
  static int cmpYX(const void *p1, const void *p2);
180
  static int cmpCharPos(const void *p1, const void *p2);
181
182
  double xMin, xMax;    // bounding box x coordinates
183
  double yMin, yMax;    // bounding box y coordinates
184
  Unicode *text;    // the text
185
  int *charPos;     // character position (within content stream)
186
        //   of each char (plus one extra entry for
187
        //   the last char)
188
  double *edge;     // "near" edge x or y coord of each char
189
        //   (plus one extra entry for the last char)
190
  int len;      // number of characters
191
  TextFontInfo *font;   // font information
192
  double fontSize;    // font size
193
  TextLink *link;
194
  double colorR,    // word color
195
         colorG,
196
         colorB;
197
  GBool invisible;    // set for invisible text (render mode 3)
198
199
  // group the byte-size fields to minimize object size
200
  Guchar rot;     // rotation, multiple of 90 degrees
201
        //   (0, 1, 2, or 3)
202
  char rotated;     // set if this word is non-horizontal
203
  char dir;     // character direction (+1 = left-to-right;
204
        //   -1 = right-to-left; 0 = neither)
205
  char spaceAfter;    // set if there is a space between this
206
        //   word and the next word on the line
207
  char underlined;
208
209
  friend class TextBlock;
210
  friend class TextLine;
211
  friend class TextPage;
212
};
213
214
//------------------------------------------------------------------------
215
// TextLine
216
//------------------------------------------------------------------------
217
218
class TextLine {
219
public:
220
221
  TextLine(GList *wordsA, double xMinA, double yMinA,
222
     double xMaxA, double yMaxA, double fontSizeA);
223
  ~TextLine();
224
225
0
  double getXMin() { return xMin; }
226
0
  double getYMin() { return yMin; }
227
0
  double getXMax() { return xMax; }
228
0
  double getYMax() { return yMax; }
229
  double getBaseline();
230
0
  int getRotation() { return rot; }
231
0
  GList *getWords() { return words; }
232
0
  Unicode *getUnicode() { return text; }
233
0
  int getLength() { return len; }
234
0
  double getEdge(int idx) { return edge[idx]; }
235
0
  GBool getHyphenated() { return hyphenated; }
236
237
private:
238
239
  static int cmpX(const void *p1, const void *p2);
240
241
  GList *words;     // [TextWord]
242
  int rot;      // rotation, multiple of 90 degrees
243
        //   (0, 1, 2, or 3)
244
  double xMin, xMax;    // bounding box x coordinates
245
  double yMin, yMax;    // bounding box y coordinates
246
  double fontSize;    // main (max) font size for this line
247
  Unicode *text;    // Unicode text of the line, including
248
        //   spaces between words
249
  double *edge;     // "near" edge x or y coord of each char
250
        //   (plus one extra entry for the last char)
251
  int len;      // number of Unicode chars
252
  GBool hyphenated;   // set if last char is a hyphen
253
  int px;     // x offset (in characters, relative to
254
        //   containing column) in physical layout mode
255
  int pw;     // line width (in characters) in physical
256
        //   layout mode
257
258
  friend class TextSuperLine;
259
  friend class TextPage;
260
  friend class TextParagraph;
261
};
262
263
//------------------------------------------------------------------------
264
// TextParagraph
265
//------------------------------------------------------------------------
266
267
class TextParagraph {
268
public:
269
270
  TextParagraph(GList *linesA, GBool dropCapA);
271
  ~TextParagraph();
272
273
  // Get the list of TextLine objects.
274
0
  GList *getLines() { return lines; }
275
276
0
  GBool hasDropCap() { return dropCap; }
277
278
0
  double getXMin() { return xMin; }
279
0
  double getYMin() { return yMin; }
280
0
  double getXMax() { return xMax; }
281
0
  double getYMax() { return yMax; }
282
283
private:
284
285
  GList *lines;     // [TextLine]
286
  GBool dropCap;    // paragraph starts with a drop capital
287
  double xMin, xMax;    // bounding box x coordinates
288
  double yMin, yMax;    // bounding box y coordinates
289
290
  friend class TextPage;
291
};
292
293
//------------------------------------------------------------------------
294
// TextColumn
295
//------------------------------------------------------------------------
296
297
class TextColumn {
298
public:
299
300
  TextColumn(GList *paragraphsA, double xMinA, double yMinA,
301
       double xMaxA, double yMaxA);
302
  ~TextColumn();
303
304
  // Get the list of TextParagraph objects.
305
0
  GList *getParagraphs() { return paragraphs; }
306
307
0
  double getXMin() { return xMin; }
308
0
  double getYMin() { return yMin; }
309
0
  double getXMax() { return xMax; }
310
0
  double getYMax() { return yMax; }
311
312
  int getRotation();
313
314
private:
315
316
  static int cmpX(const void *p1, const void *p2);
317
  static int cmpY(const void *p1, const void *p2);
318
  static int cmpPX(const void *p1, const void *p2);
319
320
  GList *paragraphs;    // [TextParagraph]
321
  double xMin, xMax;    // bounding box x coordinates
322
  double yMin, yMax;    // bounding box y coordinates
323
  int px, py;     // x, y position (in characters) in physical
324
        //   layout mode
325
  int pw, ph;     // column width, height (in characters) in
326
        //   physical layout mode
327
328
  friend class TextPage;
329
};
330
331
//------------------------------------------------------------------------
332
// TextWordList
333
//------------------------------------------------------------------------
334
335
class TextWordList {
336
public:
337
338
  TextWordList(GList *wordsA, GBool primaryLRA);
339
340
  ~TextWordList();
341
342
  // Return the number of words on the list.
343
  int getLength();
344
345
  // Return the <idx>th word from the list.
346
  TextWord *get(int idx);
347
348
  // Returns true if primary direction is left-to-right, or false if
349
  // right-to-left.
350
0
  GBool getPrimaryLR() { return primaryLR; }
351
352
private:
353
354
  GList *words;     // [TextWord]
355
  GBool primaryLR;
356
};
357
358
//------------------------------------------------------------------------
359
// TextPosition
360
//------------------------------------------------------------------------
361
362
// Position within a TextColumn tree.  The position is in column
363
// [colIdx], paragraph [parIdx], line [lineIdx], before character
364
// [charIdx].
365
class TextPosition {
366
public:
367
368
0
  TextPosition(): colIdx(0), parIdx(0), lineIdx(0), charIdx(0) {}
369
  TextPosition(int colIdxA, int parIdxA, int lineIdxA, int charIdxA):
370
0
    colIdx(colIdxA), parIdx(parIdxA), lineIdx(lineIdxA), charIdx(charIdxA) {}
371
372
  int operator==(TextPosition pos);
373
  int operator!=(TextPosition pos);
374
  int operator<(TextPosition pos);
375
  int operator>(TextPosition pos);
376
377
  int colIdx, parIdx, lineIdx, charIdx;
378
};
379
380
//------------------------------------------------------------------------
381
// TextPage
382
//------------------------------------------------------------------------
383
384
class TextPage {
385
public:
386
387
  TextPage(TextOutputControl *controlA);
388
  ~TextPage();
389
390
  // Write contents of page to a stream.
391
  void write(void *outputStream, TextOutputFunc outputFunc);
392
393
  // Find a string.  If <startAtTop> is true, starts looking at the
394
  // top of the page; else if <startAtLast> is true, starts looking
395
  // immediately after the last find result; else starts looking at
396
  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
397
  // bottom of the page; else if <stopAtLast> is true, stops looking
398
  // just before the last find result; else stops looking at
399
  // <xMax>,<yMax>.
400
  GBool findText(Unicode *s, int len,
401
     GBool startAtTop, GBool stopAtBottom,
402
     GBool startAtLast, GBool stopAtLast,
403
     GBool caseSensitive, GBool backward,
404
     GBool wholeWord,
405
     double *xMin, double *yMin,
406
     double *xMax, double *yMax);
407
408
  // Get the text which is inside the specified rectangle.  Multi-line
409
  // text always includes end-of-line markers at the end of each line.
410
  // If <forceEOL> is true, an end-of-line marker will be appended to
411
  // single-line text as well.
412
  GString *getText(double xMin, double yMin,
413
       double xMax, double yMax,
414
       GBool forceEOL = gFalse);
415
416
  // Find a string by character position and length.  If found, sets
417
  // the text bounding rectangle and returns true; otherwise returns
418
  // false.
419
  GBool findCharRange(int pos, int length,
420
          double *xMin, double *yMin,
421
          double *xMax, double *yMax);
422
423
  // Returns true if x,y falls inside a column.
424
  GBool checkPointInside(double x, double y);
425
426
  // Find a point inside a column.  Returns false if x,y fall outside
427
  // all columns.
428
  GBool findPointInside(double x, double y, TextPosition *pos);
429
430
  // Find a point in the nearest column.  Returns false only if there
431
  // are no columns.
432
  GBool findPointNear(double x, double y, TextPosition *pos);
433
434
  // Find the start and end of a word inside a column.  Returns false
435
  // if x,y fall outside all columns.
436
  GBool findWordPoints(double x, double y,
437
           TextPosition *startPos, TextPosition *endPos);
438
439
  // Find the start and end of a line inside a column.  Returns false
440
  // if x,y fall outside all columns.
441
  GBool findLinePoints(double x, double y,
442
           TextPosition *startPos, TextPosition *endPos);
443
444
  // Get the upper point of a TextPosition.
445
  void convertPosToPointUpper(TextPosition *pos, double *x, double *y);
446
447
  // Get the lower point of a TextPosition.
448
  void convertPosToPointLower(TextPosition *pos, double *x, double *y);
449
450
  // Get the upper left corner of the line containing a TextPosition.
451
  void convertPosToPointLeftEdge(TextPosition *pos, double *x, double *y);
452
453
  // Get the lower right corner of the line containing a TextPosition.
454
  void convertPosToPointRightEdge(TextPosition *pos, double *x, double *y);
455
456
  // Get the upper right corner of a column.
457
  void getColumnUpperRight(int colIdx, double *x, double *y);
458
459
  // Get the lower left corner of a column.
460
  void getColumnLowerLeft(int colIdx, double *x, double *y);
461
462
  // Create and return a list of TextColumn objects.
463
  GList *makeColumns();
464
465
  // Get the list of all TextFontInfo objects used on this page.
466
0
  GList *getFonts() { return fonts; }
467
468
  // Build a flat word list, in the specified ordering.
469
  TextWordList *makeWordList();
470
471
  // Build a word list containing only words inside the specified
472
  // rectangle.
473
  TextWordList *makeWordListForRect(double xMin, double yMin,
474
            double xMax, double yMax);
475
476
  // Get the primary rotation of text on the page.
477
0
  int getPrimaryRotation() { return primaryRot; }
478
479
  // Returns true if the primary character direction is left-to-right,
480
  // false if it is right-to-left.
481
  GBool primaryDirectionIsLR();
482
483
  // Get the counter values.
484
0
  int getNumVisibleChars() { return nVisibleChars; }
485
0
  int getNumInvisibleChars() { return nInvisibleChars; }
486
0
  int getNumRemovedDupChars() { return nRemovedDupChars; }
487
488
  // Returns true if any of the fonts used on this page are likely to
489
  // be problematic when converting text to Unicode.
490
0
  GBool problematicForUnicode() { return problematic; }
491
492
  // Add a 'special' character to this TextPage.  This is currently
493
  // used by pdftohtml to insert markers for form fields.
494
  void addSpecialChar(double xMin, double yMin, double xMax, double yMax,
495
          int rot, TextFontInfo *font, double fontSize,
496
          Unicode u);
497
498
  // Remove characters that fall inside a region.
499
  void removeChars(double xMin, double yMin, double xMax, double yMax,
500
       double xOverlapThresh, double yOverlapThresh);
501
502
private:
503
504
  void startPage(GfxState *state);
505
  void clear();
506
  void updateFont(GfxState *state);
507
  void addChar(GfxState *state, double x, double y,
508
         double dx, double dy,
509
         CharCode c, int nBytes, Unicode *u, int uLen);
510
  void incCharCount(int nChars);
511
  void beginActualText(GfxState *state, Unicode *u, int uLen);
512
  void endActualText(GfxState *state);
513
  void addUnderline(double x0, double y0, double x1, double y1);
514
  void addLink(double xMin, double yMin, double xMax, double yMax,
515
         Link *link);
516
517
  // output
518
  void writeReadingOrder(void *outputStream,
519
       TextOutputFunc outputFunc,
520
       UnicodeMap *uMap,
521
       char *space, int spaceLen,
522
       char *eol, int eolLen);
523
  void writePhysLayout(void *outputStream,
524
           TextOutputFunc outputFunc,
525
           UnicodeMap *uMap,
526
           char *space, int spaceLen,
527
           char *eol, int eolLen);
528
  void writeSimpleLayout(void *outputStream,
529
       TextOutputFunc outputFunc,
530
       UnicodeMap *uMap,
531
       char *space, int spaceLen,
532
       char *eol, int eolLen);
533
  void writeSimple2Layout(void *outputStream,
534
        TextOutputFunc outputFunc,
535
        UnicodeMap *uMap,
536
        char *space, int spaceLen,
537
        char *eol, int eolLen);
538
  void writeLinePrinter(void *outputStream,
539
      TextOutputFunc outputFunc,
540
      UnicodeMap *uMap,
541
      char *space, int spaceLen,
542
      char *eol, int eolLen);
543
  void writeRaw(void *outputStream,
544
    TextOutputFunc outputFunc,
545
    UnicodeMap *uMap,
546
    char *space, int spaceLen,
547
    char *eol, int eolLen);
548
  void encodeFragment(Unicode *text, int len, UnicodeMap *uMap,
549
          GBool primaryLR, GString *s);
550
  GBool unicodeEffectiveTypeLOrNum(Unicode u, Unicode left, Unicode right);
551
  GBool unicodeEffectiveTypeR(Unicode u, Unicode left, Unicode right);
552
553
  // analysis
554
  int rotateChars(GList *charsA);
555
  void rotateCharsToZero(GList *charsA);
556
  void rotateUnderlinesAndLinks(int rot);
557
  void unrotateChars(GList *charsA, int rot);
558
  void unrotateCharsFromZero(GList *charsA);
559
  void unrotateColumnsFromZero(GList *columns);
560
  void unrotateColumns(GList *columns, int rot);
561
  void unrotateWords(GList *words, int rot);
562
  GBool checkPrimaryLR(GList *charsA);
563
  void removeDuplicates(GList *charsA, int rot);
564
  GList *separateOverlappingText(GList *charsA);
565
  TextColumn *buildOverlappingTextColumn(GList *overlappingChars);
566
  TextBlock *splitChars(GList *charsA);
567
  TextBlock *split(GList *charsA, int rot);
568
  GList *getChars(GList *charsA, double xMin, double yMin,
569
      double xMax, double yMax);
570
  void findGaps(GList *charsA, int rot,
571
    double *xMinOut, double *yMinOut,
572
    double *xMaxOut, double *yMaxOut,
573
    double *avgFontSizeOut, double *minFontSizeOut,
574
    GList *splitLines,
575
    TextGaps *horizGaps, TextGaps *vertGaps);
576
  void mergeSplitLines(GList *charsA, int rot, GList *splitLines);
577
  void tagBlock(TextBlock *blk);
578
  void insertLargeChars(GList *largeChars, TextBlock *blk);
579
  void insertLargeCharsInFirstLeaf(GList *largeChars, TextBlock *blk);
580
  void insertLargeCharInLeaf(TextChar *ch, TextBlock *blk);
581
  void insertIntoTree(TextBlock *subtree, TextBlock *primaryTree,
582
          GBool doReorder);
583
  void reorderBlocks(TextBlock *blk);
584
  void insertColumnIntoTree(TextBlock *column, TextBlock *tree);
585
  void insertClippedChars(GList *clippedChars, TextBlock *tree);
586
  TextBlock *findClippedCharLeaf(TextChar *ch, TextBlock *tree);
587
  GList *buildColumns(TextBlock *tree, GBool primaryLR);
588
  void buildColumns2(TextBlock *blk, GList *columns, GBool primaryLR);
589
  TextColumn *buildColumn(TextBlock *tree);
590
  double getLineIndent(TextLine *line, TextBlock *blk);
591
  double getAverageLineSpacing(GList *lines);
592
  double getLineSpacing(TextLine *line0, TextLine *line1);
593
  void buildLines(TextBlock *blk, GList *lines, GBool splitSuperLines);
594
  GList *buildSimple2Columns(GList *charsA);
595
  GList *buildSimple2Lines(GList *charsA, int rot);
596
  TextLine *buildLine(TextBlock *blk);
597
  TextLine *buildLine(GList *charsA, int rot,
598
          double xMin, double yMin, double xMax, double yMax);
599
  void getLineChars(TextBlock *blk, GList *charsA);
600
  double computeWordSpacingThreshold(GList *charsA, int rot);
601
  void adjustCombiningChars(GList *charsA, int rot);
602
  int getCharDirection(TextChar *ch);
603
  int getCharDirection(TextChar *ch, TextChar *left, TextChar *right);
604
  int assignPhysLayoutPositions(GList *columns);
605
  void assignLinePhysPositions(GList *columns);
606
  void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap);
607
  int assignColumnPhysPositions(GList *columns);
608
  void buildSuperLines(TextBlock *blk, GList *superLines);
609
  void assignSimpleLayoutPositions(GList *superLines, UnicodeMap *uMap);
610
  void generateUnderlinesAndLinks(GList *columns);
611
  void findPointInColumn(TextColumn *col, double x, double y,
612
       TextPosition *pos);
613
  void buildFindCols();
614
615
  // debug
616
#if 0 //~debug
617
  void dumpChars(GList *charsA);
618
  void dumpTree(TextBlock *tree, int indent = 0);
619
  void dumpColumns(GList *columns, GBool dumpWords = gFalse);
620
  void dumpUnderlines();
621
#endif
622
623
  // word list
624
  TextWordList *makeWordListForChars(GList *charList);
625
626
  TextOutputControl control;  // formatting parameters
627
628
  UnicodeRemapping *remapping;
629
  Unicode *uBuf;
630
  int uBufSize;
631
632
  double pageWidth, pageHeight; // width and height of current page
633
  int charPos;      // next character position (within content
634
        //   stream)
635
  TextFontInfo *curFont;  // current font
636
  double curFontSize;   // current font size
637
  int curRot;     // current rotation
638
  GBool diagonal;   // set if rotation is not close to
639
        //   0/90/180/270 degrees
640
  GBool rotated;    // set if text is not horizontal (0 degrees)
641
  int nTinyChars;   // number of "tiny" chars seen so far
642
  Unicode *actualText;    // current "ActualText" span
643
  int actualTextLen;
644
  double actualTextX0,
645
         actualTextY0,
646
         actualTextX1,
647
         actualTextY1;
648
  int actualTextNBytes;
649
650
  GList *chars;     // [TextChar]
651
  GList *fonts;     // all font info objects used on this
652
        //   page [TextFontInfo]
653
  int primaryRot;   // primary rotation
654
655
  GList *underlines;    // [TextUnderline]
656
  GList *links;     // [TextLink]
657
658
  int nVisibleChars;    // number of visible chars on the page
659
  int nInvisibleChars;    // number of invisible chars on the page
660
  int nRemovedDupChars;   // number of duplicate chars removed
661
662
  GList *findCols;    // text used by the findText**/findPoint**
663
        //   functions [TextColumn]
664
  double lastFindXMin,    // coordinates of the last "find" result
665
         lastFindYMin;
666
  GBool haveLastFind;
667
668
  GBool problematic;    // true if any of the fonts used on this
669
        //   page are marked as problematic for
670
        //   Unicode conversion
671
672
  friend class TextOutputDev;
673
};
674
675
//------------------------------------------------------------------------
676
// TextOutputDev
677
//------------------------------------------------------------------------
678
679
class TextOutputDev: public OutputDev {
680
public:
681
682
  // Open a text output file.  If <fileName> is NULL, no file is
683
  // written (this is useful, e.g., for searching text).  If
684
  // <physLayoutA> is true, the original physical layout of the text
685
  // is maintained.  If <rawOrder> is true, the text is kept in
686
  // content stream order.
687
  TextOutputDev(char *fileName, TextOutputControl *controlA,
688
    GBool append, GBool fileNameIsUTF8 = gFalse);
689
690
  // Create a TextOutputDev which will write to a generic stream.  If
691
  // <physLayoutA> is true, the original physical layout of the text
692
  // is maintained.  If <rawOrder> is true, the text is kept in
693
  // content stream order.
694
  TextOutputDev(TextOutputFunc func, void *stream,
695
    TextOutputControl *controlA);
696
697
  // Destructor.
698
  virtual ~TextOutputDev();
699
700
  // Check if file was successfully created.
701
0
  virtual GBool isOk() { return ok; }
702
703
  //---- get info about output device
704
705
  // Does this device use upside-down coordinates?
706
  // (Upside-down means (0,0) is the top left corner of the page.)
707
0
  virtual GBool upsideDown() { return gTrue; }
708
709
  // Does this device use drawChar() or drawString()?
710
0
  virtual GBool useDrawChar() { return gTrue; }
711
712
  // Does this device use beginType3Char/endType3Char?  Otherwise,
713
  // text in Type 3 fonts will be drawn with drawChar/drawString.
714
0
  virtual GBool interpretType3Chars() { return gFalse; }
715
716
  // Does this device need non-text content?
717
0
  virtual GBool needNonText() { return gFalse; }
718
719
  // Does this device require incCharCount to be called for text on
720
  // non-shown layers?
721
0
  virtual GBool needCharCount() { return gTrue; }
722
723
  //----- initialization and control
724
725
  // Start a page.
726
  virtual void startPage(int pageNum, GfxState *state);
727
728
  // End a page.
729
  virtual void endPage();
730
731
  //----- save/restore graphics state
732
  virtual void restoreState(GfxState *state);
733
734
  //----- update text state
735
  virtual void updateFont(GfxState *state);
736
737
  //----- text drawing
738
  virtual void beginString(GfxState *state, GString *s);
739
  virtual void endString(GfxState *state);
740
  virtual void drawChar(GfxState *state, double x, double y,
741
      double dx, double dy,
742
      double originX, double originY,
743
      CharCode c, int nBytes, Unicode *u, int uLen,
744
      GBool fill, GBool stroke, GBool makePath);
745
  virtual void incCharCount(int nChars);
746
  virtual void beginActualText(GfxState *state, Unicode *u, int uLen);
747
  virtual void endActualText(GfxState *state);
748
749
  //----- path painting
750
  virtual void stroke(GfxState *state);
751
  virtual void fill(GfxState *state);
752
  virtual void eoFill(GfxState *state);
753
754
  //----- link borders
755
  virtual void processLink(Link *link);
756
757
  //----- special access
758
759
  // Find a string.  If <startAtTop> is true, starts looking at the
760
  // top of the page; else if <startAtLast> is true, starts looking
761
  // immediately after the last find result; else starts looking at
762
  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
763
  // bottom of the page; else if <stopAtLast> is true, stops looking
764
  // just before the last find result; else stops looking at
765
  // <xMax>,<yMax>.
766
  GBool findText(Unicode *s, int len,
767
     GBool startAtTop, GBool stopAtBottom,
768
     GBool startAtLast, GBool stopAtLast,
769
     GBool caseSensitive, GBool backward,
770
     GBool wholeWord,
771
     double *xMin, double *yMin,
772
     double *xMax, double *yMax);
773
774
  // Get the text which is inside the specified rectangle.
775
  GString *getText(double xMin, double yMin,
776
       double xMax, double yMax);
777
778
  // Find a string by character position and length.  If found, sets
779
  // the text bounding rectangle and returns true; otherwise returns
780
  // false.
781
  GBool findCharRange(int pos, int length,
782
          double *xMin, double *yMin,
783
          double *xMax, double *yMax);
784
785
  // Build a flat word list, in content stream order (if
786
  // this->rawOrder is true), physical layout order (if
787
  // this->physLayout is true and this->rawOrder is false), or reading
788
  // order (if both flags are false).
789
  TextWordList *makeWordList();
790
791
  // Build a word list containing only words inside the specified
792
  // rectangle.
793
  TextWordList *makeWordListForRect(double xMin, double yMin,
794
            double xMax, double yMax);
795
796
  // Returns the TextPage object for the last rasterized page,
797
  // transferring ownership to the caller.
798
  TextPage *takeText();
799
800
  // Turn extra processing for HTML conversion on or off.
801
0
  void enableHTMLExtras(GBool html) { control.html = html; }
802
803
  // Get the counter values.
804
0
  int getNumVisibleChars() { return text->nVisibleChars; }
805
0
  int getNumInvisibleChars() { return text->nInvisibleChars; }
806
0
  int getNumRemovedDupChars() { return text->nRemovedDupChars; }
807
808
private:
809
810
  void generateBOM();
811
812
  TextOutputFunc outputFunc;  // output function
813
  void *outputStream;   // output stream
814
  GBool needClose;    // need to close the output file?
815
        //   (only if outputStream is a FILE*)
816
  TextPage *text;   // text for the current page
817
  TextOutputControl control;  // formatting parameters
818
  GBool ok;     // set up ok?
819
};
820
821
#endif