/src/xpdf-4.05/xpdf/TextOutputDev.h
Line | Count | Source (jump to first uncovered line) |
1 | | //======================================================================== |
2 | | // |
3 | | // TextOutputDev.h |
4 | | // |
5 | | // Copyright 1997-2012 Glyph & Cog, LLC |
6 | | // |
7 | | //======================================================================== |
8 | | |
9 | | #ifndef TEXTOUTPUTDEV_H |
10 | | #define TEXTOUTPUTDEV_H |
11 | | |
12 | | #include <aconf.h> |
13 | | |
14 | | #include <stdio.h> |
15 | | #include "gtypes.h" |
16 | | #include "GfxFont.h" |
17 | | #include "OutputDev.h" |
18 | | |
19 | | class GList; |
20 | | class UnicodeMap; |
21 | | class UnicodeRemapping; |
22 | | |
23 | | class TextBlock; |
24 | | class TextChar; |
25 | | class TextGaps; |
26 | | class TextLink; |
27 | | class TextPage; |
28 | | |
29 | | //------------------------------------------------------------------------ |
30 | | |
31 | | typedef void (*TextOutputFunc)(void *stream, const char *text, int len); |
32 | | |
33 | | //------------------------------------------------------------------------ |
34 | | // TextOutputControl |
35 | | //------------------------------------------------------------------------ |
36 | | |
37 | | enum TextOutputMode { |
38 | | textOutReadingOrder, // format into reading order |
39 | | textOutPhysLayout, // maintain original physical layout |
40 | | textOutSimpleLayout, // simple one-column physical layout |
41 | | textOutSimple2Layout, // simple one-column physical layout |
42 | | textOutTableLayout, // similar to PhysLayout, but optimized |
43 | | // for tables |
44 | | textOutLinePrinter, // strict fixed-pitch/height layout |
45 | | textOutRawOrder // keep text in content stream order |
46 | | }; |
47 | | |
48 | | enum TextOutputOverlapHandling { |
49 | | textOutIgnoreOverlaps, // no special handling for overlaps |
50 | | textOutAppendOverlaps, // append overlapping text to main text |
51 | | textOutDiscardOverlaps // discard overlapping text |
52 | | }; |
53 | | |
54 | | class TextOutputControl { |
55 | | public: |
56 | | |
57 | | TextOutputControl(); |
58 | 0 | ~TextOutputControl() {} |
59 | | |
60 | | TextOutputMode mode; // formatting mode |
61 | | double fixedPitch; // if this is non-zero, assume fixed-pitch |
62 | | // characters with this width |
63 | | // (only relevant for PhysLayout, Table, |
64 | | // and LinePrinter modes) |
65 | | double fixedLineSpacing; // fixed line spacing (only relevant for |
66 | | // LinePrinter mode) |
67 | | GBool html; // enable extra processing for HTML |
68 | | GBool clipText; // separate clipped text and add it back |
69 | | // in after forming columns |
70 | | GBool discardDiagonalText; // discard all text that's not close to |
71 | | // 0/90/180/270 degrees |
72 | | GBool discardRotatedText; // discard all text that's not horizontal |
73 | | // (0 degrees) |
74 | | GBool discardInvisibleText; // discard all invisible characters |
75 | | GBool discardClippedText; // discard all clipped characters |
76 | | GBool splitRotatedWords; // do not combine horizontal and |
77 | | // non-horizontal chars in a single |
78 | | // word |
79 | | TextOutputOverlapHandling // how to handle overlapping text |
80 | | overlapHandling; |
81 | | GBool separateLargeChars; // separate "large" characters from |
82 | | // "regular" characters |
83 | | GBool insertBOM; // insert a Unicode BOM at the start of |
84 | | // the text output |
85 | | double marginLeft, // characters outside the margins are |
86 | | marginRight, // discarded |
87 | | marginTop, |
88 | | marginBottom; |
89 | | }; |
90 | | |
91 | | //------------------------------------------------------------------------ |
92 | | // TextFontInfo |
93 | | //------------------------------------------------------------------------ |
94 | | |
95 | | class TextFontInfo { |
96 | | public: |
97 | | |
98 | | // Create a TextFontInfo for the current font in [state]. |
99 | | TextFontInfo(GfxState *state); |
100 | | |
101 | | // Create a dummy TextFontInfo. |
102 | | TextFontInfo(); |
103 | | |
104 | | ~TextFontInfo(); |
105 | | |
106 | | GBool matches(GfxState *state); |
107 | | |
108 | | // Get the font name (which may be NULL). |
109 | 0 | GString *getFontName() { return fontName; } |
110 | | |
111 | | // Get font descriptor flags. |
112 | 0 | GBool isFixedWidth() { return flags & fontFixedWidth; } |
113 | 0 | GBool isSerif() { return flags & fontSerif; } |
114 | 0 | GBool isSymbolic() { return flags & fontSymbolic; } |
115 | 0 | GBool isItalic() { return flags & fontItalic; } |
116 | 0 | GBool isBold() { return flags & fontBold; } |
117 | | |
118 | | // Get the width of the 'm' character, if available. |
119 | 0 | double getMWidth() { return mWidth; } |
120 | | |
121 | 0 | double getAscent() { return ascent; } |
122 | 0 | double getDescent() { return descent; } |
123 | | |
124 | 0 | Ref getFontID() { return fontID; } |
125 | | |
126 | | private: |
127 | | |
128 | | Ref fontID; |
129 | | GString *fontName; |
130 | | int flags; |
131 | | double mWidth; |
132 | | double ascent, descent; |
133 | | |
134 | | friend class TextLine; |
135 | | friend class TextPage; |
136 | | friend class TextWord; |
137 | | }; |
138 | | |
139 | | //------------------------------------------------------------------------ |
140 | | // TextWord |
141 | | //------------------------------------------------------------------------ |
142 | | |
143 | | class TextWord { |
144 | | public: |
145 | | |
146 | | TextWord(GList *chars, int start, int lenA, |
147 | | int rotA, GBool rotatedA, int dirA, GBool spaceAfterA); |
148 | | ~TextWord(); |
149 | 0 | TextWord *copy() { return new TextWord(this); } |
150 | | |
151 | | // Get the TextFontInfo object associated with this word. |
152 | 0 | TextFontInfo *getFontInfo() { return font; } |
153 | | |
154 | 0 | int getLength() { return len; } |
155 | 0 | Unicode getChar(int idx) { return text[idx]; } |
156 | | GString *getText(); |
157 | 0 | GString *getFontName() { return font->fontName; } |
158 | | void getColor(double *r, double *g, double *b) |
159 | 0 | { *r = colorR; *g = colorG; *b = colorB; } |
160 | 0 | GBool isInvisible() { return invisible; } |
161 | | void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) |
162 | 0 | { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } |
163 | | void getCharBBox(int charIdx, double *xMinA, double *yMinA, |
164 | | double *xMaxA, double *yMaxA); |
165 | 0 | double getFontSize() { return fontSize; } |
166 | 0 | int getRotation() { return rot; } |
167 | 0 | GBool isRotated() { return (GBool)rotated; } |
168 | 0 | int getCharPos() { return charPos[0]; } |
169 | 0 | int getCharLen() { return charPos[len] - charPos[0]; } |
170 | 0 | int getDirection() { return dir; } |
171 | 0 | GBool getSpaceAfter() { return spaceAfter; } |
172 | | double getBaseline(); |
173 | 0 | GBool isUnderlined() { return underlined; } |
174 | | GString *getLinkURI(); |
175 | | |
176 | | private: |
177 | | |
178 | | TextWord(TextWord *word); |
179 | | static int cmpYX(const void *p1, const void *p2); |
180 | | static int cmpCharPos(const void *p1, const void *p2); |
181 | | |
182 | | double xMin, xMax; // bounding box x coordinates |
183 | | double yMin, yMax; // bounding box y coordinates |
184 | | Unicode *text; // the text |
185 | | int *charPos; // character position (within content stream) |
186 | | // of each char (plus one extra entry for |
187 | | // the last char) |
188 | | double *edge; // "near" edge x or y coord of each char |
189 | | // (plus one extra entry for the last char) |
190 | | int len; // number of characters |
191 | | TextFontInfo *font; // font information |
192 | | double fontSize; // font size |
193 | | TextLink *link; |
194 | | double colorR, // word color |
195 | | colorG, |
196 | | colorB; |
197 | | GBool invisible; // set for invisible text (render mode 3) |
198 | | |
199 | | // group the byte-size fields to minimize object size |
200 | | Guchar rot; // rotation, multiple of 90 degrees |
201 | | // (0, 1, 2, or 3) |
202 | | char rotated; // set if this word is non-horizontal |
203 | | char dir; // character direction (+1 = left-to-right; |
204 | | // -1 = right-to-left; 0 = neither) |
205 | | char spaceAfter; // set if there is a space between this |
206 | | // word and the next word on the line |
207 | | char underlined; |
208 | | |
209 | | friend class TextBlock; |
210 | | friend class TextLine; |
211 | | friend class TextPage; |
212 | | }; |
213 | | |
214 | | //------------------------------------------------------------------------ |
215 | | // TextLine |
216 | | //------------------------------------------------------------------------ |
217 | | |
218 | | class TextLine { |
219 | | public: |
220 | | |
221 | | TextLine(GList *wordsA, double xMinA, double yMinA, |
222 | | double xMaxA, double yMaxA, double fontSizeA); |
223 | | ~TextLine(); |
224 | | |
225 | 0 | double getXMin() { return xMin; } |
226 | 0 | double getYMin() { return yMin; } |
227 | 0 | double getXMax() { return xMax; } |
228 | 0 | double getYMax() { return yMax; } |
229 | | double getBaseline(); |
230 | 0 | int getRotation() { return rot; } |
231 | 0 | GList *getWords() { return words; } |
232 | 0 | Unicode *getUnicode() { return text; } |
233 | 0 | int getLength() { return len; } |
234 | 0 | double getEdge(int idx) { return edge[idx]; } |
235 | 0 | GBool getHyphenated() { return hyphenated; } |
236 | | |
237 | | private: |
238 | | |
239 | | static int cmpX(const void *p1, const void *p2); |
240 | | |
241 | | GList *words; // [TextWord] |
242 | | int rot; // rotation, multiple of 90 degrees |
243 | | // (0, 1, 2, or 3) |
244 | | double xMin, xMax; // bounding box x coordinates |
245 | | double yMin, yMax; // bounding box y coordinates |
246 | | double fontSize; // main (max) font size for this line |
247 | | Unicode *text; // Unicode text of the line, including |
248 | | // spaces between words |
249 | | double *edge; // "near" edge x or y coord of each char |
250 | | // (plus one extra entry for the last char) |
251 | | int len; // number of Unicode chars |
252 | | GBool hyphenated; // set if last char is a hyphen |
253 | | int px; // x offset (in characters, relative to |
254 | | // containing column) in physical layout mode |
255 | | int pw; // line width (in characters) in physical |
256 | | // layout mode |
257 | | |
258 | | friend class TextSuperLine; |
259 | | friend class TextPage; |
260 | | friend class TextParagraph; |
261 | | }; |
262 | | |
263 | | //------------------------------------------------------------------------ |
264 | | // TextParagraph |
265 | | //------------------------------------------------------------------------ |
266 | | |
267 | | class TextParagraph { |
268 | | public: |
269 | | |
270 | | TextParagraph(GList *linesA, GBool dropCapA); |
271 | | ~TextParagraph(); |
272 | | |
273 | | // Get the list of TextLine objects. |
274 | 0 | GList *getLines() { return lines; } |
275 | | |
276 | 0 | GBool hasDropCap() { return dropCap; } |
277 | | |
278 | 0 | double getXMin() { return xMin; } |
279 | 0 | double getYMin() { return yMin; } |
280 | 0 | double getXMax() { return xMax; } |
281 | 0 | double getYMax() { return yMax; } |
282 | | |
283 | | private: |
284 | | |
285 | | GList *lines; // [TextLine] |
286 | | GBool dropCap; // paragraph starts with a drop capital |
287 | | double xMin, xMax; // bounding box x coordinates |
288 | | double yMin, yMax; // bounding box y coordinates |
289 | | |
290 | | friend class TextPage; |
291 | | }; |
292 | | |
293 | | //------------------------------------------------------------------------ |
294 | | // TextColumn |
295 | | //------------------------------------------------------------------------ |
296 | | |
297 | | class TextColumn { |
298 | | public: |
299 | | |
300 | | TextColumn(GList *paragraphsA, double xMinA, double yMinA, |
301 | | double xMaxA, double yMaxA); |
302 | | ~TextColumn(); |
303 | | |
304 | | // Get the list of TextParagraph objects. |
305 | 0 | GList *getParagraphs() { return paragraphs; } |
306 | | |
307 | 0 | double getXMin() { return xMin; } |
308 | 0 | double getYMin() { return yMin; } |
309 | 0 | double getXMax() { return xMax; } |
310 | 0 | double getYMax() { return yMax; } |
311 | | |
312 | | int getRotation(); |
313 | | |
314 | | private: |
315 | | |
316 | | static int cmpX(const void *p1, const void *p2); |
317 | | static int cmpY(const void *p1, const void *p2); |
318 | | static int cmpPX(const void *p1, const void *p2); |
319 | | |
320 | | GList *paragraphs; // [TextParagraph] |
321 | | double xMin, xMax; // bounding box x coordinates |
322 | | double yMin, yMax; // bounding box y coordinates |
323 | | int px, py; // x, y position (in characters) in physical |
324 | | // layout mode |
325 | | int pw, ph; // column width, height (in characters) in |
326 | | // physical layout mode |
327 | | |
328 | | friend class TextPage; |
329 | | }; |
330 | | |
331 | | //------------------------------------------------------------------------ |
332 | | // TextWordList |
333 | | //------------------------------------------------------------------------ |
334 | | |
335 | | class TextWordList { |
336 | | public: |
337 | | |
338 | | TextWordList(GList *wordsA, GBool primaryLRA); |
339 | | |
340 | | ~TextWordList(); |
341 | | |
342 | | // Return the number of words on the list. |
343 | | int getLength(); |
344 | | |
345 | | // Return the <idx>th word from the list. |
346 | | TextWord *get(int idx); |
347 | | |
348 | | // Returns true if primary direction is left-to-right, or false if |
349 | | // right-to-left. |
350 | 0 | GBool getPrimaryLR() { return primaryLR; } |
351 | | |
352 | | private: |
353 | | |
354 | | GList *words; // [TextWord] |
355 | | GBool primaryLR; |
356 | | }; |
357 | | |
358 | | //------------------------------------------------------------------------ |
359 | | // TextPosition |
360 | | //------------------------------------------------------------------------ |
361 | | |
362 | | // Position within a TextColumn tree. The position is in column |
363 | | // [colIdx], paragraph [parIdx], line [lineIdx], before character |
364 | | // [charIdx]. |
365 | | class TextPosition { |
366 | | public: |
367 | | |
368 | 0 | TextPosition(): colIdx(0), parIdx(0), lineIdx(0), charIdx(0) {} |
369 | | TextPosition(int colIdxA, int parIdxA, int lineIdxA, int charIdxA): |
370 | 0 | colIdx(colIdxA), parIdx(parIdxA), lineIdx(lineIdxA), charIdx(charIdxA) {} |
371 | | |
372 | | int operator==(TextPosition pos); |
373 | | int operator!=(TextPosition pos); |
374 | | int operator<(TextPosition pos); |
375 | | int operator>(TextPosition pos); |
376 | | |
377 | | int colIdx, parIdx, lineIdx, charIdx; |
378 | | }; |
379 | | |
380 | | //------------------------------------------------------------------------ |
381 | | // TextPage |
382 | | //------------------------------------------------------------------------ |
383 | | |
384 | | class TextPage { |
385 | | public: |
386 | | |
387 | | TextPage(TextOutputControl *controlA); |
388 | | ~TextPage(); |
389 | | |
390 | | // Write contents of page to a stream. |
391 | | void write(void *outputStream, TextOutputFunc outputFunc); |
392 | | |
393 | | // Find a string. If <startAtTop> is true, starts looking at the |
394 | | // top of the page; else if <startAtLast> is true, starts looking |
395 | | // immediately after the last find result; else starts looking at |
396 | | // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the |
397 | | // bottom of the page; else if <stopAtLast> is true, stops looking |
398 | | // just before the last find result; else stops looking at |
399 | | // <xMax>,<yMax>. |
400 | | GBool findText(Unicode *s, int len, |
401 | | GBool startAtTop, GBool stopAtBottom, |
402 | | GBool startAtLast, GBool stopAtLast, |
403 | | GBool caseSensitive, GBool backward, |
404 | | GBool wholeWord, |
405 | | double *xMin, double *yMin, |
406 | | double *xMax, double *yMax); |
407 | | |
408 | | // Get the text which is inside the specified rectangle. Multi-line |
409 | | // text always includes end-of-line markers at the end of each line. |
410 | | // If <forceEOL> is true, an end-of-line marker will be appended to |
411 | | // single-line text as well. |
412 | | GString *getText(double xMin, double yMin, |
413 | | double xMax, double yMax, |
414 | | GBool forceEOL = gFalse); |
415 | | |
416 | | // Find a string by character position and length. If found, sets |
417 | | // the text bounding rectangle and returns true; otherwise returns |
418 | | // false. |
419 | | GBool findCharRange(int pos, int length, |
420 | | double *xMin, double *yMin, |
421 | | double *xMax, double *yMax); |
422 | | |
423 | | // Returns true if x,y falls inside a column. |
424 | | GBool checkPointInside(double x, double y); |
425 | | |
426 | | // Find a point inside a column. Returns false if x,y fall outside |
427 | | // all columns. |
428 | | GBool findPointInside(double x, double y, TextPosition *pos); |
429 | | |
430 | | // Find a point in the nearest column. Returns false only if there |
431 | | // are no columns. |
432 | | GBool findPointNear(double x, double y, TextPosition *pos); |
433 | | |
434 | | // Find the start and end of a word inside a column. Returns false |
435 | | // if x,y fall outside all columns. |
436 | | GBool findWordPoints(double x, double y, |
437 | | TextPosition *startPos, TextPosition *endPos); |
438 | | |
439 | | // Find the start and end of a line inside a column. Returns false |
440 | | // if x,y fall outside all columns. |
441 | | GBool findLinePoints(double x, double y, |
442 | | TextPosition *startPos, TextPosition *endPos); |
443 | | |
444 | | // Get the upper point of a TextPosition. |
445 | | void convertPosToPointUpper(TextPosition *pos, double *x, double *y); |
446 | | |
447 | | // Get the lower point of a TextPosition. |
448 | | void convertPosToPointLower(TextPosition *pos, double *x, double *y); |
449 | | |
450 | | // Get the upper left corner of the line containing a TextPosition. |
451 | | void convertPosToPointLeftEdge(TextPosition *pos, double *x, double *y); |
452 | | |
453 | | // Get the lower right corner of the line containing a TextPosition. |
454 | | void convertPosToPointRightEdge(TextPosition *pos, double *x, double *y); |
455 | | |
456 | | // Get the upper right corner of a column. |
457 | | void getColumnUpperRight(int colIdx, double *x, double *y); |
458 | | |
459 | | // Get the lower left corner of a column. |
460 | | void getColumnLowerLeft(int colIdx, double *x, double *y); |
461 | | |
462 | | // Create and return a list of TextColumn objects. |
463 | | GList *makeColumns(); |
464 | | |
465 | | // Get the list of all TextFontInfo objects used on this page. |
466 | 0 | GList *getFonts() { return fonts; } |
467 | | |
468 | | // Build a flat word list, in the specified ordering. |
469 | | TextWordList *makeWordList(); |
470 | | |
471 | | // Build a word list containing only words inside the specified |
472 | | // rectangle. |
473 | | TextWordList *makeWordListForRect(double xMin, double yMin, |
474 | | double xMax, double yMax); |
475 | | |
476 | | // Get the primary rotation of text on the page. |
477 | 0 | int getPrimaryRotation() { return primaryRot; } |
478 | | |
479 | | // Returns true if the primary character direction is left-to-right, |
480 | | // false if it is right-to-left. |
481 | | GBool primaryDirectionIsLR(); |
482 | | |
483 | | // Get the counter values. |
484 | 0 | int getNumVisibleChars() { return nVisibleChars; } |
485 | 0 | int getNumInvisibleChars() { return nInvisibleChars; } |
486 | 0 | int getNumRemovedDupChars() { return nRemovedDupChars; } |
487 | | |
488 | | // Returns true if any of the fonts used on this page are likely to |
489 | | // be problematic when converting text to Unicode. |
490 | 0 | GBool problematicForUnicode() { return problematic; } |
491 | | |
492 | | // Add a 'special' character to this TextPage. This is currently |
493 | | // used by pdftohtml to insert markers for form fields. |
494 | | void addSpecialChar(double xMin, double yMin, double xMax, double yMax, |
495 | | int rot, TextFontInfo *font, double fontSize, |
496 | | Unicode u); |
497 | | |
498 | | // Remove characters that fall inside a region. |
499 | | void removeChars(double xMin, double yMin, double xMax, double yMax, |
500 | | double xOverlapThresh, double yOverlapThresh); |
501 | | |
502 | | private: |
503 | | |
504 | | void startPage(GfxState *state); |
505 | | void clear(); |
506 | | void updateFont(GfxState *state); |
507 | | void addChar(GfxState *state, double x, double y, |
508 | | double dx, double dy, |
509 | | CharCode c, int nBytes, Unicode *u, int uLen); |
510 | | void incCharCount(int nChars); |
511 | | void beginActualText(GfxState *state, Unicode *u, int uLen); |
512 | | void endActualText(GfxState *state); |
513 | | void addUnderline(double x0, double y0, double x1, double y1); |
514 | | void addLink(double xMin, double yMin, double xMax, double yMax, |
515 | | Link *link); |
516 | | |
517 | | // output |
518 | | void writeReadingOrder(void *outputStream, |
519 | | TextOutputFunc outputFunc, |
520 | | UnicodeMap *uMap, |
521 | | char *space, int spaceLen, |
522 | | char *eol, int eolLen); |
523 | | void writePhysLayout(void *outputStream, |
524 | | TextOutputFunc outputFunc, |
525 | | UnicodeMap *uMap, |
526 | | char *space, int spaceLen, |
527 | | char *eol, int eolLen); |
528 | | void writeSimpleLayout(void *outputStream, |
529 | | TextOutputFunc outputFunc, |
530 | | UnicodeMap *uMap, |
531 | | char *space, int spaceLen, |
532 | | char *eol, int eolLen); |
533 | | void writeSimple2Layout(void *outputStream, |
534 | | TextOutputFunc outputFunc, |
535 | | UnicodeMap *uMap, |
536 | | char *space, int spaceLen, |
537 | | char *eol, int eolLen); |
538 | | void writeLinePrinter(void *outputStream, |
539 | | TextOutputFunc outputFunc, |
540 | | UnicodeMap *uMap, |
541 | | char *space, int spaceLen, |
542 | | char *eol, int eolLen); |
543 | | void writeRaw(void *outputStream, |
544 | | TextOutputFunc outputFunc, |
545 | | UnicodeMap *uMap, |
546 | | char *space, int spaceLen, |
547 | | char *eol, int eolLen); |
548 | | void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, |
549 | | GBool primaryLR, GString *s); |
550 | | GBool unicodeEffectiveTypeLOrNum(Unicode u, Unicode left, Unicode right); |
551 | | GBool unicodeEffectiveTypeR(Unicode u, Unicode left, Unicode right); |
552 | | |
553 | | // analysis |
554 | | int rotateChars(GList *charsA); |
555 | | void rotateCharsToZero(GList *charsA); |
556 | | void rotateUnderlinesAndLinks(int rot); |
557 | | void unrotateChars(GList *charsA, int rot); |
558 | | void unrotateCharsFromZero(GList *charsA); |
559 | | void unrotateColumnsFromZero(GList *columns); |
560 | | void unrotateColumns(GList *columns, int rot); |
561 | | void unrotateWords(GList *words, int rot); |
562 | | GBool checkPrimaryLR(GList *charsA); |
563 | | void removeDuplicates(GList *charsA, int rot); |
564 | | GList *separateOverlappingText(GList *charsA); |
565 | | TextColumn *buildOverlappingTextColumn(GList *overlappingChars); |
566 | | TextBlock *splitChars(GList *charsA); |
567 | | TextBlock *split(GList *charsA, int rot); |
568 | | GList *getChars(GList *charsA, double xMin, double yMin, |
569 | | double xMax, double yMax); |
570 | | void findGaps(GList *charsA, int rot, |
571 | | double *xMinOut, double *yMinOut, |
572 | | double *xMaxOut, double *yMaxOut, |
573 | | double *avgFontSizeOut, double *minFontSizeOut, |
574 | | GList *splitLines, |
575 | | TextGaps *horizGaps, TextGaps *vertGaps); |
576 | | void mergeSplitLines(GList *charsA, int rot, GList *splitLines); |
577 | | void tagBlock(TextBlock *blk); |
578 | | void insertLargeChars(GList *largeChars, TextBlock *blk); |
579 | | void insertLargeCharsInFirstLeaf(GList *largeChars, TextBlock *blk); |
580 | | void insertLargeCharInLeaf(TextChar *ch, TextBlock *blk); |
581 | | void insertIntoTree(TextBlock *subtree, TextBlock *primaryTree, |
582 | | GBool doReorder); |
583 | | void reorderBlocks(TextBlock *blk); |
584 | | void insertColumnIntoTree(TextBlock *column, TextBlock *tree); |
585 | | void insertClippedChars(GList *clippedChars, TextBlock *tree); |
586 | | TextBlock *findClippedCharLeaf(TextChar *ch, TextBlock *tree); |
587 | | GList *buildColumns(TextBlock *tree, GBool primaryLR); |
588 | | void buildColumns2(TextBlock *blk, GList *columns, GBool primaryLR); |
589 | | TextColumn *buildColumn(TextBlock *tree); |
590 | | double getLineIndent(TextLine *line, TextBlock *blk); |
591 | | double getAverageLineSpacing(GList *lines); |
592 | | double getLineSpacing(TextLine *line0, TextLine *line1); |
593 | | void buildLines(TextBlock *blk, GList *lines, GBool splitSuperLines); |
594 | | GList *buildSimple2Columns(GList *charsA); |
595 | | GList *buildSimple2Lines(GList *charsA, int rot); |
596 | | TextLine *buildLine(TextBlock *blk); |
597 | | TextLine *buildLine(GList *charsA, int rot, |
598 | | double xMin, double yMin, double xMax, double yMax); |
599 | | void getLineChars(TextBlock *blk, GList *charsA); |
600 | | double computeWordSpacingThreshold(GList *charsA, int rot); |
601 | | void adjustCombiningChars(GList *charsA, int rot); |
602 | | int getCharDirection(TextChar *ch); |
603 | | int getCharDirection(TextChar *ch, TextChar *left, TextChar *right); |
604 | | int assignPhysLayoutPositions(GList *columns); |
605 | | void assignLinePhysPositions(GList *columns); |
606 | | void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap); |
607 | | int assignColumnPhysPositions(GList *columns); |
608 | | void buildSuperLines(TextBlock *blk, GList *superLines); |
609 | | void assignSimpleLayoutPositions(GList *superLines, UnicodeMap *uMap); |
610 | | void generateUnderlinesAndLinks(GList *columns); |
611 | | void findPointInColumn(TextColumn *col, double x, double y, |
612 | | TextPosition *pos); |
613 | | void buildFindCols(); |
614 | | |
615 | | // debug |
616 | | #if 0 //~debug |
617 | | void dumpChars(GList *charsA); |
618 | | void dumpTree(TextBlock *tree, int indent = 0); |
619 | | void dumpColumns(GList *columns, GBool dumpWords = gFalse); |
620 | | void dumpUnderlines(); |
621 | | #endif |
622 | | |
623 | | // word list |
624 | | TextWordList *makeWordListForChars(GList *charList); |
625 | | |
626 | | TextOutputControl control; // formatting parameters |
627 | | |
628 | | UnicodeRemapping *remapping; |
629 | | Unicode *uBuf; |
630 | | int uBufSize; |
631 | | |
632 | | double pageWidth, pageHeight; // width and height of current page |
633 | | int charPos; // next character position (within content |
634 | | // stream) |
635 | | TextFontInfo *curFont; // current font |
636 | | double curFontSize; // current font size |
637 | | int curRot; // current rotation |
638 | | GBool diagonal; // set if rotation is not close to |
639 | | // 0/90/180/270 degrees |
640 | | GBool rotated; // set if text is not horizontal (0 degrees) |
641 | | int nTinyChars; // number of "tiny" chars seen so far |
642 | | Unicode *actualText; // current "ActualText" span |
643 | | int actualTextLen; |
644 | | double actualTextX0, |
645 | | actualTextY0, |
646 | | actualTextX1, |
647 | | actualTextY1; |
648 | | int actualTextNBytes; |
649 | | |
650 | | GList *chars; // [TextChar] |
651 | | GList *fonts; // all font info objects used on this |
652 | | // page [TextFontInfo] |
653 | | int primaryRot; // primary rotation |
654 | | |
655 | | GList *underlines; // [TextUnderline] |
656 | | GList *links; // [TextLink] |
657 | | |
658 | | int nVisibleChars; // number of visible chars on the page |
659 | | int nInvisibleChars; // number of invisible chars on the page |
660 | | int nRemovedDupChars; // number of duplicate chars removed |
661 | | |
662 | | GList *findCols; // text used by the findText**/findPoint** |
663 | | // functions [TextColumn] |
664 | | double lastFindXMin, // coordinates of the last "find" result |
665 | | lastFindYMin; |
666 | | GBool haveLastFind; |
667 | | |
668 | | GBool problematic; // true if any of the fonts used on this |
669 | | // page are marked as problematic for |
670 | | // Unicode conversion |
671 | | |
672 | | friend class TextOutputDev; |
673 | | }; |
674 | | |
675 | | //------------------------------------------------------------------------ |
676 | | // TextOutputDev |
677 | | //------------------------------------------------------------------------ |
678 | | |
679 | | class TextOutputDev: public OutputDev { |
680 | | public: |
681 | | |
682 | | // Open a text output file. If <fileName> is NULL, no file is |
683 | | // written (this is useful, e.g., for searching text). If |
684 | | // <physLayoutA> is true, the original physical layout of the text |
685 | | // is maintained. If <rawOrder> is true, the text is kept in |
686 | | // content stream order. |
687 | | TextOutputDev(char *fileName, TextOutputControl *controlA, |
688 | | GBool append, GBool fileNameIsUTF8 = gFalse); |
689 | | |
690 | | // Create a TextOutputDev which will write to a generic stream. If |
691 | | // <physLayoutA> is true, the original physical layout of the text |
692 | | // is maintained. If <rawOrder> is true, the text is kept in |
693 | | // content stream order. |
694 | | TextOutputDev(TextOutputFunc func, void *stream, |
695 | | TextOutputControl *controlA); |
696 | | |
697 | | // Destructor. |
698 | | virtual ~TextOutputDev(); |
699 | | |
700 | | // Check if file was successfully created. |
701 | 0 | virtual GBool isOk() { return ok; } |
702 | | |
703 | | //---- get info about output device |
704 | | |
705 | | // Does this device use upside-down coordinates? |
706 | | // (Upside-down means (0,0) is the top left corner of the page.) |
707 | 0 | virtual GBool upsideDown() { return gTrue; } |
708 | | |
709 | | // Does this device use drawChar() or drawString()? |
710 | 0 | virtual GBool useDrawChar() { return gTrue; } |
711 | | |
712 | | // Does this device use beginType3Char/endType3Char? Otherwise, |
713 | | // text in Type 3 fonts will be drawn with drawChar/drawString. |
714 | 0 | virtual GBool interpretType3Chars() { return gFalse; } |
715 | | |
716 | | // Does this device need non-text content? |
717 | 0 | virtual GBool needNonText() { return gFalse; } |
718 | | |
719 | | // Does this device require incCharCount to be called for text on |
720 | | // non-shown layers? |
721 | 0 | virtual GBool needCharCount() { return gTrue; } |
722 | | |
723 | | //----- initialization and control |
724 | | |
725 | | // Start a page. |
726 | | virtual void startPage(int pageNum, GfxState *state); |
727 | | |
728 | | // End a page. |
729 | | virtual void endPage(); |
730 | | |
731 | | //----- save/restore graphics state |
732 | | virtual void restoreState(GfxState *state); |
733 | | |
734 | | //----- update text state |
735 | | virtual void updateFont(GfxState *state); |
736 | | |
737 | | //----- text drawing |
738 | | virtual void beginString(GfxState *state, GString *s); |
739 | | virtual void endString(GfxState *state); |
740 | | virtual void drawChar(GfxState *state, double x, double y, |
741 | | double dx, double dy, |
742 | | double originX, double originY, |
743 | | CharCode c, int nBytes, Unicode *u, int uLen, |
744 | | GBool fill, GBool stroke, GBool makePath); |
745 | | virtual void incCharCount(int nChars); |
746 | | virtual void beginActualText(GfxState *state, Unicode *u, int uLen); |
747 | | virtual void endActualText(GfxState *state); |
748 | | |
749 | | //----- path painting |
750 | | virtual void stroke(GfxState *state); |
751 | | virtual void fill(GfxState *state); |
752 | | virtual void eoFill(GfxState *state); |
753 | | |
754 | | //----- link borders |
755 | | virtual void processLink(Link *link); |
756 | | |
757 | | //----- special access |
758 | | |
759 | | // Find a string. If <startAtTop> is true, starts looking at the |
760 | | // top of the page; else if <startAtLast> is true, starts looking |
761 | | // immediately after the last find result; else starts looking at |
762 | | // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the |
763 | | // bottom of the page; else if <stopAtLast> is true, stops looking |
764 | | // just before the last find result; else stops looking at |
765 | | // <xMax>,<yMax>. |
766 | | GBool findText(Unicode *s, int len, |
767 | | GBool startAtTop, GBool stopAtBottom, |
768 | | GBool startAtLast, GBool stopAtLast, |
769 | | GBool caseSensitive, GBool backward, |
770 | | GBool wholeWord, |
771 | | double *xMin, double *yMin, |
772 | | double *xMax, double *yMax); |
773 | | |
774 | | // Get the text which is inside the specified rectangle. |
775 | | GString *getText(double xMin, double yMin, |
776 | | double xMax, double yMax); |
777 | | |
778 | | // Find a string by character position and length. If found, sets |
779 | | // the text bounding rectangle and returns true; otherwise returns |
780 | | // false. |
781 | | GBool findCharRange(int pos, int length, |
782 | | double *xMin, double *yMin, |
783 | | double *xMax, double *yMax); |
784 | | |
785 | | // Build a flat word list, in content stream order (if |
786 | | // this->rawOrder is true), physical layout order (if |
787 | | // this->physLayout is true and this->rawOrder is false), or reading |
788 | | // order (if both flags are false). |
789 | | TextWordList *makeWordList(); |
790 | | |
791 | | // Build a word list containing only words inside the specified |
792 | | // rectangle. |
793 | | TextWordList *makeWordListForRect(double xMin, double yMin, |
794 | | double xMax, double yMax); |
795 | | |
796 | | // Returns the TextPage object for the last rasterized page, |
797 | | // transferring ownership to the caller. |
798 | | TextPage *takeText(); |
799 | | |
800 | | // Turn extra processing for HTML conversion on or off. |
801 | 0 | void enableHTMLExtras(GBool html) { control.html = html; } |
802 | | |
803 | | // Get the counter values. |
804 | 0 | int getNumVisibleChars() { return text->nVisibleChars; } |
805 | 0 | int getNumInvisibleChars() { return text->nInvisibleChars; } |
806 | 0 | int getNumRemovedDupChars() { return text->nRemovedDupChars; } |
807 | | |
808 | | private: |
809 | | |
810 | | void generateBOM(); |
811 | | |
812 | | TextOutputFunc outputFunc; // output function |
813 | | void *outputStream; // output stream |
814 | | GBool needClose; // need to close the output file? |
815 | | // (only if outputStream is a FILE*) |
816 | | TextPage *text; // text for the current page |
817 | | TextOutputControl control; // formatting parameters |
818 | | GBool ok; // set up ok? |
819 | | }; |
820 | | |
821 | | #endif |