/src/mupdf/source/fitz/stext-output.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #define SUBSCRIPT_OFFSET 0.2f |
26 | | #define SUPERSCRIPT_OFFSET -0.2f |
27 | | |
28 | | #include <ft2build.h> |
29 | | #include FT_FREETYPE_H |
30 | | |
31 | | // Text black color when converted from DeviceCMYK to RGB |
32 | 0 | #define CMYK_BLACK 0x221f1f |
33 | | |
34 | | static void |
35 | | scale_run(fz_context *ctx, fz_stext_block *block, float scale) |
36 | 0 | { |
37 | 0 | fz_matrix m = fz_scale(scale, scale); |
38 | 0 | fz_stext_line *line; |
39 | 0 | fz_stext_char *ch; |
40 | |
|
41 | 0 | while (block) |
42 | 0 | { |
43 | 0 | block->bbox = fz_transform_rect(block->bbox, m); |
44 | 0 | switch (block->type) |
45 | 0 | { |
46 | 0 | case FZ_STEXT_BLOCK_TEXT: |
47 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
48 | 0 | { |
49 | 0 | line->bbox = fz_transform_rect(block->bbox, m); |
50 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
51 | 0 | { |
52 | 0 | ch->origin = fz_transform_point(ch->origin, m); |
53 | 0 | ch->quad = fz_transform_quad(ch->quad, m); |
54 | 0 | ch->size = ch->size * scale; |
55 | 0 | } |
56 | 0 | } |
57 | 0 | break; |
58 | | |
59 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
60 | 0 | block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale); |
61 | 0 | break; |
62 | | |
63 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
64 | 0 | if (block->u.s.down) |
65 | 0 | scale_run(ctx, block->u.s.down->first_block, scale); |
66 | 0 | break; |
67 | 0 | } |
68 | 0 | block = block->next; |
69 | 0 | } |
70 | 0 | } |
71 | | |
72 | | static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale) |
73 | 0 | { |
74 | 0 | scale_run(ctx, page->first_block, scale); |
75 | 0 | } |
76 | | |
77 | | /* HTML output (visual formatting with preserved layout) */ |
78 | | |
79 | | static int |
80 | | detect_super_script(fz_stext_line *line, fz_stext_char *ch) |
81 | 0 | { |
82 | 0 | if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) |
83 | 0 | return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; |
84 | 0 | return 0; |
85 | 0 | } |
86 | | |
87 | | static const char * |
88 | | font_full_name(fz_context *ctx, fz_font *font) |
89 | 0 | { |
90 | 0 | const char *name = fz_font_name(ctx, font); |
91 | 0 | const char *s = strchr(name, '+'); |
92 | 0 | return s ? s + 1 : name; |
93 | 0 | } |
94 | | |
95 | | static const char * |
96 | | html_clean_font_name(const char *fontname) |
97 | 0 | { |
98 | 0 | if (strstr(fontname, "Times")) |
99 | 0 | return "Times New Roman"; |
100 | 0 | if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica")) |
101 | 0 | { |
102 | 0 | if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed")) |
103 | 0 | return "Arial Narrow"; |
104 | 0 | return "Arial"; |
105 | 0 | } |
106 | 0 | if (strstr(fontname, "Courier")) |
107 | 0 | return "Courier"; |
108 | 0 | return fontname; |
109 | 0 | } |
110 | | |
111 | | static void |
112 | | font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif) |
113 | 0 | { |
114 | 0 | const char *name = html_clean_font_name(font_full_name(ctx, font)); |
115 | 0 | char *s; |
116 | 0 | fz_strlcpy(buf, name, size); |
117 | 0 | s = strrchr(buf, '-'); |
118 | 0 | if (s) |
119 | 0 | *s = 0; |
120 | 0 | if (is_mono) |
121 | 0 | fz_strlcat(buf, ",monospace", size); |
122 | 0 | else |
123 | 0 | fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size); |
124 | 0 | } |
125 | | |
126 | | static void |
127 | | fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) |
128 | 0 | { |
129 | 0 | char family[80]; |
130 | |
|
131 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
132 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
133 | 0 | int is_serif = fz_font_is_serif(ctx, font); |
134 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
135 | |
|
136 | 0 | font_family_name(ctx, font, family, sizeof family, is_mono, is_serif); |
137 | |
|
138 | 0 | if (sup) fz_write_string(ctx, out, "<sup>"); |
139 | 0 | if (is_mono) fz_write_string(ctx, out, "<tt>"); |
140 | 0 | if (is_bold) fz_write_string(ctx, out, "<b>"); |
141 | 0 | if (is_italic) fz_write_string(ctx, out, "<i>"); |
142 | 0 | fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size); |
143 | 0 | if (color != 0 && color != CMYK_BLACK) |
144 | 0 | fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff); |
145 | 0 | fz_write_printf(ctx, out, "\">"); |
146 | 0 | } |
147 | | |
148 | | static void |
149 | | fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) |
150 | 0 | { |
151 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
152 | 0 | int is_bold = fz_font_is_bold(ctx,font); |
153 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
154 | |
|
155 | 0 | fz_write_string(ctx, out, "</span>"); |
156 | 0 | if (is_italic) fz_write_string(ctx, out, "</i>"); |
157 | 0 | if (is_bold) fz_write_string(ctx, out, "</b>"); |
158 | 0 | if (is_mono) fz_write_string(ctx, out, "</tt>"); |
159 | 0 | if (sup) fz_write_string(ctx, out, "</sup>"); |
160 | 0 | } |
161 | | |
162 | | static void |
163 | | fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
164 | 0 | { |
165 | 0 | fz_matrix ctm = block->u.i.transform; |
166 | |
|
167 | 0 | #define USE_CSS_MATRIX_TRANSFORMS |
168 | 0 | #ifdef USE_CSS_MATRIX_TRANSFORMS |
169 | | /* Matrix maths notes. |
170 | | * When we get here ctm maps the unit square to the position in device |
171 | | * space occupied by the image. |
172 | | * |
173 | | * That is to say that mapping the 4 corners of the unit square through |
174 | | * the transform, give us the 4 target corners. We extend the corners |
175 | | * by adding an extra '1' into them to allow transforms to work. Thus |
176 | | * (x,y) maps through ctm = (a b c d e f) as: |
177 | | * |
178 | | * (x y 1) (a b 0) = (X Y 1) |
179 | | * (c d 0) |
180 | | * (e f 1) |
181 | | * |
182 | | * To simplify reading of matrix maths, we use the trick where we |
183 | | * 'drop' the first matrix down the page. Thus the corners c0=(0,0), |
184 | | * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively: |
185 | | * |
186 | | * ( a b 0) |
187 | | * ( c d 0) |
188 | | * ( e f 1) |
189 | | * (0 0 1) ( e f 1) |
190 | | * (0 1 1) ( c+e d+f 1) |
191 | | * (1 1 1) (a+c+e b+d+f 1) |
192 | | * (1 0 1) ( a+e b+f 1) |
193 | | * |
194 | | * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f) |
195 | | * |
196 | | * Unfortunately, the CSS matrix transform, does not map the unit square. |
197 | | * Rather it does something moderately mad. As far as I can work out, the |
198 | | * top left corner of a (0,0) -> (w, h) box is transformed using the .e |
199 | | * and .f entries of the matrix. Then the image from within that square |
200 | | * is transformed using the centre of that square as the origin. |
201 | | * |
202 | | * So, an image placed at (0,0) in destination space with 1:1 transform |
203 | | * will result in an image a (0,0) as you'd expect. But an image at (0,0) |
204 | | * with a scale of 2, will result in 25% of the image off the left of the |
205 | | * screen, and 25% off the top. |
206 | | * |
207 | | * Accordingly, we have to adjust the ctm in several steps. |
208 | | */ |
209 | | /* Move to moving the centre of the image. */ |
210 | 0 | ctm.e += (ctm.a+ctm.c)/2; |
211 | 0 | ctm.f += (ctm.b+ctm.d)/2; |
212 | | /* Move from transforming the unit square to w/h */ |
213 | 0 | ctm.a /= block->u.i.image->w; |
214 | 0 | ctm.b /= block->u.i.image->w; |
215 | 0 | ctm.c /= block->u.i.image->h; |
216 | 0 | ctm.d /= block->u.i.image->h; |
217 | | /* Move from points to pixels */ |
218 | 0 | ctm.a *= 96.0f/72; |
219 | 0 | ctm.b *= 96.0f/72; |
220 | 0 | ctm.c *= 96.0f/72; |
221 | 0 | ctm.d *= 96.0f/72; |
222 | 0 | ctm.e *= 96.0f/72; |
223 | 0 | ctm.f *= 96.0f/72; |
224 | | /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */ |
225 | 0 | ctm.e -= block->u.i.image->w/2; |
226 | 0 | ctm.f -= block->u.i.image->h/2; |
227 | |
|
228 | 0 | fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"", |
229 | 0 | ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f); |
230 | | #else |
231 | | /* Alternative version of the code that uses scaleX/Y and rotate |
232 | | * instead, but only copes with axis aligned cases. */ |
233 | | int t; |
234 | | |
235 | | int x = block->bbox.x0; |
236 | | int y = block->bbox.y0; |
237 | | int w = block->bbox.x1 - block->bbox.x0; |
238 | | int h = block->bbox.y1 - block->bbox.y0; |
239 | | |
240 | | const char *flip = ""; |
241 | | |
242 | | if (ctm.b == 0 && ctm.c == 0) |
243 | | { |
244 | | if (ctm.a < 0 && ctm.d < 0) |
245 | | flip = "transform: scaleX(-1) scaleY(-1);"; |
246 | | else if (ctm.a < 0) |
247 | | { |
248 | | flip = "transform: scaleX(-1);"; |
249 | | } |
250 | | else if (ctm.d < 0) |
251 | | { |
252 | | flip = "transform: scaleY(-1);"; |
253 | | } |
254 | | } else if (ctm.a == 0 && ctm.d == 0) { |
255 | | if (ctm.b < 0 && ctm.c < 0) |
256 | | { |
257 | | flip = "transform: scaleY(-1) rotate(90deg);"; |
258 | | x += (w-h)/2; |
259 | | y -= (w-h)/2; |
260 | | t = w; w = h; h = t; |
261 | | } |
262 | | else if (ctm.b < 0) |
263 | | { |
264 | | flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);"; |
265 | | x += (w-h)/2; |
266 | | y -= (w-h)/2; |
267 | | t = w; w = h; h = t; |
268 | | } |
269 | | else if (ctm.c < 0) |
270 | | { |
271 | | flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);"; |
272 | | x += (w-h)/2; |
273 | | y -= (w-h)/2; |
274 | | t = w; w = h; h = t; |
275 | | } |
276 | | else |
277 | | { |
278 | | flip = "transform: scaleY(-1) rotate(270deg);"; |
279 | | x += (w-h)/2; |
280 | | y -= (w-h)/2; |
281 | | t = w; w = h; h = t; |
282 | | } |
283 | | } |
284 | | |
285 | | fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h); |
286 | | #endif |
287 | 0 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
288 | 0 | fz_write_string(ctx, out, "\">\n"); |
289 | 0 | } |
290 | | |
291 | | void |
292 | | fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
293 | 0 | { |
294 | 0 | fz_stext_line *line; |
295 | 0 | fz_stext_char *ch; |
296 | 0 | float x, y, h; |
297 | |
|
298 | 0 | fz_font *font = NULL; |
299 | 0 | float size = 0; |
300 | 0 | int sup = 0; |
301 | 0 | uint32_t color = 0; |
302 | |
|
303 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
304 | 0 | { |
305 | 0 | x = line->bbox.x0; |
306 | 0 | y = line->bbox.y0; |
307 | 0 | h = line->bbox.y1 - line->bbox.y0; |
308 | |
|
309 | 0 | if (line->first_char) |
310 | 0 | { |
311 | 0 | h = line->first_char->size; |
312 | 0 | y = line->first_char->origin.y - h * 0.8f; |
313 | 0 | } |
314 | |
|
315 | 0 | fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h); |
316 | 0 | font = NULL; |
317 | |
|
318 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
319 | 0 | { |
320 | 0 | int ch_sup = detect_super_script(line, ch); |
321 | 0 | if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color) |
322 | 0 | { |
323 | 0 | if (font) |
324 | 0 | fz_print_style_end_html(ctx, out, font, size, sup, color); |
325 | 0 | font = ch->font; |
326 | 0 | size = ch->size; |
327 | 0 | color = ch->argb; |
328 | 0 | sup = ch_sup; |
329 | 0 | fz_print_style_begin_html(ctx, out, font, size, sup, color); |
330 | 0 | } |
331 | |
|
332 | 0 | switch (ch->c) |
333 | 0 | { |
334 | 0 | default: |
335 | 0 | if (ch->c >= 32 && ch->c <= 127) |
336 | 0 | fz_write_byte(ctx, out, ch->c); |
337 | 0 | else |
338 | 0 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
339 | 0 | break; |
340 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
341 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
342 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
343 | 0 | case '"': fz_write_string(ctx, out, """); break; |
344 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
345 | 0 | } |
346 | 0 | } |
347 | | |
348 | 0 | if (font) |
349 | 0 | fz_print_style_end_html(ctx, out, font, size, sup, color); |
350 | |
|
351 | 0 | fz_write_string(ctx, out, "</p>\n"); |
352 | 0 | } |
353 | 0 | } |
354 | | |
355 | | static const char * |
356 | | html_tag_for_struct(fz_stext_struct *s) |
357 | 0 | { |
358 | 0 | const char *raw; |
359 | |
|
360 | 0 | if (s == NULL) |
361 | 0 | return "DIV"; |
362 | | |
363 | 0 | raw = s->raw; |
364 | 0 | if (raw == NULL) |
365 | 0 | raw = fz_structure_to_string(s->standard); |
366 | |
|
367 | 0 | if (!fz_strcasecmp(raw, "blockquote")) |
368 | 0 | return "blockquote"; |
369 | 0 | if (!fz_strcasecmp(raw, "title")) |
370 | 0 | return "h1"; |
371 | 0 | if (!fz_strcasecmp(raw, "sub")) |
372 | 0 | return "sub"; |
373 | 0 | if (!fz_strcasecmp(raw, "p")) |
374 | 0 | return "p"; |
375 | 0 | if (!fz_strcasecmp(raw, "h")) |
376 | 0 | return "h1"; /* Pick one! */ |
377 | 0 | if (!fz_strcasecmp(raw, "h1")) |
378 | 0 | return "h1"; |
379 | 0 | if (!fz_strcasecmp(raw, "h2")) |
380 | 0 | return "h2"; |
381 | 0 | if (!fz_strcasecmp(raw, "h3")) |
382 | 0 | return "h3"; |
383 | 0 | if (!fz_strcasecmp(raw, "h4")) |
384 | 0 | return "h4"; |
385 | 0 | if (!fz_strcasecmp(raw, "h5")) |
386 | 0 | return "h5"; |
387 | 0 | if (!fz_strcasecmp(raw, "h6")) |
388 | 0 | return "h6"; |
389 | | |
390 | 0 | if (!fz_strcasecmp(raw, "list")) |
391 | 0 | return "ul"; |
392 | 0 | if (!fz_strcasecmp(raw, "listitem")) |
393 | 0 | return "li"; |
394 | 0 | if (!fz_strcasecmp(raw, "table")) |
395 | 0 | return "table"; |
396 | 0 | if (!fz_strcasecmp(raw, "tr")) |
397 | 0 | return "tr"; |
398 | 0 | if (!fz_strcasecmp(raw, "th")) |
399 | 0 | return "th"; |
400 | 0 | if (!fz_strcasecmp(raw, "td")) |
401 | 0 | return "td"; |
402 | 0 | if (!fz_strcasecmp(raw, "thead")) |
403 | 0 | return "thead"; |
404 | 0 | if (!fz_strcasecmp(raw, "tbody")) |
405 | 0 | return "tbody"; |
406 | 0 | if (!fz_strcasecmp(raw, "tfoot")) |
407 | 0 | return "tfoot"; |
408 | | |
409 | 0 | if (!fz_strcasecmp(raw, "span")) |
410 | 0 | return "span"; |
411 | 0 | if (!fz_strcasecmp(raw, "code")) |
412 | 0 | return "code"; |
413 | 0 | if (!fz_strcasecmp(raw, "em")) |
414 | 0 | return "em"; |
415 | 0 | if (!fz_strcasecmp(raw, "strong")) |
416 | 0 | return "strong"; |
417 | | |
418 | 0 | return "div"; |
419 | 0 | } |
420 | | |
421 | | static void |
422 | | print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block); |
423 | | |
424 | | static void |
425 | | fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
426 | 0 | { |
427 | 0 | const char *tag; |
428 | |
|
429 | 0 | if (block->u.s.down == NULL) |
430 | 0 | return; |
431 | | |
432 | 0 | tag = html_tag_for_struct(block->u.s.down); |
433 | |
|
434 | 0 | fz_write_printf(ctx, out, "<%s>\n", tag); |
435 | |
|
436 | 0 | print_blocks_as_html(ctx, out, block->u.s.down->first_block); |
437 | |
|
438 | 0 | fz_write_printf(ctx, out, "</%s>\n", tag); |
439 | 0 | } |
440 | | |
441 | | static void |
442 | | print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
443 | 0 | { |
444 | 0 | for (; block; block = block->next) |
445 | 0 | { |
446 | 0 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
447 | 0 | fz_print_stext_image_as_html(ctx, out, block); |
448 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
449 | 0 | fz_print_stext_block_as_html(ctx, out, block); |
450 | 0 | else if (block->type == FZ_STEXT_BLOCK_STRUCT) |
451 | 0 | fz_print_stext_struct_as_html(ctx, out, block); |
452 | 0 | } |
453 | 0 | } |
454 | | |
455 | | void |
456 | | fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
457 | 0 | { |
458 | 0 | float w = page->mediabox.x1 - page->mediabox.x0; |
459 | 0 | float h = page->mediabox.y1 - page->mediabox.y0; |
460 | |
|
461 | 0 | fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h); |
462 | |
|
463 | 0 | print_blocks_as_html(ctx, out, page->first_block); |
464 | |
|
465 | 0 | fz_write_string(ctx, out, "</div>\n"); |
466 | 0 | } |
467 | | |
468 | | void |
469 | | fz_print_stext_header_as_html(fz_context *ctx, fz_output *out) |
470 | 0 | { |
471 | 0 | fz_write_string(ctx, out, "<!DOCTYPE html>\n"); |
472 | 0 | fz_write_string(ctx, out, "<html>\n"); |
473 | 0 | fz_write_string(ctx, out, "<head>\n"); |
474 | 0 | fz_write_string(ctx, out, "<style>\n"); |
475 | 0 | fz_write_string(ctx, out, "body{background-color:slategray}\n"); |
476 | 0 | fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n"); |
477 | 0 | fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n"); |
478 | 0 | fz_write_string(ctx, out, "</style>\n"); |
479 | 0 | fz_write_string(ctx, out, "</head>\n"); |
480 | 0 | fz_write_string(ctx, out, "<body>\n"); |
481 | 0 | } |
482 | | |
483 | | void |
484 | | fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) |
485 | 0 | { |
486 | 0 | fz_write_string(ctx, out, "</body>\n"); |
487 | 0 | fz_write_string(ctx, out, "</html>\n"); |
488 | 0 | } |
489 | | |
490 | | /* XHTML output (semantic, little layout, suitable for reflow) */ |
491 | | |
492 | | static void |
493 | | find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1) |
494 | 0 | { |
495 | 0 | int i; |
496 | |
|
497 | 0 | *ix0 = -1; |
498 | 0 | *ix1 = -1; |
499 | |
|
500 | 0 | for (i = 1; i < xs->len; i++) |
501 | 0 | if (x0 < xs->list[i].pos) |
502 | 0 | { |
503 | 0 | *ix0 = i-1; |
504 | 0 | break; |
505 | 0 | } |
506 | 0 | for (; i < xs->len; i++) |
507 | 0 | if (x1 < xs->list[i].pos) |
508 | 0 | { |
509 | 0 | *ix1 = i-1; |
510 | 0 | break; |
511 | 0 | } |
512 | 0 | if (i == xs->len) |
513 | 0 | *ix1 = i-1; |
514 | 0 | } |
515 | | |
516 | | static void |
517 | | run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out); |
518 | | |
519 | | static void |
520 | | fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
521 | 0 | { |
522 | 0 | fz_stext_block *grid, *tr, *td; |
523 | 0 | int w, h; |
524 | 0 | int x, y; |
525 | 0 | uint8_t *cells; |
526 | 0 | int malformed = 0; |
527 | |
|
528 | 0 | for (grid = block; grid != NULL; grid = grid->next) |
529 | 0 | if (grid->type == FZ_STEXT_BLOCK_GRID) |
530 | 0 | break; |
531 | 0 | if (grid == NULL) |
532 | 0 | { |
533 | 0 | fz_warn(ctx, "Malformed table data"); |
534 | 0 | return; |
535 | 0 | } |
536 | 0 | w = grid->u.b.xs->len; |
537 | 0 | h = grid->u.b.ys->len; |
538 | 0 | cells = fz_calloc(ctx, w, h); |
539 | |
|
540 | 0 | fz_try(ctx) |
541 | 0 | { |
542 | 0 | fz_write_printf(ctx, out, "<table>\n"); |
543 | |
|
544 | 0 | y = 0; |
545 | 0 | for (tr = grid->next; tr != NULL; tr = tr->next) |
546 | 0 | { |
547 | 0 | if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR) |
548 | 0 | { |
549 | 0 | malformed = 1; |
550 | 0 | continue; |
551 | 0 | } |
552 | 0 | fz_write_printf(ctx, out, "<tr>\n"); |
553 | 0 | x = 0; |
554 | 0 | for (td = tr->u.s.down->first_block; td != NULL; td = td->next) |
555 | 0 | { |
556 | 0 | int x0, y0, x1, y1; |
557 | 0 | if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD) |
558 | 0 | { |
559 | 0 | malformed = 1; |
560 | 0 | continue; |
561 | 0 | } |
562 | 0 | find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1); |
563 | 0 | find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1); |
564 | 0 | if (x0 < 0 || x1 < 0 || x1 >= w) |
565 | 0 | { |
566 | 0 | malformed = 1; |
567 | 0 | x0 = x; |
568 | 0 | x1 = x+1; |
569 | 0 | } |
570 | 0 | if (y0 < 0 || y1 < 0 || y1 >= h) |
571 | 0 | { |
572 | 0 | malformed = 1; |
573 | 0 | y0 = y; |
574 | 0 | y1 = y+1; |
575 | 0 | } |
576 | 0 | if (y < y0) |
577 | 0 | { |
578 | 0 | malformed = 1; |
579 | 0 | continue; |
580 | 0 | } |
581 | 0 | if (x > x0) |
582 | 0 | { |
583 | 0 | malformed = 1; |
584 | 0 | } |
585 | 0 | while (x < x0) |
586 | 0 | { |
587 | 0 | uint8_t *c = &cells[x + w*y]; |
588 | 0 | if (*c == 0) |
589 | 0 | { |
590 | 0 | fz_write_printf(ctx, out, "<td></td>"); |
591 | 0 | *c = 1; |
592 | 0 | } |
593 | 0 | x++; |
594 | 0 | } |
595 | 0 | fz_write_string(ctx, out, "<td"); |
596 | 0 | if (x1 > x0+1) |
597 | 0 | fz_write_printf(ctx, out, " rowspan=%d", x1-x0); |
598 | 0 | if (y1 > y0+1) |
599 | 0 | fz_write_printf(ctx, out, " colspan=%d", y1-y0); |
600 | 0 | fz_write_string(ctx, out, ">\n"); |
601 | 0 | run_to_xhtml(ctx, td->u.s.down->first_block, out); |
602 | 0 | fz_write_printf(ctx, out, "</td>\n"); |
603 | 0 | for ( ; y0 < y1; y0++) |
604 | 0 | for (x = x0; x < x1; x++) |
605 | 0 | { |
606 | 0 | uint8_t *c = &cells[x + w*y0]; |
607 | 0 | if (*c != 0) |
608 | 0 | malformed = 1; |
609 | 0 | *c = 1; |
610 | 0 | } |
611 | 0 | } |
612 | 0 | fz_write_printf(ctx, out, "</tr>\n"); |
613 | 0 | y++; |
614 | 0 | } |
615 | |
|
616 | 0 | fz_write_printf(ctx, out, "</table>\n"); |
617 | 0 | } |
618 | 0 | fz_always(ctx) |
619 | 0 | fz_free(ctx, cells); |
620 | 0 | fz_catch(ctx) |
621 | 0 | fz_rethrow(ctx); |
622 | | |
623 | 0 | if (malformed) |
624 | 0 | fz_warn(ctx, "Malformed table data"); |
625 | 0 | } |
626 | | |
627 | | static void |
628 | | fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
629 | 0 | { |
630 | 0 | int w = block->bbox.x1 - block->bbox.x0; |
631 | 0 | int h = block->bbox.y1 - block->bbox.y0; |
632 | |
|
633 | 0 | fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h); |
634 | 0 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
635 | 0 | fz_write_string(ctx, out, "\"/></p>\n"); |
636 | 0 | } |
637 | | |
638 | | static void |
639 | | fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) |
640 | 0 | { |
641 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
642 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
643 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
644 | |
|
645 | 0 | if (sup) |
646 | 0 | fz_write_string(ctx, out, "<sup>"); |
647 | 0 | if (is_mono) |
648 | 0 | fz_write_string(ctx, out, "<tt>"); |
649 | 0 | if (is_bold) |
650 | 0 | fz_write_string(ctx, out, "<b>"); |
651 | 0 | if (is_italic) |
652 | 0 | fz_write_string(ctx, out, "<i>"); |
653 | 0 | } |
654 | | |
655 | | static void |
656 | | fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) |
657 | 0 | { |
658 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
659 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
660 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
661 | |
|
662 | 0 | if (is_italic) |
663 | 0 | fz_write_string(ctx, out, "</i>"); |
664 | 0 | if (is_bold) |
665 | 0 | fz_write_string(ctx, out, "</b>"); |
666 | 0 | if (is_mono) |
667 | 0 | fz_write_string(ctx, out, "</tt>"); |
668 | 0 | if (sup) |
669 | 0 | fz_write_string(ctx, out, "</sup>"); |
670 | 0 | } |
671 | | |
672 | | static float avg_font_size_of_line(fz_stext_char *ch) |
673 | 0 | { |
674 | 0 | float size = 0; |
675 | 0 | int n = 0; |
676 | 0 | if (!ch) |
677 | 0 | return 0; |
678 | 0 | while (ch) |
679 | 0 | { |
680 | 0 | size += ch->size; |
681 | 0 | ++n; |
682 | 0 | ch = ch->next; |
683 | 0 | } |
684 | 0 | return size / n; |
685 | 0 | } |
686 | | |
687 | | static const char *tag_from_font_size(float size) |
688 | 0 | { |
689 | 0 | if (size >= 20) return "h1"; |
690 | 0 | if (size >= 15) return "h2"; |
691 | 0 | if (size >= 12) return "h3"; |
692 | 0 | return "p"; |
693 | 0 | } |
694 | | |
695 | | static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
696 | 0 | { |
697 | 0 | fz_stext_line *line; |
698 | 0 | fz_stext_char *ch; |
699 | |
|
700 | 0 | fz_font *font = NULL; |
701 | 0 | int sup = 0; |
702 | 0 | int sp = 1; |
703 | 0 | const char *tag = NULL; |
704 | 0 | const char *new_tag; |
705 | |
|
706 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
707 | 0 | { |
708 | 0 | new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char)); |
709 | 0 | if (tag != new_tag) |
710 | 0 | { |
711 | 0 | if (tag) |
712 | 0 | { |
713 | 0 | if (font) |
714 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
715 | 0 | fz_write_printf(ctx, out, "</%s>", tag); |
716 | 0 | } |
717 | 0 | tag = new_tag; |
718 | 0 | fz_write_printf(ctx, out, "<%s>", tag); |
719 | 0 | if (font) |
720 | 0 | fz_print_style_begin_xhtml(ctx, out, font, sup); |
721 | 0 | } |
722 | |
|
723 | 0 | if (!sp) |
724 | 0 | fz_write_byte(ctx, out, ' '); |
725 | |
|
726 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
727 | 0 | { |
728 | 0 | int ch_sup = detect_super_script(line, ch); |
729 | 0 | if (ch->font != font || ch_sup != sup) |
730 | 0 | { |
731 | 0 | if (font) |
732 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
733 | 0 | font = ch->font; |
734 | 0 | sup = ch_sup; |
735 | 0 | fz_print_style_begin_xhtml(ctx, out, font, sup); |
736 | 0 | } |
737 | |
|
738 | 0 | sp = (ch->c == ' '); |
739 | 0 | switch (ch->c) |
740 | 0 | { |
741 | 0 | default: |
742 | 0 | if (ch->c >= 32 && ch->c <= 127) |
743 | 0 | fz_write_byte(ctx, out, ch->c); |
744 | 0 | else |
745 | 0 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
746 | 0 | break; |
747 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
748 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
749 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
750 | 0 | case '"': fz_write_string(ctx, out, """); break; |
751 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
752 | 0 | } |
753 | 0 | } |
754 | 0 | } |
755 | | |
756 | 0 | if (font) |
757 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
758 | 0 | fz_write_printf(ctx, out, "</%s>\n", tag); |
759 | 0 | } |
760 | | |
761 | | static void |
762 | | fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
763 | 0 | { |
764 | 0 | const char *tag; |
765 | |
|
766 | 0 | if (block->u.s.down == NULL) |
767 | 0 | return; |
768 | | |
769 | 0 | if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) |
770 | 0 | { |
771 | 0 | fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block); |
772 | 0 | return; |
773 | 0 | } |
774 | | |
775 | 0 | tag = html_tag_for_struct(block->u.s.down); |
776 | |
|
777 | 0 | fz_write_printf(ctx, out, "<%s>\n", tag); |
778 | |
|
779 | 0 | run_to_xhtml(ctx, block->u.s.down->first_block, out); |
780 | |
|
781 | 0 | fz_write_printf(ctx, out, "</%s>\n", tag); |
782 | 0 | } |
783 | | |
784 | | static void |
785 | | run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out) |
786 | 0 | { |
787 | 0 | while (block) |
788 | 0 | { |
789 | 0 | switch(block->type) |
790 | 0 | { |
791 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
792 | 0 | fz_print_stext_image_as_xhtml(ctx, out, block); |
793 | 0 | break; |
794 | 0 | case FZ_STEXT_BLOCK_TEXT: |
795 | 0 | fz_print_stext_block_as_xhtml(ctx, out, block); |
796 | 0 | break; |
797 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
798 | 0 | fz_print_struct_as_xhtml(ctx, out, block); |
799 | 0 | break; |
800 | 0 | } |
801 | 0 | block = block->next; |
802 | 0 | } |
803 | 0 | } |
804 | | |
805 | | void |
806 | | fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
807 | 0 | { |
808 | 0 | fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id); |
809 | |
|
810 | 0 | run_to_xhtml(ctx, page->first_block, out); |
811 | |
|
812 | 0 | fz_write_string(ctx, out, "</div>\n"); |
813 | 0 | } |
814 | | |
815 | | void |
816 | | fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) |
817 | 0 | { |
818 | 0 | fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n"); |
819 | 0 | fz_write_string(ctx, out, "<!DOCTYPE html"); |
820 | 0 | fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""); |
821 | 0 | fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"); |
822 | 0 | fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n"); |
823 | 0 | fz_write_string(ctx, out, "<head>\n"); |
824 | 0 | fz_write_string(ctx, out, "<style>\n"); |
825 | 0 | fz_write_string(ctx, out, "p{white-space:pre-wrap}\n"); |
826 | 0 | fz_write_string(ctx, out, "</style>\n"); |
827 | 0 | fz_write_string(ctx, out, "</head>\n"); |
828 | 0 | fz_write_string(ctx, out, "<body>\n"); |
829 | 0 | } |
830 | | |
831 | | void |
832 | | fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) |
833 | 0 | { |
834 | 0 | fz_write_string(ctx, out, "</body>\n"); |
835 | 0 | fz_write_string(ctx, out, "</html>\n"); |
836 | 0 | } |
837 | | |
838 | | /* Detailed XML dump of the entire structured text data */ |
839 | | |
840 | | static void |
841 | | xml_write_char(fz_context *ctx, fz_output *out, int c) |
842 | 0 | { |
843 | 0 | switch (c) |
844 | 0 | { |
845 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
846 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
847 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
848 | 0 | case '"': fz_write_string(ctx, out, """); break; |
849 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
850 | 0 | default: |
851 | 0 | if (c >= 32 && c <= 127) |
852 | 0 | fz_write_printf(ctx, out, "%c", c); |
853 | 0 | else |
854 | 0 | fz_write_printf(ctx, out, "&#x%x;", c); |
855 | 0 | break; |
856 | 0 | } |
857 | 0 | } |
858 | | |
859 | | static void |
860 | | as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out) |
861 | 0 | { |
862 | 0 | fz_stext_line *line; |
863 | 0 | fz_stext_char *ch; |
864 | 0 | int i; |
865 | |
|
866 | 0 | while (block) |
867 | 0 | { |
868 | 0 | switch (block->type) |
869 | 0 | { |
870 | 0 | case FZ_STEXT_BLOCK_TEXT: |
871 | 0 | fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"", |
872 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
873 | 0 | if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN) |
874 | 0 | fz_write_printf(ctx, out, " justify=\"unknown\""); |
875 | 0 | if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT) |
876 | 0 | fz_write_printf(ctx, out, " justify=\"left\""); |
877 | 0 | if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE) |
878 | 0 | fz_write_printf(ctx, out, " justify=\"centre\""); |
879 | 0 | if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT) |
880 | 0 | fz_write_printf(ctx, out, " justify=\"right\""); |
881 | 0 | if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) |
882 | 0 | fz_write_printf(ctx, out, " justify=\"full\""); |
883 | 0 | fz_write_printf(ctx, out, ">\n"); |
884 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
885 | 0 | { |
886 | 0 | fz_font *font = NULL; |
887 | 0 | float size = 0; |
888 | 0 | const char *name = NULL; |
889 | |
|
890 | 0 | fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\" flags=\"%d\"", |
891 | 0 | line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1, |
892 | 0 | line->wmode, |
893 | 0 | line->dir.x, line->dir.y, line->flags); |
894 | | |
895 | | /* This is duplication of information, but it makes it MUCH easier to search for |
896 | | * text fragments in large output. */ |
897 | 0 | { |
898 | 0 | int valid = 1; |
899 | 0 | fz_write_printf(ctx, out, " text=\""); |
900 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
901 | 0 | { |
902 | 0 | if (valid) |
903 | 0 | valid = fz_is_valid_xml_char(ch->c); |
904 | 0 | xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c)); |
905 | 0 | } |
906 | 0 | if (!valid) |
907 | 0 | { |
908 | 0 | fz_write_printf(ctx, out, "\" hextext=\""); |
909 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
910 | 0 | { |
911 | 0 | char text[8]; |
912 | 0 | int n = fz_runetochar(text, ch->c); |
913 | 0 | for (i = 0; i < n; i++) |
914 | 0 | fz_write_printf(ctx, out, "%02x", text[i]); |
915 | 0 | } |
916 | 0 | } |
917 | 0 | fz_write_printf(ctx, out, "\""); |
918 | 0 | } |
919 | |
|
920 | 0 | fz_write_printf(ctx, out, ">\n"); |
921 | |
|
922 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
923 | 0 | { |
924 | 0 | if (ch->font != font || ch->size != size) |
925 | 0 | { |
926 | 0 | const char *s; |
927 | 0 | if (font) |
928 | 0 | fz_write_string(ctx, out, "</font>\n"); |
929 | 0 | font = ch->font; |
930 | 0 | size = ch->size; |
931 | 0 | s = name = font_full_name(ctx, font); |
932 | 0 | while (*s) |
933 | 0 | { |
934 | 0 | int c = *s++; |
935 | 0 | if (c < 32 || c >= 127) |
936 | 0 | break; |
937 | 0 | } |
938 | 0 | if (*s) |
939 | 0 | fz_write_printf(ctx, out, "<font hexname=%>", name); |
940 | 0 | else |
941 | 0 | fz_write_printf(ctx, out, "<font name=\"%s\"", name); |
942 | 0 | fz_write_printf(ctx, out, " size=\"%g\">\n", size); |
943 | 0 | } |
944 | 0 | fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"", |
945 | 0 | ch->quad.ul.x, ch->quad.ul.y, |
946 | 0 | ch->quad.ur.x, ch->quad.ur.y, |
947 | 0 | ch->quad.ll.x, ch->quad.ll.y, |
948 | 0 | ch->quad.lr.x, ch->quad.lr.y, |
949 | 0 | ch->origin.x, ch->origin.y, |
950 | 0 | ch->bidi, |
951 | 0 | ch->argb & 0xFFFFFF, |
952 | 0 | ch->argb>>24, |
953 | 0 | ch->flags); |
954 | 0 | xml_write_char(ctx, out, ch->c); |
955 | 0 | if (!fz_is_valid_xml_char(ch->c)) |
956 | 0 | { |
957 | 0 | char text[8]; |
958 | 0 | int n = fz_runetochar(text, ch->c); |
959 | 0 | fz_write_string(ctx, out, "\" hexc=\""); |
960 | 0 | for (i = 0; i < n; i++) |
961 | 0 | fz_write_printf(ctx, out, "%02x", text[i]); |
962 | 0 | } |
963 | 0 | fz_write_string(ctx, out, "\"/>\n"); |
964 | 0 | } |
965 | |
|
966 | 0 | if (font) |
967 | 0 | fz_write_string(ctx, out, "</font>\n"); |
968 | |
|
969 | 0 | fz_write_string(ctx, out, "</line>\n"); |
970 | 0 | } |
971 | 0 | fz_write_string(ctx, out, "</block>\n"); |
972 | 0 | break; |
973 | | |
974 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
975 | 0 | fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n", |
976 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
977 | 0 | break; |
978 | | |
979 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
980 | 0 | fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index, |
981 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
982 | 0 | if (block->u.s.down) |
983 | 0 | fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"", |
984 | 0 | block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard)); |
985 | 0 | fz_write_printf(ctx, out, ">\n"); |
986 | 0 | if (block->u.s.down) |
987 | 0 | as_xml(ctx, block->u.s.down->first_block, out); |
988 | 0 | fz_write_printf(ctx, out, "</struct>\n"); |
989 | 0 | break; |
990 | | |
991 | 0 | case FZ_STEXT_BLOCK_VECTOR: |
992 | 0 | fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n", |
993 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1, |
994 | 0 | !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED), |
995 | 0 | !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE), |
996 | 0 | !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES), |
997 | 0 | block->u.v.argb); |
998 | 0 | break; |
999 | | |
1000 | 0 | case FZ_STEXT_BLOCK_GRID: |
1001 | 0 | fz_write_printf(ctx, out, "<grid xpos=\""); |
1002 | 0 | for (i = 0; i < block->u.b.xs->len; i++) |
1003 | 0 | fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos); |
1004 | 0 | fz_write_printf(ctx, out, "\" xuncertainty=\""); |
1005 | 0 | for (i = 0; i < block->u.b.xs->len; i++) |
1006 | 0 | fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty); |
1007 | 0 | fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty); |
1008 | 0 | for (i = 0; i < block->u.b.ys->len; i++) |
1009 | 0 | fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos); |
1010 | 0 | fz_write_printf(ctx, out, "\" yuncertainty=\""); |
1011 | 0 | for (i = 0; i < block->u.b.ys->len; i++) |
1012 | 0 | fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty); |
1013 | 0 | fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty); |
1014 | 0 | break; |
1015 | 0 | } |
1016 | 0 | block = block->next; |
1017 | 0 | } |
1018 | 0 | } |
1019 | | |
1020 | | void |
1021 | | fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
1022 | 0 | { |
1023 | 0 | fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id, |
1024 | 0 | page->mediabox.x1 - page->mediabox.x0, |
1025 | 0 | page->mediabox.y1 - page->mediabox.y0); |
1026 | |
|
1027 | 0 | as_xml(ctx, page->first_block, out); |
1028 | |
|
1029 | 0 | fz_write_string(ctx, out, "</page>\n"); |
1030 | 0 | } |
1031 | | |
1032 | | /* JSON dump */ |
1033 | | |
1034 | | static void |
1035 | | as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale) |
1036 | 0 | { |
1037 | 0 | fz_stext_line *line; |
1038 | 0 | fz_stext_char *ch; |
1039 | 0 | int comma = 0; |
1040 | |
|
1041 | 0 | while (block) |
1042 | 0 | { |
1043 | 0 | if (comma) |
1044 | 0 | fz_write_string(ctx, out, ","); |
1045 | 0 | comma = 1; |
1046 | |
|
1047 | 0 | switch (block->type) |
1048 | 0 | { |
1049 | 0 | case FZ_STEXT_BLOCK_TEXT: |
1050 | 0 | fz_write_printf(ctx, out, "{%q:%q,", "type", "text"); |
1051 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
1052 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); |
1053 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); |
1054 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); |
1055 | 0 | fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); |
1056 | 0 | fz_write_printf(ctx, out, "%q:[", "lines"); |
1057 | |
|
1058 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
1059 | 0 | { |
1060 | 0 | if (line != block->u.t.first_line) |
1061 | 0 | fz_write_string(ctx, out, ","); |
1062 | 0 | fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode); |
1063 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
1064 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale)); |
1065 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale)); |
1066 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale)); |
1067 | 0 | fz_write_printf(ctx, out, "%q:%d,", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale)); |
1068 | 0 | fz_write_printf(ctx, out, "%q:%d},", "flags", line->flags); |
1069 | | |
1070 | | /* Since we force preserve-spans, the first char has the style for the entire line. */ |
1071 | 0 | if (line->first_char) |
1072 | 0 | { |
1073 | 0 | fz_font *font = line->first_char->font; |
1074 | 0 | char *font_family = "sans-serif"; |
1075 | 0 | char *font_weight = "normal"; |
1076 | 0 | char *font_style = "normal"; |
1077 | 0 | if (fz_font_is_monospaced(ctx, font)) font_family = "monospace"; |
1078 | 0 | else if (fz_font_is_serif(ctx, font)) font_family = "serif"; |
1079 | 0 | if (fz_font_is_bold(ctx, font)) font_weight = "bold"; |
1080 | 0 | if (fz_font_is_italic(ctx, font)) font_style = "italic"; |
1081 | 0 | fz_write_printf(ctx, out, "%q:{", "font"); |
1082 | 0 | fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font)); |
1083 | 0 | fz_write_printf(ctx, out, "%q:%q,", "family", font_family); |
1084 | 0 | fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight); |
1085 | 0 | fz_write_printf(ctx, out, "%q:%q,", "style", font_style); |
1086 | 0 | fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale)); |
1087 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale)); |
1088 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale)); |
1089 | 0 | } |
1090 | |
|
1091 | 0 | fz_write_printf(ctx, out, "%q:\"", "text"); |
1092 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1093 | 0 | { |
1094 | 0 | if (ch->c == '"' || ch->c == '\\') |
1095 | 0 | fz_write_printf(ctx, out, "\\%c", ch->c); |
1096 | 0 | else if (ch->c < 32) |
1097 | 0 | fz_write_printf(ctx, out, "\\u%04x", ch->c); |
1098 | 0 | else |
1099 | 0 | fz_write_printf(ctx, out, "%C", ch->c); |
1100 | 0 | } |
1101 | 0 | fz_write_printf(ctx, out, "\"}"); |
1102 | 0 | } |
1103 | 0 | fz_write_string(ctx, out, "]}"); |
1104 | 0 | break; |
1105 | | |
1106 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
1107 | 0 | fz_write_printf(ctx, out, "{%q:%q,", "type", "image"); |
1108 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
1109 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); |
1110 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); |
1111 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); |
1112 | 0 | fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); |
1113 | 0 | break; |
1114 | | |
1115 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
1116 | 0 | fz_write_printf(ctx, out, "{%q:%q,", "type", "structure"); |
1117 | 0 | fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index); |
1118 | 0 | if (block->u.s.down) |
1119 | 0 | { |
1120 | 0 | fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw); |
1121 | 0 | fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard)); |
1122 | 0 | fz_write_printf(ctx, out, ",%q:[", "contents"); |
1123 | 0 | as_json(ctx, block->u.s.down->first_block, out, scale); |
1124 | 0 | fz_write_printf(ctx, out, "]"); |
1125 | 0 | } |
1126 | 0 | fz_write_printf(ctx, out, "}"); |
1127 | 0 | break; |
1128 | |
|
1129 | 0 | } |
1130 | 0 | block = block->next; |
1131 | 0 | } |
1132 | 0 | } |
1133 | | |
1134 | | void |
1135 | | fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale) |
1136 | 0 | { |
1137 | 0 | fz_write_printf(ctx, out, "{%q:[", "blocks"); |
1138 | |
|
1139 | 0 | as_json(ctx, page->first_block, out, scale); |
1140 | |
|
1141 | 0 | fz_write_string(ctx, out, "]}"); |
1142 | 0 | } |
1143 | | |
1144 | | /* Plain text */ |
1145 | | |
1146 | | static void |
1147 | | do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block) |
1148 | 0 | { |
1149 | 0 | fz_stext_block *block; |
1150 | 0 | fz_stext_line *line; |
1151 | 0 | fz_stext_char *ch; |
1152 | 0 | char utf[10]; |
1153 | 0 | int i, n; |
1154 | |
|
1155 | 0 | for (block = first_block; block; block = block->next) |
1156 | 0 | { |
1157 | 0 | switch (block->type) |
1158 | 0 | { |
1159 | 0 | case FZ_STEXT_BLOCK_TEXT: |
1160 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
1161 | 0 | { |
1162 | 0 | int break_line = 1; |
1163 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1164 | 0 | { |
1165 | 0 | if (ch->next == NULL && (line->flags & FZ_STEXT_LINE_FLAGS_JOINED) != 0) |
1166 | 0 | { |
1167 | 0 | break_line = 0; |
1168 | 0 | continue; |
1169 | 0 | } |
1170 | 0 | n = fz_runetochar(utf, ch->c); |
1171 | 0 | for (i = 0; i < n; i++) |
1172 | 0 | fz_write_byte(ctx, out, utf[i]); |
1173 | 0 | } |
1174 | 0 | if (break_line) |
1175 | 0 | fz_write_string(ctx, out, "\n"); |
1176 | 0 | } |
1177 | 0 | fz_write_string(ctx, out, "\n"); |
1178 | 0 | break; |
1179 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
1180 | 0 | if (block->u.s.down != NULL) |
1181 | 0 | do_as_text(ctx, out, block->u.s.down->first_block); |
1182 | 0 | break; |
1183 | 0 | } |
1184 | 0 | } |
1185 | 0 | } |
1186 | | |
1187 | | void |
1188 | | fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) |
1189 | 0 | { |
1190 | 0 | do_as_text(ctx, out, page->first_block); |
1191 | 0 | } |
1192 | | |
1193 | | /* Text output writer */ |
1194 | | |
1195 | | enum { |
1196 | | FZ_FORMAT_TEXT, |
1197 | | FZ_FORMAT_HTML, |
1198 | | FZ_FORMAT_XHTML, |
1199 | | FZ_FORMAT_STEXT_XML, |
1200 | | FZ_FORMAT_STEXT_JSON, |
1201 | | }; |
1202 | | |
1203 | | typedef struct |
1204 | | { |
1205 | | fz_document_writer super; |
1206 | | int format; |
1207 | | int number; |
1208 | | fz_stext_options opts; |
1209 | | fz_stext_page *page; |
1210 | | fz_output *out; |
1211 | | } fz_text_writer; |
1212 | | |
1213 | | static fz_device * |
1214 | | text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) |
1215 | 0 | { |
1216 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
1217 | 0 | float s = wri->opts.scale; |
1218 | |
|
1219 | 0 | if (wri->page) |
1220 | 0 | { |
1221 | 0 | fz_drop_stext_page(ctx, wri->page); |
1222 | 0 | wri->page = NULL; |
1223 | 0 | } |
1224 | |
|
1225 | 0 | wri->number++; |
1226 | |
|
1227 | 0 | wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s))); |
1228 | 0 | return fz_new_stext_device(ctx, wri->page, &wri->opts); |
1229 | 0 | } |
1230 | | |
1231 | | static void |
1232 | | text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) |
1233 | 0 | { |
1234 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
1235 | 0 | float s = wri->opts.scale; |
1236 | |
|
1237 | 0 | fz_scale_stext_page(ctx, wri->page, s); |
1238 | |
|
1239 | 0 | fz_try(ctx) |
1240 | 0 | { |
1241 | 0 | fz_close_device(ctx, dev); |
1242 | 0 | switch (wri->format) |
1243 | 0 | { |
1244 | 0 | default: |
1245 | 0 | case FZ_FORMAT_TEXT: |
1246 | 0 | fz_print_stext_page_as_text(ctx, wri->out, wri->page); |
1247 | 0 | break; |
1248 | 0 | case FZ_FORMAT_HTML: |
1249 | 0 | fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number); |
1250 | 0 | break; |
1251 | 0 | case FZ_FORMAT_XHTML: |
1252 | 0 | fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number); |
1253 | 0 | break; |
1254 | 0 | case FZ_FORMAT_STEXT_XML: |
1255 | 0 | fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number); |
1256 | 0 | break; |
1257 | 0 | case FZ_FORMAT_STEXT_JSON: |
1258 | 0 | if (wri->number > 1) |
1259 | 0 | fz_write_string(ctx, wri->out, ","); |
1260 | 0 | fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1); |
1261 | 0 | break; |
1262 | 0 | } |
1263 | 0 | } |
1264 | 0 | fz_always(ctx) |
1265 | 0 | { |
1266 | 0 | fz_drop_device(ctx, dev); |
1267 | 0 | fz_drop_stext_page(ctx, wri->page); |
1268 | 0 | wri->page = NULL; |
1269 | 0 | } |
1270 | 0 | fz_catch(ctx) |
1271 | 0 | fz_rethrow(ctx); |
1272 | 0 | } |
1273 | | |
1274 | | static void |
1275 | | text_close_writer(fz_context *ctx, fz_document_writer *wri_) |
1276 | 0 | { |
1277 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
1278 | 0 | switch (wri->format) |
1279 | 0 | { |
1280 | 0 | case FZ_FORMAT_HTML: |
1281 | 0 | fz_print_stext_trailer_as_html(ctx, wri->out); |
1282 | 0 | break; |
1283 | 0 | case FZ_FORMAT_XHTML: |
1284 | 0 | fz_print_stext_trailer_as_xhtml(ctx, wri->out); |
1285 | 0 | break; |
1286 | 0 | case FZ_FORMAT_STEXT_XML: |
1287 | 0 | fz_write_string(ctx, wri->out, "</document>\n"); |
1288 | 0 | break; |
1289 | 0 | case FZ_FORMAT_STEXT_JSON: |
1290 | 0 | fz_write_string(ctx, wri->out, "]\n"); |
1291 | 0 | break; |
1292 | 0 | } |
1293 | 0 | fz_close_output(ctx, wri->out); |
1294 | 0 | } |
1295 | | |
1296 | | static void |
1297 | | text_drop_writer(fz_context *ctx, fz_document_writer *wri_) |
1298 | 0 | { |
1299 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
1300 | 0 | fz_drop_stext_page(ctx, wri->page); |
1301 | 0 | fz_drop_output(ctx, wri->out); |
1302 | 0 | } |
1303 | | |
1304 | | fz_document_writer * |
1305 | | fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options) |
1306 | 0 | { |
1307 | 0 | fz_text_writer *wri = NULL; |
1308 | |
|
1309 | 0 | fz_var(wri); |
1310 | |
|
1311 | 0 | fz_try(ctx) |
1312 | 0 | { |
1313 | 0 | wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer); |
1314 | 0 | fz_parse_stext_options(ctx, &wri->opts, options); |
1315 | |
|
1316 | 0 | wri->format = FZ_FORMAT_TEXT; |
1317 | 0 | if (!strcmp(format, "text")) |
1318 | 0 | wri->format = FZ_FORMAT_TEXT; |
1319 | 0 | else if (!strcmp(format, "html")) |
1320 | 0 | wri->format = FZ_FORMAT_HTML; |
1321 | 0 | else if (!strcmp(format, "xhtml")) |
1322 | 0 | wri->format = FZ_FORMAT_XHTML; |
1323 | 0 | else if (!strcmp(format, "stext")) |
1324 | 0 | wri->format = FZ_FORMAT_STEXT_XML; |
1325 | 0 | else if (!strcmp(format, "stext.xml")) |
1326 | 0 | wri->format = FZ_FORMAT_STEXT_XML; |
1327 | 0 | else if (!strcmp(format, "stext.json")) |
1328 | 0 | { |
1329 | 0 | wri->format = FZ_FORMAT_STEXT_JSON; |
1330 | 0 | wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS; |
1331 | 0 | } |
1332 | |
|
1333 | 0 | wri->out = out; |
1334 | |
|
1335 | 0 | switch (wri->format) |
1336 | 0 | { |
1337 | 0 | case FZ_FORMAT_HTML: |
1338 | 0 | fz_print_stext_header_as_html(ctx, wri->out); |
1339 | 0 | break; |
1340 | 0 | case FZ_FORMAT_XHTML: |
1341 | 0 | fz_print_stext_header_as_xhtml(ctx, wri->out); |
1342 | 0 | break; |
1343 | 0 | case FZ_FORMAT_STEXT_XML: |
1344 | 0 | fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n"); |
1345 | 0 | fz_write_string(ctx, wri->out, "<document>\n"); |
1346 | 0 | break; |
1347 | 0 | case FZ_FORMAT_STEXT_JSON: |
1348 | 0 | fz_write_string(ctx, wri->out, "["); |
1349 | 0 | break; |
1350 | 0 | } |
1351 | 0 | } |
1352 | 0 | fz_catch(ctx) |
1353 | 0 | { |
1354 | 0 | fz_drop_output(ctx, out); |
1355 | 0 | fz_free(ctx, wri); |
1356 | 0 | fz_rethrow(ctx); |
1357 | 0 | } |
1358 | | |
1359 | 0 | return (fz_document_writer*)wri; |
1360 | 0 | } |
1361 | | |
1362 | | fz_document_writer * |
1363 | | fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options) |
1364 | 0 | { |
1365 | 0 | fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); |
1366 | 0 | return fz_new_text_writer_with_output(ctx, format, out, options); |
1367 | 0 | } |