/src/mupdf/source/fitz/stext-output.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #define SUBSCRIPT_OFFSET 0.2f |
26 | | #define SUPERSCRIPT_OFFSET -0.2f |
27 | | |
28 | | #include <ft2build.h> |
29 | | #include FT_FREETYPE_H |
30 | | |
31 | | // Text black color when converted from DeviceCMYK to RGB |
32 | 0 | #define CMYK_BLACK 0x221f1f |
33 | | |
34 | | static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale) |
35 | 0 | { |
36 | 0 | fz_matrix m = fz_scale(scale, scale); |
37 | 0 | fz_stext_block *block; |
38 | 0 | fz_stext_line *line; |
39 | 0 | fz_stext_char *ch; |
40 | |
|
41 | 0 | for (block = page->first_block; block; block = block->next) |
42 | 0 | { |
43 | 0 | block->bbox = fz_transform_rect(block->bbox, m); |
44 | 0 | switch (block->type) |
45 | 0 | { |
46 | 0 | case FZ_STEXT_BLOCK_TEXT: |
47 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
48 | 0 | { |
49 | 0 | line->bbox = fz_transform_rect(block->bbox, m); |
50 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
51 | 0 | { |
52 | 0 | ch->origin = fz_transform_point(ch->origin, m); |
53 | 0 | ch->quad = fz_transform_quad(ch->quad, m); |
54 | 0 | ch->size = ch->size * scale; |
55 | 0 | } |
56 | 0 | } |
57 | 0 | break; |
58 | | |
59 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
60 | 0 | block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale); |
61 | 0 | break; |
62 | 0 | } |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | | /* HTML output (visual formatting with preserved layout) */ |
67 | | |
68 | | static int |
69 | | detect_super_script(fz_stext_line *line, fz_stext_char *ch) |
70 | 0 | { |
71 | 0 | if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) |
72 | 0 | return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; |
73 | 0 | return 0; |
74 | 0 | } |
75 | | |
76 | | static const char * |
77 | | font_full_name(fz_context *ctx, fz_font *font) |
78 | 0 | { |
79 | 0 | const char *name = fz_font_name(ctx, font); |
80 | 0 | const char *s = strchr(name, '+'); |
81 | 0 | return s ? s + 1 : name; |
82 | 0 | } |
83 | | |
84 | | static const char * |
85 | | html_clean_font_name(const char *fontname) |
86 | 0 | { |
87 | 0 | if (strstr(fontname, "Times")) |
88 | 0 | return "Times New Roman"; |
89 | 0 | if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica")) |
90 | 0 | { |
91 | 0 | if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed")) |
92 | 0 | return "Arial Narrow"; |
93 | 0 | return "Arial"; |
94 | 0 | } |
95 | 0 | if (strstr(fontname, "Courier")) |
96 | 0 | return "Courier"; |
97 | 0 | return fontname; |
98 | 0 | } |
99 | | |
100 | | static void |
101 | | font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif) |
102 | 0 | { |
103 | 0 | const char *name = html_clean_font_name(font_full_name(ctx, font)); |
104 | 0 | char *s; |
105 | 0 | fz_strlcpy(buf, name, size); |
106 | 0 | s = strrchr(buf, '-'); |
107 | 0 | if (s) |
108 | 0 | *s = 0; |
109 | 0 | if (is_mono) |
110 | 0 | fz_strlcat(buf, ",monospace", size); |
111 | 0 | else |
112 | 0 | fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size); |
113 | 0 | } |
114 | | |
115 | | static void |
116 | | fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) |
117 | 0 | { |
118 | 0 | char family[80]; |
119 | |
|
120 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
121 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
122 | 0 | int is_serif = fz_font_is_serif(ctx, font); |
123 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
124 | |
|
125 | 0 | font_family_name(ctx, font, family, sizeof family, is_mono, is_serif); |
126 | |
|
127 | 0 | if (sup) fz_write_string(ctx, out, "<sup>"); |
128 | 0 | if (is_mono) fz_write_string(ctx, out, "<tt>"); |
129 | 0 | if (is_bold) fz_write_string(ctx, out, "<b>"); |
130 | 0 | if (is_italic) fz_write_string(ctx, out, "<i>"); |
131 | 0 | fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size); |
132 | 0 | if (color != 0 && color != CMYK_BLACK) |
133 | 0 | fz_write_printf(ctx, out, ";color:#%06x", color); |
134 | 0 | fz_write_printf(ctx, out, "\">"); |
135 | 0 | } |
136 | | |
137 | | static void |
138 | | fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) |
139 | 0 | { |
140 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
141 | 0 | int is_bold = fz_font_is_bold(ctx,font); |
142 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
143 | |
|
144 | 0 | fz_write_string(ctx, out, "</span>"); |
145 | 0 | if (is_italic) fz_write_string(ctx, out, "</i>"); |
146 | 0 | if (is_bold) fz_write_string(ctx, out, "</b>"); |
147 | 0 | if (is_mono) fz_write_string(ctx, out, "</tt>"); |
148 | 0 | if (sup) fz_write_string(ctx, out, "</sup>"); |
149 | 0 | } |
150 | | |
151 | | static void |
152 | | fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
153 | 0 | { |
154 | 0 | fz_matrix ctm = block->u.i.transform; |
155 | |
|
156 | 0 | #define USE_CSS_MATRIX_TRANSFORMS |
157 | 0 | #ifdef USE_CSS_MATRIX_TRANSFORMS |
158 | | /* Matrix maths notes. |
159 | | * When we get here ctm maps the unit square to the position in device |
160 | | * space occupied by the image. |
161 | | * |
162 | | * That is to say that mapping the 4 corners of the unit square through |
163 | | * the transform, give us the 4 target corners. We extend the corners |
164 | | * by adding an extra '1' into them to allow transforms to work. Thus |
165 | | * (x,y) maps through ctm = (a b c d e f) as: |
166 | | * |
167 | | * (x y 1) (a b 0) = (X Y 1) |
168 | | * (c d 0) |
169 | | * (e f 1) |
170 | | * |
171 | | * To simplify reading of matrix maths, we use the trick where we |
172 | | * 'drop' the first matrix down the page. Thus the corners c0=(0,0), |
173 | | * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively: |
174 | | * |
175 | | * ( a b 0) |
176 | | * ( c d 0) |
177 | | * ( e f 1) |
178 | | * (0 0 1) ( e f 1) |
179 | | * (0 1 1) ( c+e d+f 1) |
180 | | * (1 1 1) (a+c+e b+d+f 1) |
181 | | * (1 0 1) ( a+e b+f 1) |
182 | | * |
183 | | * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f) |
184 | | * |
185 | | * Unfortunately, the CSS matrix transform, does not map the unit square. |
186 | | * Rather it does something moderately mad. As far as I can work out, the |
187 | | * top left corner of a (0,0) -> (w, h) box is transformed using the .e |
188 | | * and .f entries of the matrix. Then the image from within that square |
189 | | * is transformed using the centre of that square as the origin. |
190 | | * |
191 | | * So, an image placed at (0,0) in destination space with 1:1 transform |
192 | | * will result in an image a (0,0) as you'd expect. But an image at (0,0) |
193 | | * with a scale of 2, will result in 25% of the image off the left of the |
194 | | * screen, and 25% off the top. |
195 | | * |
196 | | * Accordingly, we have to adjust the ctm in several steps. |
197 | | */ |
198 | | /* Move to moving the centre of the image. */ |
199 | 0 | ctm.e += (ctm.a+ctm.c)/2; |
200 | 0 | ctm.f += (ctm.b+ctm.d)/2; |
201 | | /* Move from transforming the unit square to w/h */ |
202 | 0 | ctm.a /= block->u.i.image->w; |
203 | 0 | ctm.b /= block->u.i.image->w; |
204 | 0 | ctm.c /= block->u.i.image->h; |
205 | 0 | ctm.d /= block->u.i.image->h; |
206 | | /* Move from points to pixels */ |
207 | 0 | ctm.a *= 96.0f/72; |
208 | 0 | ctm.b *= 96.0f/72; |
209 | 0 | ctm.c *= 96.0f/72; |
210 | 0 | ctm.d *= 96.0f/72; |
211 | 0 | ctm.e *= 96.0f/72; |
212 | 0 | ctm.f *= 96.0f/72; |
213 | | /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */ |
214 | 0 | ctm.e -= block->u.i.image->w/2; |
215 | 0 | ctm.f -= block->u.i.image->h/2; |
216 | |
|
217 | 0 | fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"", |
218 | 0 | ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f); |
219 | | #else |
220 | | /* Alternative version of the code that uses scaleX/Y and rotate |
221 | | * instead, but only copes with axis aligned cases. */ |
222 | | int t; |
223 | | |
224 | | int x = block->bbox.x0; |
225 | | int y = block->bbox.y0; |
226 | | int w = block->bbox.x1 - block->bbox.x0; |
227 | | int h = block->bbox.y1 - block->bbox.y0; |
228 | | |
229 | | const char *flip = ""; |
230 | | |
231 | | if (ctm.b == 0 && ctm.c == 0) |
232 | | { |
233 | | if (ctm.a < 0 && ctm.d < 0) |
234 | | flip = "transform: scaleX(-1) scaleY(-1);"; |
235 | | else if (ctm.a < 0) |
236 | | { |
237 | | flip = "transform: scaleX(-1);"; |
238 | | } |
239 | | else if (ctm.d < 0) |
240 | | { |
241 | | flip = "transform: scaleY(-1);"; |
242 | | } |
243 | | } else if (ctm.a == 0 && ctm.d == 0) { |
244 | | if (ctm.b < 0 && ctm.c < 0) |
245 | | { |
246 | | flip = "transform: scaleY(-1) rotate(90deg);"; |
247 | | x += (w-h)/2; |
248 | | y -= (w-h)/2; |
249 | | t = w; w = h; h = t; |
250 | | } |
251 | | else if (ctm.b < 0) |
252 | | { |
253 | | flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);"; |
254 | | x += (w-h)/2; |
255 | | y -= (w-h)/2; |
256 | | t = w; w = h; h = t; |
257 | | } |
258 | | else if (ctm.c < 0) |
259 | | { |
260 | | flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);"; |
261 | | x += (w-h)/2; |
262 | | y -= (w-h)/2; |
263 | | t = w; w = h; h = t; |
264 | | } |
265 | | else |
266 | | { |
267 | | flip = "transform: scaleY(-1) rotate(270deg);"; |
268 | | x += (w-h)/2; |
269 | | y -= (w-h)/2; |
270 | | t = w; w = h; h = t; |
271 | | } |
272 | | } |
273 | | |
274 | | fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h); |
275 | | #endif |
276 | 0 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
277 | 0 | fz_write_string(ctx, out, "\">\n"); |
278 | 0 | } |
279 | | |
280 | | void |
281 | | fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
282 | 0 | { |
283 | 0 | fz_stext_line *line; |
284 | 0 | fz_stext_char *ch; |
285 | 0 | float x, y, h; |
286 | |
|
287 | 0 | fz_font *font = NULL; |
288 | 0 | float size = 0; |
289 | 0 | int sup = 0; |
290 | 0 | int color = 0; |
291 | |
|
292 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
293 | 0 | { |
294 | 0 | x = line->bbox.x0; |
295 | 0 | y = line->bbox.y0; |
296 | 0 | h = line->bbox.y1 - line->bbox.y0; |
297 | |
|
298 | 0 | if (line->first_char) |
299 | 0 | { |
300 | 0 | h = line->first_char->size; |
301 | 0 | y = line->first_char->origin.y - h * 0.8f; |
302 | 0 | } |
303 | |
|
304 | 0 | fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h); |
305 | 0 | font = NULL; |
306 | |
|
307 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
308 | 0 | { |
309 | 0 | int ch_sup = detect_super_script(line, ch); |
310 | 0 | if (ch->font != font || ch->size != size || ch_sup != sup || ch->color != color) |
311 | 0 | { |
312 | 0 | if (font) |
313 | 0 | fz_print_style_end_html(ctx, out, font, size, sup, color); |
314 | 0 | font = ch->font; |
315 | 0 | size = ch->size; |
316 | 0 | color = ch->color; |
317 | 0 | sup = ch_sup; |
318 | 0 | fz_print_style_begin_html(ctx, out, font, size, sup, color); |
319 | 0 | } |
320 | |
|
321 | 0 | switch (ch->c) |
322 | 0 | { |
323 | 0 | default: |
324 | 0 | if (ch->c >= 32 && ch->c <= 127) |
325 | 0 | fz_write_byte(ctx, out, ch->c); |
326 | 0 | else |
327 | 0 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
328 | 0 | break; |
329 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
330 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
331 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
332 | 0 | case '"': fz_write_string(ctx, out, """); break; |
333 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | 0 | if (font) |
338 | 0 | fz_print_style_end_html(ctx, out, font, size, sup, color); |
339 | |
|
340 | 0 | fz_write_string(ctx, out, "</p>\n"); |
341 | 0 | } |
342 | 0 | } |
343 | | |
344 | | void |
345 | | fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
346 | 0 | { |
347 | 0 | fz_stext_block *block; |
348 | |
|
349 | 0 | float w = page->mediabox.x1 - page->mediabox.x0; |
350 | 0 | float h = page->mediabox.y1 - page->mediabox.y0; |
351 | |
|
352 | 0 | fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h); |
353 | |
|
354 | 0 | for (block = page->first_block; block; block = block->next) |
355 | 0 | { |
356 | 0 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
357 | 0 | fz_print_stext_image_as_html(ctx, out, block); |
358 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
359 | 0 | fz_print_stext_block_as_html(ctx, out, block); |
360 | 0 | } |
361 | |
|
362 | 0 | fz_write_string(ctx, out, "</div>\n"); |
363 | 0 | } |
364 | | |
365 | | void |
366 | | fz_print_stext_header_as_html(fz_context *ctx, fz_output *out) |
367 | 0 | { |
368 | 0 | fz_write_string(ctx, out, "<!DOCTYPE html>\n"); |
369 | 0 | fz_write_string(ctx, out, "<html>\n"); |
370 | 0 | fz_write_string(ctx, out, "<head>\n"); |
371 | 0 | fz_write_string(ctx, out, "<style>\n"); |
372 | 0 | fz_write_string(ctx, out, "body{background-color:slategray}\n"); |
373 | 0 | fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n"); |
374 | 0 | fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n"); |
375 | 0 | fz_write_string(ctx, out, "</style>\n"); |
376 | 0 | fz_write_string(ctx, out, "</head>\n"); |
377 | 0 | fz_write_string(ctx, out, "<body>\n"); |
378 | 0 | } |
379 | | |
380 | | void |
381 | | fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) |
382 | 0 | { |
383 | 0 | fz_write_string(ctx, out, "</body>\n"); |
384 | 0 | fz_write_string(ctx, out, "</html>\n"); |
385 | 0 | } |
386 | | |
387 | | /* XHTML output (semantic, little layout, suitable for reflow) */ |
388 | | |
389 | | static void |
390 | | fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
391 | 0 | { |
392 | 0 | int w = block->bbox.x1 - block->bbox.x0; |
393 | 0 | int h = block->bbox.y1 - block->bbox.y0; |
394 | |
|
395 | 0 | fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h); |
396 | 0 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
397 | 0 | fz_write_string(ctx, out, "\"/></p>\n"); |
398 | 0 | } |
399 | | |
400 | | static void |
401 | | fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) |
402 | 0 | { |
403 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
404 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
405 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
406 | |
|
407 | 0 | if (sup) |
408 | 0 | fz_write_string(ctx, out, "<sup>"); |
409 | 0 | if (is_mono) |
410 | 0 | fz_write_string(ctx, out, "<tt>"); |
411 | 0 | if (is_bold) |
412 | 0 | fz_write_string(ctx, out, "<b>"); |
413 | 0 | if (is_italic) |
414 | 0 | fz_write_string(ctx, out, "<i>"); |
415 | 0 | } |
416 | | |
417 | | static void |
418 | | fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) |
419 | 0 | { |
420 | 0 | int is_mono = fz_font_is_monospaced(ctx, font); |
421 | 0 | int is_bold = fz_font_is_bold(ctx, font); |
422 | 0 | int is_italic = fz_font_is_italic(ctx, font); |
423 | |
|
424 | 0 | if (is_italic) |
425 | 0 | fz_write_string(ctx, out, "</i>"); |
426 | 0 | if (is_bold) |
427 | 0 | fz_write_string(ctx, out, "</b>"); |
428 | 0 | if (is_mono) |
429 | 0 | fz_write_string(ctx, out, "</tt>"); |
430 | 0 | if (sup) |
431 | 0 | fz_write_string(ctx, out, "</sup>"); |
432 | 0 | } |
433 | | |
434 | | static float avg_font_size_of_line(fz_stext_char *ch) |
435 | 0 | { |
436 | 0 | float size = 0; |
437 | 0 | int n = 0; |
438 | 0 | if (!ch) |
439 | 0 | return 0; |
440 | 0 | while (ch) |
441 | 0 | { |
442 | 0 | size += ch->size; |
443 | 0 | ++n; |
444 | 0 | ch = ch->next; |
445 | 0 | } |
446 | 0 | return size / n; |
447 | 0 | } |
448 | | |
449 | | static const char *tag_from_font_size(float size) |
450 | 0 | { |
451 | 0 | if (size >= 20) return "h1"; |
452 | 0 | if (size >= 15) return "h2"; |
453 | 0 | if (size >= 12) return "h3"; |
454 | 0 | return "p"; |
455 | 0 | } |
456 | | |
457 | | static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
458 | 0 | { |
459 | 0 | fz_stext_line *line; |
460 | 0 | fz_stext_char *ch; |
461 | |
|
462 | 0 | fz_font *font = NULL; |
463 | 0 | int sup = 0; |
464 | 0 | int sp = 1; |
465 | 0 | const char *tag = NULL; |
466 | 0 | const char *new_tag; |
467 | |
|
468 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
469 | 0 | { |
470 | 0 | new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char)); |
471 | 0 | if (tag != new_tag) |
472 | 0 | { |
473 | 0 | if (tag) |
474 | 0 | { |
475 | 0 | if (font) |
476 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
477 | 0 | fz_write_printf(ctx, out, "</%s>", tag); |
478 | 0 | } |
479 | 0 | tag = new_tag; |
480 | 0 | fz_write_printf(ctx, out, "<%s>", tag); |
481 | 0 | if (font) |
482 | 0 | fz_print_style_begin_xhtml(ctx, out, font, sup); |
483 | 0 | } |
484 | |
|
485 | 0 | if (!sp) |
486 | 0 | fz_write_byte(ctx, out, ' '); |
487 | |
|
488 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
489 | 0 | { |
490 | 0 | int ch_sup = detect_super_script(line, ch); |
491 | 0 | if (ch->font != font || ch_sup != sup) |
492 | 0 | { |
493 | 0 | if (font) |
494 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
495 | 0 | font = ch->font; |
496 | 0 | sup = ch_sup; |
497 | 0 | fz_print_style_begin_xhtml(ctx, out, font, sup); |
498 | 0 | } |
499 | |
|
500 | 0 | sp = (ch->c == ' '); |
501 | 0 | switch (ch->c) |
502 | 0 | { |
503 | 0 | default: |
504 | 0 | if (ch->c >= 32 && ch->c <= 127) |
505 | 0 | fz_write_byte(ctx, out, ch->c); |
506 | 0 | else |
507 | 0 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
508 | 0 | break; |
509 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
510 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
511 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
512 | 0 | case '"': fz_write_string(ctx, out, """); break; |
513 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
514 | 0 | } |
515 | 0 | } |
516 | 0 | } |
517 | | |
518 | 0 | if (font) |
519 | 0 | fz_print_style_end_xhtml(ctx, out, font, sup); |
520 | 0 | fz_write_printf(ctx, out, "</%s>\n", tag); |
521 | 0 | } |
522 | | |
523 | | void |
524 | | fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
525 | 0 | { |
526 | 0 | fz_stext_block *block; |
527 | |
|
528 | 0 | fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id); |
529 | |
|
530 | 0 | for (block = page->first_block; block; block = block->next) |
531 | 0 | { |
532 | 0 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
533 | 0 | fz_print_stext_image_as_xhtml(ctx, out, block); |
534 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
535 | 0 | fz_print_stext_block_as_xhtml(ctx, out, block); |
536 | 0 | } |
537 | |
|
538 | 0 | fz_write_string(ctx, out, "</div>\n"); |
539 | 0 | } |
540 | | |
541 | | void |
542 | | fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) |
543 | 0 | { |
544 | 0 | fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n"); |
545 | 0 | fz_write_string(ctx, out, "<!DOCTYPE html"); |
546 | 0 | fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""); |
547 | 0 | fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"); |
548 | 0 | fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n"); |
549 | 0 | fz_write_string(ctx, out, "<head>\n"); |
550 | 0 | fz_write_string(ctx, out, "<style>\n"); |
551 | 0 | fz_write_string(ctx, out, "p{white-space:pre-wrap}\n"); |
552 | 0 | fz_write_string(ctx, out, "</style>\n"); |
553 | 0 | fz_write_string(ctx, out, "</head>\n"); |
554 | 0 | fz_write_string(ctx, out, "<body>\n"); |
555 | 0 | } |
556 | | |
557 | | void |
558 | | fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) |
559 | 0 | { |
560 | 0 | fz_write_string(ctx, out, "</body>\n"); |
561 | 0 | fz_write_string(ctx, out, "</html>\n"); |
562 | 0 | } |
563 | | |
564 | | /* Detailed XML dump of the entire structured text data */ |
565 | | |
566 | | void |
567 | | fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) |
568 | 0 | { |
569 | 0 | fz_stext_block *block; |
570 | 0 | fz_stext_line *line; |
571 | 0 | fz_stext_char *ch; |
572 | |
|
573 | 0 | fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id, |
574 | 0 | page->mediabox.x1 - page->mediabox.x0, |
575 | 0 | page->mediabox.y1 - page->mediabox.y0); |
576 | |
|
577 | 0 | for (block = page->first_block; block; block = block->next) |
578 | 0 | { |
579 | 0 | switch (block->type) |
580 | 0 | { |
581 | 0 | case FZ_STEXT_BLOCK_TEXT: |
582 | 0 | fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n", |
583 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
584 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
585 | 0 | { |
586 | 0 | fz_font *font = NULL; |
587 | 0 | float size = 0; |
588 | 0 | const char *name = NULL; |
589 | |
|
590 | 0 | fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\">\n", |
591 | 0 | line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1, |
592 | 0 | line->wmode, |
593 | 0 | line->dir.x, line->dir.y); |
594 | |
|
595 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
596 | 0 | { |
597 | 0 | if (ch->font != font || ch->size != size) |
598 | 0 | { |
599 | 0 | if (font) |
600 | 0 | fz_write_string(ctx, out, "</font>\n"); |
601 | 0 | font = ch->font; |
602 | 0 | size = ch->size; |
603 | 0 | name = font_full_name(ctx, font); |
604 | 0 | fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size); |
605 | 0 | } |
606 | 0 | fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" c=\"", |
607 | 0 | ch->quad.ul.x, ch->quad.ul.y, |
608 | 0 | ch->quad.ur.x, ch->quad.ur.y, |
609 | 0 | ch->quad.ll.x, ch->quad.ll.y, |
610 | 0 | ch->quad.lr.x, ch->quad.lr.y, |
611 | 0 | ch->origin.x, ch->origin.y, |
612 | 0 | ch->bidi, |
613 | 0 | ch->color); |
614 | 0 | switch (ch->c) |
615 | 0 | { |
616 | 0 | case '<': fz_write_string(ctx, out, "<"); break; |
617 | 0 | case '>': fz_write_string(ctx, out, ">"); break; |
618 | 0 | case '&': fz_write_string(ctx, out, "&"); break; |
619 | 0 | case '"': fz_write_string(ctx, out, """); break; |
620 | 0 | case '\'': fz_write_string(ctx, out, "'"); break; |
621 | 0 | default: |
622 | 0 | if (ch->c >= 32 && ch->c <= 127) |
623 | 0 | fz_write_printf(ctx, out, "%c", ch->c); |
624 | 0 | else |
625 | 0 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
626 | 0 | break; |
627 | 0 | } |
628 | 0 | fz_write_string(ctx, out, "\"/>\n"); |
629 | 0 | } |
630 | | |
631 | 0 | if (font) |
632 | 0 | fz_write_string(ctx, out, "</font>\n"); |
633 | |
|
634 | 0 | fz_write_string(ctx, out, "</line>\n"); |
635 | 0 | } |
636 | 0 | fz_write_string(ctx, out, "</block>\n"); |
637 | 0 | break; |
638 | | |
639 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
640 | 0 | fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n", |
641 | 0 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
642 | 0 | break; |
643 | 0 | } |
644 | 0 | } |
645 | 0 | fz_write_string(ctx, out, "</page>\n"); |
646 | 0 | } |
647 | | |
648 | | /* JSON dump */ |
649 | | |
650 | | void |
651 | | fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale) |
652 | 0 | { |
653 | 0 | fz_stext_block *block; |
654 | 0 | fz_stext_line *line; |
655 | 0 | fz_stext_char *ch; |
656 | |
|
657 | 0 | fz_write_printf(ctx, out, "{%q:[", "blocks"); |
658 | |
|
659 | 0 | for (block = page->first_block; block; block = block->next) |
660 | 0 | { |
661 | 0 | if (block != page->first_block) |
662 | 0 | fz_write_string(ctx, out, ","); |
663 | 0 | switch (block->type) |
664 | 0 | { |
665 | 0 | case FZ_STEXT_BLOCK_TEXT: |
666 | 0 | fz_write_printf(ctx, out, "{%q:%q,", "type", "text"); |
667 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
668 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); |
669 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); |
670 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); |
671 | 0 | fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); |
672 | 0 | fz_write_printf(ctx, out, "%q:[", "lines"); |
673 | |
|
674 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
675 | 0 | { |
676 | 0 | if (line != block->u.t.first_line) |
677 | 0 | fz_write_string(ctx, out, ","); |
678 | 0 | fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode); |
679 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
680 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale)); |
681 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale)); |
682 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale)); |
683 | 0 | fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale)); |
684 | | |
685 | | /* Since we force preserve-spans, the first char has the style for the entire line. */ |
686 | 0 | if (line->first_char) |
687 | 0 | { |
688 | 0 | fz_font *font = line->first_char->font; |
689 | 0 | char *font_family = "sans-serif"; |
690 | 0 | char *font_weight = "normal"; |
691 | 0 | char *font_style = "normal"; |
692 | 0 | if (fz_font_is_monospaced(ctx, font)) font_family = "monospace"; |
693 | 0 | else if (fz_font_is_serif(ctx, font)) font_family = "serif"; |
694 | 0 | if (fz_font_is_bold(ctx, font)) font_weight = "bold"; |
695 | 0 | if (fz_font_is_italic(ctx, font)) font_style = "italic"; |
696 | 0 | fz_write_printf(ctx, out, "%q:{", "font"); |
697 | 0 | fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font)); |
698 | 0 | fz_write_printf(ctx, out, "%q:%q,", "family", font_family); |
699 | 0 | fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight); |
700 | 0 | fz_write_printf(ctx, out, "%q:%q,", "style", font_style); |
701 | 0 | fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale)); |
702 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale)); |
703 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale)); |
704 | 0 | } |
705 | |
|
706 | 0 | fz_write_printf(ctx, out, "%q:\"", "text"); |
707 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
708 | 0 | { |
709 | 0 | if (ch->c == '"' || ch->c == '\\') |
710 | 0 | fz_write_printf(ctx, out, "\\%c", ch->c); |
711 | 0 | else if (ch->c < 32) |
712 | 0 | fz_write_printf(ctx, out, "\\u%04x", ch->c); |
713 | 0 | else |
714 | 0 | fz_write_printf(ctx, out, "%C", ch->c); |
715 | 0 | } |
716 | 0 | fz_write_printf(ctx, out, "\"}"); |
717 | 0 | } |
718 | 0 | fz_write_string(ctx, out, "]}"); |
719 | 0 | break; |
720 | | |
721 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
722 | 0 | fz_write_printf(ctx, out, "{%q:%q,", "type", "image"); |
723 | 0 | fz_write_printf(ctx, out, "%q:{", "bbox"); |
724 | 0 | fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); |
725 | 0 | fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); |
726 | 0 | fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); |
727 | 0 | fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); |
728 | 0 | break; |
729 | 0 | } |
730 | 0 | } |
731 | 0 | fz_write_string(ctx, out, "]}"); |
732 | 0 | } |
733 | | |
734 | | /* Plain text */ |
735 | | |
736 | | void |
737 | | fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) |
738 | 0 | { |
739 | 0 | fz_stext_block *block; |
740 | 0 | fz_stext_line *line; |
741 | 0 | fz_stext_char *ch; |
742 | 0 | char utf[10]; |
743 | 0 | int i, n; |
744 | |
|
745 | 0 | for (block = page->first_block; block; block = block->next) |
746 | 0 | { |
747 | 0 | if (block->type == FZ_STEXT_BLOCK_TEXT) |
748 | 0 | { |
749 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
750 | 0 | { |
751 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
752 | 0 | { |
753 | 0 | n = fz_runetochar(utf, ch->c); |
754 | 0 | for (i = 0; i < n; i++) |
755 | 0 | fz_write_byte(ctx, out, utf[i]); |
756 | 0 | } |
757 | 0 | fz_write_string(ctx, out, "\n"); |
758 | 0 | } |
759 | 0 | fz_write_string(ctx, out, "\n"); |
760 | 0 | } |
761 | 0 | } |
762 | 0 | } |
763 | | |
764 | | /* Text output writer */ |
765 | | |
766 | | enum { |
767 | | FZ_FORMAT_TEXT, |
768 | | FZ_FORMAT_HTML, |
769 | | FZ_FORMAT_XHTML, |
770 | | FZ_FORMAT_STEXT_XML, |
771 | | FZ_FORMAT_STEXT_JSON, |
772 | | }; |
773 | | |
774 | | typedef struct |
775 | | { |
776 | | fz_document_writer super; |
777 | | int format; |
778 | | int number; |
779 | | fz_stext_options opts; |
780 | | fz_stext_page *page; |
781 | | fz_output *out; |
782 | | } fz_text_writer; |
783 | | |
784 | | static fz_device * |
785 | | text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) |
786 | 0 | { |
787 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
788 | 0 | float s = wri->opts.scale; |
789 | |
|
790 | 0 | if (wri->page) |
791 | 0 | { |
792 | 0 | fz_drop_stext_page(ctx, wri->page); |
793 | 0 | wri->page = NULL; |
794 | 0 | } |
795 | |
|
796 | 0 | wri->number++; |
797 | |
|
798 | 0 | wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s))); |
799 | 0 | return fz_new_stext_device(ctx, wri->page, &wri->opts); |
800 | 0 | } |
801 | | |
802 | | static void |
803 | | text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) |
804 | 0 | { |
805 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
806 | 0 | float s = wri->opts.scale; |
807 | |
|
808 | 0 | fz_scale_stext_page(ctx, wri->page, s); |
809 | |
|
810 | 0 | fz_try(ctx) |
811 | 0 | { |
812 | 0 | fz_close_device(ctx, dev); |
813 | 0 | switch (wri->format) |
814 | 0 | { |
815 | 0 | default: |
816 | 0 | case FZ_FORMAT_TEXT: |
817 | 0 | fz_print_stext_page_as_text(ctx, wri->out, wri->page); |
818 | 0 | break; |
819 | 0 | case FZ_FORMAT_HTML: |
820 | 0 | fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number); |
821 | 0 | break; |
822 | 0 | case FZ_FORMAT_XHTML: |
823 | 0 | fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number); |
824 | 0 | break; |
825 | 0 | case FZ_FORMAT_STEXT_XML: |
826 | 0 | fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number); |
827 | 0 | break; |
828 | 0 | case FZ_FORMAT_STEXT_JSON: |
829 | 0 | if (wri->number > 1) |
830 | 0 | fz_write_string(ctx, wri->out, ","); |
831 | 0 | fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1); |
832 | 0 | break; |
833 | 0 | } |
834 | 0 | } |
835 | 0 | fz_always(ctx) |
836 | 0 | { |
837 | 0 | fz_drop_device(ctx, dev); |
838 | 0 | fz_drop_stext_page(ctx, wri->page); |
839 | 0 | wri->page = NULL; |
840 | 0 | } |
841 | 0 | fz_catch(ctx) |
842 | 0 | fz_rethrow(ctx); |
843 | 0 | } |
844 | | |
845 | | static void |
846 | | text_close_writer(fz_context *ctx, fz_document_writer *wri_) |
847 | 0 | { |
848 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
849 | 0 | switch (wri->format) |
850 | 0 | { |
851 | 0 | case FZ_FORMAT_HTML: |
852 | 0 | fz_print_stext_trailer_as_html(ctx, wri->out); |
853 | 0 | break; |
854 | 0 | case FZ_FORMAT_XHTML: |
855 | 0 | fz_print_stext_trailer_as_xhtml(ctx, wri->out); |
856 | 0 | break; |
857 | 0 | case FZ_FORMAT_STEXT_XML: |
858 | 0 | fz_write_string(ctx, wri->out, "</document>\n"); |
859 | 0 | break; |
860 | 0 | case FZ_FORMAT_STEXT_JSON: |
861 | 0 | fz_write_string(ctx, wri->out, "]\n"); |
862 | 0 | break; |
863 | 0 | } |
864 | 0 | fz_close_output(ctx, wri->out); |
865 | 0 | } |
866 | | |
867 | | static void |
868 | | text_drop_writer(fz_context *ctx, fz_document_writer *wri_) |
869 | 0 | { |
870 | 0 | fz_text_writer *wri = (fz_text_writer*)wri_; |
871 | 0 | fz_drop_stext_page(ctx, wri->page); |
872 | 0 | fz_drop_output(ctx, wri->out); |
873 | 0 | } |
874 | | |
875 | | fz_document_writer * |
876 | | fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options) |
877 | 0 | { |
878 | 0 | fz_text_writer *wri = NULL; |
879 | |
|
880 | 0 | fz_var(wri); |
881 | |
|
882 | 0 | fz_try(ctx) |
883 | 0 | { |
884 | 0 | wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer); |
885 | 0 | fz_parse_stext_options(ctx, &wri->opts, options); |
886 | |
|
887 | 0 | wri->format = FZ_FORMAT_TEXT; |
888 | 0 | if (!strcmp(format, "text")) |
889 | 0 | wri->format = FZ_FORMAT_TEXT; |
890 | 0 | else if (!strcmp(format, "html")) |
891 | 0 | wri->format = FZ_FORMAT_HTML; |
892 | 0 | else if (!strcmp(format, "xhtml")) |
893 | 0 | wri->format = FZ_FORMAT_XHTML; |
894 | 0 | else if (!strcmp(format, "stext")) |
895 | 0 | wri->format = FZ_FORMAT_STEXT_XML; |
896 | 0 | else if (!strcmp(format, "stext.xml")) |
897 | 0 | wri->format = FZ_FORMAT_STEXT_XML; |
898 | 0 | else if (!strcmp(format, "stext.json")) |
899 | 0 | { |
900 | 0 | wri->format = FZ_FORMAT_STEXT_JSON; |
901 | 0 | wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS; |
902 | 0 | } |
903 | |
|
904 | 0 | wri->out = out; |
905 | |
|
906 | 0 | switch (wri->format) |
907 | 0 | { |
908 | 0 | case FZ_FORMAT_HTML: |
909 | 0 | fz_print_stext_header_as_html(ctx, wri->out); |
910 | 0 | break; |
911 | 0 | case FZ_FORMAT_XHTML: |
912 | 0 | fz_print_stext_header_as_xhtml(ctx, wri->out); |
913 | 0 | break; |
914 | 0 | case FZ_FORMAT_STEXT_XML: |
915 | 0 | fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n"); |
916 | 0 | fz_write_string(ctx, wri->out, "<document>\n"); |
917 | 0 | break; |
918 | 0 | case FZ_FORMAT_STEXT_JSON: |
919 | 0 | fz_write_string(ctx, wri->out, "["); |
920 | 0 | break; |
921 | 0 | } |
922 | 0 | } |
923 | 0 | fz_catch(ctx) |
924 | 0 | { |
925 | 0 | fz_drop_output(ctx, out); |
926 | 0 | fz_free(ctx, wri); |
927 | 0 | fz_rethrow(ctx); |
928 | 0 | } |
929 | | |
930 | 0 | return (fz_document_writer*)wri; |
931 | 0 | } |
932 | | |
933 | | fz_document_writer * |
934 | | fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options) |
935 | 0 | { |
936 | 0 | fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); |
937 | 0 | return fz_new_text_writer_with_output(ctx, format, out, options); |
938 | 0 | } |