/src/mupdf/source/fitz/output-docx.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #if FZ_ENABLE_DOCX_OUTPUT |
26 | | |
27 | | #include "glyphbox.h" |
28 | | #include "extract/extract.h" |
29 | | #include "extract/buffer.h" |
30 | | |
31 | | #include <assert.h> |
32 | | #include <errno.h> |
33 | | #include <string.h> |
34 | | |
35 | | |
36 | | typedef struct |
37 | | { |
38 | | fz_document_writer super; |
39 | | extract_alloc_t *alloc; |
40 | | |
41 | | /* |
42 | | * .ctx is needed for the callbacks we get from the Extract library, for |
43 | | * example s_realloc_fn(). Each of our main device callbacks sets .ctx on |
44 | | * entry, and resets back to NULL before returning. |
45 | | */ |
46 | | fz_context *ctx; |
47 | | |
48 | | fz_output *output; |
49 | | extract_t *extract; |
50 | | int spacing; |
51 | | int rotation; |
52 | | int images; |
53 | | int mediabox_clip; |
54 | | fz_rect mediabox; /* As passed to writer_begin_page(). */ |
55 | | char output_cache[1024]; |
56 | | } fz_docx_writer; |
57 | | |
58 | | |
59 | | typedef struct |
60 | | { |
61 | | fz_device super; |
62 | | fz_docx_writer *writer; |
63 | | } fz_docx_device; |
64 | | |
65 | | |
66 | | static void dev_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, |
67 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
68 | 0 | { |
69 | 0 | fz_docx_device *dev = (fz_docx_device*) dev_; |
70 | 0 | fz_text_span *span; |
71 | 0 | assert(!dev->writer->ctx); |
72 | 0 | dev->writer->ctx = ctx; |
73 | 0 | fz_try(ctx) |
74 | 0 | { |
75 | 0 | for (span = text->head; span; span = span->next) |
76 | 0 | { |
77 | 0 | int i; |
78 | 0 | fz_matrix combined, trm; |
79 | 0 | fz_rect bbox; |
80 | |
|
81 | 0 | combined = fz_concat(span->trm, ctm); |
82 | |
|
83 | 0 | bbox = span->font->bbox; |
84 | 0 | if (extract_span_begin( |
85 | 0 | dev->writer->extract, |
86 | 0 | span->font->name, |
87 | 0 | span->font->flags.is_bold, |
88 | 0 | span->font->flags.is_italic, |
89 | 0 | span->wmode, |
90 | 0 | combined.a, |
91 | 0 | combined.b, |
92 | 0 | combined.c, |
93 | 0 | combined.d, |
94 | 0 | bbox.x0, |
95 | 0 | bbox.y0, |
96 | 0 | bbox.x1, |
97 | 0 | bbox.y1)) |
98 | 0 | { |
99 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin span"); |
100 | 0 | } |
101 | | |
102 | 0 | trm = span->trm; |
103 | 0 | for (i=0; i<span->len; ++i) |
104 | 0 | { |
105 | 0 | fz_text_item *item = &span->items[i]; |
106 | 0 | float adv = 0; |
107 | 0 | fz_rect bounds; |
108 | 0 | fz_matrix combined; |
109 | |
|
110 | 0 | trm.e = item->x; |
111 | 0 | trm.f = item->y; |
112 | 0 | combined = fz_concat(trm, ctm); |
113 | |
|
114 | 0 | if (dev->writer->mediabox_clip) |
115 | 0 | if (fz_glyph_entirely_outside_box(ctx, &ctm, span, item, &dev->writer->mediabox)) |
116 | 0 | continue; |
117 | | |
118 | 0 | if (span->items[i].gid >= 0) |
119 | 0 | adv = fz_advance_glyph(ctx, span->font, span->items[i].gid, span->wmode); |
120 | |
|
121 | 0 | bounds = fz_bound_glyph(ctx, span->font, span->items[i].gid, combined); |
122 | 0 | if (extract_add_char(dev->writer->extract, combined.e, combined.f, item->ucs, adv, |
123 | 0 | bounds.x0, bounds.y0, bounds.x1, bounds.y1)) |
124 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add char"); |
125 | 0 | } |
126 | | |
127 | 0 | if (extract_span_end(dev->writer->extract)) |
128 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end span"); |
129 | 0 | } |
130 | 0 | } |
131 | 0 | fz_always(ctx) |
132 | 0 | { |
133 | 0 | dev->writer->ctx = NULL; |
134 | 0 | } |
135 | 0 | fz_catch(ctx) |
136 | 0 | { |
137 | 0 | fz_rethrow(ctx); |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | | static void dev_fill_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, |
142 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
143 | 0 | { |
144 | 0 | dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params); |
145 | 0 | } |
146 | | |
147 | | static void dev_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, |
148 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
149 | 0 | { |
150 | 0 | dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params); |
151 | 0 | } |
152 | | |
153 | | static void dev_clip_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, fz_rect scissor) |
154 | 0 | { |
155 | 0 | dev_text(ctx, dev_, text, ctm, NULL, NULL, 0 /*alpha*/, fz_default_color_params); |
156 | 0 | } |
157 | | |
158 | | static void dev_clip_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) |
159 | 0 | { |
160 | 0 | dev_text(ctx, dev_, text, ctm, NULL, 0, 0, fz_default_color_params); |
161 | 0 | } |
162 | | |
163 | | static void |
164 | | dev_ignore_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm) |
165 | 0 | { |
166 | 0 | } |
167 | | |
168 | | static void writer_image_free(void *handle, void *image_data) |
169 | 0 | { |
170 | 0 | fz_docx_writer *writer = handle; |
171 | 0 | fz_free(writer->ctx, image_data); |
172 | 0 | } |
173 | | |
174 | | static void dev_fill_image(fz_context *ctx, fz_device *dev_, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) |
175 | 0 | { |
176 | 0 | fz_docx_device *dev = (fz_docx_device*) dev_; |
177 | 0 | const char *type = NULL; |
178 | 0 | fz_compressed_buffer *compressed = fz_compressed_image_buffer(ctx, img); |
179 | |
|
180 | 0 | assert(!dev->writer->ctx); |
181 | 0 | dev->writer->ctx = ctx; |
182 | 0 | fz_try(ctx) |
183 | 0 | { |
184 | 0 | if (compressed) |
185 | 0 | { |
186 | 0 | if (0) { /* For alignment */ } |
187 | 0 | else if (compressed->params.type == FZ_IMAGE_RAW) type = "raw"; |
188 | 0 | else if (compressed->params.type == FZ_IMAGE_FAX) type = "fax"; |
189 | 0 | else if (compressed->params.type == FZ_IMAGE_FLATE) type = "flate"; |
190 | 0 | else if (compressed->params.type == FZ_IMAGE_LZW) type = "lzw"; |
191 | 0 | else if (compressed->params.type == FZ_IMAGE_BMP) type = "bmp"; |
192 | 0 | else if (compressed->params.type == FZ_IMAGE_GIF) type = "gif"; |
193 | 0 | else if (compressed->params.type == FZ_IMAGE_JBIG2) type = "jbig2"; |
194 | 0 | else if (compressed->params.type == FZ_IMAGE_JPEG) type = "jpeg"; |
195 | 0 | else if (compressed->params.type == FZ_IMAGE_JPX) type = "jpx"; |
196 | 0 | else if (compressed->params.type == FZ_IMAGE_JXR) type = "jxr"; |
197 | 0 | else if (compressed->params.type == FZ_IMAGE_PNG) type = "png"; |
198 | 0 | else if (compressed->params.type == FZ_IMAGE_PNM) type = "pnm"; |
199 | 0 | else if (compressed->params.type == FZ_IMAGE_TIFF) type = "tiff"; |
200 | |
|
201 | 0 | if (type) |
202 | 0 | { |
203 | | /* Write out raw data. */ |
204 | 0 | unsigned char *data; |
205 | 0 | size_t datasize = fz_buffer_extract(ctx, compressed->buffer, &data); |
206 | 0 | if (extract_add_image( |
207 | 0 | dev->writer->extract, |
208 | 0 | type, |
209 | 0 | ctm.e /*x*/, |
210 | 0 | ctm.f /*y*/, |
211 | 0 | img->w /*w*/, |
212 | 0 | img->h /*h*/, |
213 | 0 | data, |
214 | 0 | datasize, |
215 | 0 | writer_image_free, |
216 | 0 | dev->writer |
217 | 0 | )) |
218 | 0 | { |
219 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add image type=%s", type); |
220 | 0 | } |
221 | 0 | } |
222 | 0 | else |
223 | 0 | { |
224 | | /* We don't recognise this image type, so ignore. */ |
225 | 0 | } |
226 | 0 | } |
227 | 0 | else |
228 | 0 | { |
229 | | /* |
230 | | * Compressed data not available, so we could write out |
231 | | * raw pixel values. But for now we ignore. |
232 | | */ |
233 | 0 | } |
234 | 0 | } |
235 | 0 | fz_always(ctx) |
236 | 0 | { |
237 | 0 | dev->writer->ctx = NULL; |
238 | 0 | } |
239 | 0 | fz_catch(ctx) |
240 | 0 | { |
241 | 0 | fz_rethrow(ctx); |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | /* |
246 | | * Support for sending information to Extract when walking stroke/fill path |
247 | | * with fz_walk_path(). |
248 | | */ |
249 | | typedef struct |
250 | | { |
251 | | fz_path_walker walker; |
252 | | extract_t *extract; |
253 | | } walker_info_t; |
254 | | |
255 | | static void s_moveto(fz_context *ctx, void *arg, float x, float y) |
256 | 0 | { |
257 | 0 | extract_t* extract = arg; |
258 | 0 | if (extract_moveto(extract, x, y)) |
259 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed"); |
260 | 0 | } |
261 | | |
262 | | static void s_lineto(fz_context *ctx, void *arg, float x, float y) |
263 | 0 | { |
264 | 0 | extract_t* extract = arg; |
265 | 0 | if (extract_lineto(extract, x, y)) |
266 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_lineto() failed"); |
267 | 0 | } |
268 | | |
269 | | static void s_curveto(fz_context *ctx, void *arg, float x1, float y1, |
270 | | float x2, float y2, float x3, float y3) |
271 | 0 | { |
272 | | /* We simply move to the end point of the curve so that subsequent |
273 | | (straight) lines will be handled correctly. */ |
274 | 0 | extract_t* extract = arg; |
275 | 0 | if (extract_moveto(extract, x3, y3)) |
276 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed"); |
277 | 0 | } |
278 | | |
279 | | static void s_closepath(fz_context *ctx, void *arg) |
280 | 0 | { |
281 | 0 | extract_t* extract = arg; |
282 | 0 | if (extract_closepath(extract)) |
283 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_closepath() failed"); |
284 | 0 | } |
285 | | |
286 | | /* |
287 | | * Calls extract_*() path functions on <path> using fz_walk_path() and the |
288 | | * above callbacks. |
289 | | */ |
290 | | static void s_walk_path(fz_context *ctx, fz_docx_device *dev, extract_t *extract, const fz_path *path) |
291 | 0 | { |
292 | 0 | fz_path_walker walker; |
293 | 0 | walker.moveto = s_moveto; |
294 | 0 | walker.lineto = s_lineto; |
295 | 0 | walker.curveto = s_curveto; |
296 | 0 | walker.closepath = s_closepath; |
297 | 0 | walker.quadto = NULL; |
298 | 0 | walker.curvetov = NULL; |
299 | 0 | walker.curvetoy = NULL; |
300 | 0 | walker.rectto = NULL; |
301 | |
|
302 | 0 | assert(dev->writer->ctx == ctx); |
303 | 0 | fz_walk_path(ctx, path, &walker, extract /*arg*/); |
304 | 0 | } |
305 | | |
306 | | void dev_fill_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int even_odd, |
307 | | fz_matrix matrix, fz_colorspace * colorspace, const float *color, float alpha, |
308 | | fz_color_params color_params) |
309 | 0 | { |
310 | 0 | fz_docx_device *dev = (fz_docx_device*) dev_; |
311 | 0 | extract_t *extract = dev->writer->extract; |
312 | |
|
313 | 0 | assert(!dev->writer->ctx); |
314 | 0 | dev->writer->ctx = ctx; |
315 | |
|
316 | 0 | fz_try(ctx) |
317 | 0 | { |
318 | 0 | if (extract_fill_begin( |
319 | 0 | extract, |
320 | 0 | matrix.a, |
321 | 0 | matrix.b, |
322 | 0 | matrix.c, |
323 | 0 | matrix.d, |
324 | 0 | matrix.e, |
325 | 0 | matrix.f, |
326 | 0 | color[0] |
327 | 0 | )) |
328 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin fill"); |
329 | 0 | s_walk_path(ctx, dev, extract, path); |
330 | 0 | if (extract_fill_end(extract)) |
331 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_fill_end() failed"); |
332 | 0 | } |
333 | 0 | fz_always(ctx) |
334 | 0 | { |
335 | 0 | dev->writer->ctx = NULL; |
336 | 0 | } |
337 | 0 | fz_catch(ctx) |
338 | 0 | { |
339 | 0 | fz_rethrow(ctx); |
340 | 0 | } |
341 | 0 | } |
342 | | |
343 | | |
344 | | static void |
345 | | dev_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *path, |
346 | | const fz_stroke_state *stroke, fz_matrix in_ctm, |
347 | | fz_colorspace *colorspace_in, const float *color, float alpha, |
348 | | fz_color_params color_params) |
349 | 0 | { |
350 | 0 | fz_docx_device *dev = (fz_docx_device*) dev_; |
351 | 0 | extract_t *extract = dev->writer->extract; |
352 | |
|
353 | 0 | assert(!dev->writer->ctx); |
354 | 0 | dev->writer->ctx = ctx; |
355 | 0 | fz_try(ctx) |
356 | 0 | { |
357 | 0 | if (extract_stroke_begin( |
358 | 0 | extract, |
359 | 0 | in_ctm.a, |
360 | 0 | in_ctm.b, |
361 | 0 | in_ctm.c, |
362 | 0 | in_ctm.d, |
363 | 0 | in_ctm.e, |
364 | 0 | in_ctm.f, |
365 | 0 | stroke->linewidth, |
366 | 0 | color[0] |
367 | 0 | )) |
368 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin stroke"); |
369 | 0 | s_walk_path(ctx, dev, extract, path); |
370 | 0 | if (extract_stroke_end(extract)) |
371 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_stroke_end() failed"); |
372 | 0 | } |
373 | 0 | fz_always(ctx) |
374 | 0 | { |
375 | 0 | dev->writer->ctx = NULL; |
376 | 0 | } |
377 | 0 | fz_catch(ctx) |
378 | 0 | { |
379 | 0 | fz_rethrow(ctx); |
380 | 0 | } |
381 | 0 | } |
382 | | |
383 | | static extract_struct_t |
384 | | fz_struct_to_extract(fz_structure type) |
385 | 0 | { |
386 | 0 | switch (type) |
387 | 0 | { |
388 | 0 | default: |
389 | 0 | return extract_struct_INVALID; |
390 | | |
391 | 0 | case FZ_STRUCTURE_DOCUMENT: |
392 | 0 | return extract_struct_DOCUMENT; |
393 | 0 | case FZ_STRUCTURE_PART: |
394 | 0 | return extract_struct_PART; |
395 | 0 | case FZ_STRUCTURE_ART: |
396 | 0 | return extract_struct_ART; |
397 | 0 | case FZ_STRUCTURE_SECT: |
398 | 0 | return extract_struct_SECT; |
399 | 0 | case FZ_STRUCTURE_DIV: |
400 | 0 | return extract_struct_DIV; |
401 | 0 | case FZ_STRUCTURE_BLOCKQUOTE: |
402 | 0 | return extract_struct_BLOCKQUOTE; |
403 | 0 | case FZ_STRUCTURE_CAPTION: |
404 | 0 | return extract_struct_CAPTION; |
405 | 0 | case FZ_STRUCTURE_TOC: |
406 | 0 | return extract_struct_TOC; |
407 | 0 | case FZ_STRUCTURE_TOCI: |
408 | 0 | return extract_struct_TOCI; |
409 | 0 | case FZ_STRUCTURE_INDEX: |
410 | 0 | return extract_struct_INDEX; |
411 | 0 | case FZ_STRUCTURE_NONSTRUCT: |
412 | 0 | return extract_struct_NONSTRUCT; |
413 | 0 | case FZ_STRUCTURE_PRIVATE: |
414 | 0 | return extract_struct_PRIVATE; |
415 | | /* Grouping elements (PDF 2.0 - Table 364) */ |
416 | 0 | case FZ_STRUCTURE_DOCUMENTFRAGMENT: |
417 | 0 | return extract_struct_DOCUMENTFRAGMENT; |
418 | | /* Grouping elements (PDF 2.0 - Table 365) */ |
419 | 0 | case FZ_STRUCTURE_ASIDE: |
420 | 0 | return extract_struct_ASIDE; |
421 | | /* Grouping elements (PDF 2.0 - Table 366) */ |
422 | 0 | case FZ_STRUCTURE_TITLE: |
423 | 0 | return extract_struct_TITLE; |
424 | 0 | case FZ_STRUCTURE_FENOTE: |
425 | 0 | return extract_struct_FENOTE; |
426 | | /* Grouping elements (PDF 2.0 - Table 367) */ |
427 | 0 | case FZ_STRUCTURE_SUB: |
428 | 0 | return extract_struct_SUB; |
429 | | |
430 | | /* Paragraphlike elements (PDF 1.7 - Table 10.21) */ |
431 | 0 | case FZ_STRUCTURE_P: |
432 | 0 | return extract_struct_P; |
433 | 0 | case FZ_STRUCTURE_H: |
434 | 0 | return extract_struct_H; |
435 | 0 | case FZ_STRUCTURE_H1: |
436 | 0 | return extract_struct_H1; |
437 | 0 | case FZ_STRUCTURE_H2: |
438 | 0 | return extract_struct_H2; |
439 | 0 | case FZ_STRUCTURE_H3: |
440 | 0 | return extract_struct_H3; |
441 | 0 | case FZ_STRUCTURE_H4: |
442 | 0 | return extract_struct_H4; |
443 | 0 | case FZ_STRUCTURE_H5: |
444 | 0 | return extract_struct_H5; |
445 | 0 | case FZ_STRUCTURE_H6: |
446 | 0 | return extract_struct_H6; |
447 | | |
448 | | /* List elements (PDF 1.7 - Table 10.23) */ |
449 | 0 | case FZ_STRUCTURE_LIST: |
450 | 0 | return extract_struct_LIST; |
451 | 0 | case FZ_STRUCTURE_LISTITEM: |
452 | 0 | return extract_struct_LISTITEM; |
453 | 0 | case FZ_STRUCTURE_LABEL: |
454 | 0 | return extract_struct_LABEL; |
455 | 0 | case FZ_STRUCTURE_LISTBODY: |
456 | 0 | return extract_struct_LISTBODY; |
457 | | |
458 | | /* Table elements (PDF 1.7 - Table 10.24) */ |
459 | 0 | case FZ_STRUCTURE_TABLE: |
460 | 0 | return extract_struct_TABLE; |
461 | 0 | case FZ_STRUCTURE_TR: |
462 | 0 | return extract_struct_TR; |
463 | 0 | case FZ_STRUCTURE_TH: |
464 | 0 | return extract_struct_TH; |
465 | 0 | case FZ_STRUCTURE_TD: |
466 | 0 | return extract_struct_TD; |
467 | 0 | case FZ_STRUCTURE_THEAD: |
468 | 0 | return extract_struct_THEAD; |
469 | 0 | case FZ_STRUCTURE_TBODY: |
470 | 0 | return extract_struct_TBODY; |
471 | 0 | case FZ_STRUCTURE_TFOOT: |
472 | 0 | return extract_struct_TFOOT; |
473 | | |
474 | | /* Inline elements (PDF 1.7 - Table 10.25) */ |
475 | 0 | case FZ_STRUCTURE_SPAN: |
476 | 0 | return extract_struct_SPAN; |
477 | 0 | case FZ_STRUCTURE_QUOTE: |
478 | 0 | return extract_struct_QUOTE; |
479 | 0 | case FZ_STRUCTURE_NOTE: |
480 | 0 | return extract_struct_NOTE; |
481 | 0 | case FZ_STRUCTURE_REFERENCE: |
482 | 0 | return extract_struct_REFERENCE; |
483 | 0 | case FZ_STRUCTURE_BIBENTRY: |
484 | 0 | return extract_struct_BIBENTRY; |
485 | 0 | case FZ_STRUCTURE_CODE: |
486 | 0 | return extract_struct_CODE; |
487 | 0 | case FZ_STRUCTURE_LINK: |
488 | 0 | return extract_struct_LINK; |
489 | 0 | case FZ_STRUCTURE_ANNOT: |
490 | 0 | return extract_struct_ANNOT; |
491 | | /* Inline elements (PDF 2.0 - Table 368) */ |
492 | 0 | case FZ_STRUCTURE_EM: |
493 | 0 | return extract_struct_EM; |
494 | 0 | case FZ_STRUCTURE_STRONG: |
495 | 0 | return extract_struct_STRONG; |
496 | | |
497 | | /* Ruby inline element (PDF 1.7 - Table 10.26) */ |
498 | 0 | case FZ_STRUCTURE_RUBY: |
499 | 0 | return extract_struct_RUBY; |
500 | 0 | case FZ_STRUCTURE_RB: |
501 | 0 | return extract_struct_RB; |
502 | 0 | case FZ_STRUCTURE_RT: |
503 | 0 | return extract_struct_RT; |
504 | 0 | case FZ_STRUCTURE_RP: |
505 | 0 | return extract_struct_RP; |
506 | | |
507 | | /* Warichu inline element (PDF 1.7 - Table 10.26) */ |
508 | 0 | case FZ_STRUCTURE_WARICHU: |
509 | 0 | return extract_struct_WARICHU; |
510 | 0 | case FZ_STRUCTURE_WT: |
511 | 0 | return extract_struct_WT; |
512 | 0 | case FZ_STRUCTURE_WP: |
513 | 0 | return extract_struct_WP; |
514 | | |
515 | | /* Illustration elements (PDF 1.7 - Table 10.27) */ |
516 | 0 | case FZ_STRUCTURE_FIGURE: |
517 | 0 | return extract_struct_FIGURE; |
518 | 0 | case FZ_STRUCTURE_FORMULA: |
519 | 0 | return extract_struct_FORMULA; |
520 | 0 | case FZ_STRUCTURE_FORM: |
521 | 0 | return extract_struct_FORM; |
522 | | |
523 | | /* Artifact structure type (PDF 2.0 - Table 375) */ |
524 | 0 | case FZ_STRUCTURE_ARTIFACT: |
525 | 0 | return extract_struct_ARTIFACT; |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | static void |
530 | | dev_begin_structure(fz_context *ctx, fz_device *dev_, fz_structure standard, const char *raw, int idx) |
531 | 0 | { |
532 | 0 | fz_docx_device *dev = (fz_docx_device *)dev_; |
533 | 0 | extract_t *extract = dev->writer->extract; |
534 | |
|
535 | 0 | assert(!dev->writer->ctx); |
536 | 0 | dev->writer->ctx = ctx; |
537 | 0 | fz_try(ctx) |
538 | 0 | { |
539 | 0 | if (extract_begin_struct(extract, fz_struct_to_extract(standard), idx, -1)) |
540 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin struct"); |
541 | 0 | } |
542 | 0 | fz_always(ctx) |
543 | 0 | dev->writer->ctx = NULL; |
544 | 0 | fz_catch(ctx) |
545 | 0 | fz_rethrow(ctx); |
546 | 0 | } |
547 | | |
548 | | static void |
549 | | dev_end_structure(fz_context *ctx, fz_device *dev_) |
550 | 0 | { |
551 | 0 | fz_docx_device *dev = (fz_docx_device *)dev_; |
552 | 0 | extract_t *extract = dev->writer->extract; |
553 | |
|
554 | 0 | assert(!dev->writer->ctx); |
555 | 0 | dev->writer->ctx = ctx; |
556 | 0 | fz_try(ctx) |
557 | 0 | { |
558 | 0 | if (extract_end_struct(extract)) |
559 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end struct"); |
560 | 0 | } |
561 | 0 | fz_always(ctx) |
562 | 0 | dev->writer->ctx = NULL; |
563 | 0 | fz_catch(ctx) |
564 | 0 | fz_rethrow(ctx); |
565 | 0 | } |
566 | | |
567 | | |
568 | | static fz_device *writer_begin_page(fz_context *ctx, fz_document_writer *writer_, fz_rect mediabox) |
569 | 0 | { |
570 | 0 | fz_docx_writer *writer = (fz_docx_writer*) writer_; |
571 | 0 | fz_docx_device *dev; |
572 | 0 | assert(!writer->ctx); |
573 | 0 | writer->ctx = ctx; |
574 | 0 | writer->mediabox = mediabox; |
575 | 0 | fz_var(dev); |
576 | 0 | fz_try(ctx) |
577 | 0 | { |
578 | 0 | if (extract_page_begin(writer->extract, mediabox.x0, mediabox.y0, mediabox.x1, mediabox.y1)) |
579 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin page"); |
580 | 0 | dev = fz_new_derived_device(ctx, fz_docx_device); |
581 | 0 | dev->super.fill_text = dev_fill_text; |
582 | 0 | dev->super.stroke_text = dev_stroke_text; |
583 | 0 | dev->super.clip_text = dev_clip_text; |
584 | 0 | dev->super.clip_stroke_text = dev_clip_stroke_text; |
585 | 0 | dev->super.ignore_text = dev_ignore_text; |
586 | 0 | dev->super.fill_image = dev_fill_image; |
587 | 0 | dev->super.fill_path = dev_fill_path; |
588 | 0 | dev->super.stroke_path = dev_stroke_path; |
589 | 0 | dev->super.begin_structure = dev_begin_structure; |
590 | 0 | dev->super.end_structure = dev_end_structure; |
591 | 0 | dev->writer = writer; |
592 | 0 | } |
593 | 0 | fz_always(ctx) |
594 | 0 | { |
595 | 0 | writer->ctx = NULL; |
596 | 0 | } |
597 | 0 | fz_catch(ctx) |
598 | 0 | { |
599 | 0 | fz_rethrow(ctx); |
600 | 0 | } |
601 | 0 | return &dev->super; |
602 | 0 | } |
603 | | |
604 | | static void writer_end_page(fz_context *ctx, fz_document_writer *writer_, fz_device *dev) |
605 | 0 | { |
606 | 0 | fz_docx_writer *writer = (fz_docx_writer*) writer_; |
607 | 0 | assert(!writer->ctx); |
608 | 0 | writer->ctx = ctx; |
609 | 0 | fz_try(ctx) |
610 | 0 | { |
611 | 0 | fz_close_device(ctx, dev); |
612 | 0 | if (extract_page_end(writer->extract)) |
613 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end page"); |
614 | | |
615 | 0 | if (extract_process(writer->extract, writer->spacing, writer->rotation, writer->images)) |
616 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to process page"); |
617 | 0 | } |
618 | 0 | fz_always(ctx) |
619 | 0 | { |
620 | 0 | writer->ctx = NULL; |
621 | 0 | fz_drop_device(ctx, dev); |
622 | 0 | } |
623 | 0 | fz_catch(ctx) |
624 | 0 | { |
625 | 0 | fz_rethrow(ctx); |
626 | 0 | } |
627 | 0 | } |
628 | | |
629 | | static int buffer_write(void *handle, const void *source, size_t numbytes, size_t *o_actual) |
630 | | /* |
631 | | * extract_buffer_t callback that calls fz_write_data(). <source> will be docx |
632 | | * archive data. |
633 | | */ |
634 | 0 | { |
635 | 0 | int e = 0; |
636 | 0 | fz_docx_writer *writer = handle; |
637 | 0 | fz_var(e); |
638 | 0 | fz_try(writer->ctx) |
639 | 0 | { |
640 | 0 | fz_write_data(writer->ctx, writer->output, source, numbytes); |
641 | 0 | *o_actual = numbytes; |
642 | 0 | } |
643 | 0 | fz_catch(writer->ctx) |
644 | 0 | { |
645 | 0 | errno = EIO; |
646 | 0 | e = -1; |
647 | 0 | } |
648 | 0 | return e; |
649 | 0 | } |
650 | | |
651 | | static int buffer_cache(void *handle, void **o_cache, size_t *o_numbytes) |
652 | | /* |
653 | | * extract_buffer_t cache function. We simply return writer->output_cache. |
654 | | */ |
655 | 0 | { |
656 | 0 | fz_docx_writer *writer = handle; |
657 | 0 | *o_cache = writer->output_cache; |
658 | 0 | *o_numbytes = sizeof(writer->output_cache); |
659 | 0 | return 0; |
660 | 0 | } |
661 | | |
662 | | static void writer_close(fz_context *ctx, fz_document_writer *writer_) |
663 | 0 | { |
664 | 0 | fz_docx_writer *writer = (fz_docx_writer*) writer_; |
665 | 0 | extract_buffer_t *extract_buffer_output = NULL; |
666 | |
|
667 | 0 | fz_var(extract_buffer_output); |
668 | 0 | fz_var(writer); |
669 | 0 | assert(!writer->ctx); |
670 | 0 | writer->ctx = ctx; |
671 | 0 | fz_try(ctx) |
672 | 0 | { |
673 | | /* |
674 | | * Write docx to writer->output. Need to create an |
675 | | * extract_buffer_t that writes to writer->output, for use by |
676 | | * extract_write(). |
677 | | */ |
678 | 0 | if (extract_buffer_open( |
679 | 0 | writer->alloc, |
680 | 0 | writer, |
681 | 0 | NULL /*fn_read*/, |
682 | 0 | buffer_write, |
683 | 0 | buffer_cache, |
684 | 0 | NULL /*fn_close*/, |
685 | 0 | &extract_buffer_output |
686 | 0 | )) |
687 | 0 | { |
688 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_buffer_output: %s", strerror(errno)); |
689 | 0 | } |
690 | 0 | if (extract_write(writer->extract, extract_buffer_output)) |
691 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to generate docx content: %s", strerror(errno)); |
692 | 0 | if (extract_buffer_close(&extract_buffer_output)) |
693 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to close extract_buffer: %s", strerror(errno)); |
694 | | |
695 | 0 | extract_end(&writer->extract); |
696 | 0 | fz_close_output(ctx, writer->output); |
697 | 0 | writer->ctx = NULL; |
698 | 0 | } |
699 | 0 | fz_catch(ctx) |
700 | 0 | { |
701 | | /* |
702 | | * We don't call fz_close_output() because it can throw and in |
703 | | * this error case we can safely leave cleanup to our s_drop() |
704 | | * function's calls to fz_drop_output(). |
705 | | */ |
706 | 0 | extract_buffer_close(&extract_buffer_output); |
707 | 0 | extract_end(&writer->extract); |
708 | 0 | writer->ctx = NULL; |
709 | 0 | fz_rethrow(ctx); |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | | static void writer_drop(fz_context *ctx, fz_document_writer *writer_) |
714 | 0 | { |
715 | 0 | fz_docx_writer *writer = (fz_docx_writer*) writer_; |
716 | 0 | fz_drop_output(ctx, writer->output); |
717 | 0 | writer->output = NULL; |
718 | 0 | assert(!writer->ctx); |
719 | 0 | writer->ctx = ctx; |
720 | 0 | extract_end(&writer->extract); |
721 | 0 | extract_alloc_destroy(&writer->alloc); |
722 | 0 | writer->ctx = NULL; |
723 | 0 | } |
724 | | |
725 | | |
726 | | static int get_bool_option(fz_context *ctx, const char *options, const char *name, int default_) |
727 | 0 | { |
728 | 0 | const char *value; |
729 | 0 | if (fz_has_option(ctx, options, name, &value)) |
730 | 0 | { |
731 | 0 | if (fz_option_eq(value, "yes")) return 1; |
732 | 0 | if (fz_option_eq(value, "no")) return 0; |
733 | 0 | else fz_throw(ctx, FZ_ERROR_SYNTAX, "option '%s' should be yes or no in options='%s'", name, options); |
734 | 0 | } |
735 | 0 | else |
736 | 0 | return default_; |
737 | 0 | } |
738 | | |
739 | | static double get_double_option(fz_context *ctx, const char *options, const char *name, double default_) |
740 | 0 | { |
741 | 0 | const char *value; |
742 | 0 | if (fz_has_option(ctx, options, name, &value)) |
743 | 0 | { |
744 | 0 | double ret = atof(value); |
745 | 0 | return ret; |
746 | 0 | } |
747 | 0 | else |
748 | 0 | return default_; |
749 | 0 | } |
750 | | |
751 | | static void *s_realloc_fn(void *state, void *prev, size_t size) |
752 | 0 | { |
753 | 0 | fz_docx_writer *writer = state; |
754 | 0 | assert(writer); |
755 | 0 | assert(writer->ctx); |
756 | 0 | return fz_realloc_no_throw(writer->ctx, prev, size); |
757 | 0 | } |
758 | | |
759 | | /* Will drop <out> if an error occurs. */ |
760 | | static fz_document_writer *fz_new_docx_writer_internal(fz_context *ctx, fz_output *out, |
761 | | const char *options, extract_format_t format) |
762 | 0 | { |
763 | 0 | fz_docx_writer *writer = NULL; |
764 | |
|
765 | 0 | fz_var(writer); |
766 | |
|
767 | 0 | fz_try(ctx) |
768 | 0 | { |
769 | 0 | double space_guess = get_double_option(ctx, options, "space-guess", 0); |
770 | 0 | writer = fz_new_derived_document_writer( |
771 | 0 | ctx, |
772 | 0 | fz_docx_writer, |
773 | 0 | writer_begin_page, |
774 | 0 | writer_end_page, |
775 | 0 | writer_close, |
776 | 0 | writer_drop |
777 | 0 | ); |
778 | 0 | writer->ctx = ctx; |
779 | 0 | writer->output = out; |
780 | 0 | if (get_bool_option(ctx, options, "html", 0)) format = extract_format_HTML; |
781 | 0 | if (get_bool_option(ctx, options, "text", 0)) format = extract_format_TEXT; |
782 | 0 | if (get_bool_option(ctx, options, "json", 0)) format = extract_format_JSON; |
783 | 0 | if (extract_alloc_create(s_realloc_fn, writer, &writer->alloc)) |
784 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_alloc instance"); |
785 | 0 | if (extract_begin(writer->alloc, format, &writer->extract)) |
786 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract instance"); |
787 | 0 | if (space_guess) |
788 | 0 | extract_set_space_guess(writer->extract, space_guess); |
789 | 0 | writer->spacing = get_bool_option(ctx, options, "spacing", 0); |
790 | 0 | writer->rotation = get_bool_option(ctx, options, "rotation", 1); |
791 | 0 | writer->images = get_bool_option(ctx, options, "images", 1); |
792 | 0 | writer->mediabox_clip = get_bool_option(ctx, options, "mediabox-clip", 1); |
793 | 0 | if (extract_set_layout_analysis(writer->extract, get_bool_option(ctx, options, "analyse", 0))) |
794 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_enable_analysis failed."); |
795 | 0 | { |
796 | 0 | const char* v; |
797 | 0 | if (fz_has_option(ctx, options, "tables-csv-format", &v)) |
798 | 0 | { |
799 | 0 | size_t len = strlen(v) + 1; /* Might include trailing options. */ |
800 | 0 | char* formatbuf = fz_malloc(ctx, len); |
801 | 0 | fz_copy_option(ctx, v, formatbuf, len); |
802 | 0 | fprintf(stderr, "tables-csv-format: %s\n", formatbuf); |
803 | 0 | if (extract_tables_csv_format(writer->extract, formatbuf)) |
804 | 0 | { |
805 | 0 | fz_free(ctx, formatbuf); |
806 | 0 | fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_tables_csv_format() failed."); |
807 | 0 | } |
808 | 0 | fz_free(ctx, formatbuf); |
809 | 0 | } |
810 | 0 | } |
811 | 0 | writer->ctx = NULL; |
812 | 0 | } |
813 | 0 | fz_catch(ctx) |
814 | 0 | { |
815 | | /* fz_drop_document_writer() drops its output so we only need to call |
816 | | fz_drop_output() if we failed before creating the writer. */ |
817 | 0 | if (writer) |
818 | 0 | { |
819 | 0 | writer->ctx = ctx; |
820 | 0 | fz_drop_document_writer(ctx, &writer->super); |
821 | 0 | writer->ctx = NULL; |
822 | 0 | } |
823 | 0 | else |
824 | 0 | fz_drop_output(ctx, out); |
825 | 0 | fz_rethrow(ctx); |
826 | 0 | } |
827 | 0 | return &writer->super; |
828 | 0 | } |
829 | | |
830 | | fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
831 | 0 | { |
832 | 0 | return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX); |
833 | 0 | } |
834 | | |
835 | | fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options) |
836 | 0 | { |
837 | | /* No need to drop <out> if fz_new_docx_writer_internal() throws, because |
838 | | it always drops <out> if it fails. */ |
839 | 0 | fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/); |
840 | 0 | return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX); |
841 | 0 | } |
842 | | |
843 | | #if FZ_ENABLE_ODT_OUTPUT |
844 | | |
845 | | fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
846 | 0 | { |
847 | 0 | return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT); |
848 | 0 | } |
849 | | |
850 | | fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) |
851 | 0 | { |
852 | | /* No need to drop <out> if fz_new_docx_writer_internal() throws, because |
853 | | it always drops <out> if it fails. */ |
854 | 0 | fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/); |
855 | 0 | return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT); |
856 | 0 | } |
857 | | |
858 | | #else |
859 | | |
860 | | fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
861 | | { |
862 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled"); |
863 | | return NULL; |
864 | | } |
865 | | |
866 | | fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) |
867 | | { |
868 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled"); |
869 | | return NULL; |
870 | | } |
871 | | |
872 | | #endif |
873 | | |
874 | | #else |
875 | | |
876 | | fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
877 | | { |
878 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled"); |
879 | | return NULL; |
880 | | } |
881 | | |
882 | | fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) |
883 | | { |
884 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled"); |
885 | | return NULL; |
886 | | } |
887 | | |
888 | | fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
889 | | { |
890 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled"); |
891 | | return NULL; |
892 | | } |
893 | | |
894 | | fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options) |
895 | | { |
896 | | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled"); |
897 | | return NULL; |
898 | | } |
899 | | |
900 | | #endif |