/src/mupdf/source/html/office.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2023-2024 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "html-imp.h" |
25 | | |
26 | | #undef DEBUG_OFFICE_TO_HTML |
27 | | |
28 | | /* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */ |
29 | | typedef struct |
30 | | { |
31 | | int output_page_numbers; |
32 | | int output_sheet_names; |
33 | | int output_cell_markers; |
34 | | int output_cell_row_markers; |
35 | | int output_cell_names; |
36 | | int output_formatting; |
37 | | int output_filenames; |
38 | | int output_errors; |
39 | | } |
40 | | fz_office_to_html_opts; |
41 | | |
42 | | typedef struct |
43 | | { |
44 | | fz_office_to_html_opts opts; |
45 | | |
46 | | fz_output *out; |
47 | | |
48 | | int page; |
49 | | |
50 | | /* State for if we are parsing a sheet. */ |
51 | | /* The last column label we have to send. */ |
52 | | char *label; |
53 | | /* Columns are numbered from 1. */ |
54 | | /* The column we are at. */ |
55 | | int col_at; |
56 | | /* The column we last signalled. If this is 0, then we haven't |
57 | | * even started a row yet. */ |
58 | | int col_signalled; |
59 | | |
60 | | /* If we are currently processing a spreadsheet, store the current |
61 | | * sheets name here. */ |
62 | | const char *sheet_name; |
63 | | |
64 | | int shared_string_max; |
65 | | int shared_string_len; |
66 | | char **shared_strings; |
67 | | |
68 | | int footnotes_max; |
69 | | char **footnotes; |
70 | | |
71 | | char *title; |
72 | | } doc_info; |
73 | | |
74 | | static void |
75 | | doc_escape(fz_context *ctx, fz_output *output, const char *str_) |
76 | 0 | { |
77 | 0 | const unsigned char *str = (const unsigned char *)str_; |
78 | 0 | int c; |
79 | |
|
80 | 0 | if (!str) |
81 | 0 | return; |
82 | | |
83 | 0 | while ((c = *str++) != 0) |
84 | 0 | { |
85 | 0 | if (c == '&') |
86 | 0 | { |
87 | 0 | fz_write_string(ctx, output, "&"); |
88 | 0 | } |
89 | 0 | else if (c == '<') |
90 | 0 | { |
91 | 0 | fz_write_string(ctx, output, "<"); |
92 | 0 | } |
93 | 0 | else if (c == '>') |
94 | 0 | { |
95 | 0 | fz_write_string(ctx, output, ">"); |
96 | 0 | } |
97 | 0 | else |
98 | 0 | { |
99 | | /* We get utf-8 in, just parrot it out again. */ |
100 | 0 | fz_write_byte(ctx, output, c); |
101 | 0 | } |
102 | 0 | } |
103 | 0 | } |
104 | | |
105 | | static void |
106 | | show_text(fz_context *ctx, fz_xml *top, doc_info *info) |
107 | 0 | { |
108 | 0 | fz_xml *pos = top; |
109 | 0 | fz_xml *next; |
110 | |
|
111 | 0 | while (pos) |
112 | 0 | { |
113 | 0 | doc_escape(ctx, info->out, fz_xml_text(pos)); |
114 | |
|
115 | 0 | if (fz_xml_is_tag(pos, "lineBreak")) |
116 | 0 | { |
117 | 0 | fz_write_string(ctx, info->out, "\n"); |
118 | 0 | } |
119 | 0 | else if (fz_xml_is_tag(pos, "tab")) |
120 | 0 | { |
121 | 0 | fz_write_string(ctx, info->out, "\t"); |
122 | 0 | } |
123 | 0 | else if (fz_xml_is_tag(pos, "lastRenderedPageBreak")) |
124 | 0 | { |
125 | 0 | info->page++; |
126 | 0 | } |
127 | | |
128 | | /* Always try to move down. */ |
129 | 0 | next = fz_xml_down(pos); |
130 | 0 | if (next) |
131 | 0 | { |
132 | | /* We can move down, easy! */ |
133 | 0 | pos = next; |
134 | 0 | continue; |
135 | 0 | } |
136 | | |
137 | 0 | if (pos == top) |
138 | 0 | break; |
139 | | |
140 | | /* We can't move down, try moving to next. */ |
141 | 0 | next = fz_xml_next(pos); |
142 | 0 | if (next) |
143 | 0 | { |
144 | | /* We can move to next, easy! */ |
145 | 0 | pos = next; |
146 | 0 | continue; |
147 | 0 | } |
148 | | |
149 | | /* If we can't go down, or next, pop up until we |
150 | | * find somewhere we can go next from. */ |
151 | 0 | while (1) |
152 | 0 | { |
153 | | /* OK. So move up. */ |
154 | 0 | pos = fz_xml_up(pos); |
155 | | /* Check for hitting the top. */ |
156 | 0 | if (pos == top) |
157 | 0 | pos = NULL; |
158 | 0 | if (pos == NULL) |
159 | 0 | break; |
160 | | /* We've returned to a node. See if it's a 'p'. */ |
161 | 0 | if (fz_xml_is_tag(pos, "p")) |
162 | 0 | { |
163 | 0 | fz_write_string(ctx, info->out, "\n"); |
164 | 0 | } |
165 | 0 | next = fz_xml_next(pos); |
166 | 0 | if (next) |
167 | 0 | { |
168 | 0 | pos = next; |
169 | 0 | break; |
170 | 0 | } |
171 | 0 | } |
172 | 0 | } |
173 | 0 | } |
174 | | |
175 | | static void |
176 | | show_footnote(fz_context *ctx, fz_xml *v, doc_info *info) |
177 | 0 | { |
178 | 0 | int n = fz_atoi(fz_xml_att(v, "w:id")); |
179 | |
|
180 | 0 | if (n < 0 || n >= info->footnotes_max) |
181 | 0 | return; |
182 | | |
183 | 0 | if (info->footnotes[n] == NULL || |
184 | 0 | info->footnotes[n][0] == 0) |
185 | 0 | return; |
186 | | |
187 | | /* Then send the strings. */ |
188 | 0 | doc_escape(ctx, info->out, info->footnotes[n]); |
189 | 0 | } |
190 | | |
191 | | static void |
192 | | process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages) |
193 | 0 | { |
194 | 0 | fz_xml *pos; |
195 | 0 | fz_xml *next; |
196 | 0 | const char *paragraph_style = NULL; |
197 | 0 | const char *inline_style = NULL; |
198 | |
|
199 | | #ifdef DEBUG_OFFICE_TO_HTML |
200 | | fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n"); |
201 | | fz_output_xml(ctx, fz_stddbg(ctx), xml, 0); |
202 | | #endif |
203 | | |
204 | | /* First off, see if we can do page numbers. */ |
205 | 0 | if (do_pages) |
206 | 0 | { |
207 | 0 | pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL); |
208 | 0 | if (pos) |
209 | 0 | { |
210 | | /* We *can* do page numbers, so start here. */ |
211 | 0 | fz_write_string(ctx, info->out, "<div id=\"page1\">\n"); |
212 | 0 | info->page = 1; |
213 | 0 | } |
214 | 0 | } |
215 | | |
216 | | /* Now walk the tree for real. */ |
217 | 0 | pos = xml; |
218 | 0 | while (pos) |
219 | 0 | { |
220 | | /* When we arrive on a node, check if it's a 't'. */ |
221 | 0 | if (fz_xml_is_tag(pos, "t")) |
222 | 0 | { |
223 | 0 | show_text(ctx, pos, info); |
224 | | /* Do NOT go down, we've already dealt with that. */ |
225 | 0 | } |
226 | 0 | else if (fz_xml_is_tag(pos, "br")) |
227 | 0 | { |
228 | 0 | if (paragraph_style && strcmp(paragraph_style, "pre")) |
229 | 0 | { |
230 | 0 | fz_write_printf(ctx, info->out, "<br/>\n"); |
231 | 0 | } |
232 | 0 | else |
233 | 0 | { |
234 | 0 | fz_write_printf(ctx, info->out, "\n"); |
235 | 0 | } |
236 | 0 | } |
237 | 0 | else if (fz_xml_is_tag(pos, "footnoteReference")) |
238 | 0 | { |
239 | 0 | show_footnote(ctx, pos, info); |
240 | | /* Do NOT go down, we've already dealt with that. */ |
241 | 0 | } |
242 | 0 | else if (fz_xml_is_tag(pos, "tabs")) |
243 | 0 | { |
244 | | /* Don't walk through tabs, or we will hit lots of 'tab' entries and |
245 | | * output incorrect information. */ |
246 | 0 | } |
247 | 0 | else if (fz_xml_is_tag(pos, "pStyle")) |
248 | 0 | { |
249 | | /* Should prob fix fz_xml_*() to strip namespace prefix |
250 | | from attributes, to match what it does for tag names. |
251 | | */ |
252 | 0 | paragraph_style = fz_xml_att(pos, "w:val"); |
253 | 0 | if (paragraph_style) |
254 | 0 | { |
255 | 0 | if (!strcmp(paragraph_style, "BodyText")) |
256 | 0 | paragraph_style = NULL; |
257 | 0 | else if (!strcmp(paragraph_style, "Heading1")) |
258 | 0 | paragraph_style = "h1"; |
259 | 0 | else if (!strcmp(paragraph_style, "Heading2")) |
260 | 0 | paragraph_style = "h2"; |
261 | 0 | else if (!strcmp(paragraph_style, "Heading3")) |
262 | 0 | paragraph_style = "h3"; |
263 | 0 | else if (!strcmp(paragraph_style, "Heading4")) |
264 | 0 | paragraph_style = "h4"; |
265 | 0 | else if (!strcmp(paragraph_style, "Heading5")) |
266 | 0 | paragraph_style = "h5"; |
267 | 0 | else if (!strcmp(paragraph_style, "Heading6")) |
268 | 0 | paragraph_style = "h6"; |
269 | 0 | else if (!strcmp(paragraph_style, "SourceCode")) |
270 | 0 | paragraph_style = "pre"; |
271 | 0 | else |
272 | 0 | paragraph_style = NULL; |
273 | |
|
274 | 0 | if (paragraph_style) |
275 | 0 | fz_write_printf(ctx, info->out, "<%s>", paragraph_style); |
276 | 0 | } |
277 | 0 | } |
278 | 0 | else if (fz_xml_is_tag(pos, "rStyle")) |
279 | 0 | { |
280 | 0 | inline_style = fz_xml_att(pos, "w:val"); |
281 | 0 | if (inline_style) |
282 | 0 | { |
283 | 0 | if (!strcmp(inline_style, "VerbatimChar")) |
284 | 0 | inline_style = "tt"; |
285 | 0 | else |
286 | 0 | { |
287 | 0 | if (0) |
288 | 0 | fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style); |
289 | 0 | inline_style = NULL; |
290 | 0 | } |
291 | 0 | if (inline_style) |
292 | 0 | fz_write_printf(ctx, info->out, "<%s>", inline_style); |
293 | 0 | } |
294 | 0 | } |
295 | 0 | else |
296 | 0 | { |
297 | 0 | fz_xml *down; |
298 | 0 | if (fz_xml_is_tag(pos, "lineBreak")) |
299 | 0 | { |
300 | 0 | fz_write_string(ctx, info->out, "\n"); |
301 | 0 | } |
302 | 0 | else if (fz_xml_is_tag(pos, "p")) |
303 | 0 | { |
304 | 0 | fz_write_string(ctx, info->out, "<p>"); |
305 | 0 | } |
306 | 0 | else if (fz_xml_is_tag(pos, "tab")) |
307 | 0 | { |
308 | 0 | fz_write_string(ctx, info->out, "\t"); |
309 | 0 | } |
310 | 0 | else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak")) |
311 | 0 | { |
312 | 0 | if (info->page) |
313 | 0 | fz_write_string(ctx, info->out, "\n</div>\n"); |
314 | 0 | info->page++; |
315 | 0 | fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page); |
316 | 0 | } |
317 | | /* Try to move down. */ |
318 | 0 | down = fz_xml_down(pos); |
319 | 0 | if (down) |
320 | 0 | { |
321 | | /* We can move down, easy! */ |
322 | 0 | pos = down; |
323 | 0 | continue; |
324 | 0 | } |
325 | 0 | } |
326 | | /* Try moving to next. */ |
327 | 0 | next = fz_xml_next(pos); |
328 | 0 | if (next) |
329 | 0 | { |
330 | | /* We can move to next, easy! */ |
331 | 0 | pos = next; |
332 | 0 | continue; |
333 | 0 | } |
334 | | |
335 | | /* If we can't go down, or next, pop up until we |
336 | | * find somewhere we can go next from. */ |
337 | 0 | while (1) |
338 | 0 | { |
339 | | /* OK. So move up. */ |
340 | 0 | pos = fz_xml_up(pos); |
341 | | /* Check for hitting the top. */ |
342 | 0 | if (pos == NULL) |
343 | 0 | break; |
344 | | /* We've returned to a node. See if it's a 'p'. */ |
345 | 0 | if (fz_xml_is_tag(pos, "p")) |
346 | 0 | { |
347 | 0 | if (paragraph_style) |
348 | 0 | { |
349 | 0 | fz_write_printf(ctx, info->out, "</%s>", paragraph_style); |
350 | 0 | paragraph_style = NULL; |
351 | 0 | } |
352 | 0 | fz_write_string(ctx, info->out, "</p>\n"); |
353 | 0 | } |
354 | 0 | else if (fz_xml_is_tag(pos, "r")) |
355 | 0 | { |
356 | | /* Seems to be pseudo-close for rStyle. */ |
357 | 0 | if (inline_style) |
358 | 0 | { |
359 | 0 | fz_write_printf(ctx, info->out, "</%s>", inline_style); |
360 | 0 | inline_style = NULL; |
361 | 0 | } |
362 | 0 | } |
363 | 0 | next = fz_xml_next(pos); |
364 | 0 | if (next) |
365 | 0 | { |
366 | 0 | pos = next; |
367 | 0 | break; |
368 | 0 | } |
369 | 0 | } |
370 | 0 | } |
371 | |
|
372 | 0 | if (do_pages && info->page) |
373 | 0 | fz_write_string(ctx, info->out, "\n</div>\n"); |
374 | 0 | } |
375 | | |
376 | | static void |
377 | | process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages) |
378 | 0 | { |
379 | 0 | fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); |
380 | |
|
381 | 0 | fz_try(ctx) |
382 | 0 | process_doc_stream(ctx, xml, info, do_pages); |
383 | 0 | fz_always(ctx) |
384 | 0 | fz_drop_xml(ctx, xml); |
385 | 0 | fz_catch(ctx) |
386 | 0 | fz_rethrow(ctx); |
387 | 0 | } |
388 | | |
389 | | static void |
390 | | process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) |
391 | 0 | { |
392 | 0 | fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0); |
393 | |
|
394 | 0 | fz_try(ctx) |
395 | 0 | { |
396 | | /* FIXME: Should really search for these just inside 'spine'. */ |
397 | 0 | fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL); |
398 | 0 | while (pos) |
399 | 0 | { |
400 | 0 | char *idref = fz_xml_att(pos, "idref"); |
401 | 0 | fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref); |
402 | 0 | while (item) |
403 | 0 | { |
404 | 0 | char *type = fz_xml_att(item, "media-type"); |
405 | 0 | char *href = fz_xml_att(item, "href"); |
406 | 0 | if (type && href && !strcmp(type, "application/xml")) |
407 | 0 | { |
408 | 0 | process_item(ctx, arch, href, info, 1); |
409 | 0 | } |
410 | 0 | item = fz_xml_find_next_dfs(pos, "item", "id", idref); |
411 | 0 | } |
412 | 0 | pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL); |
413 | 0 | } |
414 | 0 | } |
415 | 0 | fz_always(ctx) |
416 | 0 | fz_drop_xml(ctx, xml); |
417 | 0 | fz_catch(ctx) |
418 | 0 | fz_rethrow(ctx); |
419 | 0 | } |
420 | | |
421 | | /* XLSX support */ |
422 | | static char * |
423 | | make_rel_name(fz_context *ctx, const char *file) |
424 | 0 | { |
425 | 0 | size_t z = strlen(file); |
426 | 0 | char *s = fz_malloc(ctx, z + 12); |
427 | 0 | char *t; |
428 | 0 | const char *p; |
429 | 0 | const char *slash = file; |
430 | |
|
431 | 0 | for (p = file; *p != 0; p++) |
432 | 0 | if (*p == '/') |
433 | 0 | slash = p+1; |
434 | |
|
435 | 0 | t = s; |
436 | 0 | if (slash != file) |
437 | 0 | { |
438 | 0 | memcpy(t, file, slash - file); |
439 | 0 | t += slash - file; |
440 | 0 | } |
441 | 0 | memcpy(t, "_rels/", 6); |
442 | 0 | t += 6; |
443 | 0 | memcpy(t, file + (slash - file), z - (slash - file)); |
444 | 0 | t += z - (slash - file); |
445 | 0 | memcpy(t, ".rels", 6); |
446 | |
|
447 | 0 | return s; |
448 | 0 | } |
449 | | |
450 | | static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id) |
451 | 0 | { |
452 | 0 | fz_xml *pos; |
453 | |
|
454 | 0 | if (id == NULL) |
455 | 0 | return NULL; |
456 | | |
457 | 0 | pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL); |
458 | 0 | while (pos) |
459 | 0 | { |
460 | 0 | char *id2 = fz_xml_att(pos, "Id"); |
461 | |
|
462 | 0 | if (id2 && !strcmp(id, id2)) |
463 | 0 | return fz_xml_att(pos, "Target"); |
464 | | |
465 | 0 | pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL); |
466 | 0 | } |
467 | | |
468 | 0 | return NULL; |
469 | 0 | } |
470 | | |
471 | | static void |
472 | | send_cell_formatting(fz_context *ctx, doc_info *info) |
473 | 0 | { |
474 | 0 | if (info->col_signalled == 0) |
475 | 0 | { |
476 | 0 | fz_write_string(ctx, info->out, "<tr>\n"); |
477 | 0 | info->col_signalled = 1; |
478 | 0 | if (info->col_at > 1) |
479 | 0 | fz_write_string(ctx, info->out, "<td>"); |
480 | 0 | } |
481 | | |
482 | | /* Send the label */ |
483 | 0 | while (info->col_signalled < info->col_at) |
484 | 0 | { |
485 | 0 | fz_write_string(ctx, info->out, "</td>"); |
486 | 0 | info->col_signalled++; |
487 | 0 | if (info->col_signalled < info->col_at) |
488 | 0 | fz_write_string(ctx, info->out, "<td>"); |
489 | 0 | } |
490 | 0 | if (info->sheet_name && info->sheet_name[0]) |
491 | 0 | fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label); |
492 | 0 | else |
493 | 0 | fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label); |
494 | 0 | } |
495 | | |
496 | | static void |
497 | | show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info) |
498 | 0 | { |
499 | 0 | const char *t = fz_xml_text(fz_xml_down(v)); |
500 | 0 | int n = fz_atoi(t); |
501 | |
|
502 | 0 | if (n < 0 || n >= info->shared_string_len) |
503 | 0 | return; |
504 | | |
505 | 0 | if (info->shared_strings[n] == NULL || |
506 | 0 | info->shared_strings[n][0] == 0) |
507 | 0 | return; |
508 | | |
509 | 0 | send_cell_formatting(ctx, info); |
510 | | /* Then send the strings. */ |
511 | 0 | doc_escape(ctx, info->out, info->shared_strings[n]); |
512 | 0 | } |
513 | | |
514 | | static int |
515 | | col_from_label(const char *label) |
516 | 0 | { |
517 | 0 | int col = 0; |
518 | 0 | int len = 26; |
519 | 0 | int base = 0; |
520 | | |
521 | | /* If we can't read the column, return 0. */ |
522 | 0 | if (label == NULL || *label < 'A' || *label > 'Z') |
523 | 0 | return 0; |
524 | | |
525 | | /* Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts |
526 | | * at base index 'base'. Each section is 26 times as long, and starts |
527 | | * at base + len from the previous section. |
528 | | * |
529 | | * A: col = 26 * 0 + 0 + 0 |
530 | | * AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26 |
531 | | * AAA: col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26 |
532 | | */ |
533 | 0 | do |
534 | 0 | { |
535 | 0 | col = 26 * col + (*label++) - 'A' + base; |
536 | 0 | base += len; |
537 | 0 | len *= 26; |
538 | 0 | } |
539 | 0 | while (*label >= 'A' && *label <= 'Z'); |
540 | |
|
541 | 0 | return col+1; |
542 | 0 | } |
543 | | |
544 | | static void |
545 | | show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info) |
546 | 0 | { |
547 | 0 | fz_xml *pos = top; |
548 | 0 | fz_xml *next; |
549 | |
|
550 | 0 | while (pos) |
551 | 0 | { |
552 | 0 | char *text = fz_xml_text(pos); |
553 | |
|
554 | 0 | if (text) |
555 | 0 | { |
556 | 0 | send_cell_formatting(ctx, info); |
557 | 0 | doc_escape(ctx, info->out, text); |
558 | 0 | } |
559 | | |
560 | | /* Always try to move down. */ |
561 | 0 | next = fz_xml_down(pos); |
562 | 0 | if (next) |
563 | 0 | { |
564 | | /* We can move down, easy! */ |
565 | 0 | pos = next; |
566 | 0 | continue; |
567 | 0 | } |
568 | | |
569 | 0 | if (pos == top) |
570 | 0 | break; |
571 | | |
572 | | /* We can't move down, try moving to next. */ |
573 | 0 | next = fz_xml_next(pos); |
574 | 0 | if (next) |
575 | 0 | { |
576 | | /* We can move to next, easy! */ |
577 | 0 | pos = next; |
578 | 0 | continue; |
579 | 0 | } |
580 | | |
581 | | /* If we can't go down, or next, pop up until we |
582 | | * find somewhere we can go next from. */ |
583 | 0 | while (1) |
584 | 0 | { |
585 | | /* OK. So move up. */ |
586 | 0 | pos = fz_xml_up(pos); |
587 | | /* Check for hitting the top. */ |
588 | 0 | if (pos == top) |
589 | 0 | pos = NULL; |
590 | 0 | if (pos == NULL) |
591 | 0 | break; |
592 | 0 | next = fz_xml_next(pos); |
593 | 0 | if (next) |
594 | 0 | { |
595 | 0 | pos = next; |
596 | 0 | break; |
597 | 0 | } |
598 | 0 | } |
599 | 0 | } |
600 | 0 | } |
601 | | |
602 | | static void |
603 | | arrived_at_cell(fz_context *ctx, doc_info *info, const char *label) |
604 | 0 | { |
605 | 0 | int col; |
606 | | |
607 | | /* If we have a label queued, and no label is given here, then we're |
608 | | * processing a 'cell' callback after having had a 'cellname' |
609 | | * callback. So don't signal it twice! */ |
610 | 0 | if (label == NULL && info->label) |
611 | 0 | return; |
612 | | |
613 | 0 | col = label ? col_from_label(label) : 0; |
614 | |
|
615 | 0 | fz_free(ctx, info->label); |
616 | 0 | info->label = NULL; |
617 | 0 | info->label = label ? fz_strdup(ctx, label) : NULL; |
618 | 0 | info->col_at = col; |
619 | 0 | } |
620 | | |
621 | | static void |
622 | | show_cell(fz_context *ctx, fz_xml *cell, doc_info *info) |
623 | 0 | { |
624 | 0 | char *t = fz_xml_att(cell, "t"); |
625 | 0 | fz_xml *v = fz_xml_find_down(cell, "v"); |
626 | 0 | const char *r = fz_xml_att(cell, "r"); |
627 | |
|
628 | 0 | arrived_at_cell(ctx, info, r); |
629 | 0 | if (t && t[0] == 's' && t[1] == 0) |
630 | 0 | show_shared_string(ctx, v, info); |
631 | 0 | else |
632 | 0 | show_cell_text(ctx, v, info); |
633 | 0 | } |
634 | | |
635 | | static void |
636 | | new_row(fz_context *ctx, doc_info *info) |
637 | 0 | { |
638 | 0 | if (info->col_signalled) |
639 | 0 | { |
640 | | /* We've sent at least one cell. So need to close the |
641 | | * td and tr */ |
642 | 0 | fz_write_string(ctx, info->out, "</td>\n</tr>\n"); |
643 | 0 | } |
644 | 0 | else |
645 | 0 | { |
646 | | /* We've not sent anything for this row. Keep the counts |
647 | | * correct. */ |
648 | 0 | fz_write_string(ctx, info->out, "<tr></tr>\n"); |
649 | 0 | } |
650 | 0 | info->col_at = 1; |
651 | 0 | info->col_signalled = 0; |
652 | 0 | fz_free(ctx, info->label); |
653 | 0 | info->label = NULL; |
654 | 0 | } |
655 | | |
656 | | static void |
657 | | process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info) |
658 | 0 | { |
659 | 0 | fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); |
660 | |
|
661 | | #ifdef DEBUG_OFFICE_TO_HTML |
662 | | fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n"); |
663 | | fz_output_xml(ctx, fz_stddbg(ctx), xml, 0); |
664 | | #endif |
665 | |
|
666 | 0 | fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name); |
667 | |
|
668 | 0 | info->sheet_name = name; |
669 | 0 | info->col_at = 0; |
670 | 0 | info->col_signalled = 0; |
671 | |
|
672 | 0 | fz_try(ctx) |
673 | 0 | { |
674 | 0 | fz_xml *pos = xml; |
675 | 0 | fz_xml *next; |
676 | |
|
677 | 0 | while (pos) |
678 | 0 | { |
679 | | /* When we arrive on a node, check if it's a cell. */ |
680 | 0 | if (fz_xml_is_tag(pos, "c")) |
681 | 0 | { |
682 | 0 | show_cell(ctx, pos, info); |
683 | | /* Do NOT go down, we've already dealt with that. */ |
684 | 0 | } |
685 | 0 | else |
686 | 0 | { |
687 | | /* Try to move down. */ |
688 | 0 | next = fz_xml_down(pos); |
689 | 0 | if (next) |
690 | 0 | { |
691 | | /* We can move down, easy! */ |
692 | 0 | pos = next; |
693 | 0 | continue; |
694 | 0 | } |
695 | 0 | } |
696 | | /* Try moving to next. */ |
697 | 0 | next = fz_xml_next(pos); |
698 | 0 | if (next) |
699 | 0 | { |
700 | | /* We can move to next, easy! */ |
701 | 0 | pos = next; |
702 | 0 | continue; |
703 | 0 | } |
704 | | |
705 | | /* If we can't go down, or next, pop up until we |
706 | | * find somewhere we can go next from. */ |
707 | 0 | while (1) |
708 | 0 | { |
709 | | /* OK. So move up. */ |
710 | 0 | pos = fz_xml_up(pos); |
711 | | /* Check for hitting the top. */ |
712 | 0 | if (pos == NULL) |
713 | 0 | break; |
714 | | |
715 | | /* We've returned to a node. See if it's a 'row'. */ |
716 | 0 | if (fz_xml_is_tag(pos, "row")) |
717 | 0 | new_row(ctx, info); |
718 | |
|
719 | 0 | next = fz_xml_next(pos); |
720 | 0 | if (next) |
721 | 0 | { |
722 | 0 | pos = next; |
723 | 0 | break; |
724 | 0 | } |
725 | 0 | } |
726 | 0 | } |
727 | 0 | if (info->col_signalled) |
728 | 0 | fz_write_printf(ctx, info->out, "</td>\n</tr>\n"); |
729 | 0 | fz_write_printf(ctx, info->out, "</table>\n"); |
730 | 0 | } |
731 | 0 | fz_always(ctx) |
732 | 0 | fz_drop_xml(ctx, xml); |
733 | 0 | fz_catch(ctx) |
734 | 0 | fz_rethrow(ctx); |
735 | 0 | } |
736 | | |
737 | | static void |
738 | | process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) |
739 | 0 | { |
740 | 0 | fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++); |
741 | 0 | process_item(ctx, arch, file, info, 0); |
742 | 0 | fz_write_printf(ctx, info->out, "</div>\n"); |
743 | 0 | } |
744 | | |
745 | | static char * |
746 | | make_absolute_path(fz_context *ctx, const char *abs, const char *rel) |
747 | 0 | { |
748 | 0 | const char *a = abs; |
749 | 0 | const char *aslash = a; |
750 | 0 | int up = 0; |
751 | 0 | size_t z1, z2; |
752 | 0 | char *s; |
753 | |
|
754 | 0 | if (rel == NULL) |
755 | 0 | return NULL; |
756 | 0 | if (abs == NULL || *rel == '/') |
757 | 0 | return fz_strdup(ctx, rel); |
758 | | |
759 | 0 | for (a = abs; *a != 0; a++) |
760 | 0 | if (*a == '/') |
761 | 0 | aslash = a+1; |
762 | |
|
763 | 0 | while (rel[0] == '.') |
764 | 0 | { |
765 | 0 | if (rel[1] == '/') |
766 | 0 | rel += 2; |
767 | 0 | else if (rel[1] == '.' && rel[2] == '/') |
768 | 0 | rel += 3, up++; |
769 | 0 | else |
770 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path"); |
771 | 0 | } |
772 | 0 | if (rel[0] == 0) |
773 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path"); |
774 | | |
775 | 0 | while (up) |
776 | 0 | { |
777 | 0 | while (aslash != abs && aslash[-1] != '/') |
778 | 0 | aslash--; |
779 | |
|
780 | 0 | up--; |
781 | 0 | } |
782 | |
|
783 | 0 | z1 = aslash - abs; |
784 | 0 | z2 = strlen(rel); |
785 | 0 | s = fz_malloc(ctx, z1 + z2 + 1); |
786 | 0 | if (z1) |
787 | 0 | memcpy(s, abs, z1); |
788 | 0 | memcpy(s+z1, rel, z2+1); |
789 | |
|
790 | 0 | return s; |
791 | 0 | } |
792 | | |
793 | | static char * |
794 | | collate_t_content(fz_context *ctx, fz_xml *top) |
795 | 0 | { |
796 | 0 | char *val = NULL; |
797 | 0 | fz_xml *next; |
798 | 0 | fz_xml *pos = fz_xml_down(top); |
799 | |
|
800 | 0 | while (pos != top) |
801 | 0 | { |
802 | | /* Capture all the 't' content. */ |
803 | 0 | if (fz_xml_is_tag(pos, "t")) |
804 | 0 | { |
805 | | /* Remember the content. */ |
806 | 0 | char *s = fz_xml_text(fz_xml_down(pos)); |
807 | |
|
808 | 0 | if (s == NULL) |
809 | 0 | { |
810 | | /* Do nothing */ |
811 | 0 | } |
812 | 0 | else if (val == NULL) |
813 | 0 | val = fz_strdup(ctx, s); |
814 | 0 | else |
815 | 0 | { |
816 | 0 | char *val2; |
817 | 0 | size_t z1 = strlen(val); |
818 | 0 | size_t z2 = strlen(s) + 1; |
819 | 0 | fz_try(ctx) |
820 | 0 | { |
821 | 0 | val2 = fz_malloc(ctx, z1 + z2); |
822 | 0 | } |
823 | 0 | fz_catch(ctx) |
824 | 0 | { |
825 | 0 | fz_free(ctx, val); |
826 | 0 | fz_rethrow(ctx); |
827 | 0 | } |
828 | 0 | memcpy(val2, val, z1); |
829 | 0 | memcpy(val2 + z1, s, z2); |
830 | 0 | fz_free(ctx, val); |
831 | 0 | val = val2; |
832 | 0 | } |
833 | | /* Do NOT go down, we've already dealt with that. */ |
834 | 0 | } |
835 | 0 | else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh")) |
836 | 0 | { |
837 | | /* We do not want the 't' content from within these. */ |
838 | 0 | } |
839 | 0 | else |
840 | 0 | { |
841 | | /* Try to move down. */ |
842 | 0 | next = fz_xml_down(pos); |
843 | 0 | if (next) |
844 | 0 | { |
845 | | /* We can move down, easy! */ |
846 | 0 | pos = next; |
847 | 0 | continue; |
848 | 0 | } |
849 | 0 | } |
850 | | /* Try moving to next. */ |
851 | 0 | next = fz_xml_next(pos); |
852 | 0 | if (next) |
853 | 0 | { |
854 | | /* We can move to next, easy! */ |
855 | 0 | pos = next; |
856 | 0 | continue; |
857 | 0 | } |
858 | | |
859 | | /* If we can't go down, or next, pop up until we |
860 | | * find somewhere we can go next from. */ |
861 | 0 | while (1) |
862 | 0 | { |
863 | | /* OK. So move up. */ |
864 | 0 | pos = fz_xml_up(pos); |
865 | | /* Check for hitting the top. */ |
866 | 0 | if (pos == top) |
867 | 0 | break; |
868 | 0 | next = fz_xml_next(pos); |
869 | 0 | if (next) |
870 | 0 | { |
871 | 0 | pos = next; |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | } |
875 | 0 | } |
876 | | |
877 | 0 | return val; |
878 | 0 | } |
879 | | |
880 | | static fz_xml * |
881 | | try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) |
882 | 0 | { |
883 | 0 | if (!fz_has_archive_entry(ctx, arch, filename)) |
884 | 0 | return NULL; |
885 | | |
886 | 0 | return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white); |
887 | 0 | } |
888 | | |
889 | | static void |
890 | | load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file) |
891 | 0 | { |
892 | 0 | fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"); |
893 | 0 | const char *ss_file = fz_xml_att(pos, "Target"); |
894 | 0 | char *resolved = NULL; |
895 | 0 | fz_xml *xml = NULL; |
896 | 0 | char *str = NULL; |
897 | |
|
898 | 0 | if (ss_file == NULL) |
899 | 0 | return; |
900 | | |
901 | 0 | fz_var(xml); |
902 | 0 | fz_var(str); |
903 | 0 | fz_var(resolved); |
904 | |
|
905 | 0 | fz_try(ctx) |
906 | 0 | { |
907 | 0 | fz_xml *pos; |
908 | |
|
909 | 0 | resolved = make_absolute_path(ctx, file, ss_file); |
910 | 0 | xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1); |
911 | |
|
912 | 0 | pos = fz_xml_find_dfs(xml, "si", NULL, NULL); |
913 | 0 | while (pos) |
914 | 0 | { |
915 | 0 | int n = info->shared_string_len; |
916 | 0 | str = collate_t_content(ctx, pos); |
917 | |
|
918 | 0 | if (n == info->shared_string_max) |
919 | 0 | { |
920 | 0 | int max = info->shared_string_max; |
921 | 0 | int newmax = max ? max * 2 : 1024; |
922 | 0 | char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax); |
923 | 0 | memset(&arr[max], 0, sizeof(*arr) * (newmax - max)); |
924 | 0 | info->shared_strings = arr; |
925 | 0 | info->shared_string_max = newmax; |
926 | 0 | } |
927 | |
|
928 | 0 | info->shared_strings[n] = str; |
929 | 0 | str = NULL; |
930 | 0 | info->shared_string_len++; |
931 | 0 | pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL); |
932 | 0 | } |
933 | 0 | } |
934 | 0 | fz_always(ctx) |
935 | 0 | { |
936 | 0 | fz_drop_xml(ctx, xml); |
937 | 0 | fz_free(ctx, resolved); |
938 | 0 | fz_free(ctx, str); |
939 | 0 | } |
940 | 0 | fz_catch(ctx) |
941 | 0 | fz_rethrow(ctx); |
942 | 0 | } |
943 | | |
944 | | static void |
945 | | load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file) |
946 | 0 | { |
947 | 0 | char *resolved = NULL; |
948 | 0 | fz_xml *xml = NULL; |
949 | 0 | char *str = NULL; |
950 | |
|
951 | 0 | fz_var(xml); |
952 | 0 | fz_var(str); |
953 | 0 | fz_var(resolved); |
954 | |
|
955 | 0 | fz_try(ctx) |
956 | 0 | { |
957 | 0 | fz_xml *pos; |
958 | |
|
959 | 0 | resolved = make_absolute_path(ctx, file, "footnotes.xml"); |
960 | 0 | xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1); |
961 | 0 | if (xml == NULL) |
962 | 0 | break; |
963 | | |
964 | 0 | pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL); |
965 | 0 | while (pos) |
966 | 0 | { |
967 | 0 | int n = fz_atoi(fz_xml_att(pos, "w:id")); |
968 | |
|
969 | 0 | str = collate_t_content(ctx, pos); |
970 | |
|
971 | 0 | if (str && n >= 0) |
972 | 0 | { |
973 | 0 | if (n >= info->footnotes_max) |
974 | 0 | { |
975 | 0 | int max = info->footnotes_max; |
976 | 0 | int newmax = max ? max * 2 : 1024; |
977 | 0 | char **arr; |
978 | 0 | if (newmax < n) |
979 | 0 | newmax = n+1; |
980 | 0 | arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax); |
981 | 0 | memset(&arr[max], 0, sizeof(*arr) * (newmax - max)); |
982 | 0 | info->footnotes = arr; |
983 | 0 | info->footnotes_max = newmax; |
984 | 0 | } |
985 | |
|
986 | 0 | info->footnotes[n] = str; |
987 | 0 | str = NULL; |
988 | 0 | } |
989 | 0 | pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL); |
990 | 0 | } |
991 | 0 | } |
992 | 0 | fz_always(ctx) |
993 | 0 | { |
994 | 0 | fz_drop_xml(ctx, xml); |
995 | 0 | fz_free(ctx, resolved); |
996 | 0 | fz_free(ctx, str); |
997 | 0 | } |
998 | 0 | fz_catch(ctx) |
999 | 0 | fz_rethrow(ctx); |
1000 | 0 | } |
1001 | | |
1002 | | static void |
1003 | | process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) |
1004 | 0 | { |
1005 | 0 | char *file_rels; |
1006 | 0 | fz_xml *xml = NULL; |
1007 | 0 | fz_xml *rels = NULL; |
1008 | 0 | char *resolved_rel = NULL; |
1009 | |
|
1010 | 0 | if (file == NULL) |
1011 | 0 | return; |
1012 | | |
1013 | 0 | file_rels = make_rel_name(ctx, file); |
1014 | |
|
1015 | 0 | fz_var(resolved_rel); |
1016 | |
|
1017 | 0 | fz_var(rels); |
1018 | 0 | fz_var(xml); |
1019 | |
|
1020 | 0 | fz_try(ctx) |
1021 | 0 | { |
1022 | 0 | fz_xml *pos; |
1023 | |
|
1024 | 0 | rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0); |
1025 | 0 | xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); |
1026 | | |
1027 | | /* XLSX */ |
1028 | 0 | pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL); |
1029 | 0 | if (pos) |
1030 | 0 | { |
1031 | 0 | load_shared_strings(ctx, arch, rels, info, file); |
1032 | 0 | while (pos) |
1033 | 0 | { |
1034 | 0 | char *name = fz_xml_att(pos, "name"); |
1035 | 0 | char *id = fz_xml_att(pos, "r:id"); |
1036 | 0 | char *sheet = lookup_rel(ctx, rels, id); |
1037 | |
|
1038 | 0 | if (sheet) |
1039 | 0 | { |
1040 | 0 | resolved_rel = make_absolute_path(ctx, file, sheet); |
1041 | 0 | process_sheet(ctx, arch, name, resolved_rel, info); |
1042 | 0 | fz_free(ctx, resolved_rel); |
1043 | 0 | resolved_rel = NULL; |
1044 | 0 | } |
1045 | 0 | pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL); |
1046 | 0 | } |
1047 | 0 | break; |
1048 | 0 | } |
1049 | | |
1050 | | /* Let's try it as a powerpoint */ |
1051 | 0 | pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL); |
1052 | 0 | if (pos) |
1053 | 0 | { |
1054 | 0 | while (pos) |
1055 | 0 | { |
1056 | 0 | char *id = fz_xml_att(pos, "r:id"); |
1057 | 0 | char *sheet = lookup_rel(ctx, rels, id); |
1058 | |
|
1059 | 0 | if (sheet) |
1060 | 0 | { |
1061 | 0 | resolved_rel = make_absolute_path(ctx, file, sheet); |
1062 | 0 | process_slide(ctx, arch, resolved_rel, info); |
1063 | 0 | fz_free(ctx, resolved_rel); |
1064 | 0 | resolved_rel = NULL; |
1065 | 0 | } |
1066 | 0 | pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL); |
1067 | 0 | } |
1068 | 0 | break; |
1069 | 0 | } |
1070 | | |
1071 | | /* Let's try it as word. */ |
1072 | 0 | { |
1073 | 0 | load_footnotes(ctx, arch, rels, info, file); |
1074 | 0 | process_doc_stream(ctx, xml, info, 1); |
1075 | 0 | } |
1076 | 0 | } |
1077 | 0 | fz_always(ctx) |
1078 | 0 | { |
1079 | 0 | fz_drop_xml(ctx, xml); |
1080 | 0 | fz_drop_xml(ctx, rels); |
1081 | 0 | fz_free(ctx, resolved_rel); |
1082 | 0 | fz_free(ctx, file_rels); |
1083 | 0 | } |
1084 | 0 | fz_catch(ctx) |
1085 | 0 | fz_rethrow(ctx); |
1086 | 0 | } |
1087 | | |
1088 | | static void |
1089 | | process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) |
1090 | 0 | { |
1091 | 0 | fz_xml *xml = NULL; |
1092 | 0 | char *title; |
1093 | |
|
1094 | 0 | fz_var(xml); |
1095 | |
|
1096 | 0 | fz_try(ctx) |
1097 | 0 | { |
1098 | 0 | fz_xml *pos; |
1099 | |
|
1100 | 0 | xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); |
1101 | |
|
1102 | 0 | pos = fz_xml_find_dfs(xml, "title", NULL, NULL); |
1103 | 0 | title = fz_xml_text(fz_xml_down(pos)); |
1104 | 0 | if (title) |
1105 | 0 | { |
1106 | 0 | fz_write_string(ctx, info->out, "<title>"); |
1107 | 0 | doc_escape(ctx, info->out, title); |
1108 | 0 | fz_write_string(ctx, info->out, "</title>"); |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | fz_always(ctx) |
1112 | 0 | { |
1113 | 0 | fz_drop_xml(ctx, xml); |
1114 | 0 | } |
1115 | 0 | fz_catch(ctx) |
1116 | 0 | fz_rethrow(ctx); |
1117 | 0 | } |
1118 | | |
1119 | | static fz_buffer * |
1120 | | fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts) |
1121 | 0 | { |
1122 | 0 | fz_stream *stream = NULL; |
1123 | 0 | fz_archive *archive = NULL; |
1124 | 0 | fz_buffer *buffer_out = NULL; |
1125 | 0 | fz_xml *xml = NULL; |
1126 | 0 | fz_xml *pos = NULL; |
1127 | 0 | fz_xml *rels = NULL; |
1128 | 0 | const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; |
1129 | 0 | const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"; |
1130 | 0 | doc_info info = { 0 }; |
1131 | 0 | int i; |
1132 | |
|
1133 | 0 | fz_var(archive); |
1134 | 0 | fz_var(stream); |
1135 | 0 | fz_var(buffer_out); |
1136 | 0 | fz_var(xml); |
1137 | 0 | fz_var(rels); |
1138 | |
|
1139 | 0 | if (opts) |
1140 | 0 | info.opts = *opts; |
1141 | |
|
1142 | 0 | fz_try(ctx) |
1143 | 0 | { |
1144 | 0 | if (buffer_in) |
1145 | 0 | { |
1146 | 0 | stream = fz_open_buffer(ctx, buffer_in); |
1147 | 0 | archive = fz_open_archive_with_stream(ctx, stream); |
1148 | 0 | } |
1149 | 0 | else |
1150 | 0 | archive = fz_keep_archive(ctx, dir); |
1151 | 0 | buffer_out = fz_new_buffer(ctx, 1024); |
1152 | 0 | info.out = fz_new_output_with_buffer(ctx, buffer_out); |
1153 | | |
1154 | | /* Is it an HWPX ?*/ |
1155 | 0 | xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0); |
1156 | 0 | if (xml) |
1157 | 0 | { |
1158 | 0 | pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"); |
1159 | 0 | if (!pos) |
1160 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx."); |
1161 | | |
1162 | 0 | while (pos) |
1163 | 0 | { |
1164 | 0 | const char *file = fz_xml_att(pos, "full-path"); |
1165 | 0 | process_rootfile(ctx, archive, file, &info); |
1166 | 0 | pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml"); |
1167 | 0 | } |
1168 | 0 | break; |
1169 | 0 | } |
1170 | | |
1171 | | /* Try other types */ |
1172 | 0 | { |
1173 | 0 | xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0); |
1174 | |
|
1175 | 0 | fz_write_string(ctx, info.out, "<html>\n"); |
1176 | |
|
1177 | 0 | pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props); |
1178 | 0 | if (pos) |
1179 | 0 | { |
1180 | 0 | const char *file = fz_xml_att(pos, "Target"); |
1181 | 0 | fz_write_string(ctx, info.out, "<head>\n"); |
1182 | 0 | process_office_document_properties(ctx, archive, file, &info); |
1183 | 0 | fz_write_string(ctx, info.out, "</head>\n"); |
1184 | 0 | } |
1185 | |
|
1186 | 0 | fz_write_string(ctx, info.out, "<body>\n"); |
1187 | 0 | pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema); |
1188 | 0 | if (!pos) |
1189 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx."); |
1190 | | |
1191 | 0 | while (pos) |
1192 | 0 | { |
1193 | 0 | const char *file = fz_xml_att(pos, "Target"); |
1194 | 0 | if (file) |
1195 | 0 | process_office_document(ctx, archive, file, &info); |
1196 | 0 | pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema); |
1197 | 0 | } |
1198 | 0 | } |
1199 | | |
1200 | 0 | fz_close_output(ctx, info.out); |
1201 | 0 | } |
1202 | 0 | fz_always(ctx) |
1203 | 0 | { |
1204 | 0 | fz_drop_xml(ctx, rels); |
1205 | 0 | fz_drop_xml(ctx, xml); |
1206 | 0 | for (i = 0; i < info.shared_string_len; ++i) |
1207 | 0 | fz_free(ctx, info.shared_strings[i]); |
1208 | 0 | fz_free(ctx, info.shared_strings); |
1209 | 0 | for (i = 0; i < info.footnotes_max; ++i) |
1210 | 0 | fz_free(ctx, info.footnotes[i]); |
1211 | 0 | fz_free(ctx, info.footnotes); |
1212 | 0 | fz_drop_output(ctx, info.out); |
1213 | 0 | fz_drop_archive(ctx, archive); |
1214 | 0 | fz_drop_stream(ctx, stream); |
1215 | 0 | } |
1216 | 0 | fz_catch(ctx) |
1217 | 0 | { |
1218 | 0 | fz_drop_buffer(ctx, buffer_out); |
1219 | 0 | fz_rethrow(ctx); |
1220 | 0 | } |
1221 | | |
1222 | | #ifdef DEBUG_OFFICE_TO_HTML |
1223 | | { |
1224 | | unsigned char *storage; |
1225 | | size_t len = fz_buffer_storage(ctx, buffer_out, &storage); |
1226 | | fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len); |
1227 | | fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out); |
1228 | | } |
1229 | | #endif |
1230 | | |
1231 | 0 | return buffer_out; |
1232 | 0 | } |
1233 | | |
1234 | | /* Office document handler */ |
1235 | | |
1236 | | static fz_buffer * |
1237 | | office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css) |
1238 | 0 | { |
1239 | 0 | fz_office_to_html_opts opts = { 0 }; |
1240 | |
|
1241 | 0 | return fz_office_to_html(ctx, set, buf, zip, user_css, &opts); |
1242 | 0 | } |
1243 | | |
1244 | | static const fz_htdoc_format_t fz_htdoc_office = |
1245 | | { |
1246 | | "Office document", |
1247 | | office_to_html, |
1248 | | 0, 1, 0 |
1249 | | }; |
1250 | | |
1251 | | static fz_document * |
1252 | | office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state) |
1253 | 0 | { |
1254 | 0 | return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office); |
1255 | 0 | } |
1256 | | |
1257 | | static const char *office_extensions[] = |
1258 | | { |
1259 | | "docx", |
1260 | | "xlsx", |
1261 | | "pptx", |
1262 | | "hwpx", |
1263 | | NULL |
1264 | | }; |
1265 | | |
1266 | | static const char *office_mimetypes[] = |
1267 | | { |
1268 | | // DOCX |
1269 | | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
1270 | | // XLSX |
1271 | | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
1272 | | // PPTX |
1273 | | "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
1274 | | // HWPX |
1275 | | "application/haansofthwpx", |
1276 | | "application/vnd.hancom.hwpx", |
1277 | | NULL |
1278 | | }; |
1279 | | |
1280 | | /* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler |
1281 | | * to override us by returning 100. */ |
1282 | | static int |
1283 | | office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state) |
1284 | 13.5k | { |
1285 | 13.5k | fz_archive *arch = NULL; |
1286 | 13.5k | int ret = 0; |
1287 | 13.5k | fz_xml *xml = NULL; |
1288 | | |
1289 | 13.5k | if (state) |
1290 | 13.5k | *state = NULL; |
1291 | 13.5k | if (free_state) |
1292 | 13.5k | *free_state = NULL; |
1293 | | |
1294 | 13.5k | fz_var(arch); |
1295 | 13.5k | fz_var(ret); |
1296 | 13.5k | fz_var(xml); |
1297 | | |
1298 | 27.1k | fz_try(ctx) |
1299 | 27.1k | { |
1300 | 13.5k | if (stream) |
1301 | 13.5k | { |
1302 | 13.5k | arch = fz_try_open_archive_with_stream(ctx, stream); |
1303 | 13.5k | if (arch == NULL) |
1304 | 13.3k | break; |
1305 | 13.5k | } |
1306 | 0 | else |
1307 | 0 | arch = fz_keep_archive(ctx, zip); |
1308 | | |
1309 | 182 | xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0); |
1310 | 182 | if (xml) |
1311 | 0 | { |
1312 | 0 | if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml")) |
1313 | 0 | ret = 75; /* HWPX */ |
1314 | 0 | break; |
1315 | 0 | } |
1316 | 182 | xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0); |
1317 | 182 | if (xml) |
1318 | 12 | { |
1319 | 12 | if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")) |
1320 | 12 | { |
1321 | 12 | ret = 75; /* DOCX | PPTX | XLSX */ |
1322 | 12 | } |
1323 | 12 | break; |
1324 | 12 | } |
1325 | 182 | } |
1326 | 27.1k | fz_always(ctx) |
1327 | 13.5k | { |
1328 | 13.5k | fz_drop_xml(ctx, xml); |
1329 | 13.5k | fz_drop_archive(ctx, arch); |
1330 | 13.5k | } |
1331 | 13.5k | fz_catch(ctx) |
1332 | 155 | fz_rethrow(ctx); |
1333 | | |
1334 | 13.4k | return ret; |
1335 | 13.5k | } |
1336 | | |
1337 | | fz_document_handler office_document_handler = |
1338 | | { |
1339 | | NULL, |
1340 | | office_open_document, |
1341 | | office_extensions, |
1342 | | office_mimetypes, |
1343 | | office_recognize_doc_content |
1344 | | }; |