Coverage Report

Created: 2025-01-11 06:55

/src/mupdf/source/html/office.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2023-2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "html-imp.h"
25
26
#undef DEBUG_OFFICE_TO_HTML
27
28
/* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */
29
typedef struct
30
{
31
  int output_page_numbers;
32
  int output_sheet_names;
33
  int output_cell_markers;
34
  int output_cell_row_markers;
35
  int output_cell_names;
36
  int output_formatting;
37
  int output_filenames;
38
  int output_errors;
39
}
40
fz_office_to_html_opts;
41
42
typedef struct
43
{
44
  fz_office_to_html_opts opts;
45
46
  fz_output *out;
47
48
  int page;
49
50
  /* State for if we are parsing a sheet. */
51
  /* The last column label we have to send. */
52
  char *label;
53
  /* Columns are numbered from 1. */
54
  /* The column we are at. */
55
  int col_at;
56
  /* The column we last signalled. If this is 0, then we haven't
57
   * even started a row yet. */
58
  int col_signalled;
59
60
  /* If we are currently processing a spreadsheet, store the current
61
   * sheets name here. */
62
  const char *sheet_name;
63
64
  int shared_string_max;
65
  int shared_string_len;
66
  char **shared_strings;
67
68
  int footnotes_max;
69
  char **footnotes;
70
71
  char *title;
72
} doc_info;
73
74
static void
75
doc_escape(fz_context *ctx, fz_output *output, const char *str_)
76
0
{
77
0
  const unsigned char *str = (const unsigned char *)str_;
78
0
  int c;
79
80
0
  if (!str)
81
0
    return;
82
83
0
  while ((c = *str++) != 0)
84
0
  {
85
0
    if (c == '&')
86
0
    {
87
0
      fz_write_string(ctx, output, "&amp;");
88
0
    }
89
0
    else if (c == '<')
90
0
    {
91
0
      fz_write_string(ctx, output, "&lt;");
92
0
    }
93
0
    else if (c == '>')
94
0
    {
95
0
      fz_write_string(ctx, output, "&gt;");
96
0
    }
97
0
    else
98
0
    {
99
      /* We get utf-8 in, just parrot it out again. */
100
0
      fz_write_byte(ctx, output, c);
101
0
    }
102
0
  }
103
0
}
104
105
static void
106
show_text(fz_context *ctx, fz_xml *top, doc_info *info)
107
0
{
108
0
  fz_xml *pos = top;
109
0
  fz_xml *next;
110
111
0
  while (pos)
112
0
  {
113
0
    doc_escape(ctx, info->out, fz_xml_text(pos));
114
115
0
    if (fz_xml_is_tag(pos, "lineBreak"))
116
0
    {
117
0
      fz_write_string(ctx, info->out, "\n");
118
0
    }
119
0
    else if (fz_xml_is_tag(pos, "tab"))
120
0
    {
121
0
      fz_write_string(ctx, info->out, "\t");
122
0
    }
123
0
    else if (fz_xml_is_tag(pos, "lastRenderedPageBreak"))
124
0
    {
125
0
      info->page++;
126
0
    }
127
128
    /* Always try to move down. */
129
0
    next = fz_xml_down(pos);
130
0
    if (next)
131
0
    {
132
      /* We can move down, easy! */
133
0
      pos = next;
134
0
      continue;
135
0
    }
136
137
0
    if (pos == top)
138
0
      break;
139
140
    /* We can't move down, try moving to next. */
141
0
    next = fz_xml_next(pos);
142
0
    if (next)
143
0
    {
144
      /* We can move to next, easy! */
145
0
      pos = next;
146
0
      continue;
147
0
    }
148
149
    /* If we can't go down, or next, pop up until we
150
     * find somewhere we can go next from. */
151
0
    while (1)
152
0
    {
153
      /* OK. So move up. */
154
0
      pos = fz_xml_up(pos);
155
      /* Check for hitting the top. */
156
0
      if (pos == top)
157
0
        pos = NULL;
158
0
      if (pos == NULL)
159
0
        break;
160
      /* We've returned to a node. See if it's a 'p'. */
161
0
      if (fz_xml_is_tag(pos, "p"))
162
0
      {
163
0
        fz_write_string(ctx, info->out, "\n");
164
0
      }
165
0
      next = fz_xml_next(pos);
166
0
      if (next)
167
0
      {
168
0
        pos = next;
169
0
        break;
170
0
      }
171
0
    }
172
0
  }
173
0
}
174
175
static void
176
show_footnote(fz_context *ctx, fz_xml *v, doc_info *info)
177
0
{
178
0
  int n = fz_atoi(fz_xml_att(v, "w:id"));
179
180
0
  if (n < 0 || n >= info->footnotes_max)
181
0
    return;
182
183
0
  if (info->footnotes[n] == NULL ||
184
0
    info->footnotes[n][0] == 0)
185
0
    return;
186
187
  /* Then send the strings. */
188
0
  doc_escape(ctx, info->out, info->footnotes[n]);
189
0
}
190
191
static void
192
process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages)
193
0
{
194
0
  fz_xml *pos;
195
0
  fz_xml *next;
196
0
  const char *paragraph_style = NULL;
197
0
  const char *inline_style = NULL;
198
199
#ifdef DEBUG_OFFICE_TO_HTML
200
  fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n");
201
  fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
202
#endif
203
204
  /* First off, see if we can do page numbers. */
205
0
  if (do_pages)
206
0
  {
207
0
    pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL);
208
0
    if (pos)
209
0
    {
210
      /* We *can* do page numbers, so start here. */
211
0
      fz_write_string(ctx, info->out, "<div id=\"page1\">\n");
212
0
      info->page = 1;
213
0
    }
214
0
  }
215
216
  /* Now walk the tree for real. */
217
0
  pos = xml;
218
0
  while (pos)
219
0
  {
220
    /* When we arrive on a node, check if it's a 't'. */
221
0
    if (fz_xml_is_tag(pos, "t"))
222
0
    {
223
0
      show_text(ctx, pos, info);
224
      /* Do NOT go down, we've already dealt with that. */
225
0
    }
226
0
    else if (fz_xml_is_tag(pos, "br"))
227
0
    {
228
0
      if (paragraph_style && strcmp(paragraph_style, "pre"))
229
0
      {
230
0
        fz_write_printf(ctx, info->out, "<br/>\n");
231
0
      }
232
0
      else
233
0
      {
234
0
        fz_write_printf(ctx, info->out, "\n");
235
0
      }
236
0
    }
237
0
    else if (fz_xml_is_tag(pos, "footnoteReference"))
238
0
    {
239
0
      show_footnote(ctx, pos, info);
240
      /* Do NOT go down, we've already dealt with that. */
241
0
    }
242
0
    else if (fz_xml_is_tag(pos, "tabs"))
243
0
    {
244
      /* Don't walk through tabs, or we will hit lots of 'tab' entries and
245
       * output incorrect information. */
246
0
    }
247
0
    else if (fz_xml_is_tag(pos, "pStyle"))
248
0
    {
249
      /* Should prob fix fz_xml_*() to strip namespace prefix
250
      from attributes, to match what it does for tag names.
251
      */
252
0
      paragraph_style = fz_xml_att(pos, "w:val");
253
0
      if (paragraph_style)
254
0
      {
255
0
        if (!strcmp(paragraph_style, "BodyText"))
256
0
          paragraph_style = NULL;
257
0
        else if (!strcmp(paragraph_style, "Heading1"))
258
0
          paragraph_style = "h1";
259
0
        else if (!strcmp(paragraph_style, "Heading2"))
260
0
          paragraph_style = "h2";
261
0
        else if (!strcmp(paragraph_style, "Heading3"))
262
0
          paragraph_style = "h3";
263
0
        else if (!strcmp(paragraph_style, "Heading4"))
264
0
          paragraph_style = "h4";
265
0
        else if (!strcmp(paragraph_style, "Heading5"))
266
0
          paragraph_style = "h5";
267
0
        else if (!strcmp(paragraph_style, "Heading6"))
268
0
          paragraph_style = "h6";
269
0
        else if (!strcmp(paragraph_style, "SourceCode"))
270
0
          paragraph_style = "pre";
271
0
        else
272
0
          paragraph_style = NULL;
273
274
0
        if (paragraph_style)
275
0
          fz_write_printf(ctx, info->out, "<%s>", paragraph_style);
276
0
      }
277
0
    }
278
0
    else if (fz_xml_is_tag(pos, "rStyle"))
279
0
    {
280
0
      inline_style = fz_xml_att(pos, "w:val");
281
0
      if (inline_style)
282
0
      {
283
0
        if (!strcmp(inline_style, "VerbatimChar"))
284
0
          inline_style = "tt";
285
0
        else
286
0
        {
287
0
          if (0)
288
0
            fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style);
289
0
          inline_style = NULL;
290
0
        }
291
0
        if (inline_style)
292
0
          fz_write_printf(ctx, info->out, "<%s>", inline_style);
293
0
      }
294
0
    }
295
0
    else
296
0
    {
297
0
      fz_xml *down;
298
0
      if (fz_xml_is_tag(pos, "lineBreak"))
299
0
      {
300
0
        fz_write_string(ctx, info->out, "\n");
301
0
      }
302
0
      else if (fz_xml_is_tag(pos, "p"))
303
0
      {
304
0
        fz_write_string(ctx, info->out, "<p>");
305
0
      }
306
0
      else if (fz_xml_is_tag(pos, "tab"))
307
0
      {
308
0
        fz_write_string(ctx, info->out, "\t");
309
0
      }
310
0
      else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak"))
311
0
      {
312
0
        if (info->page)
313
0
          fz_write_string(ctx, info->out, "\n</div>\n");
314
0
        info->page++;
315
0
        fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page);
316
0
      }
317
      /* Try to move down. */
318
0
      down = fz_xml_down(pos);
319
0
      if (down)
320
0
      {
321
        /* We can move down, easy! */
322
0
        pos = down;
323
0
        continue;
324
0
      }
325
0
    }
326
    /* Try moving to next. */
327
0
    next = fz_xml_next(pos);
328
0
    if (next)
329
0
    {
330
      /* We can move to next, easy! */
331
0
      pos = next;
332
0
      continue;
333
0
    }
334
335
    /* If we can't go down, or next, pop up until we
336
     * find somewhere we can go next from. */
337
0
    while (1)
338
0
    {
339
      /* OK. So move up. */
340
0
      pos = fz_xml_up(pos);
341
      /* Check for hitting the top. */
342
0
      if (pos == NULL)
343
0
        break;
344
      /* We've returned to a node. See if it's a 'p'. */
345
0
      if (fz_xml_is_tag(pos, "p"))
346
0
      {
347
0
        if (paragraph_style)
348
0
        {
349
0
          fz_write_printf(ctx, info->out, "</%s>", paragraph_style);
350
0
          paragraph_style = NULL;
351
0
        }
352
0
        fz_write_string(ctx, info->out, "</p>\n");
353
0
      }
354
0
      else if (fz_xml_is_tag(pos, "r"))
355
0
      {
356
        /* Seems to be pseudo-close for rStyle. */
357
0
        if (inline_style)
358
0
        {
359
0
          fz_write_printf(ctx, info->out, "</%s>", inline_style);
360
0
          inline_style = NULL;
361
0
        }
362
0
      }
363
0
      next = fz_xml_next(pos);
364
0
      if (next)
365
0
      {
366
0
        pos = next;
367
0
        break;
368
0
      }
369
0
    }
370
0
  }
371
372
0
  if (do_pages && info->page)
373
0
    fz_write_string(ctx, info->out, "\n</div>\n");
374
0
}
375
376
static void
377
process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages)
378
0
{
379
0
  fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
380
381
0
  fz_try(ctx)
382
0
    process_doc_stream(ctx, xml, info, do_pages);
383
0
  fz_always(ctx)
384
0
    fz_drop_xml(ctx, xml);
385
0
  fz_catch(ctx)
386
0
    fz_rethrow(ctx);
387
0
}
388
389
static void
390
process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
391
0
{
392
0
  fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0);
393
394
0
  fz_try(ctx)
395
0
  {
396
    /* FIXME: Should really search for these just inside 'spine'. */
397
0
    fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL);
398
0
    while (pos)
399
0
    {
400
0
      char *idref = fz_xml_att(pos, "idref");
401
0
      fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref);
402
0
      while (item)
403
0
      {
404
0
        char *type = fz_xml_att(item, "media-type");
405
0
        char *href = fz_xml_att(item, "href");
406
0
        if (type && href && !strcmp(type, "application/xml"))
407
0
        {
408
0
          process_item(ctx, arch, href, info, 1);
409
0
        }
410
0
        item = fz_xml_find_next_dfs(pos, "item", "id", idref);
411
0
      }
412
0
      pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL);
413
0
    }
414
0
  }
415
0
  fz_always(ctx)
416
0
    fz_drop_xml(ctx, xml);
417
0
  fz_catch(ctx)
418
0
    fz_rethrow(ctx);
419
0
}
420
421
/* XLSX support */
422
static char *
423
make_rel_name(fz_context *ctx, const char *file)
424
0
{
425
0
  size_t z = strlen(file);
426
0
  char *s = fz_malloc(ctx, z + 12);
427
0
  char *t;
428
0
  const char *p;
429
0
  const char *slash = file;
430
431
0
  for (p = file; *p != 0; p++)
432
0
    if (*p == '/')
433
0
      slash = p+1;
434
435
0
  t = s;
436
0
  if (slash != file)
437
0
  {
438
0
    memcpy(t, file, slash - file);
439
0
    t += slash - file;
440
0
  }
441
0
  memcpy(t, "_rels/", 6);
442
0
  t += 6;
443
0
  memcpy(t, file + (slash - file), z - (slash - file));
444
0
  t += z - (slash - file);
445
0
  memcpy(t, ".rels", 6);
446
447
0
  return s;
448
0
}
449
450
static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id)
451
0
{
452
0
  fz_xml *pos;
453
454
0
  if (id == NULL)
455
0
    return NULL;
456
457
0
  pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL);
458
0
  while (pos)
459
0
  {
460
0
    char *id2 = fz_xml_att(pos, "Id");
461
462
0
    if (id2 && !strcmp(id, id2))
463
0
      return fz_xml_att(pos, "Target");
464
465
0
    pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL);
466
0
  }
467
468
0
  return NULL;
469
0
}
470
471
static void
472
send_cell_formatting(fz_context *ctx, doc_info *info)
473
0
{
474
0
  if (info->col_signalled == 0)
475
0
  {
476
0
    fz_write_string(ctx, info->out, "<tr>\n");
477
0
    info->col_signalled = 1;
478
0
    if (info->col_at > 1)
479
0
      fz_write_string(ctx, info->out, "<td>");
480
0
  }
481
482
  /* Send the label */
483
0
  while (info->col_signalled < info->col_at)
484
0
  {
485
0
    fz_write_string(ctx, info->out, "</td>");
486
0
    info->col_signalled++;
487
0
    if (info->col_signalled < info->col_at)
488
0
      fz_write_string(ctx, info->out, "<td>");
489
0
  }
490
0
  if (info->sheet_name && info->sheet_name[0])
491
0
    fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label);
492
0
  else
493
0
    fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label);
494
0
}
495
496
static void
497
show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info)
498
0
{
499
0
  const char *t = fz_xml_text(fz_xml_down(v));
500
0
  int n = fz_atoi(t);
501
502
0
  if (n < 0 || n >= info->shared_string_len)
503
0
    return;
504
505
0
  if (info->shared_strings[n] == NULL ||
506
0
    info->shared_strings[n][0] == 0)
507
0
    return;
508
509
0
  send_cell_formatting(ctx, info);
510
  /* Then send the strings. */
511
0
  doc_escape(ctx, info->out, info->shared_strings[n]);
512
0
}
513
514
static int
515
col_from_label(const char *label)
516
0
{
517
0
  int col = 0;
518
0
  int len = 26;
519
0
  int base = 0;
520
521
  /* If we can't read the column, return 0. */
522
0
  if (label == NULL || *label < 'A' || *label > 'Z')
523
0
    return 0;
524
525
  /*  Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts
526
   *  at base index 'base'. Each section is 26 times as long, and starts
527
   *  at base + len from the previous section.
528
   *
529
   *  A:  col = 26 * 0 + 0 + 0
530
   *  AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26
531
   *  AAA:  col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26
532
   */
533
0
  do
534
0
  {
535
0
    col = 26 * col + (*label++) - 'A' + base;
536
0
    base += len;
537
0
    len *= 26;
538
0
  }
539
0
  while (*label >= 'A' && *label <= 'Z');
540
541
0
  return col+1;
542
0
}
543
544
static void
545
show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info)
546
0
{
547
0
  fz_xml *pos = top;
548
0
  fz_xml *next;
549
550
0
  while (pos)
551
0
  {
552
0
    char *text = fz_xml_text(pos);
553
554
0
    if (text)
555
0
    {
556
0
      send_cell_formatting(ctx, info);
557
0
      doc_escape(ctx, info->out, text);
558
0
    }
559
560
    /* Always try to move down. */
561
0
    next = fz_xml_down(pos);
562
0
    if (next)
563
0
    {
564
      /* We can move down, easy! */
565
0
      pos = next;
566
0
      continue;
567
0
    }
568
569
0
    if (pos == top)
570
0
      break;
571
572
    /* We can't move down, try moving to next. */
573
0
    next = fz_xml_next(pos);
574
0
    if (next)
575
0
    {
576
      /* We can move to next, easy! */
577
0
      pos = next;
578
0
      continue;
579
0
    }
580
581
    /* If we can't go down, or next, pop up until we
582
     * find somewhere we can go next from. */
583
0
    while (1)
584
0
    {
585
      /* OK. So move up. */
586
0
      pos = fz_xml_up(pos);
587
      /* Check for hitting the top. */
588
0
      if (pos == top)
589
0
        pos = NULL;
590
0
      if (pos == NULL)
591
0
        break;
592
0
      next = fz_xml_next(pos);
593
0
      if (next)
594
0
      {
595
0
        pos = next;
596
0
        break;
597
0
      }
598
0
    }
599
0
  }
600
0
}
601
602
static void
603
arrived_at_cell(fz_context *ctx, doc_info *info, const char *label)
604
0
{
605
0
  int col;
606
607
  /* If we have a label queued, and no label is given here, then we're
608
   * processing a 'cell' callback after having had a 'cellname'
609
   * callback. So don't signal it twice! */
610
0
  if (label == NULL && info->label)
611
0
    return;
612
613
0
  col = label ? col_from_label(label) : 0;
614
615
0
  fz_free(ctx, info->label);
616
0
  info->label = NULL;
617
0
  info->label = label ? fz_strdup(ctx, label) : NULL;
618
0
  info->col_at = col;
619
0
}
620
621
static void
622
show_cell(fz_context *ctx, fz_xml *cell, doc_info *info)
623
0
{
624
0
  char *t = fz_xml_att(cell, "t");
625
0
  fz_xml *v = fz_xml_find_down(cell, "v");
626
0
  const char *r = fz_xml_att(cell, "r");
627
628
0
  arrived_at_cell(ctx, info, r);
629
0
  if (t && t[0] == 's' && t[1] == 0)
630
0
    show_shared_string(ctx, v, info);
631
0
  else
632
0
    show_cell_text(ctx, v, info);
633
0
}
634
635
static void
636
new_row(fz_context *ctx, doc_info *info)
637
0
{
638
0
  if (info->col_signalled)
639
0
  {
640
    /* We've sent at least one cell. So need to close the
641
     * td and tr */
642
0
    fz_write_string(ctx, info->out, "</td>\n</tr>\n");
643
0
  }
644
0
  else
645
0
  {
646
    /* We've not sent anything for this row. Keep the counts
647
     * correct. */
648
0
    fz_write_string(ctx, info->out, "<tr></tr>\n");
649
0
  }
650
0
  info->col_at = 1;
651
0
  info->col_signalled = 0;
652
0
  fz_free(ctx, info->label);
653
0
  info->label = NULL;
654
0
}
655
656
static void
657
process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info)
658
0
{
659
0
  fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
660
661
#ifdef DEBUG_OFFICE_TO_HTML
662
  fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n");
663
  fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
664
#endif
665
666
0
  fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name);
667
668
0
  info->sheet_name = name;
669
0
  info->col_at = 0;
670
0
  info->col_signalled = 0;
671
672
0
  fz_try(ctx)
673
0
  {
674
0
    fz_xml *pos = xml;
675
0
    fz_xml *next;
676
677
0
    while (pos)
678
0
    {
679
      /* When we arrive on a node, check if it's a cell. */
680
0
      if (fz_xml_is_tag(pos, "c"))
681
0
      {
682
0
        show_cell(ctx, pos, info);
683
        /* Do NOT go down, we've already dealt with that. */
684
0
      }
685
0
      else
686
0
      {
687
        /* Try to move down. */
688
0
        next = fz_xml_down(pos);
689
0
        if (next)
690
0
        {
691
          /* We can move down, easy! */
692
0
          pos = next;
693
0
          continue;
694
0
        }
695
0
      }
696
      /* Try moving to next. */
697
0
      next = fz_xml_next(pos);
698
0
      if (next)
699
0
      {
700
        /* We can move to next, easy! */
701
0
        pos = next;
702
0
        continue;
703
0
      }
704
705
      /* If we can't go down, or next, pop up until we
706
       * find somewhere we can go next from. */
707
0
      while (1)
708
0
      {
709
        /* OK. So move up. */
710
0
        pos = fz_xml_up(pos);
711
        /* Check for hitting the top. */
712
0
        if (pos == NULL)
713
0
          break;
714
715
        /* We've returned to a node. See if it's a 'row'. */
716
0
        if (fz_xml_is_tag(pos, "row"))
717
0
          new_row(ctx, info);
718
719
0
        next = fz_xml_next(pos);
720
0
        if (next)
721
0
        {
722
0
          pos = next;
723
0
          break;
724
0
        }
725
0
      }
726
0
    }
727
0
    if (info->col_signalled)
728
0
      fz_write_printf(ctx, info->out, "</td>\n</tr>\n");
729
0
    fz_write_printf(ctx, info->out, "</table>\n");
730
0
  }
731
0
  fz_always(ctx)
732
0
    fz_drop_xml(ctx, xml);
733
0
  fz_catch(ctx)
734
0
    fz_rethrow(ctx);
735
0
}
736
737
static void
738
process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
739
0
{
740
0
  fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++);
741
0
  process_item(ctx, arch, file, info, 0);
742
0
  fz_write_printf(ctx, info->out, "</div>\n");
743
0
}
744
745
static char *
746
make_absolute_path(fz_context *ctx, const char *abs, const char *rel)
747
0
{
748
0
  const char *a = abs;
749
0
  const char *aslash = a;
750
0
  int up = 0;
751
0
  size_t z1, z2;
752
0
  char *s;
753
754
0
  if (rel == NULL)
755
0
    return NULL;
756
0
  if (abs == NULL || *rel == '/')
757
0
    return fz_strdup(ctx, rel);
758
759
0
  for (a = abs; *a != 0; a++)
760
0
    if (*a == '/')
761
0
      aslash = a+1;
762
763
0
  while (rel[0] == '.')
764
0
  {
765
0
    if (rel[1] == '/')
766
0
      rel += 2;
767
0
    else if (rel[1] == '.' && rel[2] == '/')
768
0
      rel += 3, up++;
769
0
    else
770
0
      fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
771
0
  }
772
0
  if (rel[0] == 0)
773
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
774
775
0
  while (up)
776
0
  {
777
0
    while (aslash != abs && aslash[-1] != '/')
778
0
      aslash--;
779
780
0
    up--;
781
0
  }
782
783
0
  z1 = aslash - abs;
784
0
  z2 = strlen(rel);
785
0
  s = fz_malloc(ctx, z1 + z2 + 1);
786
0
  if (z1)
787
0
    memcpy(s, abs, z1);
788
0
  memcpy(s+z1, rel, z2+1);
789
790
0
  return s;
791
0
}
792
793
static char *
794
collate_t_content(fz_context *ctx, fz_xml *top)
795
0
{
796
0
  char *val = NULL;
797
0
  fz_xml *next;
798
0
  fz_xml *pos = fz_xml_down(top);
799
800
0
  while (pos != top)
801
0
  {
802
    /* Capture all the 't' content. */
803
0
    if (fz_xml_is_tag(pos, "t"))
804
0
    {
805
      /* Remember the content. */
806
0
      char *s = fz_xml_text(fz_xml_down(pos));
807
808
0
      if (s == NULL)
809
0
      {
810
        /* Do nothing */
811
0
      }
812
0
      else if (val == NULL)
813
0
        val = fz_strdup(ctx, s);
814
0
      else
815
0
      {
816
0
        char *val2;
817
0
        size_t z1 = strlen(val);
818
0
        size_t z2 = strlen(s) + 1;
819
0
        fz_try(ctx)
820
0
        {
821
0
          val2 = fz_malloc(ctx, z1 + z2);
822
0
        }
823
0
        fz_catch(ctx)
824
0
        {
825
0
          fz_free(ctx, val);
826
0
          fz_rethrow(ctx);
827
0
        }
828
0
        memcpy(val2, val, z1);
829
0
        memcpy(val2 + z1, s, z2);
830
0
        fz_free(ctx, val);
831
0
        val = val2;
832
0
      }
833
      /* Do NOT go down, we've already dealt with that. */
834
0
    }
835
0
    else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh"))
836
0
    {
837
      /* We do not want the 't' content from within these. */
838
0
    }
839
0
    else
840
0
    {
841
      /* Try to move down. */
842
0
      next = fz_xml_down(pos);
843
0
      if (next)
844
0
      {
845
        /* We can move down, easy! */
846
0
        pos = next;
847
0
        continue;
848
0
      }
849
0
    }
850
    /* Try moving to next. */
851
0
    next = fz_xml_next(pos);
852
0
    if (next)
853
0
    {
854
      /* We can move to next, easy! */
855
0
      pos = next;
856
0
      continue;
857
0
    }
858
859
    /* If we can't go down, or next, pop up until we
860
     * find somewhere we can go next from. */
861
0
    while (1)
862
0
    {
863
      /* OK. So move up. */
864
0
      pos = fz_xml_up(pos);
865
      /* Check for hitting the top. */
866
0
      if (pos == top)
867
0
        break;
868
0
      next = fz_xml_next(pos);
869
0
      if (next)
870
0
      {
871
0
        pos = next;
872
0
        break;
873
0
      }
874
0
    }
875
0
  }
876
877
0
  return val;
878
0
}
879
880
static fz_xml *
881
try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
882
0
{
883
0
  if (!fz_has_archive_entry(ctx, arch, filename))
884
0
    return NULL;
885
886
0
  return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white);
887
0
}
888
889
static void
890
load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
891
0
{
892
0
  fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings");
893
0
  const char *ss_file = fz_xml_att(pos, "Target");
894
0
  char *resolved = NULL;
895
0
  fz_xml *xml = NULL;
896
0
  char *str = NULL;
897
898
0
  if (ss_file == NULL)
899
0
    return;
900
901
0
  fz_var(xml);
902
0
  fz_var(str);
903
0
  fz_var(resolved);
904
905
0
  fz_try(ctx)
906
0
  {
907
0
    fz_xml *pos;
908
909
0
    resolved = make_absolute_path(ctx, file, ss_file);
910
0
    xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1);
911
912
0
    pos = fz_xml_find_dfs(xml, "si", NULL, NULL);
913
0
    while (pos)
914
0
    {
915
0
      int n = info->shared_string_len;
916
0
      str = collate_t_content(ctx, pos);
917
918
0
      if (n == info->shared_string_max)
919
0
      {
920
0
        int max = info->shared_string_max;
921
0
        int newmax = max ? max * 2 : 1024;
922
0
        char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax);
923
0
        memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
924
0
        info->shared_strings = arr;
925
0
        info->shared_string_max = newmax;
926
0
      }
927
928
0
      info->shared_strings[n] = str;
929
0
      str = NULL;
930
0
      info->shared_string_len++;
931
0
      pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL);
932
0
    }
933
0
  }
934
0
  fz_always(ctx)
935
0
  {
936
0
    fz_drop_xml(ctx, xml);
937
0
    fz_free(ctx, resolved);
938
0
    fz_free(ctx, str);
939
0
  }
940
0
  fz_catch(ctx)
941
0
    fz_rethrow(ctx);
942
0
}
943
944
static void
945
load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
946
0
{
947
0
  char *resolved = NULL;
948
0
  fz_xml *xml = NULL;
949
0
  char *str = NULL;
950
951
0
  fz_var(xml);
952
0
  fz_var(str);
953
0
  fz_var(resolved);
954
955
0
  fz_try(ctx)
956
0
  {
957
0
    fz_xml *pos;
958
959
0
    resolved = make_absolute_path(ctx, file, "footnotes.xml");
960
0
    xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1);
961
0
    if (xml == NULL)
962
0
      break;
963
964
0
    pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL);
965
0
    while (pos)
966
0
    {
967
0
      int n = fz_atoi(fz_xml_att(pos, "w:id"));
968
969
0
      str = collate_t_content(ctx, pos);
970
971
0
      if (str && n >= 0)
972
0
      {
973
0
        if (n >= info->footnotes_max)
974
0
        {
975
0
          int max = info->footnotes_max;
976
0
          int newmax = max ? max * 2 : 1024;
977
0
          char **arr;
978
0
          if (newmax < n)
979
0
            newmax = n+1;
980
0
          arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax);
981
0
          memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
982
0
          info->footnotes = arr;
983
0
          info->footnotes_max = newmax;
984
0
        }
985
986
0
        info->footnotes[n] = str;
987
0
        str = NULL;
988
0
      }
989
0
      pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL);
990
0
    }
991
0
  }
992
0
  fz_always(ctx)
993
0
  {
994
0
    fz_drop_xml(ctx, xml);
995
0
    fz_free(ctx, resolved);
996
0
    fz_free(ctx, str);
997
0
  }
998
0
  fz_catch(ctx)
999
0
    fz_rethrow(ctx);
1000
0
}
1001
1002
static void
1003
process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
1004
0
{
1005
0
  char *file_rels;
1006
0
  fz_xml *xml = NULL;
1007
0
  fz_xml *rels = NULL;
1008
0
  char *resolved_rel = NULL;
1009
1010
0
  if (file == NULL)
1011
0
    return;
1012
1013
0
  file_rels = make_rel_name(ctx, file);
1014
1015
0
  fz_var(resolved_rel);
1016
1017
0
  fz_var(rels);
1018
0
  fz_var(xml);
1019
1020
0
  fz_try(ctx)
1021
0
  {
1022
0
    fz_xml *pos;
1023
1024
0
    rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0);
1025
0
    xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
1026
1027
    /* XLSX */
1028
0
    pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL);
1029
0
    if (pos)
1030
0
    {
1031
0
      load_shared_strings(ctx, arch, rels, info, file);
1032
0
      while (pos)
1033
0
      {
1034
0
        char *name = fz_xml_att(pos, "name");
1035
0
        char *id = fz_xml_att(pos, "r:id");
1036
0
        char *sheet = lookup_rel(ctx, rels, id);
1037
1038
0
        if (sheet)
1039
0
        {
1040
0
          resolved_rel = make_absolute_path(ctx, file, sheet);
1041
0
          process_sheet(ctx, arch, name, resolved_rel, info);
1042
0
          fz_free(ctx, resolved_rel);
1043
0
          resolved_rel = NULL;
1044
0
        }
1045
0
        pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL);
1046
0
      }
1047
0
      break;
1048
0
    }
1049
1050
    /* Let's try it as a powerpoint */
1051
0
    pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL);
1052
0
    if (pos)
1053
0
    {
1054
0
      while (pos)
1055
0
      {
1056
0
        char *id = fz_xml_att(pos, "r:id");
1057
0
        char *sheet = lookup_rel(ctx, rels, id);
1058
1059
0
        if (sheet)
1060
0
        {
1061
0
          resolved_rel = make_absolute_path(ctx, file, sheet);
1062
0
          process_slide(ctx, arch, resolved_rel, info);
1063
0
          fz_free(ctx, resolved_rel);
1064
0
          resolved_rel = NULL;
1065
0
        }
1066
0
        pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL);
1067
0
      }
1068
0
      break;
1069
0
    }
1070
1071
    /* Let's try it as word. */
1072
0
    {
1073
0
      load_footnotes(ctx, arch, rels, info, file);
1074
0
      process_doc_stream(ctx, xml, info, 1);
1075
0
    }
1076
0
  }
1077
0
  fz_always(ctx)
1078
0
  {
1079
0
    fz_drop_xml(ctx, xml);
1080
0
    fz_drop_xml(ctx, rels);
1081
0
    fz_free(ctx, resolved_rel);
1082
0
    fz_free(ctx, file_rels);
1083
0
  }
1084
0
  fz_catch(ctx)
1085
0
    fz_rethrow(ctx);
1086
0
}
1087
1088
static void
1089
process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
1090
0
{
1091
0
  fz_xml *xml = NULL;
1092
0
  char *title;
1093
1094
0
  fz_var(xml);
1095
1096
0
  fz_try(ctx)
1097
0
  {
1098
0
    fz_xml *pos;
1099
1100
0
    xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
1101
1102
0
    pos = fz_xml_find_dfs(xml, "title", NULL, NULL);
1103
0
    title = fz_xml_text(fz_xml_down(pos));
1104
0
    if (title)
1105
0
    {
1106
0
      fz_write_string(ctx, info->out, "<title>");
1107
0
      doc_escape(ctx, info->out, title);
1108
0
      fz_write_string(ctx, info->out, "</title>");
1109
0
    }
1110
0
  }
1111
0
  fz_always(ctx)
1112
0
  {
1113
0
    fz_drop_xml(ctx, xml);
1114
0
  }
1115
0
  fz_catch(ctx)
1116
0
    fz_rethrow(ctx);
1117
0
}
1118
1119
static fz_buffer *
1120
fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts)
1121
0
{
1122
0
  fz_stream *stream = NULL;
1123
0
  fz_archive *archive = NULL;
1124
0
  fz_buffer *buffer_out = NULL;
1125
0
  fz_xml *xml = NULL;
1126
0
  fz_xml *pos = NULL;
1127
0
  fz_xml *rels = NULL;
1128
0
  const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
1129
0
  const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
1130
0
  doc_info info = { 0 };
1131
0
  int i;
1132
1133
0
  fz_var(archive);
1134
0
  fz_var(stream);
1135
0
  fz_var(buffer_out);
1136
0
  fz_var(xml);
1137
0
  fz_var(rels);
1138
1139
0
  if (opts)
1140
0
    info.opts = *opts;
1141
1142
0
  fz_try(ctx)
1143
0
  {
1144
0
    if (buffer_in)
1145
0
    {
1146
0
      stream = fz_open_buffer(ctx, buffer_in);
1147
0
      archive = fz_open_archive_with_stream(ctx, stream);
1148
0
    }
1149
0
    else
1150
0
      archive = fz_keep_archive(ctx, dir);
1151
0
    buffer_out = fz_new_buffer(ctx, 1024);
1152
0
    info.out = fz_new_output_with_buffer(ctx, buffer_out);
1153
1154
    /* Is it an HWPX ?*/
1155
0
    xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0);
1156
0
    if (xml)
1157
0
    {
1158
0
      pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml");
1159
0
      if (!pos)
1160
0
        fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx.");
1161
1162
0
      while (pos)
1163
0
      {
1164
0
        const char *file = fz_xml_att(pos, "full-path");
1165
0
        process_rootfile(ctx, archive, file, &info);
1166
0
        pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml");
1167
0
      }
1168
0
      break;
1169
0
    }
1170
1171
    /* Try other types */
1172
0
    {
1173
0
      xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0);
1174
1175
0
      fz_write_string(ctx, info.out, "<html>\n");
1176
1177
0
      pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props);
1178
0
      if (pos)
1179
0
      {
1180
0
        const char *file = fz_xml_att(pos, "Target");
1181
0
        fz_write_string(ctx, info.out, "<head>\n");
1182
0
        process_office_document_properties(ctx, archive, file, &info);
1183
0
        fz_write_string(ctx, info.out, "</head>\n");
1184
0
      }
1185
1186
0
      fz_write_string(ctx, info.out, "<body>\n");
1187
0
      pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema);
1188
0
      if (!pos)
1189
0
        fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx.");
1190
1191
0
      while (pos)
1192
0
      {
1193
0
        const char *file = fz_xml_att(pos, "Target");
1194
0
        if (file)
1195
0
          process_office_document(ctx, archive, file, &info);
1196
0
        pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema);
1197
0
      }
1198
0
    }
1199
1200
0
    fz_close_output(ctx, info.out);
1201
0
  }
1202
0
  fz_always(ctx)
1203
0
  {
1204
0
    fz_drop_xml(ctx, rels);
1205
0
    fz_drop_xml(ctx, xml);
1206
0
    for (i = 0; i < info.shared_string_len; ++i)
1207
0
      fz_free(ctx, info.shared_strings[i]);
1208
0
    fz_free(ctx, info.shared_strings);
1209
0
    for (i = 0; i < info.footnotes_max; ++i)
1210
0
      fz_free(ctx, info.footnotes[i]);
1211
0
    fz_free(ctx, info.footnotes);
1212
0
    fz_drop_output(ctx, info.out);
1213
0
    fz_drop_archive(ctx, archive);
1214
0
    fz_drop_stream(ctx, stream);
1215
0
  }
1216
0
  fz_catch(ctx)
1217
0
  {
1218
0
    fz_drop_buffer(ctx, buffer_out);
1219
0
    fz_rethrow(ctx);
1220
0
  }
1221
1222
#ifdef DEBUG_OFFICE_TO_HTML
1223
  {
1224
    unsigned char *storage;
1225
    size_t len = fz_buffer_storage(ctx, buffer_out, &storage);
1226
    fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len);
1227
    fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out);
1228
  }
1229
#endif
1230
1231
0
  return buffer_out;
1232
0
}
1233
1234
/* Office document handler */
1235
1236
static fz_buffer *
1237
office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
1238
0
{
1239
0
  fz_office_to_html_opts opts = { 0 };
1240
1241
0
  return fz_office_to_html(ctx, set, buf, zip, user_css, &opts);
1242
0
}
1243
1244
static const fz_htdoc_format_t fz_htdoc_office =
1245
{
1246
  "Office document",
1247
  office_to_html,
1248
  0, 1, 0
1249
};
1250
1251
static fz_document *
1252
office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
1253
0
{
1254
0
  return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office);
1255
0
}
1256
1257
static const char *office_extensions[] =
1258
{
1259
  "docx",
1260
  "xlsx",
1261
  "pptx",
1262
  "hwpx",
1263
  NULL
1264
};
1265
1266
static const char *office_mimetypes[] =
1267
{
1268
  // DOCX
1269
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1270
  // XLSX
1271
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1272
  // PPTX
1273
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1274
  // HWPX
1275
  "application/haansofthwpx",
1276
  "application/vnd.hancom.hwpx",
1277
  NULL
1278
};
1279
1280
/* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler
1281
 * to override us by returning 100. */
1282
static int
1283
office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state)
1284
13.5k
{
1285
13.5k
  fz_archive *arch = NULL;
1286
13.5k
  int ret = 0;
1287
13.5k
  fz_xml *xml = NULL;
1288
1289
13.5k
  if (state)
1290
13.5k
    *state = NULL;
1291
13.5k
  if (free_state)
1292
13.5k
    *free_state = NULL;
1293
1294
13.5k
  fz_var(arch);
1295
13.5k
  fz_var(ret);
1296
13.5k
  fz_var(xml);
1297
1298
27.1k
  fz_try(ctx)
1299
27.1k
  {
1300
13.5k
    if (stream)
1301
13.5k
    {
1302
13.5k
      arch = fz_try_open_archive_with_stream(ctx, stream);
1303
13.5k
      if (arch == NULL)
1304
13.3k
        break;
1305
13.5k
    }
1306
0
    else
1307
0
      arch = fz_keep_archive(ctx, zip);
1308
1309
182
    xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0);
1310
182
    if (xml)
1311
0
    {
1312
0
      if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"))
1313
0
        ret = 75; /* HWPX */
1314
0
      break;
1315
0
    }
1316
182
    xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0);
1317
182
    if (xml)
1318
12
    {
1319
12
      if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"))
1320
12
      {
1321
12
        ret = 75; /* DOCX | PPTX | XLSX */
1322
12
      }
1323
12
      break;
1324
12
    }
1325
182
  }
1326
27.1k
  fz_always(ctx)
1327
13.5k
  {
1328
13.5k
    fz_drop_xml(ctx, xml);
1329
13.5k
    fz_drop_archive(ctx, arch);
1330
13.5k
  }
1331
13.5k
  fz_catch(ctx)
1332
155
    fz_rethrow(ctx);
1333
1334
13.4k
  return ret;
1335
13.5k
}
1336
1337
fz_document_handler office_document_handler =
1338
{
1339
  NULL,
1340
  office_open_document,
1341
  office_extensions,
1342
  office_mimetypes,
1343
  office_recognize_doc_content
1344
};