Coverage Report

Created: 2025-12-03 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/html/txt.c
Line
Count
Source
1
// Copyright (C) 2023-2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/html.h"
25
26
enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE };
27
28
static int
29
detect_txt_encoding(fz_context *ctx, fz_buffer *buf)
30
0
{
31
0
  const uint8_t *d = buf->data;
32
0
  size_t len = buf->len;
33
0
  const uint8_t *end = buf->data + len;
34
0
  int count_tabs = 0;
35
0
  int count_hi = 0;
36
0
  int count_controls = 0;
37
0
  int plausibly_utf8 = 1;
38
39
  /* If we find a BOM, believe it. */
40
0
  if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF)
41
0
    return ENCODING_UTF8_BOM;
42
0
  else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe)
43
0
    return ENCODING_UTF16_LE;
44
0
  else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff)
45
0
    return ENCODING_UTF16_BE;
46
47
0
  while (d < end)
48
0
  {
49
0
    uint8_t c = *d++;
50
0
    if (c == 9)
51
0
      count_tabs++;
52
0
    else if (c == 12)
53
0
    {
54
      /* Form feed. Ignore that. */
55
0
    }
56
0
    else if (c == 10)
57
0
    {
58
0
      if (d < end && d[0] == 13)
59
0
        d++;
60
0
    }
61
0
    else if (c == 13)
62
0
    {
63
0
      if (d < end && d[0] == 10)
64
0
        d++;
65
0
    }
66
0
    else if (c < 32 || c == 0x7f)
67
0
      count_controls++;
68
0
    else if (c < 0x7f)
69
0
    {
70
      /* Reasonable ASCII value */
71
0
    }
72
0
    else
73
0
    {
74
0
      count_hi++;
75
0
      if ((c & 0xf8) == 0xF0)
76
0
      {
77
        /* Could be UTF8 with 3 following bytes */
78
0
        if (d+2 >= end ||
79
0
          (d[0] & 0xC0) != 0x80 ||
80
0
          (d[1] & 0xC0) != 0x80 ||
81
0
          (d[2] & 0xC0) != 0x80)
82
0
          plausibly_utf8 = 0;
83
0
        else
84
0
          d += 3;
85
0
      }
86
0
      else if ((c & 0xf0) == 0xE0)
87
0
      {
88
        /* Could be UTF8 with 2 following bytes */
89
0
        if (d+1 >= end ||
90
0
          (d[0] & 0xC0) != 0x80 ||
91
0
          (d[1] & 0xC0) != 0x80)
92
0
          plausibly_utf8 = 0;
93
0
        else
94
0
          d += 2;
95
0
      }
96
0
      else if ((c & 0xE0) == 0xC0)
97
0
      {
98
        /* Could be UTF8 with 1 following bytes */
99
0
        if (d+1 >= end ||
100
0
          (d[0] & 0xC0) != 0x80)
101
0
          plausibly_utf8 = 0;
102
0
        else
103
0
          d++;
104
0
      }
105
0
      else
106
0
        plausibly_utf8 = 0;
107
0
    }
108
0
  }
109
110
0
  (void)count_tabs;
111
0
  (void)count_hi;
112
0
  (void)count_controls;
113
114
0
  if (plausibly_utf8)
115
0
    return ENCODING_UTF8;
116
0
  return ENCODING_ASCII;
117
0
}
118
119
fz_buffer *
120
fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in)
121
0
{
122
0
  int encoding = detect_txt_encoding(ctx, in);
123
0
  fz_stream *stream = fz_open_buffer(ctx, in);
124
0
  fz_buffer *outbuf = NULL;
125
0
  fz_output *out = NULL;
126
0
  int col = 0;
127
128
0
  fz_var(outbuf);
129
0
  fz_var(out);
130
131
0
  fz_try(ctx)
132
0
  {
133
0
    outbuf = fz_new_buffer(ctx, 1024);
134
0
    out = fz_new_output_with_buffer(ctx, outbuf);
135
136
0
    fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>");
137
138
0
    if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE)
139
0
    {
140
0
      fz_read_byte(ctx, stream);
141
0
      fz_read_byte(ctx, stream);
142
0
    }
143
0
    else if (encoding == ENCODING_UTF8_BOM)
144
0
    {
145
0
      fz_read_byte(ctx, stream);
146
0
      fz_read_byte(ctx, stream);
147
0
      fz_read_byte(ctx, stream);
148
0
    }
149
150
0
    while (!fz_is_eof(ctx, stream))
151
0
    {
152
0
      int c;
153
0
      switch (encoding)
154
0
      {
155
0
      default:
156
0
      case ENCODING_ASCII:
157
0
        c = fz_read_byte(ctx, stream);
158
0
        break;
159
0
      case ENCODING_UTF8:
160
0
      case ENCODING_UTF8_BOM:
161
0
        c = fz_read_rune(ctx, stream);
162
0
        break;
163
0
      case ENCODING_UTF16_LE:
164
0
        c = fz_read_utf16_le(ctx, stream);
165
0
        break;
166
0
      case ENCODING_UTF16_BE:
167
0
        c = fz_read_utf16_be(ctx, stream);
168
0
      }
169
170
0
      if (c == 10 || c == 13)
171
0
      {
172
0
        col = -1;
173
0
        fz_write_byte(ctx, out, c);
174
0
      }
175
0
      else if (c == 9)
176
0
      {
177
0
        int n = (8 - col) & 7;
178
0
        if (n == 0)
179
0
          n = 8;
180
0
        col += n-1;
181
0
        while (n--)
182
0
          fz_write_byte(ctx, out, ' ');
183
0
      }
184
0
      else if (c == 12)
185
0
      {
186
0
        col = -1;
187
0
        fz_write_string(ctx, out, "</pre><pre>\n");
188
0
      }
189
0
      else if (c == '<')
190
0
        fz_write_string(ctx, out, "&lt;");
191
0
      else if (c == '>')
192
0
        fz_write_string(ctx, out, "&gt;");
193
0
      else if (c == '"')
194
0
        fz_write_string(ctx, out, "&quot;");
195
0
      else
196
0
        fz_write_rune(ctx, out, c);
197
198
0
      ++col;
199
0
    }
200
201
0
    fz_close_output(ctx, out);
202
0
  }
203
0
  fz_always(ctx)
204
0
  {
205
0
    fz_drop_stream(ctx, stream);
206
0
    fz_drop_output(ctx, out);
207
0
  }
208
0
  fz_catch(ctx)
209
0
  {
210
0
    fz_drop_buffer(ctx, outbuf);
211
0
    fz_rethrow(ctx);
212
0
  }
213
214
0
  return outbuf;
215
0
}
216
217
static fz_buffer *
218
txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
219
0
{
220
0
  return fz_txt_buffer_to_html(ctx, buf);
221
0
}
222
223
static const fz_htdoc_format_t fz_htdoc_txt =
224
{
225
  "Text",
226
  txt_to_html,
227
  0, 1, 0
228
};
229
230
static fz_document *
231
txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
232
0
{
233
0
  return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt);
234
0
}
235
236
static const char *txt_extensions[] =
237
{
238
  "txt",
239
  "text",
240
  "log",
241
  NULL
242
};
243
244
static const char *txt_mimetypes[] =
245
{
246
  "text.plain",
247
  NULL
248
};
249
250
fz_document_handler txt_document_handler =
251
{
252
  NULL,
253
  txt_open_document,
254
  txt_extensions,
255
  txt_mimetypes
256
};