/src/dovecot/src/lib-mail/message-decoder.c
Line | Count | Source |
1 | | /* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */ |
2 | | |
3 | | #include "lib.h" |
4 | | #include "buffer.h" |
5 | | #include "base64.h" |
6 | | #include "str.h" |
7 | | #include "unichar.h" |
8 | | #include "charset-utf8.h" |
9 | | #include "qp-decoder.h" |
10 | | #include "rfc822-parser.h" |
11 | | #include "rfc2231-parser.h" |
12 | | #include "message-parser.h" |
13 | | #include "message-header-decode.h" |
14 | | #include "message-decoder.h" |
15 | | |
16 | | struct message_decoder_context { |
17 | | enum message_decoder_flags flags; |
18 | | normalizer_func_t *normalizer; |
19 | | struct message_part *prev_part; |
20 | | |
21 | | struct message_header_line hdr; |
22 | | buffer_t *buf, *buf2; |
23 | | |
24 | | char *charset_trans_charset; |
25 | | struct charset_translation *charset_trans; |
26 | | char translation_buf[CHARSET_MAX_PENDING_BUF_SIZE]; |
27 | | size_t translation_size; |
28 | | |
29 | | struct qp_decoder *qp; |
30 | | struct base64_decoder base64_decoder; |
31 | | |
32 | | char *content_type, *content_charset; |
33 | | enum message_cte message_cte; |
34 | | |
35 | | bool binary_input:1; |
36 | | }; |
37 | | |
38 | | static void |
39 | | message_decode_body_init_charset(struct message_decoder_context *ctx, |
40 | | struct message_part *part); |
41 | | |
42 | | struct message_decoder_context * |
43 | | message_decoder_init(normalizer_func_t *normalizer, |
44 | | enum message_decoder_flags flags) |
45 | 8.45k | { |
46 | 8.45k | struct message_decoder_context *ctx; |
47 | | |
48 | 8.45k | ctx = i_new(struct message_decoder_context, 1); |
49 | 8.45k | ctx->flags = flags; |
50 | 8.45k | ctx->normalizer = normalizer; |
51 | 8.45k | ctx->buf = buffer_create_dynamic(default_pool, 8192); |
52 | 8.45k | ctx->buf2 = buffer_create_dynamic(default_pool, 8192); |
53 | 8.45k | base64_decode_init(&ctx->base64_decoder, &base64_scheme, 0); |
54 | 8.45k | return ctx; |
55 | 8.45k | } |
56 | | |
57 | | void message_decoder_deinit(struct message_decoder_context **_ctx) |
58 | 8.45k | { |
59 | 8.45k | struct message_decoder_context *ctx = *_ctx; |
60 | | |
61 | 8.45k | *_ctx = NULL; |
62 | | |
63 | 8.45k | if (ctx->charset_trans != NULL) |
64 | 8.45k | charset_to_utf8_end(&ctx->charset_trans); |
65 | 8.45k | if (ctx->qp != NULL) |
66 | 609 | qp_decoder_deinit(&ctx->qp); |
67 | | |
68 | 8.45k | buffer_free(&ctx->buf); |
69 | 8.45k | buffer_free(&ctx->buf2); |
70 | 8.45k | i_free(ctx->charset_trans_charset); |
71 | 8.45k | i_free(ctx->content_type); |
72 | 8.45k | i_free(ctx->content_charset); |
73 | 8.45k | i_free(ctx); |
74 | 8.45k | } |
75 | | |
76 | | void message_decoder_set_return_binary(struct message_decoder_context *ctx, |
77 | | bool set) |
78 | 0 | { |
79 | 0 | if (set) |
80 | 0 | ctx->flags |= MESSAGE_DECODER_FLAG_RETURN_BINARY; |
81 | 0 | else |
82 | 0 | ctx->flags &= ENUM_NEGATE(MESSAGE_DECODER_FLAG_RETURN_BINARY); |
83 | 0 | message_decode_body_init_charset(ctx, ctx->prev_part); |
84 | 0 | } |
85 | | |
86 | | enum message_cte message_decoder_parse_cte(const struct message_header_line *hdr) |
87 | 13.5k | { |
88 | 13.5k | struct rfc822_parser_context parser; |
89 | 13.5k | enum message_cte message_cte; |
90 | 13.5k | string_t *value; |
91 | | |
92 | 13.5k | value = t_str_new(64); |
93 | 13.5k | rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL); |
94 | | |
95 | 13.5k | rfc822_skip_lwsp(&parser); |
96 | | |
97 | | /* Ensure we do not accidentically accept confused values like |
98 | | 'base64 binary' or embedded NULs */ |
99 | 13.5k | if (rfc822_parse_mime_token(&parser, value) == 1) { |
100 | 2.22k | rfc822_skip_lwsp(&parser); |
101 | | /* RFC 2045 does not permit parameters for CTE, |
102 | | but in case someone uses them, we accept |
103 | | parameter separator ';' to be lenient. */ |
104 | 2.22k | if (*parser.data != ';') |
105 | 943 | return MESSAGE_CTE_UNKNOWN; |
106 | 2.22k | } |
107 | | |
108 | 12.5k | message_cte = MESSAGE_CTE_UNKNOWN; |
109 | 12.5k | switch (str_len(value)) { |
110 | 490 | case 4: |
111 | 490 | if (i_memcasecmp(str_data(value), "7bit", 4) == 0 || |
112 | 490 | i_memcasecmp(str_data(value), "8bit", 4) == 0) |
113 | 0 | message_cte = MESSAGE_CTE_78BIT; |
114 | 490 | break; |
115 | 3.53k | case 6: |
116 | 3.53k | if (i_memcasecmp(str_data(value), "base64", 6) == 0) |
117 | 3.02k | message_cte = MESSAGE_CTE_BASE64; |
118 | 513 | else if (i_memcasecmp(str_data(value), "binary", 6) == 0) |
119 | 0 | message_cte = MESSAGE_CTE_BINARY; |
120 | 3.53k | break; |
121 | 5.06k | case 16: |
122 | 5.06k | if (i_memcasecmp(str_data(value), "quoted-printable", 16) == 0) |
123 | 4.62k | message_cte = MESSAGE_CTE_QP; |
124 | 5.06k | break; |
125 | 12.5k | } |
126 | 12.5k | rfc822_parser_deinit(&parser); |
127 | 12.5k | return message_cte; |
128 | 12.5k | } |
129 | | |
130 | | static void |
131 | | parse_content_type(struct message_decoder_context *ctx, |
132 | | struct message_header_line *hdr) |
133 | 39.4k | { |
134 | 39.4k | struct rfc822_parser_context parser; |
135 | 39.4k | const char *const *results; |
136 | 39.4k | string_t *str; |
137 | 39.4k | int ret; |
138 | | |
139 | 39.4k | if (ctx->content_type != NULL) |
140 | 1.11k | return; |
141 | | |
142 | 38.3k | rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL); |
143 | 38.3k | rfc822_skip_lwsp(&parser); |
144 | 38.3k | str = t_str_new(64); |
145 | 38.3k | ret = rfc822_parse_content_type(&parser, str); |
146 | 38.3k | ctx->content_type = i_strdup(str_c(str)); |
147 | 38.3k | if (ret < 0) { |
148 | 8.70k | rfc822_parser_deinit(&parser); |
149 | 8.70k | return; |
150 | 8.70k | } |
151 | | |
152 | 29.6k | rfc2231_parse(&parser, &results); |
153 | 69.5k | for (; *results != NULL; results += 2) { |
154 | 43.0k | if (strcasecmp(results[0], "charset") == 0) { |
155 | 3.17k | ctx->content_charset = i_strdup(results[1]); |
156 | 3.17k | break; |
157 | 3.17k | } |
158 | 43.0k | } |
159 | 29.6k | rfc822_parser_deinit(&parser); |
160 | 29.6k | } |
161 | | |
162 | | static bool message_decode_header(struct message_decoder_context *ctx, |
163 | | struct message_header_line *hdr, |
164 | | struct message_block *output) |
165 | 310k | { |
166 | 310k | size_t value_len; |
167 | | |
168 | 310k | if (hdr->continues) { |
169 | 27.2k | hdr->use_full_value = TRUE; |
170 | 27.2k | return FALSE; |
171 | 27.2k | } |
172 | | |
173 | 310k | T_BEGIN { |
174 | 283k | if (hdr->name_len == 12 && |
175 | 41.1k | strcasecmp(hdr->name, "Content-Type") == 0) |
176 | 39.4k | parse_content_type(ctx, hdr); |
177 | 283k | if (hdr->name_len == 25 && |
178 | 14.2k | strcasecmp(hdr->name, "Content-Transfer-Encoding") == 0) |
179 | 13.5k | ctx->message_cte = message_decoder_parse_cte(hdr); |
180 | 283k | } T_END; |
181 | | |
182 | 283k | buffer_set_used_size(ctx->buf, 0); |
183 | 283k | message_header_decode_utf8(hdr->full_value, hdr->full_value_len, |
184 | 283k | ctx->buf, ctx->normalizer); |
185 | 283k | value_len = ctx->buf->used; |
186 | | |
187 | 283k | if (ctx->normalizer != NULL) { |
188 | 283k | (void)ctx->normalizer(hdr->name, hdr->name_len, ctx->buf); |
189 | 283k | buffer_append_c(ctx->buf, '\0'); |
190 | 283k | } else { |
191 | 0 | if (!uni_utf8_get_valid_data((const unsigned char *)hdr->name, |
192 | 0 | hdr->name_len, ctx->buf)) |
193 | 0 | buffer_append_c(ctx->buf, '\0'); |
194 | 0 | } |
195 | | |
196 | 283k | ctx->hdr = *hdr; |
197 | 283k | ctx->hdr.full_value = ctx->buf->data; |
198 | 283k | ctx->hdr.full_value_len = value_len; |
199 | 283k | ctx->hdr.value_len = 0; |
200 | 283k | if (ctx->buf->used != value_len) { |
201 | 283k | ctx->hdr.name = CONST_PTR_OFFSET(ctx->buf->data, |
202 | 283k | ctx->hdr.full_value_len); |
203 | 283k | ctx->hdr.name_len = ctx->buf->used - 1 - value_len; |
204 | 283k | } |
205 | | |
206 | 283k | output->hdr = &ctx->hdr; |
207 | 283k | return TRUE; |
208 | 283k | } |
209 | | |
210 | | static void translation_buf_decode(struct message_decoder_context *ctx, |
211 | | const unsigned char **data, size_t *size) |
212 | 3.32k | { |
213 | 3.32k | unsigned char trans_buf[CHARSET_MAX_PENDING_BUF_SIZE+1]; |
214 | 3.32k | size_t data_wanted, skip; |
215 | 3.32k | size_t trans_size, orig_size; |
216 | | |
217 | | /* @UNSAFE: move the previously untranslated bytes to trans_buf |
218 | | and see if we have now enough data to get the next character |
219 | | translated */ |
220 | 3.32k | memcpy(trans_buf, ctx->translation_buf, ctx->translation_size); |
221 | 3.32k | data_wanted = sizeof(trans_buf) - ctx->translation_size; |
222 | 3.32k | if (data_wanted > *size) |
223 | 2.30k | data_wanted = *size; |
224 | 3.32k | memcpy(trans_buf + ctx->translation_size, *data, data_wanted); |
225 | | |
226 | 3.32k | orig_size = trans_size = ctx->translation_size + data_wanted; |
227 | 3.32k | (void)charset_to_utf8(ctx->charset_trans, trans_buf, |
228 | 3.32k | &trans_size, ctx->buf2); |
229 | | |
230 | 3.32k | if (trans_size <= ctx->translation_size) { |
231 | | /* need more data to finish the translation. */ |
232 | 666 | i_assert(orig_size < CHARSET_MAX_PENDING_BUF_SIZE); |
233 | 666 | memcpy(ctx->translation_buf, trans_buf, orig_size); |
234 | 666 | ctx->translation_size = orig_size; |
235 | 666 | *data += *size; |
236 | 666 | *size = 0; |
237 | 666 | return; |
238 | 666 | } |
239 | 2.65k | skip = trans_size - ctx->translation_size; |
240 | | |
241 | 2.65k | i_assert(*size >= skip); |
242 | 2.65k | *data += skip; |
243 | 2.65k | *size -= skip; |
244 | | |
245 | 2.65k | ctx->translation_size = 0; |
246 | 2.65k | } |
247 | | |
248 | | static void |
249 | | message_decode_body_init_charset(struct message_decoder_context *ctx, |
250 | | struct message_part *part) |
251 | 265k | { |
252 | 265k | ctx->binary_input = ctx->content_charset == NULL && |
253 | 262k | (ctx->flags & MESSAGE_DECODER_FLAG_RETURN_BINARY) != 0 && |
254 | 0 | (part->flags & (MESSAGE_PART_FLAG_TEXT | |
255 | 0 | MESSAGE_PART_FLAG_MESSAGE_RFC822)) == 0; |
256 | | |
257 | 265k | if (ctx->binary_input) |
258 | 0 | return; |
259 | | |
260 | 265k | if (ctx->charset_trans != NULL && ctx->content_charset != NULL && |
261 | 3.00k | strcasecmp(ctx->content_charset, ctx->charset_trans_charset) == 0) { |
262 | | /* already have the correct translation selected */ |
263 | 1.00k | charset_to_utf8_reset(ctx->charset_trans); |
264 | 1.00k | return; |
265 | 1.00k | } |
266 | | |
267 | 264k | if (ctx->charset_trans != NULL) |
268 | 256k | charset_to_utf8_end(&ctx->charset_trans); |
269 | 264k | i_free_and_null(ctx->charset_trans_charset); |
270 | | |
271 | 264k | ctx->charset_trans_charset = i_strdup(ctx->content_charset != NULL ? |
272 | 262k | ctx->content_charset : "UTF-8"); |
273 | 264k | if (charset_to_utf8_begin(ctx->charset_trans_charset, ctx->normalizer, |
274 | 264k | &ctx->charset_trans) < 0) |
275 | 1.51k | ctx->charset_trans = charset_utf8_to_utf8_begin(ctx->normalizer); |
276 | 264k | } |
277 | | |
278 | | static bool message_decode_body(struct message_decoder_context *ctx, |
279 | | struct message_block *input, |
280 | | struct message_block *output) |
281 | 26.1k | { |
282 | 26.1k | const unsigned char *data = NULL; |
283 | 26.1k | size_t pos, size = 0; |
284 | 26.1k | const char *error; |
285 | | |
286 | 26.1k | switch (ctx->message_cte) { |
287 | 3.69k | case MESSAGE_CTE_UNKNOWN: |
288 | | /* just skip this body */ |
289 | 3.69k | return FALSE; |
290 | | |
291 | 13.5k | case MESSAGE_CTE_78BIT: |
292 | 13.5k | case MESSAGE_CTE_BINARY: |
293 | 13.5k | data = input->data; |
294 | 13.5k | size = input->size; |
295 | 13.5k | break; |
296 | 4.02k | case MESSAGE_CTE_QP: { |
297 | 4.02k | buffer_set_used_size(ctx->buf, 0); |
298 | 4.02k | if (ctx->qp == NULL) |
299 | 609 | ctx->qp = qp_decoder_init(ctx->buf); |
300 | 4.02k | (void)qp_decoder_more(ctx->qp, input->data, input->size, |
301 | 4.02k | &pos, &error); |
302 | 4.02k | data = ctx->buf->data; |
303 | 4.02k | size = ctx->buf->used; |
304 | 4.02k | break; |
305 | 13.5k | } |
306 | 4.87k | case MESSAGE_CTE_BASE64: |
307 | 4.87k | buffer_set_used_size(ctx->buf, 0); |
308 | 4.87k | if (!base64_decode_is_finished(&ctx->base64_decoder)) { |
309 | 4.34k | if (base64_decode_more(&ctx->base64_decoder, |
310 | 4.34k | input->data, input->size, |
311 | 4.34k | &pos, ctx->buf) <= 0) { |
312 | | /* ignore the rest of the input in this |
313 | | MIME part */ |
314 | 1.62k | (void)base64_decode_finish(&ctx->base64_decoder); |
315 | 1.62k | } |
316 | 4.34k | } |
317 | 4.87k | data = ctx->buf->data; |
318 | 4.87k | size = ctx->buf->used; |
319 | 4.87k | break; |
320 | 26.1k | } |
321 | | |
322 | 22.4k | if (ctx->binary_input) { |
323 | 0 | output->data = data; |
324 | 0 | output->size = size; |
325 | 22.4k | } else { |
326 | 22.4k | buffer_set_used_size(ctx->buf2, 0); |
327 | 22.4k | if (ctx->translation_size != 0) |
328 | 3.32k | translation_buf_decode(ctx, &data, &size); |
329 | | |
330 | 22.4k | pos = size; |
331 | 22.4k | (void)charset_to_utf8(ctx->charset_trans, |
332 | 22.4k | data, &pos, ctx->buf2); |
333 | 22.4k | if (pos != size) { |
334 | 2.85k | ctx->translation_size = size - pos; |
335 | 2.85k | i_assert(ctx->translation_size <= |
336 | 2.85k | sizeof(ctx->translation_buf)); |
337 | 2.85k | memcpy(ctx->translation_buf, data + pos, |
338 | 2.85k | ctx->translation_size); |
339 | 2.85k | } |
340 | 22.4k | output->data = ctx->buf2->data; |
341 | 22.4k | output->size = ctx->buf2->used; |
342 | 22.4k | } |
343 | | |
344 | 22.4k | output->hdr = NULL; |
345 | 22.4k | return TRUE; |
346 | 22.4k | } |
347 | | |
348 | | bool message_decoder_decode_next_block(struct message_decoder_context *ctx, |
349 | | struct message_block *input, |
350 | | struct message_block *output) |
351 | 602k | { |
352 | 602k | if (input->part != ctx->prev_part) { |
353 | | /* MIME part changed. */ |
354 | 266k | message_decoder_decode_reset(ctx); |
355 | 266k | } |
356 | | |
357 | 602k | output->part = input->part; |
358 | 602k | ctx->prev_part = input->part; |
359 | | |
360 | 602k | if (input->hdr != NULL) { |
361 | 310k | output->size = 0; |
362 | 310k | return message_decode_header(ctx, input->hdr, output); |
363 | 310k | } else if (input->size != 0) |
364 | 26.1k | return message_decode_body(ctx, input, output); |
365 | 265k | else { |
366 | 265k | output->hdr = NULL; |
367 | 265k | output->size = 0; |
368 | 265k | message_decode_body_init_charset(ctx, input->part); |
369 | 265k | return TRUE; |
370 | 265k | } |
371 | 602k | } |
372 | | |
373 | | const char * |
374 | | message_decoder_current_content_type(struct message_decoder_context *ctx) |
375 | 0 | { |
376 | 0 | return ctx->content_type; |
377 | 0 | } |
378 | | |
379 | | void message_decoder_decode_reset(struct message_decoder_context *ctx) |
380 | 266k | { |
381 | 266k | const char *error; |
382 | | |
383 | 266k | base64_decode_reset(&ctx->base64_decoder); |
384 | | |
385 | 266k | if (ctx->qp != NULL) |
386 | 25.6k | (void)qp_decoder_finish(ctx->qp, &error); |
387 | 266k | i_free_and_null(ctx->content_type); |
388 | 266k | i_free_and_null(ctx->content_charset); |
389 | 266k | ctx->message_cte = MESSAGE_CTE_78BIT; |
390 | 266k | } |