/src/dovecot/src/lib-mail/message-header-parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2002-2018 Dovecot authors, see the included COPYING file */ |
2 | | |
3 | | #include "lib.h" |
4 | | #include "buffer.h" |
5 | | #include "istream.h" |
6 | | #include "str.h" |
7 | | #include "strfuncs.h" |
8 | | #include "unichar.h" |
9 | | #include "message-size.h" |
10 | | #include "message-header-parser.h" |
11 | | |
12 | | /* RFC 5322 2.1.1 and 2.2 */ |
13 | 62.6k | #define MESSAGE_HEADER_NAME_MAX_LEN 1000 |
14 | | |
15 | | struct message_header_parser_ctx { |
16 | | struct message_header_line line; |
17 | | |
18 | | struct istream *input; |
19 | | struct message_size *hdr_size; |
20 | | |
21 | | string_t *name; |
22 | | buffer_t *value_buf; |
23 | | |
24 | | size_t header_block_max_size; |
25 | | size_t header_block_total_size; |
26 | | |
27 | | enum message_header_parser_flags flags; |
28 | | bool skip_line:1; |
29 | | bool has_nuls:1; |
30 | | }; |
31 | | |
32 | | struct message_header_parser_ctx * |
33 | | message_parse_header_init(struct istream *input, struct message_size *hdr_size, |
34 | | enum message_header_parser_flags flags) |
35 | 244k | { |
36 | 244k | struct message_header_parser_ctx *ctx; |
37 | | |
38 | 244k | ctx = i_new(struct message_header_parser_ctx, 1); |
39 | 244k | ctx->input = input; |
40 | 244k | ctx->hdr_size = hdr_size; |
41 | 244k | ctx->name = str_new(default_pool, 128); |
42 | 244k | ctx->flags = flags; |
43 | 244k | ctx->value_buf = buffer_create_dynamic(default_pool, 4096); |
44 | 244k | ctx->header_block_max_size = MESSAGE_HEADER_BLOCK_DEFAULT_MAX_SIZE; |
45 | 244k | i_stream_ref(input); |
46 | | |
47 | 244k | if (hdr_size != NULL) |
48 | 244k | i_zero(hdr_size); |
49 | 244k | return ctx; |
50 | 244k | } |
51 | | |
52 | | void |
53 | | message_parse_header_set_limit(struct message_header_parser_ctx *parser, |
54 | | size_t header_block_max_size) |
55 | 0 | { |
56 | 0 | parser->header_block_max_size = header_block_max_size; |
57 | 0 | } |
58 | | |
59 | | void |
60 | | message_parse_header_lower_limit(struct message_header_parser_ctx *parser, |
61 | | size_t header_block_max_size) |
62 | 659k | { |
63 | 659k | if (header_block_max_size < parser->header_block_max_size) |
64 | 0 | message_parse_header_set_limit(parser, header_block_max_size); |
65 | 659k | } |
66 | | |
67 | | void message_parse_header_deinit(struct message_header_parser_ctx **_ctx) |
68 | 244k | { |
69 | 244k | struct message_header_parser_ctx *ctx = *_ctx; |
70 | | |
71 | 244k | i_stream_unref(&ctx->input); |
72 | 244k | buffer_free(&ctx->value_buf); |
73 | 244k | str_free(&ctx->name); |
74 | 244k | i_free(ctx); |
75 | | |
76 | 244k | *_ctx = NULL; |
77 | 244k | } |
78 | | |
79 | | int message_parse_header_next(struct message_header_parser_ctx *ctx, |
80 | | struct message_header_line **hdr_r) |
81 | 659k | { |
82 | 659k | struct message_header_line *line = &ctx->line; |
83 | 659k | const unsigned char *msg; |
84 | 659k | size_t i, size, startpos, colon_pos, parse_size, skip = 0; |
85 | 659k | int ret; |
86 | 659k | bool continued, continues, last_no_newline, last_crlf; |
87 | 659k | bool no_newline, crlf_newline; |
88 | | |
89 | 659k | *hdr_r = NULL; |
90 | 659k | if (line->eoh) |
91 | 23.3k | return -1; |
92 | | |
93 | 636k | if (line->continues) |
94 | 24.4k | colon_pos = 0; |
95 | 612k | else { |
96 | | /* new header line */ |
97 | 612k | line->name_offset = ctx->input->v_offset; |
98 | 612k | colon_pos = UINT_MAX; |
99 | 612k | ctx->header_block_total_size += ctx->value_buf->used; |
100 | 612k | buffer_set_used_size(ctx->value_buf, 0); |
101 | 612k | } |
102 | | |
103 | 636k | no_newline = FALSE; |
104 | 636k | crlf_newline = FALSE; |
105 | 636k | continued = line->continues; |
106 | 636k | continues = FALSE; |
107 | | |
108 | 643k | for (startpos = 0;;) { |
109 | 643k | ret = i_stream_read_bytes(ctx->input, &msg, &size, startpos+2); |
110 | 643k | if (ret >= 0) { |
111 | | /* we want to know one byte in advance to find out |
112 | | if it's multiline header */ |
113 | 632k | parse_size = size == 0 ? 0 : size-1; |
114 | 632k | } else { |
115 | 11.7k | parse_size = size; |
116 | 11.7k | } |
117 | | |
118 | 643k | if (ret <= 0 && startpos == parse_size) { |
119 | 7.85k | if (ret == -1) { |
120 | 7.85k | if (startpos > 0) { |
121 | | /* header ended unexpectedly. */ |
122 | 3.72k | no_newline = TRUE; |
123 | 3.72k | skip = startpos; |
124 | 3.72k | break; |
125 | 3.72k | } |
126 | | /* error / EOF with no bytes */ |
127 | 7.85k | i_assert(skip == 0); |
128 | 4.13k | return -1; |
129 | 4.13k | } |
130 | | |
131 | 0 | if (size > 0 && !ctx->skip_line && !continued && |
132 | 0 | (msg[0] == '\n' || |
133 | 0 | (msg[0] == '\r' && size > 1 && msg[1] == '\n'))) { |
134 | | /* end of headers - this mostly happens just |
135 | | with mbox where headers are read separately |
136 | | from body */ |
137 | 0 | size = 0; |
138 | 0 | if (ctx->hdr_size != NULL) |
139 | 0 | ctx->hdr_size->lines++; |
140 | 0 | if (msg[0] == '\r') { |
141 | 0 | skip = 2; |
142 | 0 | crlf_newline = TRUE; |
143 | 0 | } else { |
144 | 0 | skip = 1; |
145 | 0 | if (ctx->hdr_size != NULL) |
146 | 0 | ctx->hdr_size->virtual_size++; |
147 | 0 | } |
148 | 0 | break; |
149 | 0 | } |
150 | 0 | if (ret == 0 && !ctx->input->eof) { |
151 | | /* stream is nonblocking - need more data */ |
152 | 0 | i_assert(skip == 0); |
153 | 0 | return 0; |
154 | 0 | } |
155 | 0 | i_assert(size > 0); |
156 | | |
157 | | /* a) line is larger than input buffer |
158 | | b) header ended unexpectedly */ |
159 | 0 | if (ret == -2) { |
160 | | /* go back to last LWSP if found. */ |
161 | 0 | size_t min_pos = !continued ? colon_pos : 0; |
162 | 0 | for (i = size-1; i > min_pos; i--) { |
163 | 0 | if (IS_LWSP(msg[i])) { |
164 | 0 | size = i; |
165 | 0 | break; |
166 | 0 | } |
167 | 0 | } |
168 | 0 | if (i == min_pos && (msg[size-1] == '\r' || |
169 | 0 | msg[size-1] == '\n')) { |
170 | | /* we may or may not have a full header, |
171 | | but we don't know until we get the |
172 | | next character. leave out the |
173 | | linefeed and finish the header on |
174 | | the next run. */ |
175 | 0 | size--; |
176 | 0 | if (size > 0 && msg[size-1] == '\r') |
177 | 0 | size--; |
178 | 0 | } |
179 | | /* the buffer really has to be more than 2 to |
180 | | avoid CRLF looping forever */ |
181 | 0 | i_assert(size > 0); |
182 | | |
183 | 0 | continues = TRUE; |
184 | 0 | } |
185 | 0 | no_newline = TRUE; |
186 | 0 | skip = size; |
187 | 0 | break; |
188 | 0 | } |
189 | | |
190 | | /* find ':' */ |
191 | 636k | if (colon_pos == UINT_MAX) { |
192 | 27.2M | for (i = startpos; i < parse_size; i++) { |
193 | 27.2M | if (msg[i] > ':') |
194 | 13.7M | continue; |
195 | | |
196 | 13.4M | if (msg[i] == ':' && !ctx->skip_line) { |
197 | 62.6k | colon_pos = i; |
198 | 62.6k | line->full_value_offset = |
199 | 62.6k | ctx->input->v_offset + i + 1; |
200 | 62.6k | break; |
201 | 62.6k | } |
202 | 13.3M | if (msg[i] == '\n') { |
203 | | /* end of headers, or error */ |
204 | 545k | break; |
205 | 545k | } |
206 | | |
207 | 12.8M | if (msg[i] == '\0') |
208 | 9.66M | ctx->has_nuls = TRUE; |
209 | 12.8M | } |
210 | 608k | } else { |
211 | 27.5k | i = startpos; |
212 | 27.5k | } |
213 | | |
214 | | /* find '\n' */ |
215 | 82.3M | for (; i < parse_size; i++) { |
216 | 82.3M | if (msg[i] <= '\n') { |
217 | 25.5M | if (msg[i] == '\n') |
218 | 628k | break; |
219 | 24.9M | if (msg[i] == '\0') |
220 | 24.8M | ctx->has_nuls = TRUE; |
221 | 24.9M | } |
222 | 82.3M | } |
223 | | |
224 | 636k | if (i < parse_size && i+1 == size && ret == -2) { |
225 | | /* we don't know if the line continues. */ |
226 | 0 | i++; |
227 | 636k | } else if (i < parse_size) { |
228 | | /* got a line */ |
229 | 628k | if (ctx->skip_line) { |
230 | | /* skipping a line with a huge header name */ |
231 | 0 | if (ctx->hdr_size != NULL) { |
232 | 0 | ctx->hdr_size->lines++; |
233 | 0 | ctx->hdr_size->physical_size += i + 1; |
234 | 0 | ctx->hdr_size->virtual_size += i + 1; |
235 | 0 | } |
236 | 0 | if (i == 0 || msg[i-1] != '\r') { |
237 | | /* missing CR */ |
238 | 0 | if (ctx->hdr_size != NULL) |
239 | 0 | ctx->hdr_size->virtual_size++; |
240 | 0 | } |
241 | |
|
242 | 0 | i_stream_skip(ctx->input, i + 1); |
243 | 0 | startpos = 0; |
244 | 0 | ctx->skip_line = FALSE; |
245 | 0 | continue; |
246 | 0 | } |
247 | 628k | continues = i+1 < size && IS_LWSP(msg[i+1]); |
248 | | |
249 | 628k | if (ctx->hdr_size != NULL) |
250 | 628k | ctx->hdr_size->lines++; |
251 | 628k | if (i == 0 || msg[i-1] != '\r') { |
252 | | /* missing CR */ |
253 | 623k | if (ctx->hdr_size != NULL) |
254 | 623k | ctx->hdr_size->virtual_size++; |
255 | 623k | size = i; |
256 | 623k | } else { |
257 | 5.57k | size = i-1; |
258 | 5.57k | crlf_newline = TRUE; |
259 | 5.57k | } |
260 | | |
261 | 628k | skip = i+1; |
262 | 628k | break; |
263 | 628k | } |
264 | | |
265 | 7.30k | startpos = i; |
266 | 7.30k | } |
267 | | |
268 | 632k | last_crlf = line->crlf_newline && |
269 | 632k | (ctx->flags & MESSAGE_HEADER_PARSER_FLAG_DROP_CR) == 0; |
270 | 632k | last_no_newline = line->no_newline || |
271 | 632k | (ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) != 0; |
272 | | |
273 | 632k | line->continues = continues; |
274 | 632k | line->continued = continued; |
275 | 632k | line->crlf_newline = crlf_newline; |
276 | 632k | line->no_newline = no_newline; |
277 | 632k | if (size == 0 && !continued) { |
278 | | /* end of headers */ |
279 | 32.5k | line->eoh = TRUE; |
280 | 32.5k | line->name_len = line->value_len = line->full_value_len = 0; |
281 | 32.5k | line->name = ""; line->value = line->full_value = NULL; |
282 | 32.5k | line->middle = NULL; line->middle_len = 0; |
283 | 32.5k | line->full_value_offset = line->name_offset; |
284 | 32.5k | line->continues = FALSE; |
285 | 599k | } else if (line->continued) { |
286 | 24.4k | line->value = msg; |
287 | 24.4k | line->value_len = size; |
288 | 575k | } else if (colon_pos == UINT_MAX) { |
289 | | /* missing ':', assume the whole line is value */ |
290 | 512k | line->value = msg; |
291 | 512k | line->value_len = size; |
292 | 512k | line->full_value_offset = line->name_offset; |
293 | | |
294 | 512k | line->name = ""; |
295 | 512k | line->name_len = 0; |
296 | | |
297 | 512k | line->middle = uchar_empty_ptr; |
298 | 512k | line->middle_len = 0; |
299 | 512k | } else { |
300 | 62.6k | size_t pos; |
301 | | |
302 | 62.6k | line->value = msg + colon_pos+1; |
303 | 62.6k | line->value_len = size - colon_pos - 1; |
304 | 62.6k | if ((ctx->flags & MESSAGE_HEADER_PARSER_FLAG_SKIP_INITIAL_LWSP) != 0) { |
305 | | /* get value. skip all LWSP after ':'. Note that |
306 | | RFC2822 doesn't say we should, but history behind |
307 | | it.. |
308 | | |
309 | | Exception to this is if the value consists only of |
310 | | LWSP, then skip only the one LWSP after ':'. */ |
311 | 0 | for (pos = 0; pos < line->value_len; pos++) { |
312 | 0 | if (!IS_LWSP(line->value[pos])) |
313 | 0 | break; |
314 | 0 | } |
315 | |
|
316 | 0 | if (pos == line->value_len) { |
317 | | /* everything was LWSP */ |
318 | 0 | if (line->value_len > 0 && |
319 | 0 | IS_LWSP(line->value[0])) |
320 | 0 | pos = 1; |
321 | 0 | } |
322 | 62.6k | } else { |
323 | 62.6k | pos = line->value_len > 0 && |
324 | 62.6k | IS_LWSP(line->value[0]) ? 1 : 0; |
325 | 62.6k | } |
326 | | |
327 | 62.6k | line->value += pos; |
328 | 62.6k | line->value_len -= pos; |
329 | 62.6k | line->full_value_offset += pos; |
330 | | |
331 | | /* get name, skip LWSP before ':' */ |
332 | 63.4k | while (colon_pos > 0 && IS_LWSP(msg[colon_pos-1])) |
333 | 782 | colon_pos--; |
334 | | |
335 | | /* Treat overlong header names as if the full header line was |
336 | | a value. Callers can usually handle large values better than |
337 | | large names. */ |
338 | 62.6k | if (colon_pos > MESSAGE_HEADER_NAME_MAX_LEN) { |
339 | 301 | line->name = ""; |
340 | 301 | line->name_len = 0; |
341 | 301 | line->middle = uchar_empty_ptr; |
342 | 301 | line->middle_len = 0; |
343 | 301 | line->value = msg; |
344 | 301 | line->value_len = size; |
345 | 301 | line->full_value_offset = line->name_offset; |
346 | 62.3k | } else { |
347 | 62.3k | str_truncate(ctx->name, 0); |
348 | | /* use buffer_append() so the name won't be truncated if there |
349 | | are NULs. */ |
350 | 62.3k | buffer_append(ctx->name, msg, colon_pos); |
351 | 62.3k | str_append_c(ctx->name, '\0'); |
352 | | |
353 | | /* keep middle stored also in ctx->name so it's available |
354 | | with use_full_value */ |
355 | 62.3k | line->middle = msg + colon_pos; |
356 | 62.3k | line->middle_len = (size_t)(line->value - line->middle); |
357 | 62.3k | str_append_data(ctx->name, line->middle, line->middle_len); |
358 | | |
359 | 62.3k | line->name = str_c(ctx->name); |
360 | 62.3k | line->name_len = colon_pos; |
361 | 62.3k | line->middle = str_data(ctx->name) + line->name_len + 1; |
362 | 62.3k | } |
363 | 62.6k | } |
364 | | |
365 | 632k | line->value_len = I_MIN(line->value_len, ctx->header_block_max_size); |
366 | 632k | size_t line_value_size = line->value_len; |
367 | 632k | size_t header_total_used = ctx->header_block_total_size + ctx->value_buf->used; |
368 | 632k | size_t line_available = ctx->header_block_max_size <= header_total_used ? 0 : |
369 | 632k | ctx->header_block_max_size - header_total_used; |
370 | 632k | line_value_size = I_MIN(line_value_size, line_available); |
371 | | |
372 | 632k | if (!line->continued) { |
373 | | /* first header line. make a copy of the line since we can't |
374 | | really trust input stream not to lose it. */ |
375 | 608k | buffer_append(ctx->value_buf, line->value, line_value_size); |
376 | 608k | line->value = line->full_value = ctx->value_buf->data; |
377 | 608k | line->full_value_len = line->value_len = line_value_size; |
378 | 608k | } else if (line->use_full_value) { |
379 | | /* continue saving the full value. */ |
380 | 10.0k | if (last_no_newline) { |
381 | | /* line is longer than fit into our buffer, so we |
382 | | were forced to break it into multiple |
383 | | message_header_lines */ |
384 | 10.0k | } else if (line_value_size > 1) { |
385 | 9.23k | if (last_crlf && line_value_size > 2) |
386 | 438 | buffer_append_c(ctx->value_buf, '\r'); |
387 | 9.23k | buffer_append_c(ctx->value_buf, '\n'); |
388 | 9.23k | } |
389 | 10.0k | if ((ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) != 0 && |
390 | 10.0k | line->value_len > 0 && line->value[0] != ' ' && |
391 | 10.0k | IS_LWSP(line->value[0]) && |
392 | 10.0k | line_value_size > 0) { |
393 | 0 | buffer_append_c(ctx->value_buf, ' '); |
394 | 0 | buffer_append(ctx->value_buf, line->value + 1, line_value_size - 1); |
395 | 0 | } else |
396 | 10.0k | buffer_append(ctx->value_buf, line->value, line_value_size); |
397 | | |
398 | 10.0k | line->full_value = ctx->value_buf->data; |
399 | 10.0k | line->full_value_len = ctx->value_buf->used; |
400 | 14.4k | } else { |
401 | | /* we didn't want full_value, and this is a continued line. */ |
402 | 14.4k | line->full_value = NULL; |
403 | 14.4k | line->full_value_len = 0; |
404 | 14.4k | } |
405 | | |
406 | | /* always reset it */ |
407 | 632k | line->use_full_value = FALSE; |
408 | | |
409 | 632k | if (ctx->hdr_size != NULL) { |
410 | 632k | ctx->hdr_size->physical_size += skip; |
411 | 632k | ctx->hdr_size->virtual_size += skip; |
412 | 632k | } |
413 | 632k | i_stream_skip(ctx->input, skip); |
414 | | |
415 | 632k | *hdr_r = line; |
416 | 632k | return 1; |
417 | 636k | } |
418 | | |
419 | | bool message_parse_header_has_nuls(const struct message_header_parser_ctx *ctx) |
420 | 244k | { |
421 | 244k | return ctx->has_nuls; |
422 | 244k | } |
423 | | |
424 | | #undef message_parse_header |
425 | | void message_parse_header(struct istream *input, struct message_size *hdr_size, |
426 | | enum message_header_parser_flags flags, |
427 | | message_header_callback_t *callback, void *context) |
428 | 0 | { |
429 | 0 | struct message_header_parser_ctx *hdr_ctx; |
430 | 0 | struct message_header_line *hdr; |
431 | 0 | int ret; |
432 | |
|
433 | 0 | hdr_ctx = message_parse_header_init(input, hdr_size, flags); |
434 | 0 | while ((ret = message_parse_header_next(hdr_ctx, &hdr)) > 0) T_BEGIN { |
435 | 0 | callback(hdr, context); |
436 | 0 | } T_END; |
437 | 0 | i_assert(ret != 0); |
438 | 0 | message_parse_header_deinit(&hdr_ctx); |
439 | | |
440 | | /* call after the final skipping */ |
441 | 0 | T_BEGIN { |
442 | 0 | callback(NULL, context); |
443 | 0 | } T_END; |
444 | 0 | } |
445 | | |
446 | | void message_header_line_write(buffer_t *output, |
447 | | const struct message_header_line *hdr) |
448 | 0 | { |
449 | 0 | if (!hdr->continued) { |
450 | 0 | buffer_append(output, hdr->name, strlen(hdr->name)); |
451 | 0 | buffer_append(output, hdr->middle, hdr->middle_len); |
452 | 0 | } |
453 | 0 | buffer_append(output, hdr->value, hdr->value_len); |
454 | 0 | if (!hdr->no_newline) { |
455 | 0 | if (hdr->crlf_newline) |
456 | 0 | buffer_append_c(output, '\r'); |
457 | 0 | buffer_append_c(output, '\n'); |
458 | 0 | } |
459 | 0 | } |
460 | | |
461 | | const char * |
462 | | message_header_strdup(pool_t pool, const unsigned char *data, size_t size) |
463 | 0 | { |
464 | 0 | i_assert(data != NULL); |
465 | | |
466 | 0 | if (memchr(data, '\0', size) == NULL) { |
467 | | /* fast path */ |
468 | 0 | char *dest = p_malloc(pool, size+1); |
469 | 0 | memcpy(dest, data, size); |
470 | 0 | return dest; |
471 | 0 | } |
472 | | |
473 | | /* slow path - this could be made faster, but it should be |
474 | | rare so keep it simple */ |
475 | 0 | string_t *str = str_new(pool, size+2); |
476 | 0 | for (size_t i = 0; i < size; i++) { |
477 | 0 | if (data[i] != '\0') |
478 | 0 | str_append_c(str, data[i]); |
479 | 0 | else |
480 | 0 | str_append(str, UNICODE_REPLACEMENT_CHAR_UTF8); |
481 | 0 | } |
482 | 0 | return str_c(str); |
483 | 0 | } |
484 | | |
485 | | bool message_header_name_is_valid(const char *name) |
486 | 0 | { |
487 | | /* |
488 | | field-name = 1*ftext |
489 | | |
490 | | ftext = %d33-57 / ; Printable US-ASCII |
491 | | %d59-126 ; characters not including |
492 | | ; ":". |
493 | | */ |
494 | 0 | for (unsigned int i = 0; name[i] != '\0'; i++) { |
495 | 0 | unsigned char c = name[i]; |
496 | 0 | if (c >= 33 && c <= 57) { |
497 | | /* before ":" */ |
498 | 0 | } else if (c >= 59 && c <= 126) { |
499 | | /* after ":" */ |
500 | 0 | } else { |
501 | 0 | return FALSE; |
502 | 0 | } |
503 | 0 | } |
504 | 0 | return TRUE; |
505 | 0 | } |