1"""
2Multi-part parsing for file uploads.
3
4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5file upload handlers for processing.
6"""
7
8import base64
9import binascii
10import collections
11import html
12
13from django.conf import settings
14from django.core.exceptions import (
15 RequestDataTooBig,
16 SuspiciousMultipartForm,
17 TooManyFieldsSent,
18 TooManyFilesSent,
19)
20from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload
21from django.utils.datastructures import MultiValueDict
22from django.utils.encoding import force_str
23from django.utils.http import parse_header_parameters
24from django.utils.regex_helper import _lazy_re_compile
25
26__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")
27
28
29class MultiPartParserError(Exception):
30 pass
31
32
33class InputStreamExhausted(Exception):
34 """
35 No more reads are allowed from this device.
36 """
37
38 pass
39
40
41RAW = "raw"
42FILE = "file"
43FIELD = "field"
44FIELD_TYPES = frozenset([FIELD, RAW])
45MAX_TOTAL_HEADER_SIZE = 1024
46
47
48class MultiPartParser:
49 """
50 An RFC 7578 multipart/form-data parser.
51
52 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
53 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.
54 """
55
56 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]")
57
58 def __init__(self, META, input_data, upload_handlers, encoding=None):
59 """
60 Initialize the MultiPartParser object.
61
62 :META:
63 The standard ``META`` dictionary in Django request objects.
64 :input_data:
65 The raw post data, as a file-like object.
66 :upload_handlers:
67 A list of UploadHandler instances that perform operations on the
68 uploaded data.
69 :encoding:
70 The encoding with which to treat the incoming data.
71 """
72 # Content-Type should contain multipart and the boundary information.
73 content_type = META.get("CONTENT_TYPE", "")
74 if not content_type.startswith("multipart/"):
75 raise MultiPartParserError("Invalid Content-Type: %s" % content_type)
76
77 try:
78 content_type.encode("ascii")
79 except UnicodeEncodeError:
80 raise MultiPartParserError(
81 "Invalid non-ASCII Content-Type in multipart: %s"
82 % force_str(content_type)
83 )
84
85 # Parse the header to get the boundary to split the parts.
86 _, opts = parse_header_parameters(content_type)
87 boundary = opts.get("boundary")
88 if not boundary or not self.boundary_re.fullmatch(boundary):
89 raise MultiPartParserError(
90 "Invalid boundary in multipart: %s" % force_str(boundary)
91 )
92
93 # Content-Length should contain the length of the body we are about
94 # to receive.
95 try:
96 content_length = int(META.get("CONTENT_LENGTH", 0))
97 except (ValueError, TypeError):
98 content_length = 0
99
100 if content_length < 0:
101 # This means we shouldn't continue...raise an error.
102 raise MultiPartParserError("Invalid content length: %r" % content_length)
103
104 self._boundary = boundary.encode("ascii")
105 self._input_data = input_data
106
107 # For compatibility with low-level network APIs (with 32-bit integers),
108 # the chunk size should be < 2^31, but still divisible by 4.
109 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
110 self._chunk_size = min([2**31 - 4] + possible_sizes)
111
112 self._meta = META
113 self._encoding = encoding or settings.DEFAULT_CHARSET
114 self._content_length = content_length
115 self._upload_handlers = upload_handlers
116
117 def parse(self):
118 # Call the actual parse routine and close all open files in case of
119 # errors. This is needed because if exceptions are thrown the
120 # MultiPartParser will not be garbage collected immediately and
121 # resources would be kept alive. This is only needed for errors because
122 # the Request object closes all uploaded files at the end of the
123 # request.
124 try:
125 return self._parse()
126 except Exception:
127 if hasattr(self, "_files"):
128 for _, files in self._files.lists():
129 for fileobj in files:
130 fileobj.close()
131 raise
132
133 def _parse(self):
134 """
135 Parse the POST data and break it into a FILES MultiValueDict and a POST
136 MultiValueDict.
137
138 Return a tuple containing the POST and FILES dictionary, respectively.
139 """
140 from django.http import QueryDict
141
142 encoding = self._encoding
143 handlers = self._upload_handlers
144
145 # HTTP spec says that Content-Length >= 0 is valid
146 # handling content-length == 0 before continuing
147 if self._content_length == 0:
148 return QueryDict(encoding=self._encoding), MultiValueDict()
149
150 # See if any of the handlers take care of the parsing.
151 # This allows overriding everything if need be.
152 for handler in handlers:
153 result = handler.handle_raw_input(
154 self._input_data,
155 self._meta,
156 self._content_length,
157 self._boundary,
158 encoding,
159 )
160 # Check to see if it was handled
161 if result is not None:
162 return result[0], result[1]
163
164 # Create the data structures to be used later.
165 self._post = QueryDict(mutable=True)
166 self._files = MultiValueDict()
167
168 # Instantiate the parser and stream:
169 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))
170
171 # Whether or not to signal a file-completion at the beginning of the loop.
172 old_field_name = None
173 counters = [0] * len(handlers)
174
175 # Number of bytes that have been read.
176 num_bytes_read = 0
177 # To count the number of keys in the request.
178 num_post_keys = 0
179 # To count the number of files in the request.
180 num_files = 0
181 # To limit the amount of data read from the request.
182 read_size = None
183 # Whether a file upload is finished.
184 uploaded_file = True
185
186 try:
187 for item_type, meta_data, field_stream in Parser(stream, self._boundary):
188 if old_field_name:
189 # We run this at the beginning of the next loop
190 # since we cannot be sure a file is complete until
191 # we hit the next boundary/part of the multipart content.
192 self.handle_file_complete(old_field_name, counters)
193 old_field_name = None
194 uploaded_file = True
195
196 if (
197 item_type in FIELD_TYPES
198 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None
199 ):
200 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.
201 num_post_keys += 1
202 # 2 accounts for empty raw fields before and after the
203 # last boundary.
204 if settings.DATA_UPLOAD_MAX_NUMBER_FIELDS + 2 < num_post_keys:
205 raise TooManyFieldsSent(
206 "The number of GET/POST parameters exceeded "
207 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."
208 )
209
210 try:
211 disposition = meta_data["content-disposition"][1]
212 field_name = disposition["name"].strip()
213 except (KeyError, IndexError, AttributeError):
214 continue
215
216 transfer_encoding = meta_data.get("content-transfer-encoding")
217 if transfer_encoding is not None:
218 transfer_encoding = transfer_encoding[0].strip()
219 field_name = force_str(field_name, encoding, errors="replace")
220
221 if item_type == FIELD:
222 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.
223 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:
224 read_size = (
225 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read
226 )
227
228 # This is a post field, we can just set it in the post
229 if transfer_encoding == "base64":
230 raw_data = field_stream.read(size=read_size)
231 num_bytes_read += len(raw_data)
232 try:
233 data = base64.b64decode(raw_data)
234 except binascii.Error:
235 data = raw_data
236 else:
237 data = field_stream.read(size=read_size)
238 num_bytes_read += len(data)
239
240 # Add two here to make the check consistent with the
241 # x-www-form-urlencoded check that includes '&='.
242 num_bytes_read += len(field_name) + 2
243 if (
244 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None
245 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE
246 ):
247 raise RequestDataTooBig(
248 "Request body exceeded "
249 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."
250 )
251
252 self._post.appendlist(
253 field_name, force_str(data, encoding, errors="replace")
254 )
255 elif item_type == FILE:
256 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FILES.
257 num_files += 1
258 if (
259 settings.DATA_UPLOAD_MAX_NUMBER_FILES is not None
260 and num_files > settings.DATA_UPLOAD_MAX_NUMBER_FILES
261 ):
262 raise TooManyFilesSent(
263 "The number of files exceeded "
264 "settings.DATA_UPLOAD_MAX_NUMBER_FILES."
265 )
266 # This is a file, use the handler...
267 file_name = disposition.get("filename")
268 if file_name:
269 file_name = force_str(file_name, encoding, errors="replace")
270 file_name = self.sanitize_file_name(file_name)
271 if not file_name:
272 continue
273
274 content_type, content_type_extra = meta_data.get(
275 "content-type", ("", {})
276 )
277 content_type = content_type.strip()
278 charset = content_type_extra.get("charset")
279
280 try:
281 content_length = int(meta_data.get("content-length")[0])
282 except (IndexError, TypeError, ValueError):
283 content_length = None
284
285 counters = [0] * len(handlers)
286 uploaded_file = False
287 try:
288 for handler in handlers:
289 try:
290 handler.new_file(
291 field_name,
292 file_name,
293 content_type,
294 content_length,
295 charset,
296 content_type_extra,
297 )
298 except StopFutureHandlers:
299 break
300
301 for chunk in field_stream:
302 if transfer_encoding == "base64":
303 # We only special-case base64 transfer encoding
304 # We should always decode base64 chunks by
305 # multiple of 4, ignoring whitespace.
306
307 stripped_chunk = b"".join(chunk.split())
308
309 remaining = len(stripped_chunk) % 4
310 while remaining != 0:
311 over_chunk = field_stream.read(4 - remaining)
312 if not over_chunk:
313 break
314 stripped_chunk += b"".join(over_chunk.split())
315 remaining = len(stripped_chunk) % 4
316
317 try:
318 chunk = base64.b64decode(stripped_chunk)
319 except Exception as exc:
320 # Since this is only a chunk, any error is
321 # an unfixable error.
322 raise MultiPartParserError(
323 "Could not decode base64 data."
324 ) from exc
325
326 for i, handler in enumerate(handlers):
327 chunk_length = len(chunk)
328 chunk = handler.receive_data_chunk(chunk, counters[i])
329 counters[i] += chunk_length
330 if chunk is None:
331 # Don't continue if the chunk received by
332 # the handler is None.
333 break
334
335 except SkipFile:
336 self._close_files()
337 # Just use up the rest of this file...
338 exhaust(field_stream)
339 else:
340 # Handle file upload completions on next iteration.
341 old_field_name = field_name
342 else:
343 # If this is neither a FIELD nor a FILE, exhaust the field
344 # stream. Note: There could be an error here at some point,
345 # but there will be at least two RAW types (before and
346 # after the other boundaries). This branch is usually not
347 # reached at all, because a missing content-disposition
348 # header will skip the whole boundary.
349 exhaust(field_stream)
350 except StopUpload as e:
351 self._close_files()
352 if not e.connection_reset:
353 exhaust(self._input_data)
354 else:
355 if not uploaded_file:
356 for handler in handlers:
357 handler.upload_interrupted()
358 # Make sure that the request data is all fed
359 exhaust(self._input_data)
360
361 # Signal that the upload has completed.
362 # any() shortcircuits if a handler's upload_complete() returns a value.
363 any(handler.upload_complete() for handler in handlers)
364 self._post._mutable = False
365 return self._post, self._files
366
367 def handle_file_complete(self, old_field_name, counters):
368 """
369 Handle all the signaling that takes place when a file is complete.
370 """
371 for i, handler in enumerate(self._upload_handlers):
372 file_obj = handler.file_complete(counters[i])
373 if file_obj:
374 # If it returns a file object, then set the files dict.
375 self._files.appendlist(
376 force_str(old_field_name, self._encoding, errors="replace"),
377 file_obj,
378 )
379 break
380
381 def sanitize_file_name(self, file_name):
382 """
383 Sanitize the filename of an upload.
384
385 Remove all possible path separators, even though that might remove more
386 than actually required by the target system. Filenames that could
387 potentially cause problems (current/parent dir) are also discarded.
388
389 It should be noted that this function could still return a "filepath"
390 like "C:some_file.txt" which is handled later on by the storage layer.
391 So while this function does sanitize filenames to some extent, the
392 resulting filename should still be considered as untrusted user input.
393 """
394 file_name = html.unescape(file_name)
395 file_name = file_name.rsplit("/")[-1]
396 file_name = file_name.rsplit("\\")[-1]
397 # Remove non-printable characters.
398 file_name = "".join([char for char in file_name if char.isprintable()])
399
400 if file_name in {"", ".", ".."}:
401 return None
402 return file_name
403
404 IE_sanitize = sanitize_file_name
405
406 def _close_files(self):
407 # Free up all file handles.
408 # FIXME: this currently assumes that upload handlers store the file as 'file'
409 # We should document that...
410 # (Maybe add handler.free_file to complement new_file)
411 for handler in self._upload_handlers:
412 if hasattr(handler, "file"):
413 handler.file.close()
414
415
416class LazyStream:
417 """
418 The LazyStream wrapper allows one to get and "unget" bytes from a stream.
419
420 Given a producer object (an iterator that yields bytestrings), the
421 LazyStream object will support iteration, reading, and keeping a "look-back"
422 variable in case you need to "unget" some bytes.
423 """
424
425 def __init__(self, producer, length=None):
426 """
427 Every LazyStream must have a producer when instantiated.
428
429 A producer is an iterable that returns a string each time it
430 is called.
431 """
432 self._producer = producer
433 self._empty = False
434 self._leftover = b""
435 self.length = length
436 self.position = 0
437 self._remaining = length
438 self._unget_history = []
439
440 def tell(self):
441 return self.position
442
443 def read(self, size=None):
444 def parts():
445 remaining = self._remaining if size is None else size
446 # do the whole thing in one shot if no limit was provided.
447 if remaining is None:
448 yield b"".join(self)
449 return
450
451 # otherwise do some bookkeeping to return exactly enough
452 # of the stream and stashing any extra content we get from
453 # the producer
454 while remaining != 0:
455 assert remaining > 0, "remaining bytes to read should never go negative"
456
457 try:
458 chunk = next(self)
459 except StopIteration:
460 return
461 else:
462 emitting = chunk[:remaining]
463 self.unget(chunk[remaining:])
464 remaining -= len(emitting)
465 yield emitting
466
467 return b"".join(parts())
468
469 def __next__(self):
470 """
471 Used when the exact number of bytes to read is unimportant.
472
473 Return whatever chunk is conveniently returned from the iterator.
474 Useful to avoid unnecessary bookkeeping if performance is an issue.
475 """
476 if self._leftover:
477 output = self._leftover
478 self._leftover = b""
479 else:
480 output = next(self._producer)
481 self._unget_history = []
482 self.position += len(output)
483 return output
484
485 def close(self):
486 """
487 Used to invalidate/disable this lazy stream.
488
489 Replace the producer with an empty list. Any leftover bytes that have
490 already been read will still be reported upon read() and/or next().
491 """
492 self._producer = []
493
494 def __iter__(self):
495 return self
496
497 def unget(self, bytes):
498 """
499 Place bytes back onto the front of the lazy stream.
500
501 Future calls to read() will return those bytes first. The
502 stream position and thus tell() will be rewound.
503 """
504 if not bytes:
505 return
506 self._update_unget_history(len(bytes))
507 self.position -= len(bytes)
508 self._leftover = bytes + self._leftover
509
510 def _update_unget_history(self, num_bytes):
511 """
512 Update the unget history as a sanity check to see if we've pushed
513 back the same number of bytes in one chunk. If we keep ungetting the
514 same number of bytes many times (here, 50), we're mostly likely in an
515 infinite loop of some sort. This is usually caused by a
516 maliciously-malformed MIME request.
517 """
518 self._unget_history = [num_bytes] + self._unget_history[:49]
519 number_equal = len(
520 [
521 current_number
522 for current_number in self._unget_history
523 if current_number == num_bytes
524 ]
525 )
526
527 if number_equal > 40:
528 raise SuspiciousMultipartForm(
529 "The multipart parser got stuck, which shouldn't happen with"
530 " normal uploaded files. Check for malicious upload activity;"
531 " if there is none, report this to the Django developers."
532 )
533
534
535class ChunkIter:
536 """
537 An iterable that will yield chunks of data. Given a file-like object as the
538 constructor, yield chunks of read operations from that object.
539 """
540
541 def __init__(self, flo, chunk_size=64 * 1024):
542 self.flo = flo
543 self.chunk_size = chunk_size
544
545 def __next__(self):
546 try:
547 data = self.flo.read(self.chunk_size)
548 except InputStreamExhausted:
549 raise StopIteration()
550 if data:
551 return data
552 else:
553 raise StopIteration()
554
555 def __iter__(self):
556 return self
557
558
559class InterBoundaryIter:
560 """
561 A Producer that will iterate over boundaries.
562 """
563
564 def __init__(self, stream, boundary):
565 self._stream = stream
566 self._boundary = boundary
567
568 def __iter__(self):
569 return self
570
571 def __next__(self):
572 try:
573 return LazyStream(BoundaryIter(self._stream, self._boundary))
574 except InputStreamExhausted:
575 raise StopIteration()
576
577
578class BoundaryIter:
579 """
580 A Producer that is sensitive to boundaries.
581
582 Will happily yield bytes until a boundary is found. Will yield the bytes
583 before the boundary, throw away the boundary bytes themselves, and push the
584 post-boundary bytes back on the stream.
585
586 The future calls to next() after locating the boundary will raise a
587 StopIteration exception.
588 """
589
590 def __init__(self, stream, boundary):
591 self._stream = stream
592 self._boundary = boundary
593 self._done = False
594 # rollback an additional six bytes because the format is like
595 # this: CRLF<boundary>[--CRLF]
596 self._rollback = len(boundary) + 6
597
598 # Try to use mx fast string search if available. Otherwise
599 # use Python find. Wrap the latter for consistency.
600 unused_char = self._stream.read(1)
601 if not unused_char:
602 raise InputStreamExhausted()
603 self._stream.unget(unused_char)
604
605 def __iter__(self):
606 return self
607
608 def __next__(self):
609 if self._done:
610 raise StopIteration()
611
612 stream = self._stream
613 rollback = self._rollback
614
615 bytes_read = 0
616 chunks = []
617 for bytes in stream:
618 bytes_read += len(bytes)
619 chunks.append(bytes)
620 if bytes_read > rollback:
621 break
622 if not bytes:
623 break
624 else:
625 self._done = True
626
627 if not chunks:
628 raise StopIteration()
629
630 chunk = b"".join(chunks)
631 boundary = self._find_boundary(chunk)
632
633 if boundary:
634 end, next = boundary
635 stream.unget(chunk[next:])
636 self._done = True
637 return chunk[:end]
638 else:
639 # make sure we don't treat a partial boundary (and
640 # its separators) as data
641 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):
642 # There's nothing left, we should just return and mark as done.
643 self._done = True
644 return chunk
645 else:
646 stream.unget(chunk[-rollback:])
647 return chunk[:-rollback]
648
649 def _find_boundary(self, data):
650 """
651 Find a multipart boundary in data.
652
653 Should no boundary exist in the data, return None. Otherwise, return
654 a tuple containing the indices of the following:
655 * the end of current encapsulation
656 * the start of the next encapsulation
657 """
658 index = data.find(self._boundary)
659 if index < 0:
660 return None
661 else:
662 end = index
663 next = index + len(self._boundary)
664 # backup over CRLF
665 last = max(0, end - 1)
666 if data[last : last + 1] == b"\n":
667 end -= 1
668 last = max(0, end - 1)
669 if data[last : last + 1] == b"\r":
670 end -= 1
671 return end, next
672
673
674def exhaust(stream_or_iterable):
675 """Exhaust an iterator or stream."""
676 try:
677 iterator = iter(stream_or_iterable)
678 except TypeError:
679 iterator = ChunkIter(stream_or_iterable, 16384)
680 collections.deque(iterator, maxlen=0) # consume iterator quickly.
681
682
683def parse_boundary_stream(stream, max_header_size):
684 """
685 Parse one and exactly one stream that encapsulates a boundary.
686 """
687
688 # Look for the end of headers and if not found extend the search to double
689 # the size up to the MAX_TOTAL_HEADER_SIZE.
690 headers_chunk_size = 1024
691 while True:
692 if headers_chunk_size > max_header_size:
693 raise MultiPartParserError("Request max total header size exceeded.")
694
695 # Stream at beginning of header, look for end of header and parse it if
696 # found. The header must fit within one chunk.
697 chunk = stream.read(headers_chunk_size)
698 # 'find' returns the top of these four bytes, so munch them later to
699 # prevent them from polluting the payload.
700 header_end = chunk.find(b"\r\n\r\n")
701 if header_end != -1:
702 break
703
704 # Find no header, mark this fact and pass on the stream verbatim.
705 stream.unget(chunk)
706 # No more data to read.
707 if len(chunk) < headers_chunk_size:
708 return (RAW, {}, stream)
709 # Double the chunk size.
710 headers_chunk_size *= 2
711
712 header = chunk[:header_end]
713
714 # here we place any excess chunk back onto the stream, as
715 # well as throwing away the CRLFCRLF bytes from above.
716 stream.unget(chunk[header_end + 4 :])
717
718 TYPE = RAW
719 outdict = {}
720
721 # Eliminate blank lines
722 for line in header.split(b"\r\n"):
723 # This terminology ("main value" and "dictionary of
724 # parameters") is from the Python docs.
725 try:
726 main_value_pair, params = parse_header_parameters(line.decode())
727 name, value = main_value_pair.split(":", 1)
728 params = {k: v.encode() for k, v in params.items()}
729 except ValueError: # Invalid header.
730 continue
731
732 if name == "content-disposition":
733 TYPE = FIELD
734 if params.get("filename"):
735 TYPE = FILE
736
737 outdict[name] = value, params
738
739 if TYPE == RAW:
740 stream.unget(chunk)
741
742 return (TYPE, outdict, stream)
743
744
745class Parser:
746 def __init__(self, stream, boundary):
747 self._stream = stream
748 self._separator = b"--" + boundary
749
750 def __iter__(self):
751 boundarystream = InterBoundaryIter(self._stream, self._separator)
752 for sub_stream in boundarystream:
753 # Iterate over each part
754 yield parse_boundary_stream(sub_stream, MAX_TOTAL_HEADER_SIZE)