Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/http/multipartparser.py: 16%
346 statements
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
1"""
2Multi-part parsing for file uploads.
4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5file upload handlers for processing.
6"""
7import base64
8import binascii
9import collections
10import html
12from django.conf import settings
13from django.core.exceptions import (
14 RequestDataTooBig,
15 SuspiciousMultipartForm,
16 TooManyFieldsSent,
17)
18from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload
19from django.utils.datastructures import MultiValueDict
20from django.utils.encoding import force_str
21from django.utils.http import parse_header_parameters
22from django.utils.regex_helper import _lazy_re_compile
24__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")
27class MultiPartParserError(Exception):
28 pass
31class InputStreamExhausted(Exception):
32 """
33 No more reads are allowed from this device.
34 """
36 pass
39RAW = "raw"
40FILE = "file"
41FIELD = "field"
44class MultiPartParser:
45 """
46 An RFC 7578 multipart/form-data parser.
48 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
49 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.
50 """
52 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]")
54 def __init__(self, META, input_data, upload_handlers, encoding=None):
55 """
56 Initialize the MultiPartParser object.
58 :META:
59 The standard ``META`` dictionary in Django request objects.
60 :input_data:
61 The raw post data, as a file-like object.
62 :upload_handlers:
63 A list of UploadHandler instances that perform operations on the
64 uploaded data.
65 :encoding:
66 The encoding with which to treat the incoming data.
67 """
68 # Content-Type should contain multipart and the boundary information.
69 content_type = META.get("CONTENT_TYPE", "")
70 if not content_type.startswith("multipart/"):
71 raise MultiPartParserError("Invalid Content-Type: %s" % content_type)
73 try:
74 content_type.encode("ascii")
75 except UnicodeEncodeError:
76 raise MultiPartParserError(
77 "Invalid non-ASCII Content-Type in multipart: %s"
78 % force_str(content_type)
79 )
81 # Parse the header to get the boundary to split the parts.
82 _, opts = parse_header_parameters(content_type)
83 boundary = opts.get("boundary")
84 if not boundary or not self.boundary_re.fullmatch(boundary):
85 raise MultiPartParserError(
86 "Invalid boundary in multipart: %s" % force_str(boundary)
87 )
89 # Content-Length should contain the length of the body we are about
90 # to receive.
91 try:
92 content_length = int(META.get("CONTENT_LENGTH", 0))
93 except (ValueError, TypeError):
94 content_length = 0
96 if content_length < 0:
97 # This means we shouldn't continue...raise an error.
98 raise MultiPartParserError("Invalid content length: %r" % content_length)
100 self._boundary = boundary.encode("ascii")
101 self._input_data = input_data
103 # For compatibility with low-level network APIs (with 32-bit integers),
104 # the chunk size should be < 2^31, but still divisible by 4.
105 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
106 self._chunk_size = min([2**31 - 4] + possible_sizes)
108 self._meta = META
109 self._encoding = encoding or settings.DEFAULT_CHARSET
110 self._content_length = content_length
111 self._upload_handlers = upload_handlers
113 def parse(self):
114 """
115 Parse the POST data and break it into a FILES MultiValueDict and a POST
116 MultiValueDict.
118 Return a tuple containing the POST and FILES dictionary, respectively.
119 """
120 from django.http import QueryDict
122 encoding = self._encoding
123 handlers = self._upload_handlers
125 # HTTP spec says that Content-Length >= 0 is valid
126 # handling content-length == 0 before continuing
127 if self._content_length == 0:
128 return QueryDict(encoding=self._encoding), MultiValueDict()
130 # See if any of the handlers take care of the parsing.
131 # This allows overriding everything if need be.
132 for handler in handlers:
133 result = handler.handle_raw_input(
134 self._input_data,
135 self._meta,
136 self._content_length,
137 self._boundary,
138 encoding,
139 )
140 # Check to see if it was handled
141 if result is not None:
142 return result[0], result[1]
144 # Create the data structures to be used later.
145 self._post = QueryDict(mutable=True)
146 self._files = MultiValueDict()
148 # Instantiate the parser and stream:
149 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))
151 # Whether or not to signal a file-completion at the beginning of the loop.
152 old_field_name = None
153 counters = [0] * len(handlers)
155 # Number of bytes that have been read.
156 num_bytes_read = 0
157 # To count the number of keys in the request.
158 num_post_keys = 0
159 # To limit the amount of data read from the request.
160 read_size = None
161 # Whether a file upload is finished.
162 uploaded_file = True
164 try:
165 for item_type, meta_data, field_stream in Parser(stream, self._boundary):
166 if old_field_name:
167 # We run this at the beginning of the next loop
168 # since we cannot be sure a file is complete until
169 # we hit the next boundary/part of the multipart content.
170 self.handle_file_complete(old_field_name, counters)
171 old_field_name = None
172 uploaded_file = True
174 try:
175 disposition = meta_data["content-disposition"][1]
176 field_name = disposition["name"].strip()
177 except (KeyError, IndexError, AttributeError):
178 continue
180 transfer_encoding = meta_data.get("content-transfer-encoding")
181 if transfer_encoding is not None:
182 transfer_encoding = transfer_encoding[0].strip()
183 field_name = force_str(field_name, encoding, errors="replace")
185 if item_type == FIELD:
186 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.
187 num_post_keys += 1
188 if (
189 settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None
190 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS < num_post_keys
191 ):
192 raise TooManyFieldsSent(
193 "The number of GET/POST parameters exceeded "
194 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."
195 )
197 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.
198 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:
199 read_size = (
200 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read
201 )
203 # This is a post field, we can just set it in the post
204 if transfer_encoding == "base64":
205 raw_data = field_stream.read(size=read_size)
206 num_bytes_read += len(raw_data)
207 try:
208 data = base64.b64decode(raw_data)
209 except binascii.Error:
210 data = raw_data
211 else:
212 data = field_stream.read(size=read_size)
213 num_bytes_read += len(data)
215 # Add two here to make the check consistent with the
216 # x-www-form-urlencoded check that includes '&='.
217 num_bytes_read += len(field_name) + 2
218 if (
219 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None
220 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE
221 ):
222 raise RequestDataTooBig(
223 "Request body exceeded "
224 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."
225 )
227 self._post.appendlist(
228 field_name, force_str(data, encoding, errors="replace")
229 )
230 elif item_type == FILE:
231 # This is a file, use the handler...
232 file_name = disposition.get("filename")
233 if file_name:
234 file_name = force_str(file_name, encoding, errors="replace")
235 file_name = self.sanitize_file_name(file_name)
236 if not file_name:
237 continue
239 content_type, content_type_extra = meta_data.get(
240 "content-type", ("", {})
241 )
242 content_type = content_type.strip()
243 charset = content_type_extra.get("charset")
245 try:
246 content_length = int(meta_data.get("content-length")[0])
247 except (IndexError, TypeError, ValueError):
248 content_length = None
250 counters = [0] * len(handlers)
251 uploaded_file = False
252 try:
253 for handler in handlers:
254 try:
255 handler.new_file(
256 field_name,
257 file_name,
258 content_type,
259 content_length,
260 charset,
261 content_type_extra,
262 )
263 except StopFutureHandlers:
264 break
266 for chunk in field_stream:
267 if transfer_encoding == "base64":
268 # We only special-case base64 transfer encoding
269 # We should always decode base64 chunks by
270 # multiple of 4, ignoring whitespace.
272 stripped_chunk = b"".join(chunk.split())
274 remaining = len(stripped_chunk) % 4
275 while remaining != 0:
276 over_chunk = field_stream.read(4 - remaining)
277 if not over_chunk:
278 break
279 stripped_chunk += b"".join(over_chunk.split())
280 remaining = len(stripped_chunk) % 4
282 try:
283 chunk = base64.b64decode(stripped_chunk)
284 except Exception as exc:
285 # Since this is only a chunk, any error is
286 # an unfixable error.
287 raise MultiPartParserError(
288 "Could not decode base64 data."
289 ) from exc
291 for i, handler in enumerate(handlers):
292 chunk_length = len(chunk)
293 chunk = handler.receive_data_chunk(chunk, counters[i])
294 counters[i] += chunk_length
295 if chunk is None:
296 # Don't continue if the chunk received by
297 # the handler is None.
298 break
300 except SkipFile:
301 self._close_files()
302 # Just use up the rest of this file...
303 exhaust(field_stream)
304 else:
305 # Handle file upload completions on next iteration.
306 old_field_name = field_name
307 else:
308 # If this is neither a FIELD or a FILE, just exhaust the stream.
309 exhaust(stream)
310 except StopUpload as e:
311 self._close_files()
312 if not e.connection_reset:
313 exhaust(self._input_data)
314 else:
315 if not uploaded_file:
316 for handler in handlers:
317 handler.upload_interrupted()
318 # Make sure that the request data is all fed
319 exhaust(self._input_data)
321 # Signal that the upload has completed.
322 # any() shortcircuits if a handler's upload_complete() returns a value.
323 any(handler.upload_complete() for handler in handlers)
324 self._post._mutable = False
325 return self._post, self._files
327 def handle_file_complete(self, old_field_name, counters):
328 """
329 Handle all the signaling that takes place when a file is complete.
330 """
331 for i, handler in enumerate(self._upload_handlers):
332 file_obj = handler.file_complete(counters[i])
333 if file_obj:
334 # If it returns a file object, then set the files dict.
335 self._files.appendlist(
336 force_str(old_field_name, self._encoding, errors="replace"),
337 file_obj,
338 )
339 break
341 def sanitize_file_name(self, file_name):
342 """
343 Sanitize the filename of an upload.
345 Remove all possible path separators, even though that might remove more
346 than actually required by the target system. Filenames that could
347 potentially cause problems (current/parent dir) are also discarded.
349 It should be noted that this function could still return a "filepath"
350 like "C:some_file.txt" which is handled later on by the storage layer.
351 So while this function does sanitize filenames to some extent, the
352 resulting filename should still be considered as untrusted user input.
353 """
354 file_name = html.unescape(file_name)
355 file_name = file_name.rsplit("/")[-1]
356 file_name = file_name.rsplit("\\")[-1]
357 # Remove non-printable characters.
358 file_name = "".join([char for char in file_name if char.isprintable()])
360 if file_name in {"", ".", ".."}:
361 return None
362 return file_name
364 IE_sanitize = sanitize_file_name
366 def _close_files(self):
367 # Free up all file handles.
368 # FIXME: this currently assumes that upload handlers store the file as 'file'
369 # We should document that...
370 # (Maybe add handler.free_file to complement new_file)
371 for handler in self._upload_handlers:
372 if hasattr(handler, "file"):
373 handler.file.close()
376class LazyStream:
377 """
378 The LazyStream wrapper allows one to get and "unget" bytes from a stream.
380 Given a producer object (an iterator that yields bytestrings), the
381 LazyStream object will support iteration, reading, and keeping a "look-back"
382 variable in case you need to "unget" some bytes.
383 """
385 def __init__(self, producer, length=None):
386 """
387 Every LazyStream must have a producer when instantiated.
389 A producer is an iterable that returns a string each time it
390 is called.
391 """
392 self._producer = producer
393 self._empty = False
394 self._leftover = b""
395 self.length = length
396 self.position = 0
397 self._remaining = length
398 self._unget_history = []
400 def tell(self):
401 return self.position
403 def read(self, size=None):
404 def parts():
405 remaining = self._remaining if size is None else size
406 # do the whole thing in one shot if no limit was provided.
407 if remaining is None:
408 yield b"".join(self)
409 return
411 # otherwise do some bookkeeping to return exactly enough
412 # of the stream and stashing any extra content we get from
413 # the producer
414 while remaining != 0:
415 assert remaining > 0, "remaining bytes to read should never go negative"
417 try:
418 chunk = next(self)
419 except StopIteration:
420 return
421 else:
422 emitting = chunk[:remaining]
423 self.unget(chunk[remaining:])
424 remaining -= len(emitting)
425 yield emitting
427 return b"".join(parts())
429 def __next__(self):
430 """
431 Used when the exact number of bytes to read is unimportant.
433 Return whatever chunk is conveniently returned from the iterator.
434 Useful to avoid unnecessary bookkeeping if performance is an issue.
435 """
436 if self._leftover:
437 output = self._leftover
438 self._leftover = b""
439 else:
440 output = next(self._producer)
441 self._unget_history = []
442 self.position += len(output)
443 return output
445 def close(self):
446 """
447 Used to invalidate/disable this lazy stream.
449 Replace the producer with an empty list. Any leftover bytes that have
450 already been read will still be reported upon read() and/or next().
451 """
452 self._producer = []
454 def __iter__(self):
455 return self
457 def unget(self, bytes):
458 """
459 Place bytes back onto the front of the lazy stream.
461 Future calls to read() will return those bytes first. The
462 stream position and thus tell() will be rewound.
463 """
464 if not bytes:
465 return
466 self._update_unget_history(len(bytes))
467 self.position -= len(bytes)
468 self._leftover = bytes + self._leftover
470 def _update_unget_history(self, num_bytes):
471 """
472 Update the unget history as a sanity check to see if we've pushed
473 back the same number of bytes in one chunk. If we keep ungetting the
474 same number of bytes many times (here, 50), we're mostly likely in an
475 infinite loop of some sort. This is usually caused by a
476 maliciously-malformed MIME request.
477 """
478 self._unget_history = [num_bytes] + self._unget_history[:49]
479 number_equal = len(
480 [
481 current_number
482 for current_number in self._unget_history
483 if current_number == num_bytes
484 ]
485 )
487 if number_equal > 40:
488 raise SuspiciousMultipartForm(
489 "The multipart parser got stuck, which shouldn't happen with"
490 " normal uploaded files. Check for malicious upload activity;"
491 " if there is none, report this to the Django developers."
492 )
495class ChunkIter:
496 """
497 An iterable that will yield chunks of data. Given a file-like object as the
498 constructor, yield chunks of read operations from that object.
499 """
501 def __init__(self, flo, chunk_size=64 * 1024):
502 self.flo = flo
503 self.chunk_size = chunk_size
505 def __next__(self):
506 try:
507 data = self.flo.read(self.chunk_size)
508 except InputStreamExhausted:
509 raise StopIteration()
510 if data:
511 return data
512 else:
513 raise StopIteration()
515 def __iter__(self):
516 return self
519class InterBoundaryIter:
520 """
521 A Producer that will iterate over boundaries.
522 """
524 def __init__(self, stream, boundary):
525 self._stream = stream
526 self._boundary = boundary
528 def __iter__(self):
529 return self
531 def __next__(self):
532 try:
533 return LazyStream(BoundaryIter(self._stream, self._boundary))
534 except InputStreamExhausted:
535 raise StopIteration()
538class BoundaryIter:
539 """
540 A Producer that is sensitive to boundaries.
542 Will happily yield bytes until a boundary is found. Will yield the bytes
543 before the boundary, throw away the boundary bytes themselves, and push the
544 post-boundary bytes back on the stream.
546 The future calls to next() after locating the boundary will raise a
547 StopIteration exception.
548 """
550 def __init__(self, stream, boundary):
551 self._stream = stream
552 self._boundary = boundary
553 self._done = False
554 # rollback an additional six bytes because the format is like
555 # this: CRLF<boundary>[--CRLF]
556 self._rollback = len(boundary) + 6
558 # Try to use mx fast string search if available. Otherwise
559 # use Python find. Wrap the latter for consistency.
560 unused_char = self._stream.read(1)
561 if not unused_char:
562 raise InputStreamExhausted()
563 self._stream.unget(unused_char)
565 def __iter__(self):
566 return self
568 def __next__(self):
569 if self._done:
570 raise StopIteration()
572 stream = self._stream
573 rollback = self._rollback
575 bytes_read = 0
576 chunks = []
577 for bytes in stream:
578 bytes_read += len(bytes)
579 chunks.append(bytes)
580 if bytes_read > rollback:
581 break
582 if not bytes:
583 break
584 else:
585 self._done = True
587 if not chunks:
588 raise StopIteration()
590 chunk = b"".join(chunks)
591 boundary = self._find_boundary(chunk)
593 if boundary:
594 end, next = boundary
595 stream.unget(chunk[next:])
596 self._done = True
597 return chunk[:end]
598 else:
599 # make sure we don't treat a partial boundary (and
600 # its separators) as data
601 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):
602 # There's nothing left, we should just return and mark as done.
603 self._done = True
604 return chunk
605 else:
606 stream.unget(chunk[-rollback:])
607 return chunk[:-rollback]
609 def _find_boundary(self, data):
610 """
611 Find a multipart boundary in data.
613 Should no boundary exist in the data, return None. Otherwise, return
614 a tuple containing the indices of the following:
615 * the end of current encapsulation
616 * the start of the next encapsulation
617 """
618 index = data.find(self._boundary)
619 if index < 0:
620 return None
621 else:
622 end = index
623 next = index + len(self._boundary)
624 # backup over CRLF
625 last = max(0, end - 1)
626 if data[last : last + 1] == b"\n":
627 end -= 1
628 last = max(0, end - 1)
629 if data[last : last + 1] == b"\r":
630 end -= 1
631 return end, next
634def exhaust(stream_or_iterable):
635 """Exhaust an iterator or stream."""
636 try:
637 iterator = iter(stream_or_iterable)
638 except TypeError:
639 iterator = ChunkIter(stream_or_iterable, 16384)
640 collections.deque(iterator, maxlen=0) # consume iterator quickly.
643def parse_boundary_stream(stream, max_header_size):
644 """
645 Parse one and exactly one stream that encapsulates a boundary.
646 """
647 # Stream at beginning of header, look for end of header
648 # and parse it if found. The header must fit within one
649 # chunk.
650 chunk = stream.read(max_header_size)
652 # 'find' returns the top of these four bytes, so we'll
653 # need to munch them later to prevent them from polluting
654 # the payload.
655 header_end = chunk.find(b"\r\n\r\n")
657 if header_end == -1:
658 # we find no header, so we just mark this fact and pass on
659 # the stream verbatim
660 stream.unget(chunk)
661 return (RAW, {}, stream)
663 header = chunk[:header_end]
665 # here we place any excess chunk back onto the stream, as
666 # well as throwing away the CRLFCRLF bytes from above.
667 stream.unget(chunk[header_end + 4 :])
669 TYPE = RAW
670 outdict = {}
672 # Eliminate blank lines
673 for line in header.split(b"\r\n"):
674 # This terminology ("main value" and "dictionary of
675 # parameters") is from the Python docs.
676 try:
677 main_value_pair, params = parse_header_parameters(line.decode())
678 name, value = main_value_pair.split(":", 1)
679 params = {k: v.encode() for k, v in params.items()}
680 except ValueError: # Invalid header.
681 continue
683 if name == "content-disposition":
684 TYPE = FIELD
685 if params.get("filename"):
686 TYPE = FILE
688 outdict[name] = value, params
690 if TYPE == RAW:
691 stream.unget(chunk)
693 return (TYPE, outdict, stream)
696class Parser:
697 def __init__(self, stream, boundary):
698 self._stream = stream
699 self._separator = b"--" + boundary
701 def __iter__(self):
702 boundarystream = InterBoundaryIter(self._stream, self._separator)
703 for sub_stream in boundarystream:
704 # Iterate over each part
705 yield parse_boundary_stream(sub_stream, 1024)