Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/multipart/multipart.py: 17%
717 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:12 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:12 +0000
1from .decoders import *
2from .exceptions import *
4import os
5import re
6import sys
7import shutil
8import logging
9import tempfile
10from io import BytesIO
11from numbers import Number
13# Unique missing object.
14_missing = object()
16# States for the querystring parser.
17STATE_BEFORE_FIELD = 0
18STATE_FIELD_NAME = 1
19STATE_FIELD_DATA = 2
21# States for the multipart parser
22STATE_START = 0
23STATE_START_BOUNDARY = 1
24STATE_HEADER_FIELD_START = 2
25STATE_HEADER_FIELD = 3
26STATE_HEADER_VALUE_START = 4
27STATE_HEADER_VALUE = 5
28STATE_HEADER_VALUE_ALMOST_DONE = 6
29STATE_HEADERS_ALMOST_DONE = 7
30STATE_PART_DATA_START = 8
31STATE_PART_DATA = 9
32STATE_PART_DATA_END = 10
33STATE_END = 11
35STATES = [
36 "START",
37 "START_BOUNDARY", "HEADER_FIELD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE",
38 "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END"
39]
42# Flags for the multipart parser.
43FLAG_PART_BOUNDARY = 1
44FLAG_LAST_BOUNDARY = 2
46# Get constants. Since iterating over a str on Python 2 gives you a 1-length
47# string, but iterating over a bytes object on Python 3 gives you an integer,
48# we need to save these constants.
49CR = b'\r'[0]
50LF = b'\n'[0]
51COLON = b':'[0]
52SPACE = b' '[0]
53HYPHEN = b'-'[0]
54AMPERSAND = b'&'[0]
55SEMICOLON = b';'[0]
56LOWER_A = b'a'[0]
57LOWER_Z = b'z'[0]
58NULL = b'\x00'[0]
60# Lower-casing a character is different, because of the difference between
61# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,
62# and joining a list of bytes together.
63# These functions abstract that.
64lower_char = lambda c: c | 0x20
65ord_char = lambda c: c
66join_bytes = lambda b: bytes(list(b))
68# These are regexes for parsing header values.
69SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t')
70QUOTED_STR = br'"(?:\\.|[^"])*"'
71VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')'
72OPTION_RE_STR = (
73 br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')'
74)
75OPTION_RE = re.compile(OPTION_RE_STR)
76QUOTE = b'"'[0]
79def parse_options_header(value):
80 """
81 Parses a Content-Type header into a value in the following format:
82 (content_type, {parameters})
83 """
84 if not value:
85 return (b'', {})
87 # If we are passed a string, we assume that it conforms to WSGI and does
88 # not contain any code point that's not in latin-1.
89 if isinstance(value, str): # pragma: no cover
90 value = value.encode('latin-1')
92 # If we have no options, return the string as-is.
93 if b';' not in value:
94 return (value.lower().strip(), {})
96 # Split at the first semicolon, to get our value and then options.
97 ctype, rest = value.split(b';', 1)
98 options = {}
100 # Parse the options.
101 for match in OPTION_RE.finditer(rest):
102 key = match.group(1).lower()
103 value = match.group(2)
104 if value[0] == QUOTE and value[-1] == QUOTE:
105 # Unquote the value.
106 value = value[1:-1]
107 value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
109 # If the value is a filename, we need to fix a bug on IE6 that sends
110 # the full file path instead of the filename.
111 if key == b'filename':
112 if value[1:3] == b':\\' or value[:2] == b'\\\\':
113 value = value.split(b'\\')[-1]
115 options[key] = value
117 return ctype, options
120class Field:
121 """A Field object represents a (parsed) form field. It represents a single
122 field with a corresponding name and value.
124 The name that a :class:`Field` will be instantiated with is the same name
125 that would be found in the following HTML::
127 <input name="name_goes_here" type="text"/>
129 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that
130 will be called when data is written to the Field, and when the Field is
131 finalized, respectively.
133 :param name: the name of the form field
134 """
135 def __init__(self, name):
136 self._name = name
137 self._value = []
139 # We cache the joined version of _value for speed.
140 self._cache = _missing
142 @classmethod
143 def from_value(klass, name, value):
144 """Create an instance of a :class:`Field`, and set the corresponding
145 value - either None or an actual value. This method will also
146 finalize the Field itself.
148 :param name: the name of the form field
149 :param value: the value of the form field - either a bytestring or
150 None
151 """
153 f = klass(name)
154 if value is None:
155 f.set_none()
156 else:
157 f.write(value)
158 f.finalize()
159 return f
161 def write(self, data):
162 """Write some data into the form field.
164 :param data: a bytestring
165 """
166 return self.on_data(data)
168 def on_data(self, data):
169 """This method is a callback that will be called whenever data is
170 written to the Field.
172 :param data: a bytestring
173 """
174 self._value.append(data)
175 self._cache = _missing
176 return len(data)
178 def on_end(self):
179 """This method is called whenever the Field is finalized.
180 """
181 if self._cache is _missing:
182 self._cache = b''.join(self._value)
184 def finalize(self):
185 """Finalize the form field.
186 """
187 self.on_end()
189 def close(self):
190 """Close the Field object. This will free any underlying cache.
191 """
192 # Free our value array.
193 if self._cache is _missing:
194 self._cache = b''.join(self._value)
196 del self._value
198 def set_none(self):
199 """Some fields in a querystring can possibly have a value of None - for
200 example, the string "foo&bar=&baz=asdf" will have a field with the
201 name "foo" and value None, one with name "bar" and value "", and one
202 with name "baz" and value "asdf". Since the write() interface doesn't
203 support writing None, this function will set the field value to None.
204 """
205 self._cache = None
207 @property
208 def field_name(self):
209 """This property returns the name of the field."""
210 return self._name
212 @property
213 def value(self):
214 """This property returns the value of the form field."""
215 if self._cache is _missing:
216 self._cache = b''.join(self._value)
218 return self._cache
220 def __eq__(self, other):
221 if isinstance(other, Field):
222 return (
223 self.field_name == other.field_name and
224 self.value == other.value
225 )
226 else:
227 return NotImplemented
229 def __repr__(self):
230 if len(self.value) > 97:
231 # We get the repr, and then insert three dots before the final
232 # quote.
233 v = repr(self.value[:97])[:-1] + "...'"
234 else:
235 v = repr(self.value)
237 return "{}(field_name={!r}, value={})".format(
238 self.__class__.__name__,
239 self.field_name,
240 v
241 )
244class File:
245 """This class represents an uploaded file. It handles writing file data to
246 either an in-memory file or a temporary file on-disk, if the optional
247 threshold is passed.
249 There are some options that can be passed to the File to change behavior
250 of the class. Valid options are as follows:
252 .. list-table::
253 :widths: 15 5 5 30
254 :header-rows: 1
256 * - Name
257 - Type
258 - Default
259 - Description
260 * - UPLOAD_DIR
261 - `str`
262 - None
263 - The directory to store uploaded files in. If this is None, a
264 temporary file will be created in the system's standard location.
265 * - UPLOAD_DELETE_TMP
266 - `bool`
267 - True
268 - Delete automatically created TMP file
269 * - UPLOAD_KEEP_FILENAME
270 - `bool`
271 - False
272 - Whether or not to keep the filename of the uploaded file. If True,
273 then the filename will be converted to a safe representation (e.g.
274 by removing any invalid path segments), and then saved with the
275 same name). Otherwise, a temporary name will be used.
276 * - UPLOAD_KEEP_EXTENSIONS
277 - `bool`
278 - False
279 - Whether or not to keep the uploaded file's extension. If False, the
280 file will be saved with the default temporary extension (usually
281 ".tmp"). Otherwise, the file's extension will be maintained. Note
282 that this will properly combine with the UPLOAD_KEEP_FILENAME
283 setting.
284 * - MAX_MEMORY_FILE_SIZE
285 - `int`
286 - 1 MiB
287 - The maximum number of bytes of a File to keep in memory. By
288 default, the contents of a File are kept into memory until a certain
289 limit is reached, after which the contents of the File are written
290 to a temporary file. This behavior can be disabled by setting this
291 value to an appropriately large value (or, for example, infinity,
292 such as `float('inf')`.
294 :param file_name: The name of the file that this :class:`File` represents
296 :param field_name: The field name that uploaded this file. Note that this
297 can be None, if, for example, the file was uploaded
298 with Content-Type application/octet-stream
300 :param config: The configuration for this File. See above for valid
301 configuration keys and their corresponding values.
302 """
303 def __init__(self, file_name, field_name=None, config={}):
304 # Save configuration, set other variables default.
305 self.logger = logging.getLogger(__name__)
306 self._config = config
307 self._in_memory = True
308 self._bytes_written = 0
309 self._fileobj = BytesIO()
311 # Save the provided field/file name.
312 self._field_name = field_name
313 self._file_name = file_name
315 # Our actual file name is None by default, since, depending on our
316 # config, we may not actually use the provided name.
317 self._actual_file_name = None
319 # Split the extension from the filename.
320 if file_name is not None:
321 base, ext = os.path.splitext(file_name)
322 self._file_base = base
323 self._ext = ext
325 @property
326 def field_name(self):
327 """The form field associated with this file. May be None if there isn't
328 one, for example when we have an application/octet-stream upload.
329 """
330 return self._field_name
332 @property
333 def file_name(self):
334 """The file name given in the upload request.
335 """
336 return self._file_name
338 @property
339 def actual_file_name(self):
340 """The file name that this file is saved as. Will be None if it's not
341 currently saved on disk.
342 """
343 return self._actual_file_name
345 @property
346 def file_object(self):
347 """The file object that we're currently writing to. Note that this
348 will either be an instance of a :class:`io.BytesIO`, or a regular file
349 object.
350 """
351 return self._fileobj
353 @property
354 def size(self):
355 """The total size of this file, counted as the number of bytes that
356 currently have been written to the file.
357 """
358 return self._bytes_written
360 @property
361 def in_memory(self):
362 """A boolean representing whether or not this file object is currently
363 stored in-memory or on-disk.
364 """
365 return self._in_memory
367 def flush_to_disk(self):
368 """If the file is already on-disk, do nothing. Otherwise, copy from
369 the in-memory buffer to a disk file, and then reassign our internal
370 file object to this new disk file.
372 Note that if you attempt to flush a file that is already on-disk, a
373 warning will be logged to this module's logger.
374 """
375 if not self._in_memory:
376 self.logger.warning(
377 "Trying to flush to disk when we're not in memory"
378 )
379 return
381 # Go back to the start of our file.
382 self._fileobj.seek(0)
384 # Open a new file.
385 new_file = self._get_disk_file()
387 # Copy the file objects.
388 shutil.copyfileobj(self._fileobj, new_file)
390 # Seek to the new position in our new file.
391 new_file.seek(self._bytes_written)
393 # Reassign the fileobject.
394 old_fileobj = self._fileobj
395 self._fileobj = new_file
397 # We're no longer in memory.
398 self._in_memory = False
400 # Close the old file object.
401 old_fileobj.close()
403 def _get_disk_file(self):
404 """This function is responsible for getting a file object on-disk for us.
405 """
406 self.logger.info("Opening a file on disk")
408 file_dir = self._config.get('UPLOAD_DIR')
409 keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False)
410 keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False)
411 delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True)
413 # If we have a directory and are to keep the filename...
414 if file_dir is not None and keep_filename:
415 self.logger.info("Saving with filename in: %r", file_dir)
417 # Build our filename.
418 # TODO: what happens if we don't have a filename?
419 fname = self._file_base
420 if keep_extensions:
421 fname = fname + self._ext
423 path = os.path.join(file_dir, fname)
424 try:
425 self.logger.info("Opening file: %r", path)
426 tmp_file = open(path, 'w+b')
427 except OSError as e:
428 tmp_file = None
430 self.logger.exception("Error opening temporary file")
431 raise FileError("Error opening temporary file: %r" % path)
432 else:
433 # Build options array.
434 # Note that on Python 3, tempfile doesn't support byte names. We
435 # encode our paths using the default filesystem encoding.
436 options = {}
437 if keep_extensions:
438 ext = self._ext
439 if isinstance(ext, bytes):
440 ext = ext.decode(sys.getfilesystemencoding())
442 options['suffix'] = ext
443 if file_dir is not None:
444 d = file_dir
445 if isinstance(d, bytes):
446 d = d.decode(sys.getfilesystemencoding())
448 options['dir'] = d
449 options['delete'] = delete_tmp
451 # Create a temporary (named) file with the appropriate settings.
452 self.logger.info("Creating a temporary file with options: %r",
453 options)
454 try:
455 tmp_file = tempfile.NamedTemporaryFile(**options)
456 except OSError:
457 self.logger.exception("Error creating named temporary file")
458 raise FileError("Error creating named temporary file")
460 fname = tmp_file.name
462 # Encode filename as bytes.
463 if isinstance(fname, str):
464 fname = fname.encode(sys.getfilesystemencoding())
466 self._actual_file_name = fname
467 return tmp_file
469 def write(self, data):
470 """Write some data to the File.
472 :param data: a bytestring
473 """
474 return self.on_data(data)
476 def on_data(self, data):
477 """This method is a callback that will be called whenever data is
478 written to the File.
480 :param data: a bytestring
481 """
482 pos = self._fileobj.tell()
483 bwritten = self._fileobj.write(data)
484 # true file objects write returns None
485 if bwritten is None:
486 bwritten = self._fileobj.tell() - pos
488 # If the bytes written isn't the same as the length, just return.
489 if bwritten != len(data):
490 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten,
491 len(data))
492 return bwritten
494 # Keep track of how many bytes we've written.
495 self._bytes_written += bwritten
497 # If we're in-memory and are over our limit, we create a file.
498 if (self._in_memory and
499 self._config.get('MAX_MEMORY_FILE_SIZE') is not None and
500 (self._bytes_written >
501 self._config.get('MAX_MEMORY_FILE_SIZE'))):
502 self.logger.info("Flushing to disk")
503 self.flush_to_disk()
505 # Return the number of bytes written.
506 return bwritten
508 def on_end(self):
509 """This method is called whenever the Field is finalized.
510 """
511 # Flush the underlying file object
512 self._fileobj.flush()
514 def finalize(self):
515 """Finalize the form file. This will not close the underlying file,
516 but simply signal that we are finished writing to the File.
517 """
518 self.on_end()
520 def close(self):
521 """Close the File object. This will actually close the underlying
522 file object (whether it's a :class:`io.BytesIO` or an actual file
523 object).
524 """
525 self._fileobj.close()
527 def __repr__(self):
528 return "{}(file_name={!r}, field_name={!r})".format(
529 self.__class__.__name__,
530 self.file_name,
531 self.field_name
532 )
535class BaseParser:
536 """This class is the base class for all parsers. It contains the logic for
537 calling and adding callbacks.
539 A callback can be one of two different forms. "Notification callbacks" are
540 callbacks that are called when something happens - for example, when a new
541 part of a multipart message is encountered by the parser. "Data callbacks"
542 are called when we get some sort of data - for example, part of the body of
543 a multipart chunk. Notification callbacks are called with no parameters,
544 whereas data callbacks are called with three, as follows::
546 data_callback(data, start, end)
548 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on
549 Python 3). "start" and "end" are integer indexes into the "data" string
550 that represent the data of interest. Thus, in a data callback, the slice
551 `data[start:end]` represents the data that the callback is "interested in".
552 The callback is not passed a copy of the data, since copying severely hurts
553 performance.
554 """
555 def __init__(self):
556 self.logger = logging.getLogger(__name__)
558 def callback(self, name, data=None, start=None, end=None):
559 """This function calls a provided callback with some data. If the
560 callback is not set, will do nothing.
562 :param name: The name of the callback to call (as a string).
564 :param data: Data to pass to the callback. If None, then it is
565 assumed that the callback is a notification callback,
566 and no parameters are given.
568 :param end: An integer that is passed to the data callback.
570 :param start: An integer that is passed to the data callback.
571 """
572 name = "on_" + name
573 func = self.callbacks.get(name)
574 if func is None:
575 return
577 # Depending on whether we're given a buffer...
578 if data is not None:
579 # Don't do anything if we have start == end.
580 if start is not None and start == end:
581 return
583 self.logger.debug("Calling %s with data[%d:%d]", name, start, end)
584 func(data, start, end)
585 else:
586 self.logger.debug("Calling %s with no data", name)
587 func()
589 def set_callback(self, name, new_func):
590 """Update the function for a callback. Removes from the callbacks dict
591 if new_func is None.
593 :param name: The name of the callback to call (as a string).
595 :param new_func: The new function for the callback. If None, then the
596 callback will be removed (with no error if it does not
597 exist).
598 """
599 if new_func is None:
600 self.callbacks.pop('on_' + name, None)
601 else:
602 self.callbacks['on_' + name] = new_func
604 def close(self):
605 pass # pragma: no cover
607 def finalize(self):
608 pass # pragma: no cover
610 def __repr__(self):
611 return "%s()" % self.__class__.__name__
614class OctetStreamParser(BaseParser):
615 """This parser parses an octet-stream request body and calls callbacks when
616 incoming data is received. Callbacks are as follows:
618 .. list-table::
619 :widths: 15 10 30
620 :header-rows: 1
622 * - Callback Name
623 - Parameters
624 - Description
625 * - on_start
626 - None
627 - Called when the first data is parsed.
628 * - on_data
629 - data, start, end
630 - Called for each data chunk that is parsed.
631 * - on_end
632 - None
633 - Called when the parser is finished parsing all data.
635 :param callbacks: A dictionary of callbacks. See the documentation for
636 :class:`BaseParser`.
638 :param max_size: The maximum size of body to parse. Defaults to infinity -
639 i.e. unbounded.
640 """
641 def __init__(self, callbacks={}, max_size=float('inf')):
642 super().__init__()
643 self.callbacks = callbacks
644 self._started = False
646 if not isinstance(max_size, Number) or max_size < 1:
647 raise ValueError("max_size must be a positive number, not %r" %
648 max_size)
649 self.max_size = max_size
650 self._current_size = 0
652 def write(self, data):
653 """Write some data to the parser, which will perform size verification,
654 and then pass the data to the underlying callback.
656 :param data: a bytestring
657 """
658 if not self._started:
659 self.callback('start')
660 self._started = True
662 # Truncate data length.
663 data_len = len(data)
664 if (self._current_size + data_len) > self.max_size:
665 # We truncate the length of data that we are to process.
666 new_size = int(self.max_size - self._current_size)
667 self.logger.warning("Current size is %d (max %d), so truncating "
668 "data length from %d to %d",
669 self._current_size, self.max_size, data_len,
670 new_size)
671 data_len = new_size
673 # Increment size, then callback, in case there's an exception.
674 self._current_size += data_len
675 self.callback('data', data, 0, data_len)
676 return data_len
678 def finalize(self):
679 """Finalize this parser, which signals to that we are finished parsing,
680 and sends the on_end callback.
681 """
682 self.callback('end')
684 def __repr__(self):
685 return "%s()" % self.__class__.__name__
688class QuerystringParser(BaseParser):
689 """This is a streaming querystring parser. It will consume data, and call
690 the callbacks given when it has data.
692 .. list-table::
693 :widths: 15 10 30
694 :header-rows: 1
696 * - Callback Name
697 - Parameters
698 - Description
699 * - on_field_start
700 - None
701 - Called when a new field is encountered.
702 * - on_field_name
703 - data, start, end
704 - Called when a portion of a field's name is encountered.
705 * - on_field_data
706 - data, start, end
707 - Called when a portion of a field's data is encountered.
708 * - on_field_end
709 - None
710 - Called when the end of a field is encountered.
711 * - on_end
712 - None
713 - Called when the parser is finished parsing all data.
715 :param callbacks: A dictionary of callbacks. See the documentation for
716 :class:`BaseParser`.
718 :param strict_parsing: Whether or not to parse the body strictly. Defaults
719 to False. If this is set to True, then the behavior
720 of the parser changes as the following: if a field
721 has a value with an equal sign (e.g. "foo=bar", or
722 "foo="), it is always included. If a field has no
723 equals sign (e.g. "...&name&..."), it will be
724 treated as an error if 'strict_parsing' is True,
725 otherwise included. If an error is encountered,
726 then a
727 :class:`multipart.exceptions.QuerystringParseError`
728 will be raised.
730 :param max_size: The maximum size of body to parse. Defaults to infinity -
731 i.e. unbounded.
732 """
733 def __init__(self, callbacks={}, strict_parsing=False,
734 max_size=float('inf')):
735 super().__init__()
736 self.state = STATE_BEFORE_FIELD
737 self._found_sep = False
739 self.callbacks = callbacks
741 # Max-size stuff
742 if not isinstance(max_size, Number) or max_size < 1:
743 raise ValueError("max_size must be a positive number, not %r" %
744 max_size)
745 self.max_size = max_size
746 self._current_size = 0
748 # Should parsing be strict?
749 self.strict_parsing = strict_parsing
751 def write(self, data):
752 """Write some data to the parser, which will perform size verification,
753 parse into either a field name or value, and then pass the
754 corresponding data to the underlying callback. If an error is
755 encountered while parsing, a QuerystringParseError will be raised. The
756 "offset" attribute of the raised exception will be set to the offset in
757 the input data chunk (NOT the overall stream) that caused the error.
759 :param data: a bytestring
760 """
761 # Handle sizing.
762 data_len = len(data)
763 if (self._current_size + data_len) > self.max_size:
764 # We truncate the length of data that we are to process.
765 new_size = int(self.max_size - self._current_size)
766 self.logger.warning("Current size is %d (max %d), so truncating "
767 "data length from %d to %d",
768 self._current_size, self.max_size, data_len,
769 new_size)
770 data_len = new_size
772 l = 0
773 try:
774 l = self._internal_write(data, data_len)
775 finally:
776 self._current_size += l
778 return l
780 def _internal_write(self, data, length):
781 state = self.state
782 strict_parsing = self.strict_parsing
783 found_sep = self._found_sep
785 i = 0
786 while i < length:
787 ch = data[i]
789 # Depending on our state...
790 if state == STATE_BEFORE_FIELD:
791 # If the 'found_sep' flag is set, we've already encountered
792 # and skipped a single separator. If so, we check our strict
793 # parsing flag and decide what to do. Otherwise, we haven't
794 # yet reached a separator, and thus, if we do, we need to skip
795 # it as it will be the boundary between fields that's supposed
796 # to be there.
797 if ch == AMPERSAND or ch == SEMICOLON:
798 if found_sep:
799 # If we're parsing strictly, we disallow blank chunks.
800 if strict_parsing:
801 e = QuerystringParseError(
802 "Skipping duplicate ampersand/semicolon at "
803 "%d" % i
804 )
805 e.offset = i
806 raise e
807 else:
808 self.logger.debug("Skipping duplicate ampersand/"
809 "semicolon at %d", i)
810 else:
811 # This case is when we're skipping the (first)
812 # separator between fields, so we just set our flag
813 # and continue on.
814 found_sep = True
815 else:
816 # Emit a field-start event, and go to that state. Also,
817 # reset the "found_sep" flag, for the next time we get to
818 # this state.
819 self.callback('field_start')
820 i -= 1
821 state = STATE_FIELD_NAME
822 found_sep = False
824 elif state == STATE_FIELD_NAME:
825 # Try and find a separator - we ensure that, if we do, we only
826 # look for the equal sign before it.
827 sep_pos = data.find(b'&', i)
828 if sep_pos == -1:
829 sep_pos = data.find(b';', i)
831 # See if we can find an equals sign in the remaining data. If
832 # so, we can immediately emit the field name and jump to the
833 # data state.
834 if sep_pos != -1:
835 equals_pos = data.find(b'=', i, sep_pos)
836 else:
837 equals_pos = data.find(b'=', i)
839 if equals_pos != -1:
840 # Emit this name.
841 self.callback('field_name', data, i, equals_pos)
843 # Jump i to this position. Note that it will then have 1
844 # added to it below, which means the next iteration of this
845 # loop will inspect the character after the equals sign.
846 i = equals_pos
847 state = STATE_FIELD_DATA
848 else:
849 # No equals sign found.
850 if not strict_parsing:
851 # See also comments in the STATE_FIELD_DATA case below.
852 # If we found the separator, we emit the name and just
853 # end - there's no data callback at all (not even with
854 # a blank value).
855 if sep_pos != -1:
856 self.callback('field_name', data, i, sep_pos)
857 self.callback('field_end')
859 i = sep_pos - 1
860 state = STATE_BEFORE_FIELD
861 else:
862 # Otherwise, no separator in this block, so the
863 # rest of this chunk must be a name.
864 self.callback('field_name', data, i, length)
865 i = length
867 else:
868 # We're parsing strictly. If we find a separator,
869 # this is an error - we require an equals sign.
870 if sep_pos != -1:
871 e = QuerystringParseError(
872 "When strict_parsing is True, we require an "
873 "equals sign in all field chunks. Did not "
874 "find one in the chunk that starts at %d" %
875 (i,)
876 )
877 e.offset = i
878 raise e
880 # No separator in the rest of this chunk, so it's just
881 # a field name.
882 self.callback('field_name', data, i, length)
883 i = length
885 elif state == STATE_FIELD_DATA:
886 # Try finding either an ampersand or a semicolon after this
887 # position.
888 sep_pos = data.find(b'&', i)
889 if sep_pos == -1:
890 sep_pos = data.find(b';', i)
892 # If we found it, callback this bit as data and then go back
893 # to expecting to find a field.
894 if sep_pos != -1:
895 self.callback('field_data', data, i, sep_pos)
896 self.callback('field_end')
898 # Note that we go to the separator, which brings us to the
899 # "before field" state. This allows us to properly emit
900 # "field_start" events only when we actually have data for
901 # a field of some sort.
902 i = sep_pos - 1
903 state = STATE_BEFORE_FIELD
905 # Otherwise, emit the rest as data and finish.
906 else:
907 self.callback('field_data', data, i, length)
908 i = length
910 else: # pragma: no cover (error case)
911 msg = "Reached an unknown state %d at %d" % (state, i)
912 self.logger.warning(msg)
913 e = QuerystringParseError(msg)
914 e.offset = i
915 raise e
917 i += 1
919 self.state = state
920 self._found_sep = found_sep
921 return len(data)
923 def finalize(self):
924 """Finalize this parser, which signals to that we are finished parsing,
925 if we're still in the middle of a field, an on_field_end callback, and
926 then the on_end callback.
927 """
928 # If we're currently in the middle of a field, we finish it.
929 if self.state == STATE_FIELD_DATA:
930 self.callback('field_end')
931 self.callback('end')
933 def __repr__(self):
934 return "{}(strict_parsing={!r}, max_size={!r})".format(
935 self.__class__.__name__,
936 self.strict_parsing, self.max_size
937 )
940class MultipartParser(BaseParser):
941 """This class is a streaming multipart/form-data parser.
943 .. list-table::
944 :widths: 15 10 30
945 :header-rows: 1
947 * - Callback Name
948 - Parameters
949 - Description
950 * - on_part_begin
951 - None
952 - Called when a new part of the multipart message is encountered.
953 * - on_part_data
954 - data, start, end
955 - Called when a portion of a part's data is encountered.
956 * - on_part_end
957 - None
958 - Called when the end of a part is reached.
959 * - on_header_begin
960 - None
961 - Called when we've found a new header in a part of a multipart
962 message
963 * - on_header_field
964 - data, start, end
965 - Called each time an additional portion of a header is read (i.e. the
966 part of the header that is before the colon; the "Foo" in
967 "Foo: Bar").
968 * - on_header_value
969 - data, start, end
970 - Called when we get data for a header.
971 * - on_header_end
972 - None
973 - Called when the current header is finished - i.e. we've reached the
974 newline at the end of the header.
975 * - on_headers_finished
976 - None
977 - Called when all headers are finished, and before the part data
978 starts.
979 * - on_end
980 - None
981 - Called when the parser is finished parsing all data.
984 :param boundary: The multipart boundary. This is required, and must match
985 what is given in the HTTP request - usually in the
986 Content-Type header.
988 :param callbacks: A dictionary of callbacks. See the documentation for
989 :class:`BaseParser`.
991 :param max_size: The maximum size of body to parse. Defaults to infinity -
992 i.e. unbounded.
993 """
995 def __init__(self, boundary, callbacks={}, max_size=float('inf')):
996 # Initialize parser state.
997 super().__init__()
998 self.state = STATE_START
999 self.index = self.flags = 0
1001 self.callbacks = callbacks
1003 if not isinstance(max_size, Number) or max_size < 1:
1004 raise ValueError("max_size must be a positive number, not %r" %
1005 max_size)
1006 self.max_size = max_size
1007 self._current_size = 0
1009 # Setup marks. These are used to track the state of data received.
1010 self.marks = {}
1012 # TODO: Actually use this rather than the dumb version we currently use
1013 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
1014 # skip = [len(boundary) for x in range(256)]
1015 # for i in range(len(boundary) - 1):
1016 # skip[ord_char(boundary[i])] = len(boundary) - i - 1
1017 #
1018 # # We use a tuple since it's a constant, and marginally faster.
1019 # self.skip = tuple(skip)
1021 # Save our boundary.
1022 if isinstance(boundary, str): # pragma: no cover
1023 boundary = boundary.encode('latin-1')
1024 self.boundary = b'\r\n--' + boundary
1026 # Get a set of characters that belong to our boundary.
1027 self.boundary_chars = frozenset(self.boundary)
1029 # We also create a lookbehind list.
1030 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
1031 # "--\r\n" at the final boundary, and the length of '\r\n--' and
1032 # '--\r\n' is 8 bytes.
1033 self.lookbehind = [NULL for x in range(len(boundary) + 8)]
1035 def write(self, data):
1036 """Write some data to the parser, which will perform size verification,
1037 and then parse the data into the appropriate location (e.g. header,
1038 data, etc.), and pass this on to the underlying callback. If an error
1039 is encountered, a MultipartParseError will be raised. The "offset"
1040 attribute on the raised exception will be set to the offset of the byte
1041 in the input chunk that caused the error.
1043 :param data: a bytestring
1044 """
1045 # Handle sizing.
1046 data_len = len(data)
1047 if (self._current_size + data_len) > self.max_size:
1048 # We truncate the length of data that we are to process.
1049 new_size = int(self.max_size - self._current_size)
1050 self.logger.warning("Current size is %d (max %d), so truncating "
1051 "data length from %d to %d",
1052 self._current_size, self.max_size, data_len,
1053 new_size)
1054 data_len = new_size
1056 l = 0
1057 try:
1058 l = self._internal_write(data, data_len)
1059 finally:
1060 self._current_size += l
1062 return l
1064 def _internal_write(self, data, length):
1065 # Get values from locals.
1066 boundary = self.boundary
1068 # Get our state, flags and index. These are persisted between calls to
1069 # this function.
1070 state = self.state
1071 index = self.index
1072 flags = self.flags
1074 # Our index defaults to 0.
1075 i = 0
1077 # Set a mark.
1078 def set_mark(name):
1079 self.marks[name] = i
1081 # Remove a mark.
1082 def delete_mark(name, reset=False):
1083 self.marks.pop(name, None)
1085 # Helper function that makes calling a callback with data easier. The
1086 # 'remaining' parameter will callback from the marked value until the
1087 # end of the buffer, and reset the mark, instead of deleting it. This
1088 # is used at the end of the function to call our callbacks with any
1089 # remaining data in this chunk.
1090 def data_callback(name, remaining=False):
1091 marked_index = self.marks.get(name)
1092 if marked_index is None:
1093 return
1095 # If we're getting remaining data, we ignore the current i value
1096 # and just call with the remaining data.
1097 if remaining:
1098 self.callback(name, data, marked_index, length)
1099 self.marks[name] = 0
1101 # Otherwise, we call it from the mark to the current byte we're
1102 # processing.
1103 else:
1104 self.callback(name, data, marked_index, i)
1105 self.marks.pop(name, None)
1107 # For each byte...
1108 while i < length:
1109 c = data[i]
1111 if state == STATE_START:
1112 # Skip leading newlines
1113 if c == CR or c == LF:
1114 i += 1
1115 self.logger.debug("Skipping leading CR/LF at %d", i)
1116 continue
1118 # index is used as in index into our boundary. Set to 0.
1119 index = 0
1121 # Move to the next state, but decrement i so that we re-process
1122 # this character.
1123 state = STATE_START_BOUNDARY
1124 i -= 1
1126 elif state == STATE_START_BOUNDARY:
1127 # Check to ensure that the last 2 characters in our boundary
1128 # are CRLF.
1129 if index == len(boundary) - 2:
1130 if c != CR:
1131 # Error!
1132 msg = "Did not find CR at end of boundary (%d)" % (i,)
1133 self.logger.warning(msg)
1134 e = MultipartParseError(msg)
1135 e.offset = i
1136 raise e
1138 index += 1
1140 elif index == len(boundary) - 2 + 1:
1141 if c != LF:
1142 msg = "Did not find LF at end of boundary (%d)" % (i,)
1143 self.logger.warning(msg)
1144 e = MultipartParseError(msg)
1145 e.offset = i
1146 raise e
1148 # The index is now used for indexing into our boundary.
1149 index = 0
1151 # Callback for the start of a part.
1152 self.callback('part_begin')
1154 # Move to the next character and state.
1155 state = STATE_HEADER_FIELD_START
1157 else:
1158 # Check to ensure our boundary matches
1159 if c != boundary[index + 2]:
1160 msg = "Did not find boundary character %r at index " \
1161 "%d" % (c, index + 2)
1162 self.logger.warning(msg)
1163 e = MultipartParseError(msg)
1164 e.offset = i
1165 raise e
1167 # Increment index into boundary and continue.
1168 index += 1
1170 elif state == STATE_HEADER_FIELD_START:
1171 # Mark the start of a header field here, reset the index, and
1172 # continue parsing our header field.
1173 index = 0
1175 # Set a mark of our header field.
1176 set_mark('header_field')
1178 # Move to parsing header fields.
1179 state = STATE_HEADER_FIELD
1180 i -= 1
1182 elif state == STATE_HEADER_FIELD:
1183 # If we've reached a CR at the beginning of a header, it means
1184 # that we've reached the second of 2 newlines, and so there are
1185 # no more headers to parse.
1186 if c == CR:
1187 delete_mark('header_field')
1188 state = STATE_HEADERS_ALMOST_DONE
1189 i += 1
1190 continue
1192 # Increment our index in the header.
1193 index += 1
1195 # Do nothing if we encounter a hyphen.
1196 if c == HYPHEN:
1197 pass
1199 # If we've reached a colon, we're done with this header.
1200 elif c == COLON:
1201 # A 0-length header is an error.
1202 if index == 1:
1203 msg = "Found 0-length header at %d" % (i,)
1204 self.logger.warning(msg)
1205 e = MultipartParseError(msg)
1206 e.offset = i
1207 raise e
1209 # Call our callback with the header field.
1210 data_callback('header_field')
1212 # Move to parsing the header value.
1213 state = STATE_HEADER_VALUE_START
1215 else:
1216 # Lower-case this character, and ensure that it is in fact
1217 # a valid letter. If not, it's an error.
1218 cl = lower_char(c)
1219 if cl < LOWER_A or cl > LOWER_Z:
1220 msg = "Found non-alphanumeric character %r in " \
1221 "header at %d" % (c, i)
1222 self.logger.warning(msg)
1223 e = MultipartParseError(msg)
1224 e.offset = i
1225 raise e
1227 elif state == STATE_HEADER_VALUE_START:
1228 # Skip leading spaces.
1229 if c == SPACE:
1230 i += 1
1231 continue
1233 # Mark the start of the header value.
1234 set_mark('header_value')
1236 # Move to the header-value state, reprocessing this character.
1237 state = STATE_HEADER_VALUE
1238 i -= 1
1240 elif state == STATE_HEADER_VALUE:
1241 # If we've got a CR, we're nearly done our headers. Otherwise,
1242 # we do nothing and just move past this character.
1243 if c == CR:
1244 data_callback('header_value')
1245 self.callback('header_end')
1246 state = STATE_HEADER_VALUE_ALMOST_DONE
1248 elif state == STATE_HEADER_VALUE_ALMOST_DONE:
1249 # The last character should be a LF. If not, it's an error.
1250 if c != LF:
1251 msg = "Did not find LF character at end of header " \
1252 "(found %r)" % (c,)
1253 self.logger.warning(msg)
1254 e = MultipartParseError(msg)
1255 e.offset = i
1256 raise e
1258 # Move back to the start of another header. Note that if that
1259 # state detects ANOTHER newline, it'll trigger the end of our
1260 # headers.
1261 state = STATE_HEADER_FIELD_START
1263 elif state == STATE_HEADERS_ALMOST_DONE:
1264 # We're almost done our headers. This is reached when we parse
1265 # a CR at the beginning of a header, so our next character
1266 # should be a LF, or it's an error.
1267 if c != LF:
1268 msg = f"Did not find LF at end of headers (found {c!r})"
1269 self.logger.warning(msg)
1270 e = MultipartParseError(msg)
1271 e.offset = i
1272 raise e
1274 self.callback('headers_finished')
1275 state = STATE_PART_DATA_START
1277 elif state == STATE_PART_DATA_START:
1278 # Mark the start of our part data.
1279 set_mark('part_data')
1281 # Start processing part data, including this character.
1282 state = STATE_PART_DATA
1283 i -= 1
1285 elif state == STATE_PART_DATA:
1286 # We're processing our part data right now. During this, we
1287 # need to efficiently search for our boundary, since any data
1288 # on any number of lines can be a part of the current data.
1289 # We use the Boyer-Moore-Horspool algorithm to efficiently
1290 # search through the remainder of the buffer looking for our
1291 # boundary.
1293 # Save the current value of our index. We use this in case we
1294 # find part of a boundary, but it doesn't match fully.
1295 prev_index = index
1297 # Set up variables.
1298 boundary_length = len(boundary)
1299 boundary_end = boundary_length - 1
1300 data_length = length
1301 boundary_chars = self.boundary_chars
1303 # If our index is 0, we're starting a new part, so start our
1304 # search.
1305 if index == 0:
1306 # Search forward until we either hit the end of our buffer,
1307 # or reach a character that's in our boundary.
1308 i += boundary_end
1309 while i < data_length - 1 and data[i] not in boundary_chars:
1310 i += boundary_length
1312 # Reset i back the length of our boundary, which is the
1313 # earliest possible location that could be our match (i.e.
1314 # if we've just broken out of our loop since we saw the
1315 # last character in our boundary)
1316 i -= boundary_end
1317 c = data[i]
1319 # Now, we have a couple of cases here. If our index is before
1320 # the end of the boundary...
1321 if index < boundary_length:
1322 # If the character matches...
1323 if boundary[index] == c:
1324 # If we found a match for our boundary, we send the
1325 # existing data.
1326 if index == 0:
1327 data_callback('part_data')
1329 # The current character matches, so continue!
1330 index += 1
1331 else:
1332 index = 0
1334 # Our index is equal to the length of our boundary!
1335 elif index == boundary_length:
1336 # First we increment it.
1337 index += 1
1339 # Now, if we've reached a newline, we need to set this as
1340 # the potential end of our boundary.
1341 if c == CR:
1342 flags |= FLAG_PART_BOUNDARY
1344 # Otherwise, if this is a hyphen, we might be at the last
1345 # of all boundaries.
1346 elif c == HYPHEN:
1347 flags |= FLAG_LAST_BOUNDARY
1349 # Otherwise, we reset our index, since this isn't either a
1350 # newline or a hyphen.
1351 else:
1352 index = 0
1354 # Our index is right after the part boundary, which should be
1355 # a LF.
1356 elif index == boundary_length + 1:
1357 # If we're at a part boundary (i.e. we've seen a CR
1358 # character already)...
1359 if flags & FLAG_PART_BOUNDARY:
1360 # We need a LF character next.
1361 if c == LF:
1362 # Unset the part boundary flag.
1363 flags &= (~FLAG_PART_BOUNDARY)
1365 # Callback indicating that we've reached the end of
1366 # a part, and are starting a new one.
1367 self.callback('part_end')
1368 self.callback('part_begin')
1370 # Move to parsing new headers.
1371 index = 0
1372 state = STATE_HEADER_FIELD_START
1373 i += 1
1374 continue
1376 # We didn't find an LF character, so no match. Reset
1377 # our index and clear our flag.
1378 index = 0
1379 flags &= (~FLAG_PART_BOUNDARY)
1381 # Otherwise, if we're at the last boundary (i.e. we've
1382 # seen a hyphen already)...
1383 elif flags & FLAG_LAST_BOUNDARY:
1384 # We need a second hyphen here.
1385 if c == HYPHEN:
1386 # Callback to end the current part, and then the
1387 # message.
1388 self.callback('part_end')
1389 self.callback('end')
1390 state = STATE_END
1391 else:
1392 # No match, so reset index.
1393 index = 0
1395 # If we have an index, we need to keep this byte for later, in
1396 # case we can't match the full boundary.
1397 if index > 0:
1398 self.lookbehind[index - 1] = c
1400 # Otherwise, our index is 0. If the previous index is not, it
1401 # means we reset something, and we need to take the data we
1402 # thought was part of our boundary and send it along as actual
1403 # data.
1404 elif prev_index > 0:
1405 # Callback to write the saved data.
1406 lb_data = join_bytes(self.lookbehind)
1407 self.callback('part_data', lb_data, 0, prev_index)
1409 # Overwrite our previous index.
1410 prev_index = 0
1412 # Re-set our mark for part data.
1413 set_mark('part_data')
1415 # Re-consider the current character, since this could be
1416 # the start of the boundary itself.
1417 i -= 1
1419 elif state == STATE_END:
1420 # Do nothing and just consume a byte in the end state.
1421 if c not in (CR, LF):
1422 self.logger.warning("Consuming a byte '0x%x' in the end state", c)
1424 else: # pragma: no cover (error case)
1425 # We got into a strange state somehow! Just stop processing.
1426 msg = "Reached an unknown state %d at %d" % (state, i)
1427 self.logger.warning(msg)
1428 e = MultipartParseError(msg)
1429 e.offset = i
1430 raise e
1432 # Move to the next byte.
1433 i += 1
1435 # We call our callbacks with any remaining data. Note that we pass
1436 # the 'remaining' flag, which sets the mark back to 0 instead of
1437 # deleting it, if it's found. This is because, if the mark is found
1438 # at this point, we assume that there's data for one of these things
1439 # that has been parsed, but not yet emitted. And, as such, it implies
1440 # that we haven't yet reached the end of this 'thing'. So, by setting
1441 # the mark to 0, we cause any data callbacks that take place in future
1442 # calls to this function to start from the beginning of that buffer.
1443 data_callback('header_field', True)
1444 data_callback('header_value', True)
1445 data_callback('part_data', True)
1447 # Save values to locals.
1448 self.state = state
1449 self.index = index
1450 self.flags = flags
1452 # Return our data length to indicate no errors, and that we processed
1453 # all of it.
1454 return length
1456 def finalize(self):
1457 """Finalize this parser, which signals to that we are finished parsing.
1459 Note: It does not currently, but in the future, it will verify that we
1460 are in the final state of the parser (i.e. the end of the multipart
1461 message is well-formed), and, if not, throw an error.
1462 """
1463 # TODO: verify that we're in the state STATE_END, otherwise throw an
1464 # error or otherwise state that we're not finished parsing.
1465 pass
1467 def __repr__(self):
1468 return f"{self.__class__.__name__}(boundary={self.boundary!r})"
1471class FormParser:
1472 """This class is the all-in-one form parser. Given all the information
1473 necessary to parse a form, it will instantiate the correct parser, create
1474 the proper :class:`Field` and :class:`File` classes to store the data that
1475 is parsed, and call the two given callbacks with each field and file as
1476 they become available.
1478 :param content_type: The Content-Type of the incoming request. This is
1479 used to select the appropriate parser.
1481 :param on_field: The callback to call when a field has been parsed and is
1482 ready for usage. See above for parameters.
1484 :param on_file: The callback to call when a file has been parsed and is
1485 ready for usage. See above for parameters.
1487 :param on_end: An optional callback to call when all fields and files in a
1488 request has been parsed. Can be None.
1490 :param boundary: If the request is a multipart/form-data request, this
1491 should be the boundary of the request, as given in the
1492 Content-Type header, as a bytestring.
1494 :param file_name: If the request is of type application/octet-stream, then
1495 the body of the request will not contain any information
1496 about the uploaded file. In such cases, you can provide
1497 the file name of the uploaded file manually.
1499 :param FileClass: The class to use for uploaded files. Defaults to
1500 :class:`File`, but you can provide your own class if you
1501 wish to customize behaviour. The class will be
1502 instantiated as FileClass(file_name, field_name), and it
1503 must provide the following functions::
1504 file_instance.write(data)
1505 file_instance.finalize()
1506 file_instance.close()
1508 :param FieldClass: The class to use for uploaded fields. Defaults to
1509 :class:`Field`, but you can provide your own class if
1510 you wish to customize behaviour. The class will be
1511 instantiated as FieldClass(field_name), and it must
1512 provide the following functions::
1513 field_instance.write(data)
1514 field_instance.finalize()
1515 field_instance.close()
1517 :param config: Configuration to use for this FormParser. The default
1518 values are taken from the DEFAULT_CONFIG value, and then
1519 any keys present in this dictionary will overwrite the
1520 default values.
1522 """
1523 #: This is the default configuration for our form parser.
1524 #: Note: all file sizes should be in bytes.
1525 DEFAULT_CONFIG = {
1526 'MAX_BODY_SIZE': float('inf'),
1527 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024,
1528 'UPLOAD_DIR': None,
1529 'UPLOAD_KEEP_FILENAME': False,
1530 'UPLOAD_KEEP_EXTENSIONS': False,
1532 # Error on invalid Content-Transfer-Encoding?
1533 'UPLOAD_ERROR_ON_BAD_CTE': False,
1534 }
1536 def __init__(self, content_type, on_field, on_file, on_end=None,
1537 boundary=None, file_name=None, FileClass=File,
1538 FieldClass=Field, config={}):
1540 self.logger = logging.getLogger(__name__)
1542 # Save variables.
1543 self.content_type = content_type
1544 self.boundary = boundary
1545 self.bytes_received = 0
1546 self.parser = None
1548 # Save callbacks.
1549 self.on_field = on_field
1550 self.on_file = on_file
1551 self.on_end = on_end
1553 # Save classes.
1554 self.FileClass = File
1555 self.FieldClass = Field
1557 # Set configuration options.
1558 self.config = self.DEFAULT_CONFIG.copy()
1559 self.config.update(config)
1561 # Depending on the Content-Type, we instantiate the correct parser.
1562 if content_type == 'application/octet-stream':
1563 # Work around the lack of 'nonlocal' in Py2
1564 class vars:
1565 f = None
1567 def on_start():
1568 vars.f = FileClass(file_name, None, config=self.config)
1570 def on_data(data, start, end):
1571 vars.f.write(data[start:end])
1573 def on_end():
1574 # Finalize the file itself.
1575 vars.f.finalize()
1577 # Call our callback.
1578 on_file(vars.f)
1580 # Call the on-end callback.
1581 if self.on_end is not None:
1582 self.on_end()
1584 callbacks = {
1585 'on_start': on_start,
1586 'on_data': on_data,
1587 'on_end': on_end,
1588 }
1590 # Instantiate an octet-stream parser
1591 parser = OctetStreamParser(callbacks,
1592 max_size=self.config['MAX_BODY_SIZE'])
1594 elif (content_type == 'application/x-www-form-urlencoded' or
1595 content_type == 'application/x-url-encoded'):
1597 name_buffer = []
1599 class vars:
1600 f = None
1602 def on_field_start():
1603 pass
1605 def on_field_name(data, start, end):
1606 name_buffer.append(data[start:end])
1608 def on_field_data(data, start, end):
1609 if vars.f is None:
1610 vars.f = FieldClass(b''.join(name_buffer))
1611 del name_buffer[:]
1612 vars.f.write(data[start:end])
1614 def on_field_end():
1615 # Finalize and call callback.
1616 if vars.f is None:
1617 # If we get here, it's because there was no field data.
1618 # We create a field, set it to None, and then continue.
1619 vars.f = FieldClass(b''.join(name_buffer))
1620 del name_buffer[:]
1621 vars.f.set_none()
1623 vars.f.finalize()
1624 on_field(vars.f)
1625 vars.f = None
1627 def on_end():
1628 if self.on_end is not None:
1629 self.on_end()
1631 # Setup callbacks.
1632 callbacks = {
1633 'on_field_start': on_field_start,
1634 'on_field_name': on_field_name,
1635 'on_field_data': on_field_data,
1636 'on_field_end': on_field_end,
1637 'on_end': on_end,
1638 }
1640 # Instantiate parser.
1641 parser = QuerystringParser(
1642 callbacks=callbacks,
1643 max_size=self.config['MAX_BODY_SIZE']
1644 )
1646 elif content_type == 'multipart/form-data':
1647 if boundary is None:
1648 self.logger.error("No boundary given")
1649 raise FormParserError("No boundary given")
1651 header_name = []
1652 header_value = []
1653 headers = {}
1655 # No 'nonlocal' on Python 2 :-(
1656 class vars:
1657 f = None
1658 writer = None
1659 is_file = False
1661 def on_part_begin():
1662 pass
1664 def on_part_data(data, start, end):
1665 bytes_processed = vars.writer.write(data[start:end])
1666 # TODO: check for error here.
1667 return bytes_processed
1669 def on_part_end():
1670 vars.f.finalize()
1671 if vars.is_file:
1672 on_file(vars.f)
1673 else:
1674 on_field(vars.f)
1676 def on_header_field(data, start, end):
1677 header_name.append(data[start:end])
1679 def on_header_value(data, start, end):
1680 header_value.append(data[start:end])
1682 def on_header_end():
1683 headers[b''.join(header_name)] = b''.join(header_value)
1684 del header_name[:]
1685 del header_value[:]
1687 def on_headers_finished():
1688 # Reset the 'is file' flag.
1689 vars.is_file = False
1691 # Parse the content-disposition header.
1692 # TODO: handle mixed case
1693 content_disp = headers.get(b'Content-Disposition')
1694 disp, options = parse_options_header(content_disp)
1696 # Get the field and filename.
1697 field_name = options.get(b'name')
1698 file_name = options.get(b'filename')
1699 # TODO: check for errors
1701 # Create the proper class.
1702 if file_name is None:
1703 vars.f = FieldClass(field_name)
1704 else:
1705 vars.f = FileClass(file_name, field_name, config=self.config)
1706 vars.is_file = True
1708 # Parse the given Content-Transfer-Encoding to determine what
1709 # we need to do with the incoming data.
1710 # TODO: check that we properly handle 8bit / 7bit encoding.
1711 transfer_encoding = headers.get(b'Content-Transfer-Encoding',
1712 b'7bit')
1714 if (transfer_encoding == b'binary' or
1715 transfer_encoding == b'8bit' or
1716 transfer_encoding == b'7bit'):
1717 vars.writer = vars.f
1719 elif transfer_encoding == b'base64':
1720 vars.writer = Base64Decoder(vars.f)
1722 elif transfer_encoding == b'quoted-printable':
1723 vars.writer = QuotedPrintableDecoder(vars.f)
1725 else:
1726 self.logger.warning("Unknown Content-Transfer-Encoding: "
1727 "%r", transfer_encoding)
1728 if self.config['UPLOAD_ERROR_ON_BAD_CTE']:
1729 raise FormParserError(
1730 'Unknown Content-Transfer-Encoding "{}"'.format(
1731 transfer_encoding
1732 )
1733 )
1734 else:
1735 # If we aren't erroring, then we just treat this as an
1736 # unencoded Content-Transfer-Encoding.
1737 vars.writer = vars.f
1739 def on_end():
1740 vars.writer.finalize()
1741 if self.on_end is not None:
1742 self.on_end()
1744 # These are our callbacks for the parser.
1745 callbacks = {
1746 'on_part_begin': on_part_begin,
1747 'on_part_data': on_part_data,
1748 'on_part_end': on_part_end,
1749 'on_header_field': on_header_field,
1750 'on_header_value': on_header_value,
1751 'on_header_end': on_header_end,
1752 'on_headers_finished': on_headers_finished,
1753 'on_end': on_end,
1754 }
1756 # Instantiate a multipart parser.
1757 parser = MultipartParser(boundary, callbacks,
1758 max_size=self.config['MAX_BODY_SIZE'])
1760 else:
1761 self.logger.warning("Unknown Content-Type: %r", content_type)
1762 raise FormParserError("Unknown Content-Type: {}".format(
1763 content_type
1764 ))
1766 self.parser = parser
1768 def write(self, data):
1769 """Write some data. The parser will forward this to the appropriate
1770 underlying parser.
1772 :param data: a bytestring
1773 """
1774 self.bytes_received += len(data)
1775 # TODO: check the parser's return value for errors?
1776 return self.parser.write(data)
1778 def finalize(self):
1779 """Finalize the parser."""
1780 if self.parser is not None and hasattr(self.parser, 'finalize'):
1781 self.parser.finalize()
1783 def close(self):
1784 """Close the parser."""
1785 if self.parser is not None and hasattr(self.parser, 'close'):
1786 self.parser.close()
1788 def __repr__(self):
1789 return "{}(content_type={!r}, parser={!r})".format(
1790 self.__class__.__name__,
1791 self.content_type,
1792 self.parser,
1793 )
1796def create_form_parser(headers, on_field, on_file, trust_x_headers=False,
1797 config={}):
1798 """This function is a helper function to aid in creating a FormParser
1799 instances. Given a dictionary-like headers object, it will determine
1800 the correct information needed, instantiate a FormParser with the
1801 appropriate values and given callbacks, and then return the corresponding
1802 parser.
1804 :param headers: A dictionary-like object of HTTP headers. The only
1805 required header is Content-Type.
1807 :param on_field: Callback to call with each parsed field.
1809 :param on_file: Callback to call with each parsed file.
1811 :param trust_x_headers: Whether or not to trust information received from
1812 certain X-Headers - for example, the file name from
1813 X-File-Name.
1815 :param config: Configuration variables to pass to the FormParser.
1816 """
1817 content_type = headers.get('Content-Type')
1818 if content_type is None:
1819 logging.getLogger(__name__).warning("No Content-Type header given")
1820 raise ValueError("No Content-Type header given!")
1822 # Boundaries are optional (the FormParser will raise if one is needed
1823 # but not given).
1824 content_type, params = parse_options_header(content_type)
1825 boundary = params.get(b'boundary')
1827 # We need content_type to be a string, not a bytes object.
1828 content_type = content_type.decode('latin-1')
1830 # File names are optional.
1831 file_name = headers.get('X-File-Name')
1833 # Instantiate a form parser.
1834 form_parser = FormParser(content_type,
1835 on_field,
1836 on_file,
1837 boundary=boundary,
1838 file_name=file_name,
1839 config=config)
1841 # Return our parser.
1842 return form_parser
1845def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576,
1846 **kwargs):
1847 """This function is useful if you just want to parse a request body,
1848 without too much work. Pass it a dictionary-like object of the request's
1849 headers, and a file-like object for the input stream, along with two
1850 callbacks that will get called whenever a field or file is parsed.
1852 :param headers: A dictionary-like object of HTTP headers. The only
1853 required header is Content-Type.
1855 :param input_stream: A file-like object that represents the request body.
1856 The read() method must return bytestrings.
1858 :param on_field: Callback to call with each parsed field.
1860 :param on_file: Callback to call with each parsed file.
1862 :param chunk_size: The maximum size to read from the input stream and write
1863 to the parser at one time. Defaults to 1 MiB.
1864 """
1866 # Create our form parser.
1867 parser = create_form_parser(headers, on_field, on_file)
1869 # Read chunks of 100KiB and write to the parser, but never read more than
1870 # the given Content-Length, if any.
1871 content_length = headers.get('Content-Length')
1872 if content_length is not None:
1873 content_length = int(content_length)
1874 else:
1875 content_length = float('inf')
1876 bytes_read = 0
1878 while True:
1879 # Read only up to the Content-Length given.
1880 max_readable = min(content_length - bytes_read, 1048576)
1881 buff = input_stream.read(max_readable)
1883 # Write to the parser and update our length.
1884 parser.write(buff)
1885 bytes_read += len(buff)
1887 # If we get a buffer that's smaller than the size requested, or if we
1888 # have read up to our content length, we're done.
1889 if len(buff) != max_readable or bytes_read == content_length:
1890 break
1892 # Tell our parser that we're done writing data.
1893 parser.finalize()