1from __future__ import annotations
2
3import logging
4import os
5import shutil
6import sys
7import tempfile
8from email.message import Message
9from enum import IntEnum
10from io import BytesIO
11from numbers import Number
12from typing import TYPE_CHECKING
13
14from .decoders import Base64Decoder, QuotedPrintableDecoder
15from .exceptions import FileError, FormParserError, MultipartParseError, QuerystringParseError
16
17if TYPE_CHECKING: # pragma: no cover
18 from typing import Callable, TypedDict
19
20 class QuerystringCallbacks(TypedDict, total=False):
21 on_field_start: Callable[[], None]
22 on_field_name: Callable[[bytes, int, int], None]
23 on_field_data: Callable[[bytes, int, int], None]
24 on_field_end: Callable[[], None]
25 on_end: Callable[[], None]
26
27 class OctetStreamCallbacks(TypedDict, total=False):
28 on_start: Callable[[], None]
29 on_data: Callable[[bytes, int, int], None]
30 on_end: Callable[[], None]
31
32 class MultipartCallbacks(TypedDict, total=False):
33 on_part_begin: Callable[[], None]
34 on_part_data: Callable[[bytes, int, int], None]
35 on_part_end: Callable[[], None]
36 on_headers_begin: Callable[[], None]
37 on_header_field: Callable[[bytes, int, int], None]
38 on_header_value: Callable[[bytes, int, int], None]
39 on_header_end: Callable[[], None]
40 on_headers_finished: Callable[[], None]
41 on_end: Callable[[], None]
42
43 class FormParserConfig(TypedDict, total=False):
44 UPLOAD_DIR: str | None
45 UPLOAD_KEEP_FILENAME: bool
46 UPLOAD_KEEP_EXTENSIONS: bool
47 UPLOAD_ERROR_ON_BAD_CTE: bool
48 MAX_MEMORY_FILE_SIZE: int
49 MAX_BODY_SIZE: float
50
51 class FileConfig(TypedDict, total=False):
52 UPLOAD_DIR: str | None
53 UPLOAD_DELETE_TMP: bool
54 UPLOAD_KEEP_FILENAME: bool
55 UPLOAD_KEEP_EXTENSIONS: bool
56 MAX_MEMORY_FILE_SIZE: int
57
58
59# Unique missing object.
60_missing = object()
61
62
63class QuerystringState(IntEnum):
64 """Querystring parser states.
65
66 These are used to keep track of the state of the parser, and are used to determine
67 what to do when new data is encountered.
68 """
69
70 BEFORE_FIELD = 0
71 FIELD_NAME = 1
72 FIELD_DATA = 2
73
74
75class MultipartState(IntEnum):
76 """Multipart parser states.
77
78 These are used to keep track of the state of the parser, and are used to determine
79 what to do when new data is encountered.
80 """
81
82 START = 0
83 START_BOUNDARY = 1
84 HEADER_FIELD_START = 2
85 HEADER_FIELD = 3
86 HEADER_VALUE_START = 4
87 HEADER_VALUE = 5
88 HEADER_VALUE_ALMOST_DONE = 6
89 HEADERS_ALMOST_DONE = 7
90 PART_DATA_START = 8
91 PART_DATA = 9
92 PART_DATA_END = 10
93 END = 11
94
95
96# Flags for the multipart parser.
97FLAG_PART_BOUNDARY = 1
98FLAG_LAST_BOUNDARY = 2
99
100# Get constants. Since iterating over a str on Python 2 gives you a 1-length
101# string, but iterating over a bytes object on Python 3 gives you an integer,
102# we need to save these constants.
103CR = b"\r"[0]
104LF = b"\n"[0]
105COLON = b":"[0]
106SPACE = b" "[0]
107HYPHEN = b"-"[0]
108AMPERSAND = b"&"[0]
109SEMICOLON = b";"[0]
110LOWER_A = b"a"[0]
111LOWER_Z = b"z"[0]
112NULL = b"\x00"[0]
113
114
115# Lower-casing a character is different, because of the difference between
116# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,
117# and joining a list of bytes together.
118# These functions abstract that.
119def lower_char(c):
120 return c | 0x20
121
122
123def ord_char(c):
124 return c
125
126
127def join_bytes(b):
128 return bytes(list(b))
129
130
131def parse_options_header(value: str | bytes) -> tuple[bytes, dict[bytes, bytes]]:
132 """
133 Parses a Content-Type header into a value in the following format:
134 (content_type, {parameters})
135 """
136 # Uses email.message.Message to parse the header as described in PEP 594.
137 # Ref: https://peps.python.org/pep-0594/#cgi
138 if not value:
139 return (b"", {})
140
141 # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1.
142 if isinstance(value, bytes): # pragma: no cover
143 value = value.decode("latin-1")
144
145 # For types
146 assert isinstance(value, str), "Value should be a string by now"
147
148 # If we have no options, return the string as-is.
149 if ";" not in value:
150 return (value.lower().strip().encode("latin-1"), {})
151
152 # Split at the first semicolon, to get our value and then options.
153 # ctype, rest = value.split(b';', 1)
154 message = Message()
155 message["content-type"] = value
156 params = message.get_params()
157 # If there were no parameters, this would have already returned above
158 assert params, "At least the content type value should be present"
159 ctype = params.pop(0)[0].encode("latin-1")
160 options = {}
161 for param in params:
162 key, value = param
163 # If the value returned from get_params() is a 3-tuple, the last
164 # element corresponds to the value.
165 # See: https://docs.python.org/3/library/email.compat32-message.html
166 if isinstance(value, tuple):
167 value = value[-1]
168 # If the value is a filename, we need to fix a bug on IE6 that sends
169 # the full file path instead of the filename.
170 if key == "filename":
171 if value[1:3] == ":\\" or value[:2] == "\\\\":
172 value = value.split("\\")[-1]
173 options[key.encode("latin-1")] = value.encode("latin-1")
174 return ctype, options
175
176
177class Field:
178 """A Field object represents a (parsed) form field. It represents a single
179 field with a corresponding name and value.
180
181 The name that a :class:`Field` will be instantiated with is the same name
182 that would be found in the following HTML::
183
184 <input name="name_goes_here" type="text"/>
185
186 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that
187 will be called when data is written to the Field, and when the Field is
188 finalized, respectively.
189
190 :param name: the name of the form field
191 """
192
193 def __init__(self, name: str):
194 self._name = name
195 self._value: list[bytes] = []
196
197 # We cache the joined version of _value for speed.
198 self._cache = _missing
199
200 @classmethod
201 def from_value(cls, name: str, value: bytes | None) -> Field:
202 """Create an instance of a :class:`Field`, and set the corresponding
203 value - either None or an actual value. This method will also
204 finalize the Field itself.
205
206 :param name: the name of the form field
207 :param value: the value of the form field - either a bytestring or
208 None
209 """
210
211 f = cls(name)
212 if value is None:
213 f.set_none()
214 else:
215 f.write(value)
216 f.finalize()
217 return f
218
219 def write(self, data: bytes) -> int:
220 """Write some data into the form field.
221
222 :param data: a bytestring
223 """
224 return self.on_data(data)
225
226 def on_data(self, data: bytes) -> int:
227 """This method is a callback that will be called whenever data is
228 written to the Field.
229
230 :param data: a bytestring
231 """
232 self._value.append(data)
233 self._cache = _missing
234 return len(data)
235
236 def on_end(self) -> None:
237 """This method is called whenever the Field is finalized."""
238 if self._cache is _missing:
239 self._cache = b"".join(self._value)
240
241 def finalize(self) -> None:
242 """Finalize the form field."""
243 self.on_end()
244
245 def close(self) -> None:
246 """Close the Field object. This will free any underlying cache."""
247 # Free our value array.
248 if self._cache is _missing:
249 self._cache = b"".join(self._value)
250
251 del self._value
252
253 def set_none(self) -> None:
254 """Some fields in a querystring can possibly have a value of None - for
255 example, the string "foo&bar=&baz=asdf" will have a field with the
256 name "foo" and value None, one with name "bar" and value "", and one
257 with name "baz" and value "asdf". Since the write() interface doesn't
258 support writing None, this function will set the field value to None.
259 """
260 self._cache = None
261
262 @property
263 def field_name(self) -> str:
264 """This property returns the name of the field."""
265 return self._name
266
267 @property
268 def value(self):
269 """This property returns the value of the form field."""
270 if self._cache is _missing:
271 self._cache = b"".join(self._value)
272
273 return self._cache
274
275 def __eq__(self, other: object) -> bool:
276 if isinstance(other, Field):
277 return self.field_name == other.field_name and self.value == other.value
278 else:
279 return NotImplemented
280
281 def __repr__(self) -> str:
282 if len(self.value) > 97:
283 # We get the repr, and then insert three dots before the final
284 # quote.
285 v = repr(self.value[:97])[:-1] + "...'"
286 else:
287 v = repr(self.value)
288
289 return "{}(field_name={!r}, value={})".format(self.__class__.__name__, self.field_name, v)
290
291
292class File:
293 """This class represents an uploaded file. It handles writing file data to
294 either an in-memory file or a temporary file on-disk, if the optional
295 threshold is passed.
296
297 There are some options that can be passed to the File to change behavior
298 of the class. Valid options are as follows:
299
300 .. list-table::
301 :widths: 15 5 5 30
302 :header-rows: 1
303
304 * - Name
305 - Type
306 - Default
307 - Description
308 * - UPLOAD_DIR
309 - `str`
310 - None
311 - The directory to store uploaded files in. If this is None, a
312 temporary file will be created in the system's standard location.
313 * - UPLOAD_DELETE_TMP
314 - `bool`
315 - True
316 - Delete automatically created TMP file
317 * - UPLOAD_KEEP_FILENAME
318 - `bool`
319 - False
320 - Whether or not to keep the filename of the uploaded file. If True,
321 then the filename will be converted to a safe representation (e.g.
322 by removing any invalid path segments), and then saved with the
323 same name). Otherwise, a temporary name will be used.
324 * - UPLOAD_KEEP_EXTENSIONS
325 - `bool`
326 - False
327 - Whether or not to keep the uploaded file's extension. If False, the
328 file will be saved with the default temporary extension (usually
329 ".tmp"). Otherwise, the file's extension will be maintained. Note
330 that this will properly combine with the UPLOAD_KEEP_FILENAME
331 setting.
332 * - MAX_MEMORY_FILE_SIZE
333 - `int`
334 - 1 MiB
335 - The maximum number of bytes of a File to keep in memory. By
336 default, the contents of a File are kept into memory until a certain
337 limit is reached, after which the contents of the File are written
338 to a temporary file. This behavior can be disabled by setting this
339 value to an appropriately large value (or, for example, infinity,
340 such as `float('inf')`.
341
342 :param file_name: The name of the file that this :class:`File` represents
343
344 :param field_name: The field name that uploaded this file. Note that this
345 can be None, if, for example, the file was uploaded
346 with Content-Type application/octet-stream
347
348 :param config: The configuration for this File. See above for valid
349 configuration keys and their corresponding values.
350 """
351
352 def __init__(self, file_name: bytes | None, field_name: bytes | None = None, config: FileConfig = {}):
353 # Save configuration, set other variables default.
354 self.logger = logging.getLogger(__name__)
355 self._config = config
356 self._in_memory = True
357 self._bytes_written = 0
358 self._fileobj = BytesIO()
359
360 # Save the provided field/file name.
361 self._field_name = field_name
362 self._file_name = file_name
363
364 # Our actual file name is None by default, since, depending on our
365 # config, we may not actually use the provided name.
366 self._actual_file_name = None
367
368 # Split the extension from the filename.
369 if file_name is not None:
370 base, ext = os.path.splitext(file_name)
371 self._file_base = base
372 self._ext = ext
373
374 @property
375 def field_name(self) -> bytes | None:
376 """The form field associated with this file. May be None if there isn't
377 one, for example when we have an application/octet-stream upload.
378 """
379 return self._field_name
380
381 @property
382 def file_name(self) -> bytes | None:
383 """The file name given in the upload request."""
384 return self._file_name
385
386 @property
387 def actual_file_name(self):
388 """The file name that this file is saved as. Will be None if it's not
389 currently saved on disk.
390 """
391 return self._actual_file_name
392
393 @property
394 def file_object(self):
395 """The file object that we're currently writing to. Note that this
396 will either be an instance of a :class:`io.BytesIO`, or a regular file
397 object.
398 """
399 return self._fileobj
400
401 @property
402 def size(self):
403 """The total size of this file, counted as the number of bytes that
404 currently have been written to the file.
405 """
406 return self._bytes_written
407
408 @property
409 def in_memory(self) -> bool:
410 """A boolean representing whether or not this file object is currently
411 stored in-memory or on-disk.
412 """
413 return self._in_memory
414
415 def flush_to_disk(self) -> None:
416 """If the file is already on-disk, do nothing. Otherwise, copy from
417 the in-memory buffer to a disk file, and then reassign our internal
418 file object to this new disk file.
419
420 Note that if you attempt to flush a file that is already on-disk, a
421 warning will be logged to this module's logger.
422 """
423 if not self._in_memory:
424 self.logger.warning("Trying to flush to disk when we're not in memory")
425 return
426
427 # Go back to the start of our file.
428 self._fileobj.seek(0)
429
430 # Open a new file.
431 new_file = self._get_disk_file()
432
433 # Copy the file objects.
434 shutil.copyfileobj(self._fileobj, new_file)
435
436 # Seek to the new position in our new file.
437 new_file.seek(self._bytes_written)
438
439 # Reassign the fileobject.
440 old_fileobj = self._fileobj
441 self._fileobj = new_file
442
443 # We're no longer in memory.
444 self._in_memory = False
445
446 # Close the old file object.
447 old_fileobj.close()
448
449 def _get_disk_file(self):
450 """This function is responsible for getting a file object on-disk for us."""
451 self.logger.info("Opening a file on disk")
452
453 file_dir = self._config.get("UPLOAD_DIR")
454 keep_filename = self._config.get("UPLOAD_KEEP_FILENAME", False)
455 keep_extensions = self._config.get("UPLOAD_KEEP_EXTENSIONS", False)
456 delete_tmp = self._config.get("UPLOAD_DELETE_TMP", True)
457
458 # If we have a directory and are to keep the filename...
459 if file_dir is not None and keep_filename:
460 self.logger.info("Saving with filename in: %r", file_dir)
461
462 # Build our filename.
463 # TODO: what happens if we don't have a filename?
464 fname = self._file_base
465 if keep_extensions:
466 fname = fname + self._ext
467
468 path = os.path.join(file_dir, fname)
469 try:
470 self.logger.info("Opening file: %r", path)
471 tmp_file = open(path, "w+b")
472 except OSError:
473 tmp_file = None
474
475 self.logger.exception("Error opening temporary file")
476 raise FileError("Error opening temporary file: %r" % path)
477 else:
478 # Build options array.
479 # Note that on Python 3, tempfile doesn't support byte names. We
480 # encode our paths using the default filesystem encoding.
481 options = {}
482 if keep_extensions:
483 ext = self._ext
484 if isinstance(ext, bytes):
485 ext = ext.decode(sys.getfilesystemencoding())
486
487 options["suffix"] = ext
488 if file_dir is not None:
489 d = file_dir
490 if isinstance(d, bytes):
491 d = d.decode(sys.getfilesystemencoding())
492
493 options["dir"] = d
494 options["delete"] = delete_tmp
495
496 # Create a temporary (named) file with the appropriate settings.
497 self.logger.info("Creating a temporary file with options: %r", options)
498 try:
499 tmp_file = tempfile.NamedTemporaryFile(**options)
500 except OSError:
501 self.logger.exception("Error creating named temporary file")
502 raise FileError("Error creating named temporary file")
503
504 fname = tmp_file.name
505
506 # Encode filename as bytes.
507 if isinstance(fname, str):
508 fname = fname.encode(sys.getfilesystemencoding())
509
510 self._actual_file_name = fname
511 return tmp_file
512
513 def write(self, data: bytes):
514 """Write some data to the File.
515
516 :param data: a bytestring
517 """
518 return self.on_data(data)
519
520 def on_data(self, data: bytes):
521 """This method is a callback that will be called whenever data is
522 written to the File.
523
524 :param data: a bytestring
525 """
526 pos = self._fileobj.tell()
527 bwritten = self._fileobj.write(data)
528 # true file objects write returns None
529 if bwritten is None:
530 bwritten = self._fileobj.tell() - pos
531
532 # If the bytes written isn't the same as the length, just return.
533 if bwritten != len(data):
534 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, len(data))
535 return bwritten
536
537 # Keep track of how many bytes we've written.
538 self._bytes_written += bwritten
539
540 # If we're in-memory and are over our limit, we create a file.
541 if (
542 self._in_memory
543 and self._config.get("MAX_MEMORY_FILE_SIZE") is not None
544 and (self._bytes_written > self._config.get("MAX_MEMORY_FILE_SIZE"))
545 ):
546 self.logger.info("Flushing to disk")
547 self.flush_to_disk()
548
549 # Return the number of bytes written.
550 return bwritten
551
552 def on_end(self) -> None:
553 """This method is called whenever the Field is finalized."""
554 # Flush the underlying file object
555 self._fileobj.flush()
556
557 def finalize(self) -> None:
558 """Finalize the form file. This will not close the underlying file,
559 but simply signal that we are finished writing to the File.
560 """
561 self.on_end()
562
563 def close(self) -> None:
564 """Close the File object. This will actually close the underlying
565 file object (whether it's a :class:`io.BytesIO` or an actual file
566 object).
567 """
568 self._fileobj.close()
569
570 def __repr__(self) -> str:
571 return "{}(file_name={!r}, field_name={!r})".format(self.__class__.__name__, self.file_name, self.field_name)
572
573
574class BaseParser:
575 """This class is the base class for all parsers. It contains the logic for
576 calling and adding callbacks.
577
578 A callback can be one of two different forms. "Notification callbacks" are
579 callbacks that are called when something happens - for example, when a new
580 part of a multipart message is encountered by the parser. "Data callbacks"
581 are called when we get some sort of data - for example, part of the body of
582 a multipart chunk. Notification callbacks are called with no parameters,
583 whereas data callbacks are called with three, as follows::
584
585 data_callback(data, start, end)
586
587 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on
588 Python 3). "start" and "end" are integer indexes into the "data" string
589 that represent the data of interest. Thus, in a data callback, the slice
590 `data[start:end]` represents the data that the callback is "interested in".
591 The callback is not passed a copy of the data, since copying severely hurts
592 performance.
593 """
594
595 def __init__(self):
596 self.logger = logging.getLogger(__name__)
597
598 def callback(self, name: str, data=None, start=None, end=None):
599 """This function calls a provided callback with some data. If the
600 callback is not set, will do nothing.
601
602 :param name: The name of the callback to call (as a string).
603
604 :param data: Data to pass to the callback. If None, then it is
605 assumed that the callback is a notification callback,
606 and no parameters are given.
607
608 :param end: An integer that is passed to the data callback.
609
610 :param start: An integer that is passed to the data callback.
611 """
612 name = "on_" + name
613 func = self.callbacks.get(name)
614 if func is None:
615 return
616
617 # Depending on whether we're given a buffer...
618 if data is not None:
619 # Don't do anything if we have start == end.
620 if start is not None and start == end:
621 return
622
623 self.logger.debug("Calling %s with data[%d:%d]", name, start, end)
624 func(data, start, end)
625 else:
626 self.logger.debug("Calling %s with no data", name)
627 func()
628
629 def set_callback(self, name: str, new_func):
630 """Update the function for a callback. Removes from the callbacks dict
631 if new_func is None.
632
633 :param name: The name of the callback to call (as a string).
634
635 :param new_func: The new function for the callback. If None, then the
636 callback will be removed (with no error if it does not
637 exist).
638 """
639 if new_func is None:
640 self.callbacks.pop("on_" + name, None)
641 else:
642 self.callbacks["on_" + name] = new_func
643
644 def close(self):
645 pass # pragma: no cover
646
647 def finalize(self):
648 pass # pragma: no cover
649
650 def __repr__(self):
651 return "%s()" % self.__class__.__name__
652
653
654class OctetStreamParser(BaseParser):
655 """This parser parses an octet-stream request body and calls callbacks when
656 incoming data is received. Callbacks are as follows:
657
658 .. list-table::
659 :widths: 15 10 30
660 :header-rows: 1
661
662 * - Callback Name
663 - Parameters
664 - Description
665 * - on_start
666 - None
667 - Called when the first data is parsed.
668 * - on_data
669 - data, start, end
670 - Called for each data chunk that is parsed.
671 * - on_end
672 - None
673 - Called when the parser is finished parsing all data.
674
675 :param callbacks: A dictionary of callbacks. See the documentation for
676 :class:`BaseParser`.
677
678 :param max_size: The maximum size of body to parse. Defaults to infinity -
679 i.e. unbounded.
680 """
681
682 def __init__(self, callbacks: OctetStreamCallbacks = {}, max_size=float("inf")):
683 super().__init__()
684 self.callbacks = callbacks
685 self._started = False
686
687 if not isinstance(max_size, Number) or max_size < 1:
688 raise ValueError("max_size must be a positive number, not %r" % max_size)
689 self.max_size = max_size
690 self._current_size = 0
691
692 def write(self, data: bytes):
693 """Write some data to the parser, which will perform size verification,
694 and then pass the data to the underlying callback.
695
696 :param data: a bytestring
697 """
698 if not self._started:
699 self.callback("start")
700 self._started = True
701
702 # Truncate data length.
703 data_len = len(data)
704 if (self._current_size + data_len) > self.max_size:
705 # We truncate the length of data that we are to process.
706 new_size = int(self.max_size - self._current_size)
707 self.logger.warning(
708 "Current size is %d (max %d), so truncating data length from %d to %d",
709 self._current_size,
710 self.max_size,
711 data_len,
712 new_size,
713 )
714 data_len = new_size
715
716 # Increment size, then callback, in case there's an exception.
717 self._current_size += data_len
718 self.callback("data", data, 0, data_len)
719 return data_len
720
721 def finalize(self) -> None:
722 """Finalize this parser, which signals to that we are finished parsing,
723 and sends the on_end callback.
724 """
725 self.callback("end")
726
727 def __repr__(self) -> str:
728 return "%s()" % self.__class__.__name__
729
730
731class QuerystringParser(BaseParser):
732 """This is a streaming querystring parser. It will consume data, and call
733 the callbacks given when it has data.
734
735 .. list-table::
736 :widths: 15 10 30
737 :header-rows: 1
738
739 * - Callback Name
740 - Parameters
741 - Description
742 * - on_field_start
743 - None
744 - Called when a new field is encountered.
745 * - on_field_name
746 - data, start, end
747 - Called when a portion of a field's name is encountered.
748 * - on_field_data
749 - data, start, end
750 - Called when a portion of a field's data is encountered.
751 * - on_field_end
752 - None
753 - Called when the end of a field is encountered.
754 * - on_end
755 - None
756 - Called when the parser is finished parsing all data.
757
758 :param callbacks: A dictionary of callbacks. See the documentation for
759 :class:`BaseParser`.
760
761 :param strict_parsing: Whether or not to parse the body strictly. Defaults
762 to False. If this is set to True, then the behavior
763 of the parser changes as the following: if a field
764 has a value with an equal sign (e.g. "foo=bar", or
765 "foo="), it is always included. If a field has no
766 equals sign (e.g. "...&name&..."), it will be
767 treated as an error if 'strict_parsing' is True,
768 otherwise included. If an error is encountered,
769 then a
770 :class:`multipart.exceptions.QuerystringParseError`
771 will be raised.
772
773 :param max_size: The maximum size of body to parse. Defaults to infinity -
774 i.e. unbounded.
775 """
776
777 state: QuerystringState
778
779 def __init__(self, callbacks: QuerystringCallbacks = {}, strict_parsing: bool = False, max_size=float("inf")):
780 super().__init__()
781 self.state = QuerystringState.BEFORE_FIELD
782 self._found_sep = False
783
784 self.callbacks = callbacks
785
786 # Max-size stuff
787 if not isinstance(max_size, Number) or max_size < 1:
788 raise ValueError("max_size must be a positive number, not %r" % max_size)
789 self.max_size = max_size
790 self._current_size = 0
791
792 # Should parsing be strict?
793 self.strict_parsing = strict_parsing
794
795 def write(self, data: bytes) -> int:
796 """Write some data to the parser, which will perform size verification,
797 parse into either a field name or value, and then pass the
798 corresponding data to the underlying callback. If an error is
799 encountered while parsing, a QuerystringParseError will be raised. The
800 "offset" attribute of the raised exception will be set to the offset in
801 the input data chunk (NOT the overall stream) that caused the error.
802
803 :param data: a bytestring
804 """
805 # Handle sizing.
806 data_len = len(data)
807 if (self._current_size + data_len) > self.max_size:
808 # We truncate the length of data that we are to process.
809 new_size = int(self.max_size - self._current_size)
810 self.logger.warning(
811 "Current size is %d (max %d), so truncating data length from %d to %d",
812 self._current_size,
813 self.max_size,
814 data_len,
815 new_size,
816 )
817 data_len = new_size
818
819 l = 0
820 try:
821 l = self._internal_write(data, data_len)
822 finally:
823 self._current_size += l
824
825 return l
826
827 def _internal_write(self, data: bytes, length: int) -> int:
828 state = self.state
829 strict_parsing = self.strict_parsing
830 found_sep = self._found_sep
831
832 i = 0
833 while i < length:
834 ch = data[i]
835
836 # Depending on our state...
837 if state == QuerystringState.BEFORE_FIELD:
838 # If the 'found_sep' flag is set, we've already encountered
839 # and skipped a single separator. If so, we check our strict
840 # parsing flag and decide what to do. Otherwise, we haven't
841 # yet reached a separator, and thus, if we do, we need to skip
842 # it as it will be the boundary between fields that's supposed
843 # to be there.
844 if ch == AMPERSAND or ch == SEMICOLON:
845 if found_sep:
846 # If we're parsing strictly, we disallow blank chunks.
847 if strict_parsing:
848 e = QuerystringParseError("Skipping duplicate ampersand/semicolon at %d" % i)
849 e.offset = i
850 raise e
851 else:
852 self.logger.debug("Skipping duplicate ampersand/semicolon at %d", i)
853 else:
854 # This case is when we're skipping the (first)
855 # separator between fields, so we just set our flag
856 # and continue on.
857 found_sep = True
858 else:
859 # Emit a field-start event, and go to that state. Also,
860 # reset the "found_sep" flag, for the next time we get to
861 # this state.
862 self.callback("field_start")
863 i -= 1
864 state = QuerystringState.FIELD_NAME
865 found_sep = False
866
867 elif state == QuerystringState.FIELD_NAME:
868 # Try and find a separator - we ensure that, if we do, we only
869 # look for the equal sign before it.
870 sep_pos = data.find(b"&", i)
871 if sep_pos == -1:
872 sep_pos = data.find(b";", i)
873
874 # See if we can find an equals sign in the remaining data. If
875 # so, we can immediately emit the field name and jump to the
876 # data state.
877 if sep_pos != -1:
878 equals_pos = data.find(b"=", i, sep_pos)
879 else:
880 equals_pos = data.find(b"=", i)
881
882 if equals_pos != -1:
883 # Emit this name.
884 self.callback("field_name", data, i, equals_pos)
885
886 # Jump i to this position. Note that it will then have 1
887 # added to it below, which means the next iteration of this
888 # loop will inspect the character after the equals sign.
889 i = equals_pos
890 state = QuerystringState.FIELD_DATA
891 else:
892 # No equals sign found.
893 if not strict_parsing:
894 # See also comments in the QuerystringState.FIELD_DATA case below.
895 # If we found the separator, we emit the name and just
896 # end - there's no data callback at all (not even with
897 # a blank value).
898 if sep_pos != -1:
899 self.callback("field_name", data, i, sep_pos)
900 self.callback("field_end")
901
902 i = sep_pos - 1
903 state = QuerystringState.BEFORE_FIELD
904 else:
905 # Otherwise, no separator in this block, so the
906 # rest of this chunk must be a name.
907 self.callback("field_name", data, i, length)
908 i = length
909
910 else:
911 # We're parsing strictly. If we find a separator,
912 # this is an error - we require an equals sign.
913 if sep_pos != -1:
914 e = QuerystringParseError(
915 "When strict_parsing is True, we require an "
916 "equals sign in all field chunks. Did not "
917 "find one in the chunk that starts at %d" % (i,)
918 )
919 e.offset = i
920 raise e
921
922 # No separator in the rest of this chunk, so it's just
923 # a field name.
924 self.callback("field_name", data, i, length)
925 i = length
926
927 elif state == QuerystringState.FIELD_DATA:
928 # Try finding either an ampersand or a semicolon after this
929 # position.
930 sep_pos = data.find(b"&", i)
931 if sep_pos == -1:
932 sep_pos = data.find(b";", i)
933
934 # If we found it, callback this bit as data and then go back
935 # to expecting to find a field.
936 if sep_pos != -1:
937 self.callback("field_data", data, i, sep_pos)
938 self.callback("field_end")
939
940 # Note that we go to the separator, which brings us to the
941 # "before field" state. This allows us to properly emit
942 # "field_start" events only when we actually have data for
943 # a field of some sort.
944 i = sep_pos - 1
945 state = QuerystringState.BEFORE_FIELD
946
947 # Otherwise, emit the rest as data and finish.
948 else:
949 self.callback("field_data", data, i, length)
950 i = length
951
952 else: # pragma: no cover (error case)
953 msg = "Reached an unknown state %d at %d" % (state, i)
954 self.logger.warning(msg)
955 e = QuerystringParseError(msg)
956 e.offset = i
957 raise e
958
959 i += 1
960
961 self.state = state
962 self._found_sep = found_sep
963 return len(data)
964
965 def finalize(self) -> None:
966 """Finalize this parser, which signals to that we are finished parsing,
967 if we're still in the middle of a field, an on_field_end callback, and
968 then the on_end callback.
969 """
970 # If we're currently in the middle of a field, we finish it.
971 if self.state == QuerystringState.FIELD_DATA:
972 self.callback("field_end")
973 self.callback("end")
974
975 def __repr__(self) -> str:
976 return "{}(strict_parsing={!r}, max_size={!r})".format(
977 self.__class__.__name__, self.strict_parsing, self.max_size
978 )
979
980
981class MultipartParser(BaseParser):
982 """This class is a streaming multipart/form-data parser.
983
984 .. list-table::
985 :widths: 15 10 30
986 :header-rows: 1
987
988 * - Callback Name
989 - Parameters
990 - Description
991 * - on_part_begin
992 - None
993 - Called when a new part of the multipart message is encountered.
994 * - on_part_data
995 - data, start, end
996 - Called when a portion of a part's data is encountered.
997 * - on_part_end
998 - None
999 - Called when the end of a part is reached.
1000 * - on_header_begin
1001 - None
1002 - Called when we've found a new header in a part of a multipart
1003 message
1004 * - on_header_field
1005 - data, start, end
1006 - Called each time an additional portion of a header is read (i.e. the
1007 part of the header that is before the colon; the "Foo" in
1008 "Foo: Bar").
1009 * - on_header_value
1010 - data, start, end
1011 - Called when we get data for a header.
1012 * - on_header_end
1013 - None
1014 - Called when the current header is finished - i.e. we've reached the
1015 newline at the end of the header.
1016 * - on_headers_finished
1017 - None
1018 - Called when all headers are finished, and before the part data
1019 starts.
1020 * - on_end
1021 - None
1022 - Called when the parser is finished parsing all data.
1023
1024
1025 :param boundary: The multipart boundary. This is required, and must match
1026 what is given in the HTTP request - usually in the
1027 Content-Type header.
1028
1029 :param callbacks: A dictionary of callbacks. See the documentation for
1030 :class:`BaseParser`.
1031
1032 :param max_size: The maximum size of body to parse. Defaults to infinity -
1033 i.e. unbounded.
1034 """
1035
1036 def __init__(self, boundary: bytes | str, callbacks: MultipartCallbacks = {}, max_size=float("inf")):
1037 # Initialize parser state.
1038 super().__init__()
1039 self.state = MultipartState.START
1040 self.index = self.flags = 0
1041
1042 self.callbacks = callbacks
1043
1044 if not isinstance(max_size, Number) or max_size < 1:
1045 raise ValueError("max_size must be a positive number, not %r" % max_size)
1046 self.max_size = max_size
1047 self._current_size = 0
1048
1049 # Setup marks. These are used to track the state of data received.
1050 self.marks = {}
1051
1052 # TODO: Actually use this rather than the dumb version we currently use
1053 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
1054 # skip = [len(boundary) for x in range(256)]
1055 # for i in range(len(boundary) - 1):
1056 # skip[ord_char(boundary[i])] = len(boundary) - i - 1
1057 #
1058 # # We use a tuple since it's a constant, and marginally faster.
1059 # self.skip = tuple(skip)
1060
1061 # Save our boundary.
1062 if isinstance(boundary, str): # pragma: no cover
1063 boundary = boundary.encode("latin-1")
1064 self.boundary = b"\r\n--" + boundary
1065
1066 # Get a set of characters that belong to our boundary.
1067 self.boundary_chars = frozenset(self.boundary)
1068
1069 # We also create a lookbehind list.
1070 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
1071 # "--\r\n" at the final boundary, and the length of '\r\n--' and
1072 # '--\r\n' is 8 bytes.
1073 self.lookbehind = [NULL for x in range(len(boundary) + 8)]
1074
1075 def write(self, data: bytes) -> int:
1076 """Write some data to the parser, which will perform size verification,
1077 and then parse the data into the appropriate location (e.g. header,
1078 data, etc.), and pass this on to the underlying callback. If an error
1079 is encountered, a MultipartParseError will be raised. The "offset"
1080 attribute on the raised exception will be set to the offset of the byte
1081 in the input chunk that caused the error.
1082
1083 :param data: a bytestring
1084 """
1085 # Handle sizing.
1086 data_len = len(data)
1087 if (self._current_size + data_len) > self.max_size:
1088 # We truncate the length of data that we are to process.
1089 new_size = int(self.max_size - self._current_size)
1090 self.logger.warning(
1091 "Current size is %d (max %d), so truncating data length from %d to %d",
1092 self._current_size,
1093 self.max_size,
1094 data_len,
1095 new_size,
1096 )
1097 data_len = new_size
1098
1099 l = 0
1100 try:
1101 l = self._internal_write(data, data_len)
1102 finally:
1103 self._current_size += l
1104
1105 return l
1106
1107 def _internal_write(self, data: bytes, length: int) -> int:
1108 # Get values from locals.
1109 boundary = self.boundary
1110
1111 # Get our state, flags and index. These are persisted between calls to
1112 # this function.
1113 state = self.state
1114 index = self.index
1115 flags = self.flags
1116
1117 # Our index defaults to 0.
1118 i = 0
1119
1120 # Set a mark.
1121 def set_mark(name):
1122 self.marks[name] = i
1123
1124 # Remove a mark.
1125 def delete_mark(name, reset=False):
1126 self.marks.pop(name, None)
1127
1128 # Helper function that makes calling a callback with data easier. The
1129 # 'remaining' parameter will callback from the marked value until the
1130 # end of the buffer, and reset the mark, instead of deleting it. This
1131 # is used at the end of the function to call our callbacks with any
1132 # remaining data in this chunk.
1133 def data_callback(name, remaining=False):
1134 marked_index = self.marks.get(name)
1135 if marked_index is None:
1136 return
1137
1138 # If we're getting remaining data, we ignore the current i value
1139 # and just call with the remaining data.
1140 if remaining:
1141 self.callback(name, data, marked_index, length)
1142 self.marks[name] = 0
1143
1144 # Otherwise, we call it from the mark to the current byte we're
1145 # processing.
1146 else:
1147 self.callback(name, data, marked_index, i)
1148 self.marks.pop(name, None)
1149
1150 # For each byte...
1151 while i < length:
1152 c = data[i]
1153
1154 if state == MultipartState.START:
1155 # Skip leading newlines
1156 if c == CR or c == LF:
1157 i += 1
1158 self.logger.debug("Skipping leading CR/LF at %d", i)
1159 continue
1160
1161 # index is used as in index into our boundary. Set to 0.
1162 index = 0
1163
1164 # Move to the next state, but decrement i so that we re-process
1165 # this character.
1166 state = MultipartState.START_BOUNDARY
1167 i -= 1
1168
1169 elif state == MultipartState.START_BOUNDARY:
1170 # Check to ensure that the last 2 characters in our boundary
1171 # are CRLF.
1172 if index == len(boundary) - 2:
1173 if c != CR:
1174 # Error!
1175 msg = "Did not find CR at end of boundary (%d)" % (i,)
1176 self.logger.warning(msg)
1177 e = MultipartParseError(msg)
1178 e.offset = i
1179 raise e
1180
1181 index += 1
1182
1183 elif index == len(boundary) - 2 + 1:
1184 if c != LF:
1185 msg = "Did not find LF at end of boundary (%d)" % (i,)
1186 self.logger.warning(msg)
1187 e = MultipartParseError(msg)
1188 e.offset = i
1189 raise e
1190
1191 # The index is now used for indexing into our boundary.
1192 index = 0
1193
1194 # Callback for the start of a part.
1195 self.callback("part_begin")
1196
1197 # Move to the next character and state.
1198 state = MultipartState.HEADER_FIELD_START
1199
1200 else:
1201 # Check to ensure our boundary matches
1202 if c != boundary[index + 2]:
1203 msg = "Did not find boundary character %r at index " "%d" % (c, index + 2)
1204 self.logger.warning(msg)
1205 e = MultipartParseError(msg)
1206 e.offset = i
1207 raise e
1208
1209 # Increment index into boundary and continue.
1210 index += 1
1211
1212 elif state == MultipartState.HEADER_FIELD_START:
1213 # Mark the start of a header field here, reset the index, and
1214 # continue parsing our header field.
1215 index = 0
1216
1217 # Set a mark of our header field.
1218 set_mark("header_field")
1219
1220 # Move to parsing header fields.
1221 state = MultipartState.HEADER_FIELD
1222 i -= 1
1223
1224 elif state == MultipartState.HEADER_FIELD:
1225 # If we've reached a CR at the beginning of a header, it means
1226 # that we've reached the second of 2 newlines, and so there are
1227 # no more headers to parse.
1228 if c == CR:
1229 delete_mark("header_field")
1230 state = MultipartState.HEADERS_ALMOST_DONE
1231 i += 1
1232 continue
1233
1234 # Increment our index in the header.
1235 index += 1
1236
1237 # Do nothing if we encounter a hyphen.
1238 if c == HYPHEN:
1239 pass
1240
1241 # If we've reached a colon, we're done with this header.
1242 elif c == COLON:
1243 # A 0-length header is an error.
1244 if index == 1:
1245 msg = "Found 0-length header at %d" % (i,)
1246 self.logger.warning(msg)
1247 e = MultipartParseError(msg)
1248 e.offset = i
1249 raise e
1250
1251 # Call our callback with the header field.
1252 data_callback("header_field")
1253
1254 # Move to parsing the header value.
1255 state = MultipartState.HEADER_VALUE_START
1256
1257 else:
1258 # Lower-case this character, and ensure that it is in fact
1259 # a valid letter. If not, it's an error.
1260 cl = lower_char(c)
1261 if cl < LOWER_A or cl > LOWER_Z:
1262 msg = "Found non-alphanumeric character %r in " "header at %d" % (c, i)
1263 self.logger.warning(msg)
1264 e = MultipartParseError(msg)
1265 e.offset = i
1266 raise e
1267
1268 elif state == MultipartState.HEADER_VALUE_START:
1269 # Skip leading spaces.
1270 if c == SPACE:
1271 i += 1
1272 continue
1273
1274 # Mark the start of the header value.
1275 set_mark("header_value")
1276
1277 # Move to the header-value state, reprocessing this character.
1278 state = MultipartState.HEADER_VALUE
1279 i -= 1
1280
1281 elif state == MultipartState.HEADER_VALUE:
1282 # If we've got a CR, we're nearly done our headers. Otherwise,
1283 # we do nothing and just move past this character.
1284 if c == CR:
1285 data_callback("header_value")
1286 self.callback("header_end")
1287 state = MultipartState.HEADER_VALUE_ALMOST_DONE
1288
1289 elif state == MultipartState.HEADER_VALUE_ALMOST_DONE:
1290 # The last character should be a LF. If not, it's an error.
1291 if c != LF:
1292 msg = "Did not find LF character at end of header " "(found %r)" % (c,)
1293 self.logger.warning(msg)
1294 e = MultipartParseError(msg)
1295 e.offset = i
1296 raise e
1297
1298 # Move back to the start of another header. Note that if that
1299 # state detects ANOTHER newline, it'll trigger the end of our
1300 # headers.
1301 state = MultipartState.HEADER_FIELD_START
1302
1303 elif state == MultipartState.HEADERS_ALMOST_DONE:
1304 # We're almost done our headers. This is reached when we parse
1305 # a CR at the beginning of a header, so our next character
1306 # should be a LF, or it's an error.
1307 if c != LF:
1308 msg = f"Did not find LF at end of headers (found {c!r})"
1309 self.logger.warning(msg)
1310 e = MultipartParseError(msg)
1311 e.offset = i
1312 raise e
1313
1314 self.callback("headers_finished")
1315 state = MultipartState.PART_DATA_START
1316
1317 elif state == MultipartState.PART_DATA_START:
1318 # Mark the start of our part data.
1319 set_mark("part_data")
1320
1321 # Start processing part data, including this character.
1322 state = MultipartState.PART_DATA
1323 i -= 1
1324
1325 elif state == MultipartState.PART_DATA:
1326 # We're processing our part data right now. During this, we
1327 # need to efficiently search for our boundary, since any data
1328 # on any number of lines can be a part of the current data.
1329 # We use the Boyer-Moore-Horspool algorithm to efficiently
1330 # search through the remainder of the buffer looking for our
1331 # boundary.
1332
1333 # Save the current value of our index. We use this in case we
1334 # find part of a boundary, but it doesn't match fully.
1335 prev_index = index
1336
1337 # Set up variables.
1338 boundary_length = len(boundary)
1339 boundary_end = boundary_length - 1
1340 data_length = length
1341 boundary_chars = self.boundary_chars
1342
1343 # If our index is 0, we're starting a new part, so start our
1344 # search.
1345 if index == 0:
1346 # Search forward until we either hit the end of our buffer,
1347 # or reach a character that's in our boundary.
1348 i += boundary_end
1349 while i < data_length - 1 and data[i] not in boundary_chars:
1350 i += boundary_length
1351
1352 # Reset i back the length of our boundary, which is the
1353 # earliest possible location that could be our match (i.e.
1354 # if we've just broken out of our loop since we saw the
1355 # last character in our boundary)
1356 i -= boundary_end
1357 c = data[i]
1358
1359 # Now, we have a couple of cases here. If our index is before
1360 # the end of the boundary...
1361 if index < boundary_length:
1362 # If the character matches...
1363 if boundary[index] == c:
1364 # If we found a match for our boundary, we send the
1365 # existing data.
1366 if index == 0:
1367 data_callback("part_data")
1368
1369 # The current character matches, so continue!
1370 index += 1
1371 else:
1372 index = 0
1373
1374 # Our index is equal to the length of our boundary!
1375 elif index == boundary_length:
1376 # First we increment it.
1377 index += 1
1378
1379 # Now, if we've reached a newline, we need to set this as
1380 # the potential end of our boundary.
1381 if c == CR:
1382 flags |= FLAG_PART_BOUNDARY
1383
1384 # Otherwise, if this is a hyphen, we might be at the last
1385 # of all boundaries.
1386 elif c == HYPHEN:
1387 flags |= FLAG_LAST_BOUNDARY
1388
1389 # Otherwise, we reset our index, since this isn't either a
1390 # newline or a hyphen.
1391 else:
1392 index = 0
1393
1394 # Our index is right after the part boundary, which should be
1395 # a LF.
1396 elif index == boundary_length + 1:
1397 # If we're at a part boundary (i.e. we've seen a CR
1398 # character already)...
1399 if flags & FLAG_PART_BOUNDARY:
1400 # We need a LF character next.
1401 if c == LF:
1402 # Unset the part boundary flag.
1403 flags &= ~FLAG_PART_BOUNDARY
1404
1405 # Callback indicating that we've reached the end of
1406 # a part, and are starting a new one.
1407 self.callback("part_end")
1408 self.callback("part_begin")
1409
1410 # Move to parsing new headers.
1411 index = 0
1412 state = MultipartState.HEADER_FIELD_START
1413 i += 1
1414 continue
1415
1416 # We didn't find an LF character, so no match. Reset
1417 # our index and clear our flag.
1418 index = 0
1419 flags &= ~FLAG_PART_BOUNDARY
1420
1421 # Otherwise, if we're at the last boundary (i.e. we've
1422 # seen a hyphen already)...
1423 elif flags & FLAG_LAST_BOUNDARY:
1424 # We need a second hyphen here.
1425 if c == HYPHEN:
1426 # Callback to end the current part, and then the
1427 # message.
1428 self.callback("part_end")
1429 self.callback("end")
1430 state = MultipartState.END
1431 else:
1432 # No match, so reset index.
1433 index = 0
1434
1435 # If we have an index, we need to keep this byte for later, in
1436 # case we can't match the full boundary.
1437 if index > 0:
1438 self.lookbehind[index - 1] = c
1439
1440 # Otherwise, our index is 0. If the previous index is not, it
1441 # means we reset something, and we need to take the data we
1442 # thought was part of our boundary and send it along as actual
1443 # data.
1444 elif prev_index > 0:
1445 # Callback to write the saved data.
1446 lb_data = join_bytes(self.lookbehind)
1447 self.callback("part_data", lb_data, 0, prev_index)
1448
1449 # Overwrite our previous index.
1450 prev_index = 0
1451
1452 # Re-set our mark for part data.
1453 set_mark("part_data")
1454
1455 # Re-consider the current character, since this could be
1456 # the start of the boundary itself.
1457 i -= 1
1458
1459 elif state == MultipartState.END:
1460 # Do nothing and just consume a byte in the end state.
1461 if c not in (CR, LF):
1462 self.logger.warning("Consuming a byte '0x%x' in the end state", c)
1463
1464 else: # pragma: no cover (error case)
1465 # We got into a strange state somehow! Just stop processing.
1466 msg = "Reached an unknown state %d at %d" % (state, i)
1467 self.logger.warning(msg)
1468 e = MultipartParseError(msg)
1469 e.offset = i
1470 raise e
1471
1472 # Move to the next byte.
1473 i += 1
1474
1475 # We call our callbacks with any remaining data. Note that we pass
1476 # the 'remaining' flag, which sets the mark back to 0 instead of
1477 # deleting it, if it's found. This is because, if the mark is found
1478 # at this point, we assume that there's data for one of these things
1479 # that has been parsed, but not yet emitted. And, as such, it implies
1480 # that we haven't yet reached the end of this 'thing'. So, by setting
1481 # the mark to 0, we cause any data callbacks that take place in future
1482 # calls to this function to start from the beginning of that buffer.
1483 data_callback("header_field", True)
1484 data_callback("header_value", True)
1485 data_callback("part_data", True)
1486
1487 # Save values to locals.
1488 self.state = state
1489 self.index = index
1490 self.flags = flags
1491
1492 # Return our data length to indicate no errors, and that we processed
1493 # all of it.
1494 return length
1495
1496 def finalize(self) -> None:
1497 """Finalize this parser, which signals to that we are finished parsing.
1498
1499 Note: It does not currently, but in the future, it will verify that we
1500 are in the final state of the parser (i.e. the end of the multipart
1501 message is well-formed), and, if not, throw an error.
1502 """
1503 # TODO: verify that we're in the state MultipartState.END, otherwise throw an
1504 # error or otherwise state that we're not finished parsing.
1505 pass
1506
1507 def __repr__(self):
1508 return f"{self.__class__.__name__}(boundary={self.boundary!r})"
1509
1510
1511class FormParser:
1512 """This class is the all-in-one form parser. Given all the information
1513 necessary to parse a form, it will instantiate the correct parser, create
1514 the proper :class:`Field` and :class:`File` classes to store the data that
1515 is parsed, and call the two given callbacks with each field and file as
1516 they become available.
1517
1518 :param content_type: The Content-Type of the incoming request. This is
1519 used to select the appropriate parser.
1520
1521 :param on_field: The callback to call when a field has been parsed and is
1522 ready for usage. See above for parameters.
1523
1524 :param on_file: The callback to call when a file has been parsed and is
1525 ready for usage. See above for parameters.
1526
1527 :param on_end: An optional callback to call when all fields and files in a
1528 request has been parsed. Can be None.
1529
1530 :param boundary: If the request is a multipart/form-data request, this
1531 should be the boundary of the request, as given in the
1532 Content-Type header, as a bytestring.
1533
1534 :param file_name: If the request is of type application/octet-stream, then
1535 the body of the request will not contain any information
1536 about the uploaded file. In such cases, you can provide
1537 the file name of the uploaded file manually.
1538
1539 :param FileClass: The class to use for uploaded files. Defaults to
1540 :class:`File`, but you can provide your own class if you
1541 wish to customize behaviour. The class will be
1542 instantiated as FileClass(file_name, field_name), and it
1543 must provide the following functions::
1544 file_instance.write(data)
1545 file_instance.finalize()
1546 file_instance.close()
1547
1548 :param FieldClass: The class to use for uploaded fields. Defaults to
1549 :class:`Field`, but you can provide your own class if
1550 you wish to customize behaviour. The class will be
1551 instantiated as FieldClass(field_name), and it must
1552 provide the following functions::
1553 field_instance.write(data)
1554 field_instance.finalize()
1555 field_instance.close()
1556
1557 :param config: Configuration to use for this FormParser. The default
1558 values are taken from the DEFAULT_CONFIG value, and then
1559 any keys present in this dictionary will overwrite the
1560 default values.
1561
1562 """
1563
1564 #: This is the default configuration for our form parser.
1565 #: Note: all file sizes should be in bytes.
1566 DEFAULT_CONFIG: FormParserConfig = {
1567 "MAX_BODY_SIZE": float("inf"),
1568 "MAX_MEMORY_FILE_SIZE": 1 * 1024 * 1024,
1569 "UPLOAD_DIR": None,
1570 "UPLOAD_KEEP_FILENAME": False,
1571 "UPLOAD_KEEP_EXTENSIONS": False,
1572 # Error on invalid Content-Transfer-Encoding?
1573 "UPLOAD_ERROR_ON_BAD_CTE": False,
1574 }
1575
1576 def __init__(
1577 self,
1578 content_type,
1579 on_field,
1580 on_file,
1581 on_end=None,
1582 boundary=None,
1583 file_name=None,
1584 FileClass=File,
1585 FieldClass=Field,
1586 config: FormParserConfig = {},
1587 ):
1588 self.logger = logging.getLogger(__name__)
1589
1590 # Save variables.
1591 self.content_type = content_type
1592 self.boundary = boundary
1593 self.bytes_received = 0
1594 self.parser = None
1595
1596 # Save callbacks.
1597 self.on_field = on_field
1598 self.on_file = on_file
1599 self.on_end = on_end
1600
1601 # Save classes.
1602 self.FileClass = File
1603 self.FieldClass = Field
1604
1605 # Set configuration options.
1606 self.config = self.DEFAULT_CONFIG.copy()
1607 self.config.update(config)
1608
1609 # Depending on the Content-Type, we instantiate the correct parser.
1610 if content_type == "application/octet-stream":
1611 # Work around the lack of 'nonlocal' in Py2
1612 class vars:
1613 f = None
1614
1615 def on_start() -> None:
1616 vars.f = FileClass(file_name, None, config=self.config)
1617
1618 def on_data(data: bytes, start: int, end: int) -> None:
1619 vars.f.write(data[start:end])
1620
1621 def on_end() -> None:
1622 # Finalize the file itself.
1623 vars.f.finalize()
1624
1625 # Call our callback.
1626 on_file(vars.f)
1627
1628 # Call the on-end callback.
1629 if self.on_end is not None:
1630 self.on_end()
1631
1632 # Instantiate an octet-stream parser
1633 parser = OctetStreamParser(
1634 callbacks={"on_start": on_start, "on_data": on_data, "on_end": on_end},
1635 max_size=self.config["MAX_BODY_SIZE"],
1636 )
1637
1638 elif content_type == "application/x-www-form-urlencoded" or content_type == "application/x-url-encoded":
1639 name_buffer: list[bytes] = []
1640
1641 class vars:
1642 f = None
1643
1644 def on_field_start() -> None:
1645 pass
1646
1647 def on_field_name(data: bytes, start: int, end: int) -> None:
1648 name_buffer.append(data[start:end])
1649
1650 def on_field_data(data: bytes, start: int, end: int) -> None:
1651 if vars.f is None:
1652 vars.f = FieldClass(b"".join(name_buffer))
1653 del name_buffer[:]
1654 vars.f.write(data[start:end])
1655
1656 def on_field_end() -> None:
1657 # Finalize and call callback.
1658 if vars.f is None:
1659 # If we get here, it's because there was no field data.
1660 # We create a field, set it to None, and then continue.
1661 vars.f = FieldClass(b"".join(name_buffer))
1662 del name_buffer[:]
1663 vars.f.set_none()
1664
1665 vars.f.finalize()
1666 on_field(vars.f)
1667 vars.f = None
1668
1669 def on_end() -> None:
1670 if self.on_end is not None:
1671 self.on_end()
1672
1673 # Instantiate parser.
1674 parser = QuerystringParser(
1675 callbacks={
1676 "on_field_start": on_field_start,
1677 "on_field_name": on_field_name,
1678 "on_field_data": on_field_data,
1679 "on_field_end": on_field_end,
1680 "on_end": on_end,
1681 },
1682 max_size=self.config["MAX_BODY_SIZE"],
1683 )
1684
1685 elif content_type == "multipart/form-data":
1686 if boundary is None:
1687 self.logger.error("No boundary given")
1688 raise FormParserError("No boundary given")
1689
1690 header_name: list[bytes] = []
1691 header_value: list[bytes] = []
1692 headers = {}
1693
1694 # No 'nonlocal' on Python 2 :-(
1695 class vars:
1696 f = None
1697 writer = None
1698 is_file = False
1699
1700 def on_part_begin():
1701 pass
1702
1703 def on_part_data(data: bytes, start: int, end: int):
1704 bytes_processed = vars.writer.write(data[start:end])
1705 # TODO: check for error here.
1706 return bytes_processed
1707
1708 def on_part_end() -> None:
1709 vars.f.finalize()
1710 if vars.is_file:
1711 on_file(vars.f)
1712 else:
1713 on_field(vars.f)
1714
1715 def on_header_field(data: bytes, start: int, end: int):
1716 header_name.append(data[start:end])
1717
1718 def on_header_value(data: bytes, start: int, end: int):
1719 header_value.append(data[start:end])
1720
1721 def on_header_end():
1722 headers[b"".join(header_name)] = b"".join(header_value)
1723 del header_name[:]
1724 del header_value[:]
1725
1726 def on_headers_finished() -> None:
1727 # Reset the 'is file' flag.
1728 vars.is_file = False
1729
1730 # Parse the content-disposition header.
1731 # TODO: handle mixed case
1732 content_disp = headers.get(b"Content-Disposition")
1733 disp, options = parse_options_header(content_disp)
1734
1735 # Get the field and filename.
1736 field_name = options.get(b"name")
1737 file_name = options.get(b"filename")
1738 # TODO: check for errors
1739
1740 # Create the proper class.
1741 if file_name is None:
1742 vars.f = FieldClass(field_name)
1743 else:
1744 vars.f = FileClass(file_name, field_name, config=self.config)
1745 vars.is_file = True
1746
1747 # Parse the given Content-Transfer-Encoding to determine what
1748 # we need to do with the incoming data.
1749 # TODO: check that we properly handle 8bit / 7bit encoding.
1750 transfer_encoding = headers.get(b"Content-Transfer-Encoding", b"7bit")
1751
1752 if transfer_encoding == b"binary" or transfer_encoding == b"8bit" or transfer_encoding == b"7bit":
1753 vars.writer = vars.f
1754
1755 elif transfer_encoding == b"base64":
1756 vars.writer = Base64Decoder(vars.f)
1757
1758 elif transfer_encoding == b"quoted-printable":
1759 vars.writer = QuotedPrintableDecoder(vars.f)
1760
1761 else:
1762 self.logger.warning("Unknown Content-Transfer-Encoding: %r", transfer_encoding)
1763 if self.config["UPLOAD_ERROR_ON_BAD_CTE"]:
1764 raise FormParserError('Unknown Content-Transfer-Encoding "{}"'.format(transfer_encoding))
1765 else:
1766 # If we aren't erroring, then we just treat this as an
1767 # unencoded Content-Transfer-Encoding.
1768 vars.writer = vars.f
1769
1770 def on_end() -> None:
1771 vars.writer.finalize()
1772 if self.on_end is not None:
1773 self.on_end()
1774
1775 # Instantiate a multipart parser.
1776 parser = MultipartParser(
1777 boundary,
1778 callbacks={
1779 "on_part_begin": on_part_begin,
1780 "on_part_data": on_part_data,
1781 "on_part_end": on_part_end,
1782 "on_header_field": on_header_field,
1783 "on_header_value": on_header_value,
1784 "on_header_end": on_header_end,
1785 "on_headers_finished": on_headers_finished,
1786 "on_end": on_end,
1787 },
1788 max_size=self.config["MAX_BODY_SIZE"],
1789 )
1790
1791 else:
1792 self.logger.warning("Unknown Content-Type: %r", content_type)
1793 raise FormParserError("Unknown Content-Type: {}".format(content_type))
1794
1795 self.parser = parser
1796
1797 def write(self, data: bytes):
1798 """Write some data. The parser will forward this to the appropriate
1799 underlying parser.
1800
1801 :param data: a bytestring
1802 """
1803 self.bytes_received += len(data)
1804 # TODO: check the parser's return value for errors?
1805 return self.parser.write(data)
1806
1807 def finalize(self) -> None:
1808 """Finalize the parser."""
1809 if self.parser is not None and hasattr(self.parser, "finalize"):
1810 self.parser.finalize()
1811
1812 def close(self) -> None:
1813 """Close the parser."""
1814 if self.parser is not None and hasattr(self.parser, "close"):
1815 self.parser.close()
1816
1817 def __repr__(self) -> str:
1818 return "{}(content_type={!r}, parser={!r})".format(self.__class__.__name__, self.content_type, self.parser)
1819
1820
1821def create_form_parser(headers, on_field, on_file, trust_x_headers=False, config={}):
1822 """This function is a helper function to aid in creating a FormParser
1823 instances. Given a dictionary-like headers object, it will determine
1824 the correct information needed, instantiate a FormParser with the
1825 appropriate values and given callbacks, and then return the corresponding
1826 parser.
1827
1828 :param headers: A dictionary-like object of HTTP headers. The only
1829 required header is Content-Type.
1830
1831 :param on_field: Callback to call with each parsed field.
1832
1833 :param on_file: Callback to call with each parsed file.
1834
1835 :param trust_x_headers: Whether or not to trust information received from
1836 certain X-Headers - for example, the file name from
1837 X-File-Name.
1838
1839 :param config: Configuration variables to pass to the FormParser.
1840 """
1841 content_type = headers.get("Content-Type")
1842 if content_type is None:
1843 logging.getLogger(__name__).warning("No Content-Type header given")
1844 raise ValueError("No Content-Type header given!")
1845
1846 # Boundaries are optional (the FormParser will raise if one is needed
1847 # but not given).
1848 content_type, params = parse_options_header(content_type)
1849 boundary = params.get(b"boundary")
1850
1851 # We need content_type to be a string, not a bytes object.
1852 content_type = content_type.decode("latin-1")
1853
1854 # File names are optional.
1855 file_name = headers.get("X-File-Name")
1856
1857 # Instantiate a form parser.
1858 form_parser = FormParser(content_type, on_field, on_file, boundary=boundary, file_name=file_name, config=config)
1859
1860 # Return our parser.
1861 return form_parser
1862
1863
1864def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, **kwargs):
1865 """This function is useful if you just want to parse a request body,
1866 without too much work. Pass it a dictionary-like object of the request's
1867 headers, and a file-like object for the input stream, along with two
1868 callbacks that will get called whenever a field or file is parsed.
1869
1870 :param headers: A dictionary-like object of HTTP headers. The only
1871 required header is Content-Type.
1872
1873 :param input_stream: A file-like object that represents the request body.
1874 The read() method must return bytestrings.
1875
1876 :param on_field: Callback to call with each parsed field.
1877
1878 :param on_file: Callback to call with each parsed file.
1879
1880 :param chunk_size: The maximum size to read from the input stream and write
1881 to the parser at one time. Defaults to 1 MiB.
1882 """
1883
1884 # Create our form parser.
1885 parser = create_form_parser(headers, on_field, on_file)
1886
1887 # Read chunks of 100KiB and write to the parser, but never read more than
1888 # the given Content-Length, if any.
1889 content_length = headers.get("Content-Length")
1890 if content_length is not None:
1891 content_length = int(content_length)
1892 else:
1893 content_length = float("inf")
1894 bytes_read = 0
1895
1896 while True:
1897 # Read only up to the Content-Length given.
1898 max_readable = min(content_length - bytes_read, 1048576)
1899 buff = input_stream.read(max_readable)
1900
1901 # Write to the parser and update our length.
1902 parser.write(buff)
1903 bytes_read += len(buff)
1904
1905 # If we get a buffer that's smaller than the size requested, or if we
1906 # have read up to our content length, we're done.
1907 if len(buff) != max_readable or bytes_read == content_length:
1908 break
1909
1910 # Tell our parser that we're done writing data.
1911 parser.finalize()