Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/multipart/multipart.py: 17%

1from __future__ import annotations

3import logging

4import os

5import shutil

6import sys

7import tempfile

8from email.message import Message

9from enum import IntEnum

10from io import BytesIO

11from numbers import Number

12from typing import TYPE_CHECKING

14from .decoders import Base64Decoder, QuotedPrintableDecoder

15from .exceptions import FileError, FormParserError, MultipartParseError, QuerystringParseError

17if TYPE_CHECKING: # pragma: no cover

18 from typing import Callable, TypedDict

20 class QuerystringCallbacks(TypedDict, total=False):

21 on_field_start: Callable[[], None]

22 on_field_name: Callable[[bytes, int, int], None]

23 on_field_data: Callable[[bytes, int, int], None]

24 on_field_end: Callable[[], None]

25 on_end: Callable[[], None]

27 class OctetStreamCallbacks(TypedDict, total=False):

28 on_start: Callable[[], None]

29 on_data: Callable[[bytes, int, int], None]

30 on_end: Callable[[], None]

32 class MultipartCallbacks(TypedDict, total=False):

33 on_part_begin: Callable[[], None]

34 on_part_data: Callable[[bytes, int, int], None]

35 on_part_end: Callable[[], None]

36 on_headers_begin: Callable[[], None]

37 on_header_field: Callable[[bytes, int, int], None]

38 on_header_value: Callable[[bytes, int, int], None]

39 on_header_end: Callable[[], None]

40 on_headers_finished: Callable[[], None]

41 on_end: Callable[[], None]

43 class FormParserConfig(TypedDict, total=False):

44 UPLOAD_DIR: str | None

45 UPLOAD_KEEP_FILENAME: bool

46 UPLOAD_KEEP_EXTENSIONS: bool

47 UPLOAD_ERROR_ON_BAD_CTE: bool

48 MAX_MEMORY_FILE_SIZE: int

49 MAX_BODY_SIZE: float

51 class FileConfig(TypedDict, total=False):

52 UPLOAD_DIR: str | None

53 UPLOAD_DELETE_TMP: bool

54 UPLOAD_KEEP_FILENAME: bool

55 UPLOAD_KEEP_EXTENSIONS: bool

56 MAX_MEMORY_FILE_SIZE: int

59# Unique missing object.

60_missing = object()

63class QuerystringState(IntEnum):

64 """Querystring parser states.

66 These are used to keep track of the state of the parser, and are used to determine

67 what to do when new data is encountered.

68 """

70 BEFORE_FIELD = 0

71 FIELD_NAME = 1

72 FIELD_DATA = 2

75class MultipartState(IntEnum):

76 """Multipart parser states.

78 These are used to keep track of the state of the parser, and are used to determine

79 what to do when new data is encountered.

80 """

82 START = 0

83 START_BOUNDARY = 1

84 HEADER_FIELD_START = 2

85 HEADER_FIELD = 3

86 HEADER_VALUE_START = 4

87 HEADER_VALUE = 5

88 HEADER_VALUE_ALMOST_DONE = 6

89 HEADERS_ALMOST_DONE = 7

90 PART_DATA_START = 8

91 PART_DATA = 9

92 PART_DATA_END = 10

93 END = 11

96# Flags for the multipart parser.

97FLAG_PART_BOUNDARY = 1

98FLAG_LAST_BOUNDARY = 2

100# Get constants. Since iterating over a str on Python 2 gives you a 1-length

101# string, but iterating over a bytes object on Python 3 gives you an integer,

102# we need to save these constants.

103CR = b"\r"[0]

104LF = b"\n"[0]

105COLON = b":"[0]

106SPACE = b" "[0]

107HYPHEN = b"-"[0]

108AMPERSAND = b"&"[0]

109SEMICOLON = b";"[0]

110LOWER_A = b"a"[0]

111LOWER_Z = b"z"[0]

112NULL = b"\x00"[0]

113

114

115# Lower-casing a character is different, because of the difference between

116# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,

117# and joining a list of bytes together.

118# These functions abstract that.

119def lower_char(c):

120 return c | 0x20

121

122

123def ord_char(c):

124 return c

125

126

127def join_bytes(b):

128 return bytes(list(b))

129

130

131def parse_options_header(value: str | bytes) -> tuple[bytes, dict[bytes, bytes]]:

132 """

133 Parses a Content-Type header into a value in the following format:

134 (content_type, {parameters})

135 """

136 # Uses email.message.Message to parse the header as described in PEP 594.

137 # Ref: https://peps.python.org/pep-0594/#cgi

138 if not value:

139 return (b"", {})

140

141 # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1.

142 if isinstance(value, bytes): # pragma: no cover

143 value = value.decode("latin-1")

144

145 # For types

146 assert isinstance(value, str), "Value should be a string by now"

147

148 # If we have no options, return the string as-is.

149 if ";" not in value:

150 return (value.lower().strip().encode("latin-1"), {})

151

152 # Split at the first semicolon, to get our value and then options.

153 # ctype, rest = value.split(b';', 1)

154 message = Message()

155 message["content-type"] = value

156 params = message.get_params()

157 # If there were no parameters, this would have already returned above

158 assert params, "At least the content type value should be present"

159 ctype = params.pop(0)[0].encode("latin-1")

160 options = {}

161 for param in params:

162 key, value = param

163 # If the value returned from get_params() is a 3-tuple, the last

164 # element corresponds to the value.

165 # See: https://docs.python.org/3/library/email.compat32-message.html

166 if isinstance(value, tuple):

167 value = value[-1]

168 # If the value is a filename, we need to fix a bug on IE6 that sends

169 # the full file path instead of the filename.

170 if key == "filename":

171 if value[1:3] == ":\\" or value[:2] == "\\\\":

172 value = value.split("\\")[-1]

173 options[key.encode("latin-1")] = value.encode("latin-1")

174 return ctype, options

175

176

177class Field:

178 """A Field object represents a (parsed) form field. It represents a single

179 field with a corresponding name and value.

180

181 The name that a :class:`Field` will be instantiated with is the same name

182 that would be found in the following HTML::

183

184 <input name="name_goes_here" type="text"/>

185

186 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that

187 will be called when data is written to the Field, and when the Field is

188 finalized, respectively.

189

190 :param name: the name of the form field

191 """

192

193 def __init__(self, name: str):

194 self._name = name

195 self._value: list[bytes] = []

196

197 # We cache the joined version of _value for speed.

198 self._cache = _missing

199

200 @classmethod

201 def from_value(cls, name: str, value: bytes | None) -> Field:

202 """Create an instance of a :class:`Field`, and set the corresponding

203 value - either None or an actual value. This method will also

204 finalize the Field itself.

205

206 :param name: the name of the form field

207 :param value: the value of the form field - either a bytestring or

208 None

209 """

210

211 f = cls(name)

212 if value is None:

213 f.set_none()

214 else:

215 f.write(value)

216 f.finalize()

217 return f

218

219 def write(self, data: bytes) -> int:

220 """Write some data into the form field.

221

222 :param data: a bytestring

223 """

224 return self.on_data(data)

225

226 def on_data(self, data: bytes) -> int:

227 """This method is a callback that will be called whenever data is

228 written to the Field.

229

230 :param data: a bytestring

231 """

232 self._value.append(data)

233 self._cache = _missing

234 return len(data)

235

236 def on_end(self) -> None:

237 """This method is called whenever the Field is finalized."""

238 if self._cache is _missing:

239 self._cache = b"".join(self._value)

240

241 def finalize(self) -> None:

242 """Finalize the form field."""

243 self.on_end()

244

245 def close(self) -> None:

246 """Close the Field object. This will free any underlying cache."""

247 # Free our value array.

248 if self._cache is _missing:

249 self._cache = b"".join(self._value)

250

251 del self._value

252

253 def set_none(self) -> None:

254 """Some fields in a querystring can possibly have a value of None - for

255 example, the string "foo&bar=&baz=asdf" will have a field with the

256 name "foo" and value None, one with name "bar" and value "", and one

257 with name "baz" and value "asdf". Since the write() interface doesn't

258 support writing None, this function will set the field value to None.

259 """

260 self._cache = None

261

262 @property

263 def field_name(self) -> str:

264 """This property returns the name of the field."""

265 return self._name

266

267 @property

268 def value(self):

269 """This property returns the value of the form field."""

270 if self._cache is _missing:

271 self._cache = b"".join(self._value)

272

273 return self._cache

274

275 def __eq__(self, other: object) -> bool:

276 if isinstance(other, Field):

277 return self.field_name == other.field_name and self.value == other.value

278 else:

279 return NotImplemented

280

281 def __repr__(self) -> str:

282 if len(self.value) > 97:

283 # We get the repr, and then insert three dots before the final

284 # quote.

285 v = repr(self.value[:97])[:-1] + "...'"

286 else:

287 v = repr(self.value)

288

289 return "{}(field_name={!r}, value={})".format(self.__class__.__name__, self.field_name, v)

290

291

292class File:

293 """This class represents an uploaded file. It handles writing file data to

294 either an in-memory file or a temporary file on-disk, if the optional

295 threshold is passed.

296

297 There are some options that can be passed to the File to change behavior

298 of the class. Valid options are as follows:

299

300 .. list-table::

301 :widths: 15 5 5 30

302 :header-rows: 1

303

304 * - Name

305 - Type

306 - Default

307 - Description

308 * - UPLOAD_DIR

309 - `str`

310 - None

311 - The directory to store uploaded files in. If this is None, a

312 temporary file will be created in the system's standard location.

313 * - UPLOAD_DELETE_TMP

314 - `bool`

315 - True

316 - Delete automatically created TMP file

317 * - UPLOAD_KEEP_FILENAME

318 - `bool`

319 - False

320 - Whether or not to keep the filename of the uploaded file. If True,

321 then the filename will be converted to a safe representation (e.g.

322 by removing any invalid path segments), and then saved with the

323 same name). Otherwise, a temporary name will be used.

324 * - UPLOAD_KEEP_EXTENSIONS

325 - `bool`

326 - False

327 - Whether or not to keep the uploaded file's extension. If False, the

328 file will be saved with the default temporary extension (usually

329 ".tmp"). Otherwise, the file's extension will be maintained. Note

330 that this will properly combine with the UPLOAD_KEEP_FILENAME

331 setting.

332 * - MAX_MEMORY_FILE_SIZE

333 - `int`

334 - 1 MiB

335 - The maximum number of bytes of a File to keep in memory. By

336 default, the contents of a File are kept into memory until a certain

337 limit is reached, after which the contents of the File are written

338 to a temporary file. This behavior can be disabled by setting this

339 value to an appropriately large value (or, for example, infinity,

340 such as `float('inf')`.

341

342 :param file_name: The name of the file that this :class:`File` represents

343

344 :param field_name: The field name that uploaded this file. Note that this

345 can be None, if, for example, the file was uploaded

346 with Content-Type application/octet-stream

347

348 :param config: The configuration for this File. See above for valid

349 configuration keys and their corresponding values.

350 """

351

352 def __init__(self, file_name: bytes | None, field_name: bytes | None = None, config: FileConfig = {}):

353 # Save configuration, set other variables default.

354 self.logger = logging.getLogger(__name__)

355 self._config = config

356 self._in_memory = True

357 self._bytes_written = 0

358 self._fileobj = BytesIO()

359

360 # Save the provided field/file name.

361 self._field_name = field_name

362 self._file_name = file_name

363

364 # Our actual file name is None by default, since, depending on our

365 # config, we may not actually use the provided name.

366 self._actual_file_name = None

367

368 # Split the extension from the filename.

369 if file_name is not None:

370 base, ext = os.path.splitext(file_name)

371 self._file_base = base

372 self._ext = ext

373

374 @property

375 def field_name(self) -> bytes | None:

376 """The form field associated with this file. May be None if there isn't

377 one, for example when we have an application/octet-stream upload.

378 """

379 return self._field_name

380

381 @property

382 def file_name(self) -> bytes | None:

383 """The file name given in the upload request."""

384 return self._file_name

385

386 @property

387 def actual_file_name(self):

388 """The file name that this file is saved as. Will be None if it's not

389 currently saved on disk.

390 """

391 return self._actual_file_name

392

393 @property

394 def file_object(self):

395 """The file object that we're currently writing to. Note that this

396 will either be an instance of a :class:`io.BytesIO`, or a regular file

397 object.

398 """

399 return self._fileobj

400

401 @property

402 def size(self):

403 """The total size of this file, counted as the number of bytes that

404 currently have been written to the file.

405 """

406 return self._bytes_written

407

408 @property

409 def in_memory(self) -> bool:

410 """A boolean representing whether or not this file object is currently

411 stored in-memory or on-disk.

412 """

413 return self._in_memory

414

415 def flush_to_disk(self) -> None:

416 """If the file is already on-disk, do nothing. Otherwise, copy from

417 the in-memory buffer to a disk file, and then reassign our internal

418 file object to this new disk file.

419

420 Note that if you attempt to flush a file that is already on-disk, a

421 warning will be logged to this module's logger.

422 """

423 if not self._in_memory:

424 self.logger.warning("Trying to flush to disk when we're not in memory")

425 return

426

427 # Go back to the start of our file.

428 self._fileobj.seek(0)

429

430 # Open a new file.

431 new_file = self._get_disk_file()

432

433 # Copy the file objects.

434 shutil.copyfileobj(self._fileobj, new_file)

435

436 # Seek to the new position in our new file.

437 new_file.seek(self._bytes_written)

438

439 # Reassign the fileobject.

440 old_fileobj = self._fileobj

441 self._fileobj = new_file

442

443 # We're no longer in memory.

444 self._in_memory = False

445

446 # Close the old file object.

447 old_fileobj.close()

448

449 def _get_disk_file(self):

450 """This function is responsible for getting a file object on-disk for us."""

451 self.logger.info("Opening a file on disk")

452

453 file_dir = self._config.get("UPLOAD_DIR")

454 keep_filename = self._config.get("UPLOAD_KEEP_FILENAME", False)

455 keep_extensions = self._config.get("UPLOAD_KEEP_EXTENSIONS", False)

456 delete_tmp = self._config.get("UPLOAD_DELETE_TMP", True)

457

458 # If we have a directory and are to keep the filename...

459 if file_dir is not None and keep_filename:

460 self.logger.info("Saving with filename in: %r", file_dir)

461

462 # Build our filename.

463 # TODO: what happens if we don't have a filename?

464 fname = self._file_base

465 if keep_extensions:

466 fname = fname + self._ext

467

468 path = os.path.join(file_dir, fname)

469 try:

470 self.logger.info("Opening file: %r", path)

471 tmp_file = open(path, "w+b")

472 except OSError:

473 tmp_file = None

474

475 self.logger.exception("Error opening temporary file")

476 raise FileError("Error opening temporary file: %r" % path)

477 else:

478 # Build options array.

479 # Note that on Python 3, tempfile doesn't support byte names. We

480 # encode our paths using the default filesystem encoding.

481 options = {}

482 if keep_extensions:

483 ext = self._ext

484 if isinstance(ext, bytes):

485 ext = ext.decode(sys.getfilesystemencoding())

486

487 options["suffix"] = ext

488 if file_dir is not None:

489 d = file_dir

490 if isinstance(d, bytes):

491 d = d.decode(sys.getfilesystemencoding())

492

493 options["dir"] = d

494 options["delete"] = delete_tmp

495

496 # Create a temporary (named) file with the appropriate settings.

497 self.logger.info("Creating a temporary file with options: %r", options)

498 try:

499 tmp_file = tempfile.NamedTemporaryFile(**options)

500 except OSError:

501 self.logger.exception("Error creating named temporary file")

502 raise FileError("Error creating named temporary file")

503

504 fname = tmp_file.name

505

506 # Encode filename as bytes.

507 if isinstance(fname, str):

508 fname = fname.encode(sys.getfilesystemencoding())

509

510 self._actual_file_name = fname

511 return tmp_file

512

513 def write(self, data: bytes):

514 """Write some data to the File.

515

516 :param data: a bytestring

517 """

518 return self.on_data(data)

519

520 def on_data(self, data: bytes):

521 """This method is a callback that will be called whenever data is

522 written to the File.

523

524 :param data: a bytestring

525 """

526 pos = self._fileobj.tell()

527 bwritten = self._fileobj.write(data)

528 # true file objects write returns None

529 if bwritten is None:

530 bwritten = self._fileobj.tell() - pos

531

532 # If the bytes written isn't the same as the length, just return.

533 if bwritten != len(data):

534 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, len(data))

535 return bwritten

536

537 # Keep track of how many bytes we've written.

538 self._bytes_written += bwritten

539

540 # If we're in-memory and are over our limit, we create a file.

541 if (

542 self._in_memory

543 and self._config.get("MAX_MEMORY_FILE_SIZE") is not None

544 and (self._bytes_written > self._config.get("MAX_MEMORY_FILE_SIZE"))

545 ):

546 self.logger.info("Flushing to disk")

547 self.flush_to_disk()

548

549 # Return the number of bytes written.

550 return bwritten

551

552 def on_end(self) -> None:

553 """This method is called whenever the Field is finalized."""

554 # Flush the underlying file object

555 self._fileobj.flush()

556

557 def finalize(self) -> None:

558 """Finalize the form file. This will not close the underlying file,

559 but simply signal that we are finished writing to the File.

560 """

561 self.on_end()

562

563 def close(self) -> None:

564 """Close the File object. This will actually close the underlying

565 file object (whether it's a :class:`io.BytesIO` or an actual file

566 object).

567 """

568 self._fileobj.close()

569

570 def __repr__(self) -> str:

571 return "{}(file_name={!r}, field_name={!r})".format(self.__class__.__name__, self.file_name, self.field_name)

572

573

574class BaseParser:

575 """This class is the base class for all parsers. It contains the logic for

576 calling and adding callbacks.

577

578 A callback can be one of two different forms. "Notification callbacks" are

579 callbacks that are called when something happens - for example, when a new

580 part of a multipart message is encountered by the parser. "Data callbacks"

581 are called when we get some sort of data - for example, part of the body of

582 a multipart chunk. Notification callbacks are called with no parameters,

583 whereas data callbacks are called with three, as follows::

584

585 data_callback(data, start, end)

586

587 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on

588 Python 3). "start" and "end" are integer indexes into the "data" string

589 that represent the data of interest. Thus, in a data callback, the slice

590 `data[start:end]` represents the data that the callback is "interested in".

591 The callback is not passed a copy of the data, since copying severely hurts

592 performance.

593 """

594

595 def __init__(self):

596 self.logger = logging.getLogger(__name__)

597

598 def callback(self, name: str, data=None, start=None, end=None):

599 """This function calls a provided callback with some data. If the

600 callback is not set, will do nothing.

601

602 :param name: The name of the callback to call (as a string).

603

604 :param data: Data to pass to the callback. If None, then it is

605 assumed that the callback is a notification callback,

606 and no parameters are given.

607

608 :param end: An integer that is passed to the data callback.

609

610 :param start: An integer that is passed to the data callback.

611 """

612 name = "on_" + name

613 func = self.callbacks.get(name)

614 if func is None:

615 return

616

617 # Depending on whether we're given a buffer...

618 if data is not None:

619 # Don't do anything if we have start == end.

620 if start is not None and start == end:

621 return

622

623 self.logger.debug("Calling %s with data[%d:%d]", name, start, end)

624 func(data, start, end)

625 else:

626 self.logger.debug("Calling %s with no data", name)

627 func()

628

629 def set_callback(self, name: str, new_func):

630 """Update the function for a callback. Removes from the callbacks dict

631 if new_func is None.

632

633 :param name: The name of the callback to call (as a string).

634

635 :param new_func: The new function for the callback. If None, then the

636 callback will be removed (with no error if it does not

637 exist).

638 """

639 if new_func is None:

640 self.callbacks.pop("on_" + name, None)

641 else:

642 self.callbacks["on_" + name] = new_func

643

644 def close(self):

645 pass # pragma: no cover

646

647 def finalize(self):

648 pass # pragma: no cover

649

650 def __repr__(self):

651 return "%s()" % self.__class__.__name__

652

653

654class OctetStreamParser(BaseParser):

655 """This parser parses an octet-stream request body and calls callbacks when

656 incoming data is received. Callbacks are as follows:

657

658 .. list-table::

659 :widths: 15 10 30

660 :header-rows: 1

661

662 * - Callback Name

663 - Parameters

664 - Description

665 * - on_start

666 - None

667 - Called when the first data is parsed.

668 * - on_data

669 - data, start, end

670 - Called for each data chunk that is parsed.

671 * - on_end

672 - None

673 - Called when the parser is finished parsing all data.

674

675 :param callbacks: A dictionary of callbacks. See the documentation for

676 :class:`BaseParser`.

677

678 :param max_size: The maximum size of body to parse. Defaults to infinity -

679 i.e. unbounded.

680 """

681

682 def __init__(self, callbacks: OctetStreamCallbacks = {}, max_size=float("inf")):

683 super().__init__()

684 self.callbacks = callbacks

685 self._started = False

686

687 if not isinstance(max_size, Number) or max_size < 1:

688 raise ValueError("max_size must be a positive number, not %r" % max_size)

689 self.max_size = max_size

690 self._current_size = 0

691

692 def write(self, data: bytes):

693 """Write some data to the parser, which will perform size verification,

694 and then pass the data to the underlying callback.

695

696 :param data: a bytestring

697 """

698 if not self._started:

699 self.callback("start")

700 self._started = True

701

702 # Truncate data length.

703 data_len = len(data)

704 if (self._current_size + data_len) > self.max_size:

705 # We truncate the length of data that we are to process.

706 new_size = int(self.max_size - self._current_size)

707 self.logger.warning(

708 "Current size is %d (max %d), so truncating data length from %d to %d",

709 self._current_size,

710 self.max_size,

711 data_len,

712 new_size,

713 )

714 data_len = new_size

715

716 # Increment size, then callback, in case there's an exception.

717 self._current_size += data_len

718 self.callback("data", data, 0, data_len)

719 return data_len

720

721 def finalize(self) -> None:

722 """Finalize this parser, which signals to that we are finished parsing,

723 and sends the on_end callback.

724 """

725 self.callback("end")

726

727 def __repr__(self) -> str:

728 return "%s()" % self.__class__.__name__

729

730

731class QuerystringParser(BaseParser):

732 """This is a streaming querystring parser. It will consume data, and call

733 the callbacks given when it has data.

734

735 .. list-table::

736 :widths: 15 10 30

737 :header-rows: 1

738

739 * - Callback Name

740 - Parameters

741 - Description

742 * - on_field_start

743 - None

744 - Called when a new field is encountered.

745 * - on_field_name

746 - data, start, end

747 - Called when a portion of a field's name is encountered.

748 * - on_field_data

749 - data, start, end

750 - Called when a portion of a field's data is encountered.

751 * - on_field_end

752 - None

753 - Called when the end of a field is encountered.

754 * - on_end

755 - None

756 - Called when the parser is finished parsing all data.

757

758 :param callbacks: A dictionary of callbacks. See the documentation for

759 :class:`BaseParser`.

760

761 :param strict_parsing: Whether or not to parse the body strictly. Defaults

762 to False. If this is set to True, then the behavior

763 of the parser changes as the following: if a field

764 has a value with an equal sign (e.g. "foo=bar", or

765 "foo="), it is always included. If a field has no

766 equals sign (e.g. "...&name&..."), it will be

767 treated as an error if 'strict_parsing' is True,

768 otherwise included. If an error is encountered,

769 then a

770 :class:`multipart.exceptions.QuerystringParseError`

771 will be raised.

772

773 :param max_size: The maximum size of body to parse. Defaults to infinity -

774 i.e. unbounded.

775 """

776

777 state: QuerystringState

778

779 def __init__(self, callbacks: QuerystringCallbacks = {}, strict_parsing: bool = False, max_size=float("inf")):

780 super().__init__()

781 self.state = QuerystringState.BEFORE_FIELD

782 self._found_sep = False

783

784 self.callbacks = callbacks

785

786 # Max-size stuff

787 if not isinstance(max_size, Number) or max_size < 1:

788 raise ValueError("max_size must be a positive number, not %r" % max_size)

789 self.max_size = max_size

790 self._current_size = 0

791

792 # Should parsing be strict?

793 self.strict_parsing = strict_parsing

794

795 def write(self, data: bytes) -> int:

796 """Write some data to the parser, which will perform size verification,

797 parse into either a field name or value, and then pass the

798 corresponding data to the underlying callback. If an error is

799 encountered while parsing, a QuerystringParseError will be raised. The

800 "offset" attribute of the raised exception will be set to the offset in

801 the input data chunk (NOT the overall stream) that caused the error.

802

803 :param data: a bytestring

804 """

805 # Handle sizing.

806 data_len = len(data)

807 if (self._current_size + data_len) > self.max_size:

808 # We truncate the length of data that we are to process.

809 new_size = int(self.max_size - self._current_size)

810 self.logger.warning(

811 "Current size is %d (max %d), so truncating data length from %d to %d",

812 self._current_size,

813 self.max_size,

814 data_len,

815 new_size,

816 )

817 data_len = new_size

818

819 l = 0

820 try:

821 l = self._internal_write(data, data_len)

822 finally:

823 self._current_size += l

824

825 return l

826

827 def _internal_write(self, data: bytes, length: int) -> int:

828 state = self.state

829 strict_parsing = self.strict_parsing

830 found_sep = self._found_sep

831

832 i = 0

833 while i < length:

834 ch = data[i]

835

836 # Depending on our state...

837 if state == QuerystringState.BEFORE_FIELD:

838 # If the 'found_sep' flag is set, we've already encountered

839 # and skipped a single separator. If so, we check our strict

840 # parsing flag and decide what to do. Otherwise, we haven't

841 # yet reached a separator, and thus, if we do, we need to skip

842 # it as it will be the boundary between fields that's supposed

843 # to be there.

844 if ch == AMPERSAND or ch == SEMICOLON:

845 if found_sep:

846 # If we're parsing strictly, we disallow blank chunks.

847 if strict_parsing:

848 e = QuerystringParseError("Skipping duplicate ampersand/semicolon at %d" % i)

849 e.offset = i

850 raise e

851 else:

852 self.logger.debug("Skipping duplicate ampersand/semicolon at %d", i)

853 else:

854 # This case is when we're skipping the (first)

855 # separator between fields, so we just set our flag

856 # and continue on.

857 found_sep = True

858 else:

859 # Emit a field-start event, and go to that state. Also,

860 # reset the "found_sep" flag, for the next time we get to

861 # this state.

862 self.callback("field_start")

863 i -= 1

864 state = QuerystringState.FIELD_NAME

865 found_sep = False

866

867 elif state == QuerystringState.FIELD_NAME:

868 # Try and find a separator - we ensure that, if we do, we only

869 # look for the equal sign before it.

870 sep_pos = data.find(b"&", i)

871 if sep_pos == -1:

872 sep_pos = data.find(b";", i)

873

874 # See if we can find an equals sign in the remaining data. If

875 # so, we can immediately emit the field name and jump to the

876 # data state.

877 if sep_pos != -1:

878 equals_pos = data.find(b"=", i, sep_pos)

879 else:

880 equals_pos = data.find(b"=", i)

881

882 if equals_pos != -1:

883 # Emit this name.

884 self.callback("field_name", data, i, equals_pos)

885

886 # Jump i to this position. Note that it will then have 1

887 # added to it below, which means the next iteration of this

888 # loop will inspect the character after the equals sign.

889 i = equals_pos

890 state = QuerystringState.FIELD_DATA

891 else:

892 # No equals sign found.

893 if not strict_parsing:

894 # See also comments in the QuerystringState.FIELD_DATA case below.

895 # If we found the separator, we emit the name and just

896 # end - there's no data callback at all (not even with

897 # a blank value).

898 if sep_pos != -1:

899 self.callback("field_name", data, i, sep_pos)

900 self.callback("field_end")

901

902 i = sep_pos - 1

903 state = QuerystringState.BEFORE_FIELD

904 else:

905 # Otherwise, no separator in this block, so the

906 # rest of this chunk must be a name.

907 self.callback("field_name", data, i, length)

908 i = length

909

910 else:

911 # We're parsing strictly. If we find a separator,

912 # this is an error - we require an equals sign.

913 if sep_pos != -1:

914 e = QuerystringParseError(

915 "When strict_parsing is True, we require an "

916 "equals sign in all field chunks. Did not "

917 "find one in the chunk that starts at %d" % (i,)

918 )

919 e.offset = i

920 raise e

921

922 # No separator in the rest of this chunk, so it's just

923 # a field name.

924 self.callback("field_name", data, i, length)

925 i = length

926

927 elif state == QuerystringState.FIELD_DATA:

928 # Try finding either an ampersand or a semicolon after this

929 # position.

930 sep_pos = data.find(b"&", i)

931 if sep_pos == -1:

932 sep_pos = data.find(b";", i)

933

934 # If we found it, callback this bit as data and then go back

935 # to expecting to find a field.

936 if sep_pos != -1:

937 self.callback("field_data", data, i, sep_pos)

938 self.callback("field_end")

939

940 # Note that we go to the separator, which brings us to the

941 # "before field" state. This allows us to properly emit

942 # "field_start" events only when we actually have data for

943 # a field of some sort.

944 i = sep_pos - 1

945 state = QuerystringState.BEFORE_FIELD

946

947 # Otherwise, emit the rest as data and finish.

948 else:

949 self.callback("field_data", data, i, length)

950 i = length

951

952 else: # pragma: no cover (error case)

953 msg = "Reached an unknown state %d at %d" % (state, i)

954 self.logger.warning(msg)

955 e = QuerystringParseError(msg)

956 e.offset = i

957 raise e

958

959 i += 1

960

961 self.state = state

962 self._found_sep = found_sep

963 return len(data)

964

965 def finalize(self) -> None:

966 """Finalize this parser, which signals to that we are finished parsing,

967 if we're still in the middle of a field, an on_field_end callback, and

968 then the on_end callback.

969 """

970 # If we're currently in the middle of a field, we finish it.

971 if self.state == QuerystringState.FIELD_DATA:

972 self.callback("field_end")

973 self.callback("end")

974

975 def __repr__(self) -> str:

976 return "{}(strict_parsing={!r}, max_size={!r})".format(

977 self.__class__.__name__, self.strict_parsing, self.max_size

978 )

979

980

981class MultipartParser(BaseParser):

982 """This class is a streaming multipart/form-data parser.

983

984 .. list-table::

985 :widths: 15 10 30

986 :header-rows: 1

987

988 * - Callback Name

989 - Parameters

990 - Description

991 * - on_part_begin

992 - None

993 - Called when a new part of the multipart message is encountered.

994 * - on_part_data

995 - data, start, end

996 - Called when a portion of a part's data is encountered.

997 * - on_part_end

998 - None

999 - Called when the end of a part is reached.

1000 * - on_header_begin

1001 - None

1002 - Called when we've found a new header in a part of a multipart

1003 message

1004 * - on_header_field

1005 - data, start, end

1006 - Called each time an additional portion of a header is read (i.e. the

1007 part of the header that is before the colon; the "Foo" in

1008 "Foo: Bar").

1009 * - on_header_value

1010 - data, start, end

1011 - Called when we get data for a header.

1012 * - on_header_end

1013 - None

1014 - Called when the current header is finished - i.e. we've reached the

1015 newline at the end of the header.

1016 * - on_headers_finished

1017 - None

1018 - Called when all headers are finished, and before the part data

1019 starts.

1020 * - on_end

1021 - None

1022 - Called when the parser is finished parsing all data.

1023

1024

1025 :param boundary: The multipart boundary. This is required, and must match

1026 what is given in the HTTP request - usually in the

1027 Content-Type header.

1028

1029 :param callbacks: A dictionary of callbacks. See the documentation for

1030 :class:`BaseParser`.

1031

1032 :param max_size: The maximum size of body to parse. Defaults to infinity -

1033 i.e. unbounded.

1034 """

1035

1036 def __init__(self, boundary: bytes | str, callbacks: MultipartCallbacks = {}, max_size=float("inf")):

1037 # Initialize parser state.

1038 super().__init__()

1039 self.state = MultipartState.START

1040 self.index = self.flags = 0

1041

1042 self.callbacks = callbacks

1043

1044 if not isinstance(max_size, Number) or max_size < 1:

1045 raise ValueError("max_size must be a positive number, not %r" % max_size)

1046 self.max_size = max_size

1047 self._current_size = 0

1048

1049 # Setup marks. These are used to track the state of data received.

1050 self.marks = {}

1051

1052 # TODO: Actually use this rather than the dumb version we currently use

1053 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.

1054 # skip = [len(boundary) for x in range(256)]

1055 # for i in range(len(boundary) - 1):

1056 # skip[ord_char(boundary[i])] = len(boundary) - i - 1

1057 #

1058 # # We use a tuple since it's a constant, and marginally faster.

1059 # self.skip = tuple(skip)

1060

1061 # Save our boundary.

1062 if isinstance(boundary, str): # pragma: no cover

1063 boundary = boundary.encode("latin-1")

1064 self.boundary = b"\r\n--" + boundary

1065

1066 # Get a set of characters that belong to our boundary.

1067 self.boundary_chars = frozenset(self.boundary)

1068

1069 # We also create a lookbehind list.

1070 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +

1071 # "--\r\n" at the final boundary, and the length of '\r\n--' and

1072 # '--\r\n' is 8 bytes.

1073 self.lookbehind = [NULL for x in range(len(boundary) + 8)]

1074

1075 def write(self, data: bytes) -> int:

1076 """Write some data to the parser, which will perform size verification,

1077 and then parse the data into the appropriate location (e.g. header,

1078 data, etc.), and pass this on to the underlying callback. If an error

1079 is encountered, a MultipartParseError will be raised. The "offset"

1080 attribute on the raised exception will be set to the offset of the byte

1081 in the input chunk that caused the error.

1082

1083 :param data: a bytestring

1084 """

1085 # Handle sizing.

1086 data_len = len(data)

1087 if (self._current_size + data_len) > self.max_size:

1088 # We truncate the length of data that we are to process.

1089 new_size = int(self.max_size - self._current_size)

1090 self.logger.warning(

1091 "Current size is %d (max %d), so truncating data length from %d to %d",

1092 self._current_size,

1093 self.max_size,

1094 data_len,

1095 new_size,

1096 )

1097 data_len = new_size

1098

1099 l = 0

1100 try:

1101 l = self._internal_write(data, data_len)

1102 finally:

1103 self._current_size += l

1104

1105 return l

1106

1107 def _internal_write(self, data: bytes, length: int) -> int:

1108 # Get values from locals.

1109 boundary = self.boundary

1110

1111 # Get our state, flags and index. These are persisted between calls to

1112 # this function.

1113 state = self.state

1114 index = self.index

1115 flags = self.flags

1116

1117 # Our index defaults to 0.

1118 i = 0

1119

1120 # Set a mark.

1121 def set_mark(name):

1122 self.marks[name] = i

1123

1124 # Remove a mark.

1125 def delete_mark(name, reset=False):

1126 self.marks.pop(name, None)

1127

1128 # Helper function that makes calling a callback with data easier. The

1129 # 'remaining' parameter will callback from the marked value until the

1130 # end of the buffer, and reset the mark, instead of deleting it. This

1131 # is used at the end of the function to call our callbacks with any

1132 # remaining data in this chunk.

1133 def data_callback(name, remaining=False):

1134 marked_index = self.marks.get(name)

1135 if marked_index is None:

1136 return

1137

1138 # If we're getting remaining data, we ignore the current i value

1139 # and just call with the remaining data.

1140 if remaining:

1141 self.callback(name, data, marked_index, length)

1142 self.marks[name] = 0

1143

1144 # Otherwise, we call it from the mark to the current byte we're

1145 # processing.

1146 else:

1147 self.callback(name, data, marked_index, i)

1148 self.marks.pop(name, None)

1149

1150 # For each byte...

1151 while i < length:

1152 c = data[i]

1153

1154 if state == MultipartState.START:

1155 # Skip leading newlines

1156 if c == CR or c == LF:

1157 i += 1

1158 self.logger.debug("Skipping leading CR/LF at %d", i)

1159 continue

1160

1161 # index is used as in index into our boundary. Set to 0.

1162 index = 0

1163

1164 # Move to the next state, but decrement i so that we re-process

1165 # this character.

1166 state = MultipartState.START_BOUNDARY

1167 i -= 1

1168

1169 elif state == MultipartState.START_BOUNDARY:

1170 # Check to ensure that the last 2 characters in our boundary

1171 # are CRLF.

1172 if index == len(boundary) - 2:

1173 if c != CR:

1174 # Error!

1175 msg = "Did not find CR at end of boundary (%d)" % (i,)

1176 self.logger.warning(msg)

1177 e = MultipartParseError(msg)

1178 e.offset = i

1179 raise e

1180

1181 index += 1

1182

1183 elif index == len(boundary) - 2 + 1:

1184 if c != LF:

1185 msg = "Did not find LF at end of boundary (%d)" % (i,)

1186 self.logger.warning(msg)

1187 e = MultipartParseError(msg)

1188 e.offset = i

1189 raise e

1190

1191 # The index is now used for indexing into our boundary.

1192 index = 0

1193

1194 # Callback for the start of a part.

1195 self.callback("part_begin")

1196

1197 # Move to the next character and state.

1198 state = MultipartState.HEADER_FIELD_START

1199

1200 else:

1201 # Check to ensure our boundary matches

1202 if c != boundary[index + 2]:

1203 msg = "Did not find boundary character %r at index " "%d" % (c, index + 2)

1204 self.logger.warning(msg)

1205 e = MultipartParseError(msg)

1206 e.offset = i

1207 raise e

1208

1209 # Increment index into boundary and continue.

1210 index += 1

1211

1212 elif state == MultipartState.HEADER_FIELD_START:

1213 # Mark the start of a header field here, reset the index, and

1214 # continue parsing our header field.

1215 index = 0

1216

1217 # Set a mark of our header field.

1218 set_mark("header_field")

1219

1220 # Move to parsing header fields.

1221 state = MultipartState.HEADER_FIELD

1222 i -= 1

1223

1224 elif state == MultipartState.HEADER_FIELD:

1225 # If we've reached a CR at the beginning of a header, it means

1226 # that we've reached the second of 2 newlines, and so there are

1227 # no more headers to parse.

1228 if c == CR:

1229 delete_mark("header_field")

1230 state = MultipartState.HEADERS_ALMOST_DONE

1231 i += 1

1232 continue

1233

1234 # Increment our index in the header.

1235 index += 1

1236

1237 # Do nothing if we encounter a hyphen.

1238 if c == HYPHEN:

1239 pass

1240

1241 # If we've reached a colon, we're done with this header.

1242 elif c == COLON:

1243 # A 0-length header is an error.

1244 if index == 1:

1245 msg = "Found 0-length header at %d" % (i,)

1246 self.logger.warning(msg)

1247 e = MultipartParseError(msg)

1248 e.offset = i

1249 raise e

1250

1251 # Call our callback with the header field.

1252 data_callback("header_field")

1253

1254 # Move to parsing the header value.

1255 state = MultipartState.HEADER_VALUE_START

1256

1257 else:

1258 # Lower-case this character, and ensure that it is in fact

1259 # a valid letter. If not, it's an error.

1260 cl = lower_char(c)

1261 if cl < LOWER_A or cl > LOWER_Z:

1262 msg = "Found non-alphanumeric character %r in " "header at %d" % (c, i)

1263 self.logger.warning(msg)

1264 e = MultipartParseError(msg)

1265 e.offset = i

1266 raise e

1267

1268 elif state == MultipartState.HEADER_VALUE_START:

1269 # Skip leading spaces.

1270 if c == SPACE:

1271 i += 1

1272 continue

1273

1274 # Mark the start of the header value.

1275 set_mark("header_value")

1276

1277 # Move to the header-value state, reprocessing this character.

1278 state = MultipartState.HEADER_VALUE

1279 i -= 1

1280

1281 elif state == MultipartState.HEADER_VALUE:

1282 # If we've got a CR, we're nearly done our headers. Otherwise,

1283 # we do nothing and just move past this character.

1284 if c == CR:

1285 data_callback("header_value")

1286 self.callback("header_end")

1287 state = MultipartState.HEADER_VALUE_ALMOST_DONE

1288

1289 elif state == MultipartState.HEADER_VALUE_ALMOST_DONE:

1290 # The last character should be a LF. If not, it's an error.

1291 if c != LF:

1292 msg = "Did not find LF character at end of header " "(found %r)" % (c,)

1293 self.logger.warning(msg)

1294 e = MultipartParseError(msg)

1295 e.offset = i

1296 raise e

1297

1298 # Move back to the start of another header. Note that if that

1299 # state detects ANOTHER newline, it'll trigger the end of our

1300 # headers.

1301 state = MultipartState.HEADER_FIELD_START

1302

1303 elif state == MultipartState.HEADERS_ALMOST_DONE:

1304 # We're almost done our headers. This is reached when we parse

1305 # a CR at the beginning of a header, so our next character

1306 # should be a LF, or it's an error.

1307 if c != LF:

1308 msg = f"Did not find LF at end of headers (found {c!r})"

1309 self.logger.warning(msg)

1310 e = MultipartParseError(msg)

1311 e.offset = i

1312 raise e

1313

1314 self.callback("headers_finished")

1315 state = MultipartState.PART_DATA_START

1316

1317 elif state == MultipartState.PART_DATA_START:

1318 # Mark the start of our part data.

1319 set_mark("part_data")

1320

1321 # Start processing part data, including this character.

1322 state = MultipartState.PART_DATA

1323 i -= 1

1324

1325 elif state == MultipartState.PART_DATA:

1326 # We're processing our part data right now. During this, we

1327 # need to efficiently search for our boundary, since any data

1328 # on any number of lines can be a part of the current data.

1329 # We use the Boyer-Moore-Horspool algorithm to efficiently

1330 # search through the remainder of the buffer looking for our

1331 # boundary.

1332

1333 # Save the current value of our index. We use this in case we

1334 # find part of a boundary, but it doesn't match fully.

1335 prev_index = index

1336

1337 # Set up variables.

1338 boundary_length = len(boundary)

1339 boundary_end = boundary_length - 1

1340 data_length = length

1341 boundary_chars = self.boundary_chars

1342

1343 # If our index is 0, we're starting a new part, so start our

1344 # search.

1345 if index == 0:

1346 # Search forward until we either hit the end of our buffer,

1347 # or reach a character that's in our boundary.

1348 i += boundary_end

1349 while i < data_length - 1 and data[i] not in boundary_chars:

1350 i += boundary_length

1351

1352 # Reset i back the length of our boundary, which is the

1353 # earliest possible location that could be our match (i.e.

1354 # if we've just broken out of our loop since we saw the

1355 # last character in our boundary)

1356 i -= boundary_end

1357 c = data[i]

1358

1359 # Now, we have a couple of cases here. If our index is before

1360 # the end of the boundary...

1361 if index < boundary_length:

1362 # If the character matches...

1363 if boundary[index] == c:

1364 # If we found a match for our boundary, we send the

1365 # existing data.

1366 if index == 0:

1367 data_callback("part_data")

1368

1369 # The current character matches, so continue!

1370 index += 1

1371 else:

1372 index = 0

1373

1374 # Our index is equal to the length of our boundary!

1375 elif index == boundary_length:

1376 # First we increment it.

1377 index += 1

1378

1379 # Now, if we've reached a newline, we need to set this as

1380 # the potential end of our boundary.

1381 if c == CR:

1382 flags |= FLAG_PART_BOUNDARY

1383

1384 # Otherwise, if this is a hyphen, we might be at the last

1385 # of all boundaries.

1386 elif c == HYPHEN:

1387 flags |= FLAG_LAST_BOUNDARY

1388

1389 # Otherwise, we reset our index, since this isn't either a

1390 # newline or a hyphen.

1391 else:

1392 index = 0

1393

1394 # Our index is right after the part boundary, which should be

1395 # a LF.

1396 elif index == boundary_length + 1:

1397 # If we're at a part boundary (i.e. we've seen a CR

1398 # character already)...

1399 if flags & FLAG_PART_BOUNDARY:

1400 # We need a LF character next.

1401 if c == LF:

1402 # Unset the part boundary flag.

1403 flags &= ~FLAG_PART_BOUNDARY

1404

1405 # Callback indicating that we've reached the end of

1406 # a part, and are starting a new one.

1407 self.callback("part_end")

1408 self.callback("part_begin")

1409

1410 # Move to parsing new headers.

1411 index = 0

1412 state = MultipartState.HEADER_FIELD_START

1413 i += 1

1414 continue

1415

1416 # We didn't find an LF character, so no match. Reset

1417 # our index and clear our flag.

1418 index = 0

1419 flags &= ~FLAG_PART_BOUNDARY

1420

1421 # Otherwise, if we're at the last boundary (i.e. we've

1422 # seen a hyphen already)...

1423 elif flags & FLAG_LAST_BOUNDARY:

1424 # We need a second hyphen here.

1425 if c == HYPHEN:

1426 # Callback to end the current part, and then the

1427 # message.

1428 self.callback("part_end")

1429 self.callback("end")

1430 state = MultipartState.END

1431 else:

1432 # No match, so reset index.

1433 index = 0

1434

1435 # If we have an index, we need to keep this byte for later, in

1436 # case we can't match the full boundary.

1437 if index > 0:

1438 self.lookbehind[index - 1] = c

1439

1440 # Otherwise, our index is 0. If the previous index is not, it

1441 # means we reset something, and we need to take the data we

1442 # thought was part of our boundary and send it along as actual

1443 # data.

1444 elif prev_index > 0:

1445 # Callback to write the saved data.

1446 lb_data = join_bytes(self.lookbehind)

1447 self.callback("part_data", lb_data, 0, prev_index)

1448

1449 # Overwrite our previous index.

1450 prev_index = 0

1451

1452 # Re-set our mark for part data.

1453 set_mark("part_data")

1454

1455 # Re-consider the current character, since this could be

1456 # the start of the boundary itself.

1457 i -= 1

1458

1459 elif state == MultipartState.END:

1460 # Do nothing and just consume a byte in the end state.

1461 if c not in (CR, LF):

1462 self.logger.warning("Consuming a byte '0x%x' in the end state", c)

1463

1464 else: # pragma: no cover (error case)

1465 # We got into a strange state somehow! Just stop processing.

1466 msg = "Reached an unknown state %d at %d" % (state, i)

1467 self.logger.warning(msg)

1468 e = MultipartParseError(msg)

1469 e.offset = i

1470 raise e

1471

1472 # Move to the next byte.

1473 i += 1

1474

1475 # We call our callbacks with any remaining data. Note that we pass

1476 # the 'remaining' flag, which sets the mark back to 0 instead of

1477 # deleting it, if it's found. This is because, if the mark is found

1478 # at this point, we assume that there's data for one of these things

1479 # that has been parsed, but not yet emitted. And, as such, it implies

1480 # that we haven't yet reached the end of this 'thing'. So, by setting

1481 # the mark to 0, we cause any data callbacks that take place in future

1482 # calls to this function to start from the beginning of that buffer.

1483 data_callback("header_field", True)

1484 data_callback("header_value", True)

1485 data_callback("part_data", True)

1486

1487 # Save values to locals.

1488 self.state = state

1489 self.index = index

1490 self.flags = flags

1491

1492 # Return our data length to indicate no errors, and that we processed

1493 # all of it.

1494 return length

1495

1496 def finalize(self) -> None:

1497 """Finalize this parser, which signals to that we are finished parsing.

1498

1499 Note: It does not currently, but in the future, it will verify that we

1500 are in the final state of the parser (i.e. the end of the multipart

1501 message is well-formed), and, if not, throw an error.

1502 """

1503 # TODO: verify that we're in the state MultipartState.END, otherwise throw an

1504 # error or otherwise state that we're not finished parsing.

1505 pass

1506

1507 def __repr__(self):

1508 return f"{self.__class__.__name__}(boundary={self.boundary!r})"

1509

1510

1511class FormParser:

1512 """This class is the all-in-one form parser. Given all the information

1513 necessary to parse a form, it will instantiate the correct parser, create

1514 the proper :class:`Field` and :class:`File` classes to store the data that

1515 is parsed, and call the two given callbacks with each field and file as

1516 they become available.

1517

1518 :param content_type: The Content-Type of the incoming request. This is

1519 used to select the appropriate parser.

1520

1521 :param on_field: The callback to call when a field has been parsed and is

1522 ready for usage. See above for parameters.

1523

1524 :param on_file: The callback to call when a file has been parsed and is

1525 ready for usage. See above for parameters.

1526

1527 :param on_end: An optional callback to call when all fields and files in a

1528 request has been parsed. Can be None.

1529

1530 :param boundary: If the request is a multipart/form-data request, this

1531 should be the boundary of the request, as given in the

1532 Content-Type header, as a bytestring.

1533

1534 :param file_name: If the request is of type application/octet-stream, then

1535 the body of the request will not contain any information

1536 about the uploaded file. In such cases, you can provide

1537 the file name of the uploaded file manually.

1538

1539 :param FileClass: The class to use for uploaded files. Defaults to

1540 :class:`File`, but you can provide your own class if you

1541 wish to customize behaviour. The class will be

1542 instantiated as FileClass(file_name, field_name), and it

1543 must provide the following functions::

1544 file_instance.write(data)

1545 file_instance.finalize()

1546 file_instance.close()

1547

1548 :param FieldClass: The class to use for uploaded fields. Defaults to

1549 :class:`Field`, but you can provide your own class if

1550 you wish to customize behaviour. The class will be

1551 instantiated as FieldClass(field_name), and it must

1552 provide the following functions::

1553 field_instance.write(data)

1554 field_instance.finalize()

1555 field_instance.close()

1556

1557 :param config: Configuration to use for this FormParser. The default

1558 values are taken from the DEFAULT_CONFIG value, and then

1559 any keys present in this dictionary will overwrite the

1560 default values.

1561

1562 """

1563

1564 #: This is the default configuration for our form parser.

1565 #: Note: all file sizes should be in bytes.

1566 DEFAULT_CONFIG: FormParserConfig = {

1567 "MAX_BODY_SIZE": float("inf"),

1568 "MAX_MEMORY_FILE_SIZE": 1 * 1024 * 1024,

1569 "UPLOAD_DIR": None,

1570 "UPLOAD_KEEP_FILENAME": False,

1571 "UPLOAD_KEEP_EXTENSIONS": False,

1572 # Error on invalid Content-Transfer-Encoding?

1573 "UPLOAD_ERROR_ON_BAD_CTE": False,

1574 }

1575

1576 def __init__(

1577 self,

1578 content_type,

1579 on_field,

1580 on_file,

1581 on_end=None,

1582 boundary=None,

1583 file_name=None,

1584 FileClass=File,

1585 FieldClass=Field,

1586 config: FormParserConfig = {},

1587 ):

1588 self.logger = logging.getLogger(__name__)

1589

1590 # Save variables.

1591 self.content_type = content_type

1592 self.boundary = boundary

1593 self.bytes_received = 0

1594 self.parser = None

1595

1596 # Save callbacks.

1597 self.on_field = on_field

1598 self.on_file = on_file

1599 self.on_end = on_end

1600

1601 # Save classes.

1602 self.FileClass = File

1603 self.FieldClass = Field

1604

1605 # Set configuration options.

1606 self.config = self.DEFAULT_CONFIG.copy()

1607 self.config.update(config)

1608

1609 # Depending on the Content-Type, we instantiate the correct parser.

1610 if content_type == "application/octet-stream":

1611 # Work around the lack of 'nonlocal' in Py2

1612 class vars:

1613 f = None

1614

1615 def on_start() -> None:

1616 vars.f = FileClass(file_name, None, config=self.config)

1617

1618 def on_data(data: bytes, start: int, end: int) -> None:

1619 vars.f.write(data[start:end])

1620

1621 def on_end() -> None:

1622 # Finalize the file itself.

1623 vars.f.finalize()

1624

1625 # Call our callback.

1626 on_file(vars.f)

1627

1628 # Call the on-end callback.

1629 if self.on_end is not None:

1630 self.on_end()

1631

1632 # Instantiate an octet-stream parser

1633 parser = OctetStreamParser(

1634 callbacks={"on_start": on_start, "on_data": on_data, "on_end": on_end},

1635 max_size=self.config["MAX_BODY_SIZE"],

1636 )

1637

1638 elif content_type == "application/x-www-form-urlencoded" or content_type == "application/x-url-encoded":

1639 name_buffer: list[bytes] = []

1640

1641 class vars:

1642 f = None

1643

1644 def on_field_start() -> None:

1645 pass

1646

1647 def on_field_name(data: bytes, start: int, end: int) -> None:

1648 name_buffer.append(data[start:end])

1649

1650 def on_field_data(data: bytes, start: int, end: int) -> None:

1651 if vars.f is None:

1652 vars.f = FieldClass(b"".join(name_buffer))

1653 del name_buffer[:]

1654 vars.f.write(data[start:end])

1655

1656 def on_field_end() -> None:

1657 # Finalize and call callback.

1658 if vars.f is None:

1659 # If we get here, it's because there was no field data.

1660 # We create a field, set it to None, and then continue.

1661 vars.f = FieldClass(b"".join(name_buffer))

1662 del name_buffer[:]

1663 vars.f.set_none()

1664

1665 vars.f.finalize()

1666 on_field(vars.f)

1667 vars.f = None

1668

1669 def on_end() -> None:

1670 if self.on_end is not None:

1671 self.on_end()

1672

1673 # Instantiate parser.

1674 parser = QuerystringParser(

1675 callbacks={

1676 "on_field_start": on_field_start,

1677 "on_field_name": on_field_name,

1678 "on_field_data": on_field_data,

1679 "on_field_end": on_field_end,

1680 "on_end": on_end,

1681 },

1682 max_size=self.config["MAX_BODY_SIZE"],

1683 )

1684

1685 elif content_type == "multipart/form-data":

1686 if boundary is None:

1687 self.logger.error("No boundary given")

1688 raise FormParserError("No boundary given")

1689

1690 header_name: list[bytes] = []

1691 header_value: list[bytes] = []

1692 headers = {}

1693

1694 # No 'nonlocal' on Python 2 :-(

1695 class vars:

1696 f = None

1697 writer = None

1698 is_file = False

1699

1700 def on_part_begin():

1701 pass

1702

1703 def on_part_data(data: bytes, start: int, end: int):

1704 bytes_processed = vars.writer.write(data[start:end])

1705 # TODO: check for error here.

1706 return bytes_processed

1707

1708 def on_part_end() -> None:

1709 vars.f.finalize()

1710 if vars.is_file:

1711 on_file(vars.f)

1712 else:

1713 on_field(vars.f)

1714

1715 def on_header_field(data: bytes, start: int, end: int):

1716 header_name.append(data[start:end])

1717

1718 def on_header_value(data: bytes, start: int, end: int):

1719 header_value.append(data[start:end])

1720

1721 def on_header_end():

1722 headers[b"".join(header_name)] = b"".join(header_value)

1723 del header_name[:]

1724 del header_value[:]

1725

1726 def on_headers_finished() -> None:

1727 # Reset the 'is file' flag.

1728 vars.is_file = False

1729

1730 # Parse the content-disposition header.

1731 # TODO: handle mixed case

1732 content_disp = headers.get(b"Content-Disposition")

1733 disp, options = parse_options_header(content_disp)

1734

1735 # Get the field and filename.

1736 field_name = options.get(b"name")

1737 file_name = options.get(b"filename")

1738 # TODO: check for errors

1739

1740 # Create the proper class.

1741 if file_name is None:

1742 vars.f = FieldClass(field_name)

1743 else:

1744 vars.f = FileClass(file_name, field_name, config=self.config)

1745 vars.is_file = True

1746

1747 # Parse the given Content-Transfer-Encoding to determine what

1748 # we need to do with the incoming data.

1749 # TODO: check that we properly handle 8bit / 7bit encoding.

1750 transfer_encoding = headers.get(b"Content-Transfer-Encoding", b"7bit")

1751

1752 if transfer_encoding == b"binary" or transfer_encoding == b"8bit" or transfer_encoding == b"7bit":

1753 vars.writer = vars.f

1754

1755 elif transfer_encoding == b"base64":

1756 vars.writer = Base64Decoder(vars.f)

1757

1758 elif transfer_encoding == b"quoted-printable":

1759 vars.writer = QuotedPrintableDecoder(vars.f)

1760

1761 else:

1762 self.logger.warning("Unknown Content-Transfer-Encoding: %r", transfer_encoding)

1763 if self.config["UPLOAD_ERROR_ON_BAD_CTE"]:

1764 raise FormParserError('Unknown Content-Transfer-Encoding "{}"'.format(transfer_encoding))

1765 else:

1766 # If we aren't erroring, then we just treat this as an

1767 # unencoded Content-Transfer-Encoding.

1768 vars.writer = vars.f

1769

1770 def on_end() -> None:

1771 vars.writer.finalize()

1772 if self.on_end is not None:

1773 self.on_end()

1774

1775 # Instantiate a multipart parser.

1776 parser = MultipartParser(

1777 boundary,

1778 callbacks={

1779 "on_part_begin": on_part_begin,

1780 "on_part_data": on_part_data,

1781 "on_part_end": on_part_end,

1782 "on_header_field": on_header_field,

1783 "on_header_value": on_header_value,

1784 "on_header_end": on_header_end,

1785 "on_headers_finished": on_headers_finished,

1786 "on_end": on_end,

1787 },

1788 max_size=self.config["MAX_BODY_SIZE"],

1789 )

1790

1791 else:

1792 self.logger.warning("Unknown Content-Type: %r", content_type)

1793 raise FormParserError("Unknown Content-Type: {}".format(content_type))

1794

1795 self.parser = parser

1796

1797 def write(self, data: bytes):

1798 """Write some data. The parser will forward this to the appropriate

1799 underlying parser.

1800

1801 :param data: a bytestring

1802 """

1803 self.bytes_received += len(data)

1804 # TODO: check the parser's return value for errors?

1805 return self.parser.write(data)

1806

1807 def finalize(self) -> None:

1808 """Finalize the parser."""

1809 if self.parser is not None and hasattr(self.parser, "finalize"):

1810 self.parser.finalize()

1811

1812 def close(self) -> None:

1813 """Close the parser."""

1814 if self.parser is not None and hasattr(self.parser, "close"):

1815 self.parser.close()

1816

1817 def __repr__(self) -> str:

1818 return "{}(content_type={!r}, parser={!r})".format(self.__class__.__name__, self.content_type, self.parser)

1819

1820

1821def create_form_parser(headers, on_field, on_file, trust_x_headers=False, config={}):

1822 """This function is a helper function to aid in creating a FormParser

1823 instances. Given a dictionary-like headers object, it will determine

1824 the correct information needed, instantiate a FormParser with the

1825 appropriate values and given callbacks, and then return the corresponding

1826 parser.

1827

1828 :param headers: A dictionary-like object of HTTP headers. The only

1829 required header is Content-Type.

1830

1831 :param on_field: Callback to call with each parsed field.

1832

1833 :param on_file: Callback to call with each parsed file.

1834

1835 :param trust_x_headers: Whether or not to trust information received from

1836 certain X-Headers - for example, the file name from

1837 X-File-Name.

1838

1839 :param config: Configuration variables to pass to the FormParser.

1840 """

1841 content_type = headers.get("Content-Type")

1842 if content_type is None:

1843 logging.getLogger(__name__).warning("No Content-Type header given")

1844 raise ValueError("No Content-Type header given!")

1845

1846 # Boundaries are optional (the FormParser will raise if one is needed

1847 # but not given).

1848 content_type, params = parse_options_header(content_type)

1849 boundary = params.get(b"boundary")

1850

1851 # We need content_type to be a string, not a bytes object.

1852 content_type = content_type.decode("latin-1")

1853

1854 # File names are optional.

1855 file_name = headers.get("X-File-Name")

1856

1857 # Instantiate a form parser.

1858 form_parser = FormParser(content_type, on_field, on_file, boundary=boundary, file_name=file_name, config=config)

1859

1860 # Return our parser.

1861 return form_parser

1862

1863

1864def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, **kwargs):

1865 """This function is useful if you just want to parse a request body,

1866 without too much work. Pass it a dictionary-like object of the request's

1867 headers, and a file-like object for the input stream, along with two

1868 callbacks that will get called whenever a field or file is parsed.

1869

1870 :param headers: A dictionary-like object of HTTP headers. The only

1871 required header is Content-Type.

1872

1873 :param input_stream: A file-like object that represents the request body.

1874 The read() method must return bytestrings.

1875

1876 :param on_field: Callback to call with each parsed field.

1877

1878 :param on_file: Callback to call with each parsed file.

1879

1880 :param chunk_size: The maximum size to read from the input stream and write

1881 to the parser at one time. Defaults to 1 MiB.

1882 """

1883

1884 # Create our form parser.

1885 parser = create_form_parser(headers, on_field, on_file)

1886

1887 # Read chunks of 100KiB and write to the parser, but never read more than

1888 # the given Content-Length, if any.

1889 content_length = headers.get("Content-Length")

1890 if content_length is not None:

1891 content_length = int(content_length)

1892 else:

1893 content_length = float("inf")

1894 bytes_read = 0

1895

1896 while True:

1897 # Read only up to the Content-Length given.

1898 max_readable = min(content_length - bytes_read, 1048576)

1899 buff = input_stream.read(max_readable)

1900

1901 # Write to the parser and update our length.

1902 parser.write(buff)

1903 bytes_read += len(buff)

1904

1905 # If we get a buffer that's smaller than the size requested, or if we

1906 # have read up to our content length, we're done.

1907 if len(buff) != max_readable or bytes_read == content_length:

1908 break

1909

1910 # Tell our parser that we're done writing data.

1911 parser.finalize()