Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/multipart/multipart.py: 17%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

719 statements  

1from __future__ import annotations 

2 

3import logging 

4import os 

5import shutil 

6import sys 

7import tempfile 

8from email.message import Message 

9from enum import IntEnum 

10from io import BytesIO 

11from numbers import Number 

12from typing import TYPE_CHECKING 

13 

14from .decoders import Base64Decoder, QuotedPrintableDecoder 

15from .exceptions import FileError, FormParserError, MultipartParseError, QuerystringParseError 

16 

17if TYPE_CHECKING: # pragma: no cover 

18 from typing import Callable, TypedDict 

19 

20 class QuerystringCallbacks(TypedDict, total=False): 

21 on_field_start: Callable[[], None] 

22 on_field_name: Callable[[bytes, int, int], None] 

23 on_field_data: Callable[[bytes, int, int], None] 

24 on_field_end: Callable[[], None] 

25 on_end: Callable[[], None] 

26 

27 class OctetStreamCallbacks(TypedDict, total=False): 

28 on_start: Callable[[], None] 

29 on_data: Callable[[bytes, int, int], None] 

30 on_end: Callable[[], None] 

31 

32 class MultipartCallbacks(TypedDict, total=False): 

33 on_part_begin: Callable[[], None] 

34 on_part_data: Callable[[bytes, int, int], None] 

35 on_part_end: Callable[[], None] 

36 on_headers_begin: Callable[[], None] 

37 on_header_field: Callable[[bytes, int, int], None] 

38 on_header_value: Callable[[bytes, int, int], None] 

39 on_header_end: Callable[[], None] 

40 on_headers_finished: Callable[[], None] 

41 on_end: Callable[[], None] 

42 

43 class FormParserConfig(TypedDict, total=False): 

44 UPLOAD_DIR: str | None 

45 UPLOAD_KEEP_FILENAME: bool 

46 UPLOAD_KEEP_EXTENSIONS: bool 

47 UPLOAD_ERROR_ON_BAD_CTE: bool 

48 MAX_MEMORY_FILE_SIZE: int 

49 MAX_BODY_SIZE: float 

50 

51 class FileConfig(TypedDict, total=False): 

52 UPLOAD_DIR: str | None 

53 UPLOAD_DELETE_TMP: bool 

54 UPLOAD_KEEP_FILENAME: bool 

55 UPLOAD_KEEP_EXTENSIONS: bool 

56 MAX_MEMORY_FILE_SIZE: int 

57 

58 

59# Unique missing object. 

60_missing = object() 

61 

62 

63class QuerystringState(IntEnum): 

64 """Querystring parser states. 

65 

66 These are used to keep track of the state of the parser, and are used to determine 

67 what to do when new data is encountered. 

68 """ 

69 

70 BEFORE_FIELD = 0 

71 FIELD_NAME = 1 

72 FIELD_DATA = 2 

73 

74 

75class MultipartState(IntEnum): 

76 """Multipart parser states. 

77 

78 These are used to keep track of the state of the parser, and are used to determine 

79 what to do when new data is encountered. 

80 """ 

81 

82 START = 0 

83 START_BOUNDARY = 1 

84 HEADER_FIELD_START = 2 

85 HEADER_FIELD = 3 

86 HEADER_VALUE_START = 4 

87 HEADER_VALUE = 5 

88 HEADER_VALUE_ALMOST_DONE = 6 

89 HEADERS_ALMOST_DONE = 7 

90 PART_DATA_START = 8 

91 PART_DATA = 9 

92 PART_DATA_END = 10 

93 END = 11 

94 

95 

96# Flags for the multipart parser. 

97FLAG_PART_BOUNDARY = 1 

98FLAG_LAST_BOUNDARY = 2 

99 

100# Get constants. Since iterating over a str on Python 2 gives you a 1-length 

101# string, but iterating over a bytes object on Python 3 gives you an integer, 

102# we need to save these constants. 

103CR = b"\r"[0] 

104LF = b"\n"[0] 

105COLON = b":"[0] 

106SPACE = b" "[0] 

107HYPHEN = b"-"[0] 

108AMPERSAND = b"&"[0] 

109SEMICOLON = b";"[0] 

110LOWER_A = b"a"[0] 

111LOWER_Z = b"z"[0] 

112NULL = b"\x00"[0] 

113 

114 

115# Lower-casing a character is different, because of the difference between 

116# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte, 

117# and joining a list of bytes together. 

118# These functions abstract that. 

119def lower_char(c): 

120 return c | 0x20 

121 

122 

123def ord_char(c): 

124 return c 

125 

126 

127def join_bytes(b): 

128 return bytes(list(b)) 

129 

130 

131def parse_options_header(value: str | bytes) -> tuple[bytes, dict[bytes, bytes]]: 

132 """ 

133 Parses a Content-Type header into a value in the following format: 

134 (content_type, {parameters}) 

135 """ 

136 # Uses email.message.Message to parse the header as described in PEP 594. 

137 # Ref: https://peps.python.org/pep-0594/#cgi 

138 if not value: 

139 return (b"", {}) 

140 

141 # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1. 

142 if isinstance(value, bytes): # pragma: no cover 

143 value = value.decode("latin-1") 

144 

145 # For types 

146 assert isinstance(value, str), "Value should be a string by now" 

147 

148 # If we have no options, return the string as-is. 

149 if ";" not in value: 

150 return (value.lower().strip().encode("latin-1"), {}) 

151 

152 # Split at the first semicolon, to get our value and then options. 

153 # ctype, rest = value.split(b';', 1) 

154 message = Message() 

155 message["content-type"] = value 

156 params = message.get_params() 

157 # If there were no parameters, this would have already returned above 

158 assert params, "At least the content type value should be present" 

159 ctype = params.pop(0)[0].encode("latin-1") 

160 options = {} 

161 for param in params: 

162 key, value = param 

163 # If the value returned from get_params() is a 3-tuple, the last 

164 # element corresponds to the value. 

165 # See: https://docs.python.org/3/library/email.compat32-message.html 

166 if isinstance(value, tuple): 

167 value = value[-1] 

168 # If the value is a filename, we need to fix a bug on IE6 that sends 

169 # the full file path instead of the filename. 

170 if key == "filename": 

171 if value[1:3] == ":\\" or value[:2] == "\\\\": 

172 value = value.split("\\")[-1] 

173 options[key.encode("latin-1")] = value.encode("latin-1") 

174 return ctype, options 

175 

176 

177class Field: 

178 """A Field object represents a (parsed) form field. It represents a single 

179 field with a corresponding name and value. 

180 

181 The name that a :class:`Field` will be instantiated with is the same name 

182 that would be found in the following HTML:: 

183 

184 <input name="name_goes_here" type="text"/> 

185 

186 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that 

187 will be called when data is written to the Field, and when the Field is 

188 finalized, respectively. 

189 

190 :param name: the name of the form field 

191 """ 

192 

193 def __init__(self, name: str): 

194 self._name = name 

195 self._value: list[bytes] = [] 

196 

197 # We cache the joined version of _value for speed. 

198 self._cache = _missing 

199 

200 @classmethod 

201 def from_value(cls, name: str, value: bytes | None) -> Field: 

202 """Create an instance of a :class:`Field`, and set the corresponding 

203 value - either None or an actual value. This method will also 

204 finalize the Field itself. 

205 

206 :param name: the name of the form field 

207 :param value: the value of the form field - either a bytestring or 

208 None 

209 """ 

210 

211 f = cls(name) 

212 if value is None: 

213 f.set_none() 

214 else: 

215 f.write(value) 

216 f.finalize() 

217 return f 

218 

219 def write(self, data: bytes) -> int: 

220 """Write some data into the form field. 

221 

222 :param data: a bytestring 

223 """ 

224 return self.on_data(data) 

225 

226 def on_data(self, data: bytes) -> int: 

227 """This method is a callback that will be called whenever data is 

228 written to the Field. 

229 

230 :param data: a bytestring 

231 """ 

232 self._value.append(data) 

233 self._cache = _missing 

234 return len(data) 

235 

236 def on_end(self) -> None: 

237 """This method is called whenever the Field is finalized.""" 

238 if self._cache is _missing: 

239 self._cache = b"".join(self._value) 

240 

241 def finalize(self) -> None: 

242 """Finalize the form field.""" 

243 self.on_end() 

244 

245 def close(self) -> None: 

246 """Close the Field object. This will free any underlying cache.""" 

247 # Free our value array. 

248 if self._cache is _missing: 

249 self._cache = b"".join(self._value) 

250 

251 del self._value 

252 

253 def set_none(self) -> None: 

254 """Some fields in a querystring can possibly have a value of None - for 

255 example, the string "foo&bar=&baz=asdf" will have a field with the 

256 name "foo" and value None, one with name "bar" and value "", and one 

257 with name "baz" and value "asdf". Since the write() interface doesn't 

258 support writing None, this function will set the field value to None. 

259 """ 

260 self._cache = None 

261 

262 @property 

263 def field_name(self) -> str: 

264 """This property returns the name of the field.""" 

265 return self._name 

266 

267 @property 

268 def value(self): 

269 """This property returns the value of the form field.""" 

270 if self._cache is _missing: 

271 self._cache = b"".join(self._value) 

272 

273 return self._cache 

274 

275 def __eq__(self, other: object) -> bool: 

276 if isinstance(other, Field): 

277 return self.field_name == other.field_name and self.value == other.value 

278 else: 

279 return NotImplemented 

280 

281 def __repr__(self) -> str: 

282 if len(self.value) > 97: 

283 # We get the repr, and then insert three dots before the final 

284 # quote. 

285 v = repr(self.value[:97])[:-1] + "...'" 

286 else: 

287 v = repr(self.value) 

288 

289 return "{}(field_name={!r}, value={})".format(self.__class__.__name__, self.field_name, v) 

290 

291 

292class File: 

293 """This class represents an uploaded file. It handles writing file data to 

294 either an in-memory file or a temporary file on-disk, if the optional 

295 threshold is passed. 

296 

297 There are some options that can be passed to the File to change behavior 

298 of the class. Valid options are as follows: 

299 

300 .. list-table:: 

301 :widths: 15 5 5 30 

302 :header-rows: 1 

303 

304 * - Name 

305 - Type 

306 - Default 

307 - Description 

308 * - UPLOAD_DIR 

309 - `str` 

310 - None 

311 - The directory to store uploaded files in. If this is None, a 

312 temporary file will be created in the system's standard location. 

313 * - UPLOAD_DELETE_TMP 

314 - `bool` 

315 - True 

316 - Delete automatically created TMP file 

317 * - UPLOAD_KEEP_FILENAME 

318 - `bool` 

319 - False 

320 - Whether or not to keep the filename of the uploaded file. If True, 

321 then the filename will be converted to a safe representation (e.g. 

322 by removing any invalid path segments), and then saved with the 

323 same name). Otherwise, a temporary name will be used. 

324 * - UPLOAD_KEEP_EXTENSIONS 

325 - `bool` 

326 - False 

327 - Whether or not to keep the uploaded file's extension. If False, the 

328 file will be saved with the default temporary extension (usually 

329 ".tmp"). Otherwise, the file's extension will be maintained. Note 

330 that this will properly combine with the UPLOAD_KEEP_FILENAME 

331 setting. 

332 * - MAX_MEMORY_FILE_SIZE 

333 - `int` 

334 - 1 MiB 

335 - The maximum number of bytes of a File to keep in memory. By 

336 default, the contents of a File are kept into memory until a certain 

337 limit is reached, after which the contents of the File are written 

338 to a temporary file. This behavior can be disabled by setting this 

339 value to an appropriately large value (or, for example, infinity, 

340 such as `float('inf')`. 

341 

342 :param file_name: The name of the file that this :class:`File` represents 

343 

344 :param field_name: The field name that uploaded this file. Note that this 

345 can be None, if, for example, the file was uploaded 

346 with Content-Type application/octet-stream 

347 

348 :param config: The configuration for this File. See above for valid 

349 configuration keys and their corresponding values. 

350 """ 

351 

352 def __init__(self, file_name: bytes | None, field_name: bytes | None = None, config: FileConfig = {}): 

353 # Save configuration, set other variables default. 

354 self.logger = logging.getLogger(__name__) 

355 self._config = config 

356 self._in_memory = True 

357 self._bytes_written = 0 

358 self._fileobj = BytesIO() 

359 

360 # Save the provided field/file name. 

361 self._field_name = field_name 

362 self._file_name = file_name 

363 

364 # Our actual file name is None by default, since, depending on our 

365 # config, we may not actually use the provided name. 

366 self._actual_file_name = None 

367 

368 # Split the extension from the filename. 

369 if file_name is not None: 

370 base, ext = os.path.splitext(file_name) 

371 self._file_base = base 

372 self._ext = ext 

373 

374 @property 

375 def field_name(self) -> bytes | None: 

376 """The form field associated with this file. May be None if there isn't 

377 one, for example when we have an application/octet-stream upload. 

378 """ 

379 return self._field_name 

380 

381 @property 

382 def file_name(self) -> bytes | None: 

383 """The file name given in the upload request.""" 

384 return self._file_name 

385 

386 @property 

387 def actual_file_name(self): 

388 """The file name that this file is saved as. Will be None if it's not 

389 currently saved on disk. 

390 """ 

391 return self._actual_file_name 

392 

393 @property 

394 def file_object(self): 

395 """The file object that we're currently writing to. Note that this 

396 will either be an instance of a :class:`io.BytesIO`, or a regular file 

397 object. 

398 """ 

399 return self._fileobj 

400 

401 @property 

402 def size(self): 

403 """The total size of this file, counted as the number of bytes that 

404 currently have been written to the file. 

405 """ 

406 return self._bytes_written 

407 

408 @property 

409 def in_memory(self) -> bool: 

410 """A boolean representing whether or not this file object is currently 

411 stored in-memory or on-disk. 

412 """ 

413 return self._in_memory 

414 

415 def flush_to_disk(self) -> None: 

416 """If the file is already on-disk, do nothing. Otherwise, copy from 

417 the in-memory buffer to a disk file, and then reassign our internal 

418 file object to this new disk file. 

419 

420 Note that if you attempt to flush a file that is already on-disk, a 

421 warning will be logged to this module's logger. 

422 """ 

423 if not self._in_memory: 

424 self.logger.warning("Trying to flush to disk when we're not in memory") 

425 return 

426 

427 # Go back to the start of our file. 

428 self._fileobj.seek(0) 

429 

430 # Open a new file. 

431 new_file = self._get_disk_file() 

432 

433 # Copy the file objects. 

434 shutil.copyfileobj(self._fileobj, new_file) 

435 

436 # Seek to the new position in our new file. 

437 new_file.seek(self._bytes_written) 

438 

439 # Reassign the fileobject. 

440 old_fileobj = self._fileobj 

441 self._fileobj = new_file 

442 

443 # We're no longer in memory. 

444 self._in_memory = False 

445 

446 # Close the old file object. 

447 old_fileobj.close() 

448 

449 def _get_disk_file(self): 

450 """This function is responsible for getting a file object on-disk for us.""" 

451 self.logger.info("Opening a file on disk") 

452 

453 file_dir = self._config.get("UPLOAD_DIR") 

454 keep_filename = self._config.get("UPLOAD_KEEP_FILENAME", False) 

455 keep_extensions = self._config.get("UPLOAD_KEEP_EXTENSIONS", False) 

456 delete_tmp = self._config.get("UPLOAD_DELETE_TMP", True) 

457 

458 # If we have a directory and are to keep the filename... 

459 if file_dir is not None and keep_filename: 

460 self.logger.info("Saving with filename in: %r", file_dir) 

461 

462 # Build our filename. 

463 # TODO: what happens if we don't have a filename? 

464 fname = self._file_base 

465 if keep_extensions: 

466 fname = fname + self._ext 

467 

468 path = os.path.join(file_dir, fname) 

469 try: 

470 self.logger.info("Opening file: %r", path) 

471 tmp_file = open(path, "w+b") 

472 except OSError: 

473 tmp_file = None 

474 

475 self.logger.exception("Error opening temporary file") 

476 raise FileError("Error opening temporary file: %r" % path) 

477 else: 

478 # Build options array. 

479 # Note that on Python 3, tempfile doesn't support byte names. We 

480 # encode our paths using the default filesystem encoding. 

481 options = {} 

482 if keep_extensions: 

483 ext = self._ext 

484 if isinstance(ext, bytes): 

485 ext = ext.decode(sys.getfilesystemencoding()) 

486 

487 options["suffix"] = ext 

488 if file_dir is not None: 

489 d = file_dir 

490 if isinstance(d, bytes): 

491 d = d.decode(sys.getfilesystemencoding()) 

492 

493 options["dir"] = d 

494 options["delete"] = delete_tmp 

495 

496 # Create a temporary (named) file with the appropriate settings. 

497 self.logger.info("Creating a temporary file with options: %r", options) 

498 try: 

499 tmp_file = tempfile.NamedTemporaryFile(**options) 

500 except OSError: 

501 self.logger.exception("Error creating named temporary file") 

502 raise FileError("Error creating named temporary file") 

503 

504 fname = tmp_file.name 

505 

506 # Encode filename as bytes. 

507 if isinstance(fname, str): 

508 fname = fname.encode(sys.getfilesystemencoding()) 

509 

510 self._actual_file_name = fname 

511 return tmp_file 

512 

513 def write(self, data: bytes): 

514 """Write some data to the File. 

515 

516 :param data: a bytestring 

517 """ 

518 return self.on_data(data) 

519 

520 def on_data(self, data: bytes): 

521 """This method is a callback that will be called whenever data is 

522 written to the File. 

523 

524 :param data: a bytestring 

525 """ 

526 pos = self._fileobj.tell() 

527 bwritten = self._fileobj.write(data) 

528 # true file objects write returns None 

529 if bwritten is None: 

530 bwritten = self._fileobj.tell() - pos 

531 

532 # If the bytes written isn't the same as the length, just return. 

533 if bwritten != len(data): 

534 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, len(data)) 

535 return bwritten 

536 

537 # Keep track of how many bytes we've written. 

538 self._bytes_written += bwritten 

539 

540 # If we're in-memory and are over our limit, we create a file. 

541 if ( 

542 self._in_memory 

543 and self._config.get("MAX_MEMORY_FILE_SIZE") is not None 

544 and (self._bytes_written > self._config.get("MAX_MEMORY_FILE_SIZE")) 

545 ): 

546 self.logger.info("Flushing to disk") 

547 self.flush_to_disk() 

548 

549 # Return the number of bytes written. 

550 return bwritten 

551 

552 def on_end(self) -> None: 

553 """This method is called whenever the Field is finalized.""" 

554 # Flush the underlying file object 

555 self._fileobj.flush() 

556 

557 def finalize(self) -> None: 

558 """Finalize the form file. This will not close the underlying file, 

559 but simply signal that we are finished writing to the File. 

560 """ 

561 self.on_end() 

562 

563 def close(self) -> None: 

564 """Close the File object. This will actually close the underlying 

565 file object (whether it's a :class:`io.BytesIO` or an actual file 

566 object). 

567 """ 

568 self._fileobj.close() 

569 

570 def __repr__(self) -> str: 

571 return "{}(file_name={!r}, field_name={!r})".format(self.__class__.__name__, self.file_name, self.field_name) 

572 

573 

574class BaseParser: 

575 """This class is the base class for all parsers. It contains the logic for 

576 calling and adding callbacks. 

577 

578 A callback can be one of two different forms. "Notification callbacks" are 

579 callbacks that are called when something happens - for example, when a new 

580 part of a multipart message is encountered by the parser. "Data callbacks" 

581 are called when we get some sort of data - for example, part of the body of 

582 a multipart chunk. Notification callbacks are called with no parameters, 

583 whereas data callbacks are called with three, as follows:: 

584 

585 data_callback(data, start, end) 

586 

587 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on 

588 Python 3). "start" and "end" are integer indexes into the "data" string 

589 that represent the data of interest. Thus, in a data callback, the slice 

590 `data[start:end]` represents the data that the callback is "interested in". 

591 The callback is not passed a copy of the data, since copying severely hurts 

592 performance. 

593 """ 

594 

595 def __init__(self): 

596 self.logger = logging.getLogger(__name__) 

597 

598 def callback(self, name: str, data=None, start=None, end=None): 

599 """This function calls a provided callback with some data. If the 

600 callback is not set, will do nothing. 

601 

602 :param name: The name of the callback to call (as a string). 

603 

604 :param data: Data to pass to the callback. If None, then it is 

605 assumed that the callback is a notification callback, 

606 and no parameters are given. 

607 

608 :param end: An integer that is passed to the data callback. 

609 

610 :param start: An integer that is passed to the data callback. 

611 """ 

612 name = "on_" + name 

613 func = self.callbacks.get(name) 

614 if func is None: 

615 return 

616 

617 # Depending on whether we're given a buffer... 

618 if data is not None: 

619 # Don't do anything if we have start == end. 

620 if start is not None and start == end: 

621 return 

622 

623 self.logger.debug("Calling %s with data[%d:%d]", name, start, end) 

624 func(data, start, end) 

625 else: 

626 self.logger.debug("Calling %s with no data", name) 

627 func() 

628 

629 def set_callback(self, name: str, new_func): 

630 """Update the function for a callback. Removes from the callbacks dict 

631 if new_func is None. 

632 

633 :param name: The name of the callback to call (as a string). 

634 

635 :param new_func: The new function for the callback. If None, then the 

636 callback will be removed (with no error if it does not 

637 exist). 

638 """ 

639 if new_func is None: 

640 self.callbacks.pop("on_" + name, None) 

641 else: 

642 self.callbacks["on_" + name] = new_func 

643 

644 def close(self): 

645 pass # pragma: no cover 

646 

647 def finalize(self): 

648 pass # pragma: no cover 

649 

650 def __repr__(self): 

651 return "%s()" % self.__class__.__name__ 

652 

653 

654class OctetStreamParser(BaseParser): 

655 """This parser parses an octet-stream request body and calls callbacks when 

656 incoming data is received. Callbacks are as follows: 

657 

658 .. list-table:: 

659 :widths: 15 10 30 

660 :header-rows: 1 

661 

662 * - Callback Name 

663 - Parameters 

664 - Description 

665 * - on_start 

666 - None 

667 - Called when the first data is parsed. 

668 * - on_data 

669 - data, start, end 

670 - Called for each data chunk that is parsed. 

671 * - on_end 

672 - None 

673 - Called when the parser is finished parsing all data. 

674 

675 :param callbacks: A dictionary of callbacks. See the documentation for 

676 :class:`BaseParser`. 

677 

678 :param max_size: The maximum size of body to parse. Defaults to infinity - 

679 i.e. unbounded. 

680 """ 

681 

682 def __init__(self, callbacks: OctetStreamCallbacks = {}, max_size=float("inf")): 

683 super().__init__() 

684 self.callbacks = callbacks 

685 self._started = False 

686 

687 if not isinstance(max_size, Number) or max_size < 1: 

688 raise ValueError("max_size must be a positive number, not %r" % max_size) 

689 self.max_size = max_size 

690 self._current_size = 0 

691 

692 def write(self, data: bytes): 

693 """Write some data to the parser, which will perform size verification, 

694 and then pass the data to the underlying callback. 

695 

696 :param data: a bytestring 

697 """ 

698 if not self._started: 

699 self.callback("start") 

700 self._started = True 

701 

702 # Truncate data length. 

703 data_len = len(data) 

704 if (self._current_size + data_len) > self.max_size: 

705 # We truncate the length of data that we are to process. 

706 new_size = int(self.max_size - self._current_size) 

707 self.logger.warning( 

708 "Current size is %d (max %d), so truncating data length from %d to %d", 

709 self._current_size, 

710 self.max_size, 

711 data_len, 

712 new_size, 

713 ) 

714 data_len = new_size 

715 

716 # Increment size, then callback, in case there's an exception. 

717 self._current_size += data_len 

718 self.callback("data", data, 0, data_len) 

719 return data_len 

720 

721 def finalize(self) -> None: 

722 """Finalize this parser, which signals to that we are finished parsing, 

723 and sends the on_end callback. 

724 """ 

725 self.callback("end") 

726 

727 def __repr__(self) -> str: 

728 return "%s()" % self.__class__.__name__ 

729 

730 

731class QuerystringParser(BaseParser): 

732 """This is a streaming querystring parser. It will consume data, and call 

733 the callbacks given when it has data. 

734 

735 .. list-table:: 

736 :widths: 15 10 30 

737 :header-rows: 1 

738 

739 * - Callback Name 

740 - Parameters 

741 - Description 

742 * - on_field_start 

743 - None 

744 - Called when a new field is encountered. 

745 * - on_field_name 

746 - data, start, end 

747 - Called when a portion of a field's name is encountered. 

748 * - on_field_data 

749 - data, start, end 

750 - Called when a portion of a field's data is encountered. 

751 * - on_field_end 

752 - None 

753 - Called when the end of a field is encountered. 

754 * - on_end 

755 - None 

756 - Called when the parser is finished parsing all data. 

757 

758 :param callbacks: A dictionary of callbacks. See the documentation for 

759 :class:`BaseParser`. 

760 

761 :param strict_parsing: Whether or not to parse the body strictly. Defaults 

762 to False. If this is set to True, then the behavior 

763 of the parser changes as the following: if a field 

764 has a value with an equal sign (e.g. "foo=bar", or 

765 "foo="), it is always included. If a field has no 

766 equals sign (e.g. "...&name&..."), it will be 

767 treated as an error if 'strict_parsing' is True, 

768 otherwise included. If an error is encountered, 

769 then a 

770 :class:`multipart.exceptions.QuerystringParseError` 

771 will be raised. 

772 

773 :param max_size: The maximum size of body to parse. Defaults to infinity - 

774 i.e. unbounded. 

775 """ 

776 

777 state: QuerystringState 

778 

779 def __init__(self, callbacks: QuerystringCallbacks = {}, strict_parsing: bool = False, max_size=float("inf")): 

780 super().__init__() 

781 self.state = QuerystringState.BEFORE_FIELD 

782 self._found_sep = False 

783 

784 self.callbacks = callbacks 

785 

786 # Max-size stuff 

787 if not isinstance(max_size, Number) or max_size < 1: 

788 raise ValueError("max_size must be a positive number, not %r" % max_size) 

789 self.max_size = max_size 

790 self._current_size = 0 

791 

792 # Should parsing be strict? 

793 self.strict_parsing = strict_parsing 

794 

795 def write(self, data: bytes) -> int: 

796 """Write some data to the parser, which will perform size verification, 

797 parse into either a field name or value, and then pass the 

798 corresponding data to the underlying callback. If an error is 

799 encountered while parsing, a QuerystringParseError will be raised. The 

800 "offset" attribute of the raised exception will be set to the offset in 

801 the input data chunk (NOT the overall stream) that caused the error. 

802 

803 :param data: a bytestring 

804 """ 

805 # Handle sizing. 

806 data_len = len(data) 

807 if (self._current_size + data_len) > self.max_size: 

808 # We truncate the length of data that we are to process. 

809 new_size = int(self.max_size - self._current_size) 

810 self.logger.warning( 

811 "Current size is %d (max %d), so truncating data length from %d to %d", 

812 self._current_size, 

813 self.max_size, 

814 data_len, 

815 new_size, 

816 ) 

817 data_len = new_size 

818 

819 l = 0 

820 try: 

821 l = self._internal_write(data, data_len) 

822 finally: 

823 self._current_size += l 

824 

825 return l 

826 

827 def _internal_write(self, data: bytes, length: int) -> int: 

828 state = self.state 

829 strict_parsing = self.strict_parsing 

830 found_sep = self._found_sep 

831 

832 i = 0 

833 while i < length: 

834 ch = data[i] 

835 

836 # Depending on our state... 

837 if state == QuerystringState.BEFORE_FIELD: 

838 # If the 'found_sep' flag is set, we've already encountered 

839 # and skipped a single separator. If so, we check our strict 

840 # parsing flag and decide what to do. Otherwise, we haven't 

841 # yet reached a separator, and thus, if we do, we need to skip 

842 # it as it will be the boundary between fields that's supposed 

843 # to be there. 

844 if ch == AMPERSAND or ch == SEMICOLON: 

845 if found_sep: 

846 # If we're parsing strictly, we disallow blank chunks. 

847 if strict_parsing: 

848 e = QuerystringParseError("Skipping duplicate ampersand/semicolon at %d" % i) 

849 e.offset = i 

850 raise e 

851 else: 

852 self.logger.debug("Skipping duplicate ampersand/semicolon at %d", i) 

853 else: 

854 # This case is when we're skipping the (first) 

855 # separator between fields, so we just set our flag 

856 # and continue on. 

857 found_sep = True 

858 else: 

859 # Emit a field-start event, and go to that state. Also, 

860 # reset the "found_sep" flag, for the next time we get to 

861 # this state. 

862 self.callback("field_start") 

863 i -= 1 

864 state = QuerystringState.FIELD_NAME 

865 found_sep = False 

866 

867 elif state == QuerystringState.FIELD_NAME: 

868 # Try and find a separator - we ensure that, if we do, we only 

869 # look for the equal sign before it. 

870 sep_pos = data.find(b"&", i) 

871 if sep_pos == -1: 

872 sep_pos = data.find(b";", i) 

873 

874 # See if we can find an equals sign in the remaining data. If 

875 # so, we can immediately emit the field name and jump to the 

876 # data state. 

877 if sep_pos != -1: 

878 equals_pos = data.find(b"=", i, sep_pos) 

879 else: 

880 equals_pos = data.find(b"=", i) 

881 

882 if equals_pos != -1: 

883 # Emit this name. 

884 self.callback("field_name", data, i, equals_pos) 

885 

886 # Jump i to this position. Note that it will then have 1 

887 # added to it below, which means the next iteration of this 

888 # loop will inspect the character after the equals sign. 

889 i = equals_pos 

890 state = QuerystringState.FIELD_DATA 

891 else: 

892 # No equals sign found. 

893 if not strict_parsing: 

894 # See also comments in the QuerystringState.FIELD_DATA case below. 

895 # If we found the separator, we emit the name and just 

896 # end - there's no data callback at all (not even with 

897 # a blank value). 

898 if sep_pos != -1: 

899 self.callback("field_name", data, i, sep_pos) 

900 self.callback("field_end") 

901 

902 i = sep_pos - 1 

903 state = QuerystringState.BEFORE_FIELD 

904 else: 

905 # Otherwise, no separator in this block, so the 

906 # rest of this chunk must be a name. 

907 self.callback("field_name", data, i, length) 

908 i = length 

909 

910 else: 

911 # We're parsing strictly. If we find a separator, 

912 # this is an error - we require an equals sign. 

913 if sep_pos != -1: 

914 e = QuerystringParseError( 

915 "When strict_parsing is True, we require an " 

916 "equals sign in all field chunks. Did not " 

917 "find one in the chunk that starts at %d" % (i,) 

918 ) 

919 e.offset = i 

920 raise e 

921 

922 # No separator in the rest of this chunk, so it's just 

923 # a field name. 

924 self.callback("field_name", data, i, length) 

925 i = length 

926 

927 elif state == QuerystringState.FIELD_DATA: 

928 # Try finding either an ampersand or a semicolon after this 

929 # position. 

930 sep_pos = data.find(b"&", i) 

931 if sep_pos == -1: 

932 sep_pos = data.find(b";", i) 

933 

934 # If we found it, callback this bit as data and then go back 

935 # to expecting to find a field. 

936 if sep_pos != -1: 

937 self.callback("field_data", data, i, sep_pos) 

938 self.callback("field_end") 

939 

940 # Note that we go to the separator, which brings us to the 

941 # "before field" state. This allows us to properly emit 

942 # "field_start" events only when we actually have data for 

943 # a field of some sort. 

944 i = sep_pos - 1 

945 state = QuerystringState.BEFORE_FIELD 

946 

947 # Otherwise, emit the rest as data and finish. 

948 else: 

949 self.callback("field_data", data, i, length) 

950 i = length 

951 

952 else: # pragma: no cover (error case) 

953 msg = "Reached an unknown state %d at %d" % (state, i) 

954 self.logger.warning(msg) 

955 e = QuerystringParseError(msg) 

956 e.offset = i 

957 raise e 

958 

959 i += 1 

960 

961 self.state = state 

962 self._found_sep = found_sep 

963 return len(data) 

964 

965 def finalize(self) -> None: 

966 """Finalize this parser, which signals to that we are finished parsing, 

967 if we're still in the middle of a field, an on_field_end callback, and 

968 then the on_end callback. 

969 """ 

970 # If we're currently in the middle of a field, we finish it. 

971 if self.state == QuerystringState.FIELD_DATA: 

972 self.callback("field_end") 

973 self.callback("end") 

974 

975 def __repr__(self) -> str: 

976 return "{}(strict_parsing={!r}, max_size={!r})".format( 

977 self.__class__.__name__, self.strict_parsing, self.max_size 

978 ) 

979 

980 

981class MultipartParser(BaseParser): 

982 """This class is a streaming multipart/form-data parser. 

983 

984 .. list-table:: 

985 :widths: 15 10 30 

986 :header-rows: 1 

987 

988 * - Callback Name 

989 - Parameters 

990 - Description 

991 * - on_part_begin 

992 - None 

993 - Called when a new part of the multipart message is encountered. 

994 * - on_part_data 

995 - data, start, end 

996 - Called when a portion of a part's data is encountered. 

997 * - on_part_end 

998 - None 

999 - Called when the end of a part is reached. 

1000 * - on_header_begin 

1001 - None 

1002 - Called when we've found a new header in a part of a multipart 

1003 message 

1004 * - on_header_field 

1005 - data, start, end 

1006 - Called each time an additional portion of a header is read (i.e. the 

1007 part of the header that is before the colon; the "Foo" in 

1008 "Foo: Bar"). 

1009 * - on_header_value 

1010 - data, start, end 

1011 - Called when we get data for a header. 

1012 * - on_header_end 

1013 - None 

1014 - Called when the current header is finished - i.e. we've reached the 

1015 newline at the end of the header. 

1016 * - on_headers_finished 

1017 - None 

1018 - Called when all headers are finished, and before the part data 

1019 starts. 

1020 * - on_end 

1021 - None 

1022 - Called when the parser is finished parsing all data. 

1023 

1024 

1025 :param boundary: The multipart boundary. This is required, and must match 

1026 what is given in the HTTP request - usually in the 

1027 Content-Type header. 

1028 

1029 :param callbacks: A dictionary of callbacks. See the documentation for 

1030 :class:`BaseParser`. 

1031 

1032 :param max_size: The maximum size of body to parse. Defaults to infinity - 

1033 i.e. unbounded. 

1034 """ 

1035 

1036 def __init__(self, boundary: bytes | str, callbacks: MultipartCallbacks = {}, max_size=float("inf")): 

1037 # Initialize parser state. 

1038 super().__init__() 

1039 self.state = MultipartState.START 

1040 self.index = self.flags = 0 

1041 

1042 self.callbacks = callbacks 

1043 

1044 if not isinstance(max_size, Number) or max_size < 1: 

1045 raise ValueError("max_size must be a positive number, not %r" % max_size) 

1046 self.max_size = max_size 

1047 self._current_size = 0 

1048 

1049 # Setup marks. These are used to track the state of data received. 

1050 self.marks = {} 

1051 

1052 # TODO: Actually use this rather than the dumb version we currently use 

1053 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm. 

1054 # skip = [len(boundary) for x in range(256)] 

1055 # for i in range(len(boundary) - 1): 

1056 # skip[ord_char(boundary[i])] = len(boundary) - i - 1 

1057 # 

1058 # # We use a tuple since it's a constant, and marginally faster. 

1059 # self.skip = tuple(skip) 

1060 

1061 # Save our boundary. 

1062 if isinstance(boundary, str): # pragma: no cover 

1063 boundary = boundary.encode("latin-1") 

1064 self.boundary = b"\r\n--" + boundary 

1065 

1066 # Get a set of characters that belong to our boundary. 

1067 self.boundary_chars = frozenset(self.boundary) 

1068 

1069 # We also create a lookbehind list. 

1070 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary + 

1071 # "--\r\n" at the final boundary, and the length of '\r\n--' and 

1072 # '--\r\n' is 8 bytes. 

1073 self.lookbehind = [NULL for x in range(len(boundary) + 8)] 

1074 

1075 def write(self, data: bytes) -> int: 

1076 """Write some data to the parser, which will perform size verification, 

1077 and then parse the data into the appropriate location (e.g. header, 

1078 data, etc.), and pass this on to the underlying callback. If an error 

1079 is encountered, a MultipartParseError will be raised. The "offset" 

1080 attribute on the raised exception will be set to the offset of the byte 

1081 in the input chunk that caused the error. 

1082 

1083 :param data: a bytestring 

1084 """ 

1085 # Handle sizing. 

1086 data_len = len(data) 

1087 if (self._current_size + data_len) > self.max_size: 

1088 # We truncate the length of data that we are to process. 

1089 new_size = int(self.max_size - self._current_size) 

1090 self.logger.warning( 

1091 "Current size is %d (max %d), so truncating data length from %d to %d", 

1092 self._current_size, 

1093 self.max_size, 

1094 data_len, 

1095 new_size, 

1096 ) 

1097 data_len = new_size 

1098 

1099 l = 0 

1100 try: 

1101 l = self._internal_write(data, data_len) 

1102 finally: 

1103 self._current_size += l 

1104 

1105 return l 

1106 

1107 def _internal_write(self, data: bytes, length: int) -> int: 

1108 # Get values from locals. 

1109 boundary = self.boundary 

1110 

1111 # Get our state, flags and index. These are persisted between calls to 

1112 # this function. 

1113 state = self.state 

1114 index = self.index 

1115 flags = self.flags 

1116 

1117 # Our index defaults to 0. 

1118 i = 0 

1119 

1120 # Set a mark. 

1121 def set_mark(name): 

1122 self.marks[name] = i 

1123 

1124 # Remove a mark. 

1125 def delete_mark(name, reset=False): 

1126 self.marks.pop(name, None) 

1127 

1128 # Helper function that makes calling a callback with data easier. The 

1129 # 'remaining' parameter will callback from the marked value until the 

1130 # end of the buffer, and reset the mark, instead of deleting it. This 

1131 # is used at the end of the function to call our callbacks with any 

1132 # remaining data in this chunk. 

1133 def data_callback(name, remaining=False): 

1134 marked_index = self.marks.get(name) 

1135 if marked_index is None: 

1136 return 

1137 

1138 # If we're getting remaining data, we ignore the current i value 

1139 # and just call with the remaining data. 

1140 if remaining: 

1141 self.callback(name, data, marked_index, length) 

1142 self.marks[name] = 0 

1143 

1144 # Otherwise, we call it from the mark to the current byte we're 

1145 # processing. 

1146 else: 

1147 self.callback(name, data, marked_index, i) 

1148 self.marks.pop(name, None) 

1149 

1150 # For each byte... 

1151 while i < length: 

1152 c = data[i] 

1153 

1154 if state == MultipartState.START: 

1155 # Skip leading newlines 

1156 if c == CR or c == LF: 

1157 i += 1 

1158 self.logger.debug("Skipping leading CR/LF at %d", i) 

1159 continue 

1160 

1161 # index is used as in index into our boundary. Set to 0. 

1162 index = 0 

1163 

1164 # Move to the next state, but decrement i so that we re-process 

1165 # this character. 

1166 state = MultipartState.START_BOUNDARY 

1167 i -= 1 

1168 

1169 elif state == MultipartState.START_BOUNDARY: 

1170 # Check to ensure that the last 2 characters in our boundary 

1171 # are CRLF. 

1172 if index == len(boundary) - 2: 

1173 if c != CR: 

1174 # Error! 

1175 msg = "Did not find CR at end of boundary (%d)" % (i,) 

1176 self.logger.warning(msg) 

1177 e = MultipartParseError(msg) 

1178 e.offset = i 

1179 raise e 

1180 

1181 index += 1 

1182 

1183 elif index == len(boundary) - 2 + 1: 

1184 if c != LF: 

1185 msg = "Did not find LF at end of boundary (%d)" % (i,) 

1186 self.logger.warning(msg) 

1187 e = MultipartParseError(msg) 

1188 e.offset = i 

1189 raise e 

1190 

1191 # The index is now used for indexing into our boundary. 

1192 index = 0 

1193 

1194 # Callback for the start of a part. 

1195 self.callback("part_begin") 

1196 

1197 # Move to the next character and state. 

1198 state = MultipartState.HEADER_FIELD_START 

1199 

1200 else: 

1201 # Check to ensure our boundary matches 

1202 if c != boundary[index + 2]: 

1203 msg = "Did not find boundary character %r at index " "%d" % (c, index + 2) 

1204 self.logger.warning(msg) 

1205 e = MultipartParseError(msg) 

1206 e.offset = i 

1207 raise e 

1208 

1209 # Increment index into boundary and continue. 

1210 index += 1 

1211 

1212 elif state == MultipartState.HEADER_FIELD_START: 

1213 # Mark the start of a header field here, reset the index, and 

1214 # continue parsing our header field. 

1215 index = 0 

1216 

1217 # Set a mark of our header field. 

1218 set_mark("header_field") 

1219 

1220 # Move to parsing header fields. 

1221 state = MultipartState.HEADER_FIELD 

1222 i -= 1 

1223 

1224 elif state == MultipartState.HEADER_FIELD: 

1225 # If we've reached a CR at the beginning of a header, it means 

1226 # that we've reached the second of 2 newlines, and so there are 

1227 # no more headers to parse. 

1228 if c == CR: 

1229 delete_mark("header_field") 

1230 state = MultipartState.HEADERS_ALMOST_DONE 

1231 i += 1 

1232 continue 

1233 

1234 # Increment our index in the header. 

1235 index += 1 

1236 

1237 # Do nothing if we encounter a hyphen. 

1238 if c == HYPHEN: 

1239 pass 

1240 

1241 # If we've reached a colon, we're done with this header. 

1242 elif c == COLON: 

1243 # A 0-length header is an error. 

1244 if index == 1: 

1245 msg = "Found 0-length header at %d" % (i,) 

1246 self.logger.warning(msg) 

1247 e = MultipartParseError(msg) 

1248 e.offset = i 

1249 raise e 

1250 

1251 # Call our callback with the header field. 

1252 data_callback("header_field") 

1253 

1254 # Move to parsing the header value. 

1255 state = MultipartState.HEADER_VALUE_START 

1256 

1257 else: 

1258 # Lower-case this character, and ensure that it is in fact 

1259 # a valid letter. If not, it's an error. 

1260 cl = lower_char(c) 

1261 if cl < LOWER_A or cl > LOWER_Z: 

1262 msg = "Found non-alphanumeric character %r in " "header at %d" % (c, i) 

1263 self.logger.warning(msg) 

1264 e = MultipartParseError(msg) 

1265 e.offset = i 

1266 raise e 

1267 

1268 elif state == MultipartState.HEADER_VALUE_START: 

1269 # Skip leading spaces. 

1270 if c == SPACE: 

1271 i += 1 

1272 continue 

1273 

1274 # Mark the start of the header value. 

1275 set_mark("header_value") 

1276 

1277 # Move to the header-value state, reprocessing this character. 

1278 state = MultipartState.HEADER_VALUE 

1279 i -= 1 

1280 

1281 elif state == MultipartState.HEADER_VALUE: 

1282 # If we've got a CR, we're nearly done our headers. Otherwise, 

1283 # we do nothing and just move past this character. 

1284 if c == CR: 

1285 data_callback("header_value") 

1286 self.callback("header_end") 

1287 state = MultipartState.HEADER_VALUE_ALMOST_DONE 

1288 

1289 elif state == MultipartState.HEADER_VALUE_ALMOST_DONE: 

1290 # The last character should be a LF. If not, it's an error. 

1291 if c != LF: 

1292 msg = "Did not find LF character at end of header " "(found %r)" % (c,) 

1293 self.logger.warning(msg) 

1294 e = MultipartParseError(msg) 

1295 e.offset = i 

1296 raise e 

1297 

1298 # Move back to the start of another header. Note that if that 

1299 # state detects ANOTHER newline, it'll trigger the end of our 

1300 # headers. 

1301 state = MultipartState.HEADER_FIELD_START 

1302 

1303 elif state == MultipartState.HEADERS_ALMOST_DONE: 

1304 # We're almost done our headers. This is reached when we parse 

1305 # a CR at the beginning of a header, so our next character 

1306 # should be a LF, or it's an error. 

1307 if c != LF: 

1308 msg = f"Did not find LF at end of headers (found {c!r})" 

1309 self.logger.warning(msg) 

1310 e = MultipartParseError(msg) 

1311 e.offset = i 

1312 raise e 

1313 

1314 self.callback("headers_finished") 

1315 state = MultipartState.PART_DATA_START 

1316 

1317 elif state == MultipartState.PART_DATA_START: 

1318 # Mark the start of our part data. 

1319 set_mark("part_data") 

1320 

1321 # Start processing part data, including this character. 

1322 state = MultipartState.PART_DATA 

1323 i -= 1 

1324 

1325 elif state == MultipartState.PART_DATA: 

1326 # We're processing our part data right now. During this, we 

1327 # need to efficiently search for our boundary, since any data 

1328 # on any number of lines can be a part of the current data. 

1329 # We use the Boyer-Moore-Horspool algorithm to efficiently 

1330 # search through the remainder of the buffer looking for our 

1331 # boundary. 

1332 

1333 # Save the current value of our index. We use this in case we 

1334 # find part of a boundary, but it doesn't match fully. 

1335 prev_index = index 

1336 

1337 # Set up variables. 

1338 boundary_length = len(boundary) 

1339 boundary_end = boundary_length - 1 

1340 data_length = length 

1341 boundary_chars = self.boundary_chars 

1342 

1343 # If our index is 0, we're starting a new part, so start our 

1344 # search. 

1345 if index == 0: 

1346 # Search forward until we either hit the end of our buffer, 

1347 # or reach a character that's in our boundary. 

1348 i += boundary_end 

1349 while i < data_length - 1 and data[i] not in boundary_chars: 

1350 i += boundary_length 

1351 

1352 # Reset i back the length of our boundary, which is the 

1353 # earliest possible location that could be our match (i.e. 

1354 # if we've just broken out of our loop since we saw the 

1355 # last character in our boundary) 

1356 i -= boundary_end 

1357 c = data[i] 

1358 

1359 # Now, we have a couple of cases here. If our index is before 

1360 # the end of the boundary... 

1361 if index < boundary_length: 

1362 # If the character matches... 

1363 if boundary[index] == c: 

1364 # If we found a match for our boundary, we send the 

1365 # existing data. 

1366 if index == 0: 

1367 data_callback("part_data") 

1368 

1369 # The current character matches, so continue! 

1370 index += 1 

1371 else: 

1372 index = 0 

1373 

1374 # Our index is equal to the length of our boundary! 

1375 elif index == boundary_length: 

1376 # First we increment it. 

1377 index += 1 

1378 

1379 # Now, if we've reached a newline, we need to set this as 

1380 # the potential end of our boundary. 

1381 if c == CR: 

1382 flags |= FLAG_PART_BOUNDARY 

1383 

1384 # Otherwise, if this is a hyphen, we might be at the last 

1385 # of all boundaries. 

1386 elif c == HYPHEN: 

1387 flags |= FLAG_LAST_BOUNDARY 

1388 

1389 # Otherwise, we reset our index, since this isn't either a 

1390 # newline or a hyphen. 

1391 else: 

1392 index = 0 

1393 

1394 # Our index is right after the part boundary, which should be 

1395 # a LF. 

1396 elif index == boundary_length + 1: 

1397 # If we're at a part boundary (i.e. we've seen a CR 

1398 # character already)... 

1399 if flags & FLAG_PART_BOUNDARY: 

1400 # We need a LF character next. 

1401 if c == LF: 

1402 # Unset the part boundary flag. 

1403 flags &= ~FLAG_PART_BOUNDARY 

1404 

1405 # Callback indicating that we've reached the end of 

1406 # a part, and are starting a new one. 

1407 self.callback("part_end") 

1408 self.callback("part_begin") 

1409 

1410 # Move to parsing new headers. 

1411 index = 0 

1412 state = MultipartState.HEADER_FIELD_START 

1413 i += 1 

1414 continue 

1415 

1416 # We didn't find an LF character, so no match. Reset 

1417 # our index and clear our flag. 

1418 index = 0 

1419 flags &= ~FLAG_PART_BOUNDARY 

1420 

1421 # Otherwise, if we're at the last boundary (i.e. we've 

1422 # seen a hyphen already)... 

1423 elif flags & FLAG_LAST_BOUNDARY: 

1424 # We need a second hyphen here. 

1425 if c == HYPHEN: 

1426 # Callback to end the current part, and then the 

1427 # message. 

1428 self.callback("part_end") 

1429 self.callback("end") 

1430 state = MultipartState.END 

1431 else: 

1432 # No match, so reset index. 

1433 index = 0 

1434 

1435 # If we have an index, we need to keep this byte for later, in 

1436 # case we can't match the full boundary. 

1437 if index > 0: 

1438 self.lookbehind[index - 1] = c 

1439 

1440 # Otherwise, our index is 0. If the previous index is not, it 

1441 # means we reset something, and we need to take the data we 

1442 # thought was part of our boundary and send it along as actual 

1443 # data. 

1444 elif prev_index > 0: 

1445 # Callback to write the saved data. 

1446 lb_data = join_bytes(self.lookbehind) 

1447 self.callback("part_data", lb_data, 0, prev_index) 

1448 

1449 # Overwrite our previous index. 

1450 prev_index = 0 

1451 

1452 # Re-set our mark for part data. 

1453 set_mark("part_data") 

1454 

1455 # Re-consider the current character, since this could be 

1456 # the start of the boundary itself. 

1457 i -= 1 

1458 

1459 elif state == MultipartState.END: 

1460 # Do nothing and just consume a byte in the end state. 

1461 if c not in (CR, LF): 

1462 self.logger.warning("Consuming a byte '0x%x' in the end state", c) 

1463 

1464 else: # pragma: no cover (error case) 

1465 # We got into a strange state somehow! Just stop processing. 

1466 msg = "Reached an unknown state %d at %d" % (state, i) 

1467 self.logger.warning(msg) 

1468 e = MultipartParseError(msg) 

1469 e.offset = i 

1470 raise e 

1471 

1472 # Move to the next byte. 

1473 i += 1 

1474 

1475 # We call our callbacks with any remaining data. Note that we pass 

1476 # the 'remaining' flag, which sets the mark back to 0 instead of 

1477 # deleting it, if it's found. This is because, if the mark is found 

1478 # at this point, we assume that there's data for one of these things 

1479 # that has been parsed, but not yet emitted. And, as such, it implies 

1480 # that we haven't yet reached the end of this 'thing'. So, by setting 

1481 # the mark to 0, we cause any data callbacks that take place in future 

1482 # calls to this function to start from the beginning of that buffer. 

1483 data_callback("header_field", True) 

1484 data_callback("header_value", True) 

1485 data_callback("part_data", True) 

1486 

1487 # Save values to locals. 

1488 self.state = state 

1489 self.index = index 

1490 self.flags = flags 

1491 

1492 # Return our data length to indicate no errors, and that we processed 

1493 # all of it. 

1494 return length 

1495 

1496 def finalize(self) -> None: 

1497 """Finalize this parser, which signals to that we are finished parsing. 

1498 

1499 Note: It does not currently, but in the future, it will verify that we 

1500 are in the final state of the parser (i.e. the end of the multipart 

1501 message is well-formed), and, if not, throw an error. 

1502 """ 

1503 # TODO: verify that we're in the state MultipartState.END, otherwise throw an 

1504 # error or otherwise state that we're not finished parsing. 

1505 pass 

1506 

1507 def __repr__(self): 

1508 return f"{self.__class__.__name__}(boundary={self.boundary!r})" 

1509 

1510 

1511class FormParser: 

1512 """This class is the all-in-one form parser. Given all the information 

1513 necessary to parse a form, it will instantiate the correct parser, create 

1514 the proper :class:`Field` and :class:`File` classes to store the data that 

1515 is parsed, and call the two given callbacks with each field and file as 

1516 they become available. 

1517 

1518 :param content_type: The Content-Type of the incoming request. This is 

1519 used to select the appropriate parser. 

1520 

1521 :param on_field: The callback to call when a field has been parsed and is 

1522 ready for usage. See above for parameters. 

1523 

1524 :param on_file: The callback to call when a file has been parsed and is 

1525 ready for usage. See above for parameters. 

1526 

1527 :param on_end: An optional callback to call when all fields and files in a 

1528 request has been parsed. Can be None. 

1529 

1530 :param boundary: If the request is a multipart/form-data request, this 

1531 should be the boundary of the request, as given in the 

1532 Content-Type header, as a bytestring. 

1533 

1534 :param file_name: If the request is of type application/octet-stream, then 

1535 the body of the request will not contain any information 

1536 about the uploaded file. In such cases, you can provide 

1537 the file name of the uploaded file manually. 

1538 

1539 :param FileClass: The class to use for uploaded files. Defaults to 

1540 :class:`File`, but you can provide your own class if you 

1541 wish to customize behaviour. The class will be 

1542 instantiated as FileClass(file_name, field_name), and it 

1543 must provide the following functions:: 

1544 file_instance.write(data) 

1545 file_instance.finalize() 

1546 file_instance.close() 

1547 

1548 :param FieldClass: The class to use for uploaded fields. Defaults to 

1549 :class:`Field`, but you can provide your own class if 

1550 you wish to customize behaviour. The class will be 

1551 instantiated as FieldClass(field_name), and it must 

1552 provide the following functions:: 

1553 field_instance.write(data) 

1554 field_instance.finalize() 

1555 field_instance.close() 

1556 

1557 :param config: Configuration to use for this FormParser. The default 

1558 values are taken from the DEFAULT_CONFIG value, and then 

1559 any keys present in this dictionary will overwrite the 

1560 default values. 

1561 

1562 """ 

1563 

1564 #: This is the default configuration for our form parser. 

1565 #: Note: all file sizes should be in bytes. 

1566 DEFAULT_CONFIG: FormParserConfig = { 

1567 "MAX_BODY_SIZE": float("inf"), 

1568 "MAX_MEMORY_FILE_SIZE": 1 * 1024 * 1024, 

1569 "UPLOAD_DIR": None, 

1570 "UPLOAD_KEEP_FILENAME": False, 

1571 "UPLOAD_KEEP_EXTENSIONS": False, 

1572 # Error on invalid Content-Transfer-Encoding? 

1573 "UPLOAD_ERROR_ON_BAD_CTE": False, 

1574 } 

1575 

1576 def __init__( 

1577 self, 

1578 content_type, 

1579 on_field, 

1580 on_file, 

1581 on_end=None, 

1582 boundary=None, 

1583 file_name=None, 

1584 FileClass=File, 

1585 FieldClass=Field, 

1586 config: FormParserConfig = {}, 

1587 ): 

1588 self.logger = logging.getLogger(__name__) 

1589 

1590 # Save variables. 

1591 self.content_type = content_type 

1592 self.boundary = boundary 

1593 self.bytes_received = 0 

1594 self.parser = None 

1595 

1596 # Save callbacks. 

1597 self.on_field = on_field 

1598 self.on_file = on_file 

1599 self.on_end = on_end 

1600 

1601 # Save classes. 

1602 self.FileClass = File 

1603 self.FieldClass = Field 

1604 

1605 # Set configuration options. 

1606 self.config = self.DEFAULT_CONFIG.copy() 

1607 self.config.update(config) 

1608 

1609 # Depending on the Content-Type, we instantiate the correct parser. 

1610 if content_type == "application/octet-stream": 

1611 # Work around the lack of 'nonlocal' in Py2 

1612 class vars: 

1613 f = None 

1614 

1615 def on_start() -> None: 

1616 vars.f = FileClass(file_name, None, config=self.config) 

1617 

1618 def on_data(data: bytes, start: int, end: int) -> None: 

1619 vars.f.write(data[start:end]) 

1620 

1621 def on_end() -> None: 

1622 # Finalize the file itself. 

1623 vars.f.finalize() 

1624 

1625 # Call our callback. 

1626 on_file(vars.f) 

1627 

1628 # Call the on-end callback. 

1629 if self.on_end is not None: 

1630 self.on_end() 

1631 

1632 # Instantiate an octet-stream parser 

1633 parser = OctetStreamParser( 

1634 callbacks={"on_start": on_start, "on_data": on_data, "on_end": on_end}, 

1635 max_size=self.config["MAX_BODY_SIZE"], 

1636 ) 

1637 

1638 elif content_type == "application/x-www-form-urlencoded" or content_type == "application/x-url-encoded": 

1639 name_buffer: list[bytes] = [] 

1640 

1641 class vars: 

1642 f = None 

1643 

1644 def on_field_start() -> None: 

1645 pass 

1646 

1647 def on_field_name(data: bytes, start: int, end: int) -> None: 

1648 name_buffer.append(data[start:end]) 

1649 

1650 def on_field_data(data: bytes, start: int, end: int) -> None: 

1651 if vars.f is None: 

1652 vars.f = FieldClass(b"".join(name_buffer)) 

1653 del name_buffer[:] 

1654 vars.f.write(data[start:end]) 

1655 

1656 def on_field_end() -> None: 

1657 # Finalize and call callback. 

1658 if vars.f is None: 

1659 # If we get here, it's because there was no field data. 

1660 # We create a field, set it to None, and then continue. 

1661 vars.f = FieldClass(b"".join(name_buffer)) 

1662 del name_buffer[:] 

1663 vars.f.set_none() 

1664 

1665 vars.f.finalize() 

1666 on_field(vars.f) 

1667 vars.f = None 

1668 

1669 def on_end() -> None: 

1670 if self.on_end is not None: 

1671 self.on_end() 

1672 

1673 # Instantiate parser. 

1674 parser = QuerystringParser( 

1675 callbacks={ 

1676 "on_field_start": on_field_start, 

1677 "on_field_name": on_field_name, 

1678 "on_field_data": on_field_data, 

1679 "on_field_end": on_field_end, 

1680 "on_end": on_end, 

1681 }, 

1682 max_size=self.config["MAX_BODY_SIZE"], 

1683 ) 

1684 

1685 elif content_type == "multipart/form-data": 

1686 if boundary is None: 

1687 self.logger.error("No boundary given") 

1688 raise FormParserError("No boundary given") 

1689 

1690 header_name: list[bytes] = [] 

1691 header_value: list[bytes] = [] 

1692 headers = {} 

1693 

1694 # No 'nonlocal' on Python 2 :-( 

1695 class vars: 

1696 f = None 

1697 writer = None 

1698 is_file = False 

1699 

1700 def on_part_begin(): 

1701 pass 

1702 

1703 def on_part_data(data: bytes, start: int, end: int): 

1704 bytes_processed = vars.writer.write(data[start:end]) 

1705 # TODO: check for error here. 

1706 return bytes_processed 

1707 

1708 def on_part_end() -> None: 

1709 vars.f.finalize() 

1710 if vars.is_file: 

1711 on_file(vars.f) 

1712 else: 

1713 on_field(vars.f) 

1714 

1715 def on_header_field(data: bytes, start: int, end: int): 

1716 header_name.append(data[start:end]) 

1717 

1718 def on_header_value(data: bytes, start: int, end: int): 

1719 header_value.append(data[start:end]) 

1720 

1721 def on_header_end(): 

1722 headers[b"".join(header_name)] = b"".join(header_value) 

1723 del header_name[:] 

1724 del header_value[:] 

1725 

1726 def on_headers_finished() -> None: 

1727 # Reset the 'is file' flag. 

1728 vars.is_file = False 

1729 

1730 # Parse the content-disposition header. 

1731 # TODO: handle mixed case 

1732 content_disp = headers.get(b"Content-Disposition") 

1733 disp, options = parse_options_header(content_disp) 

1734 

1735 # Get the field and filename. 

1736 field_name = options.get(b"name") 

1737 file_name = options.get(b"filename") 

1738 # TODO: check for errors 

1739 

1740 # Create the proper class. 

1741 if file_name is None: 

1742 vars.f = FieldClass(field_name) 

1743 else: 

1744 vars.f = FileClass(file_name, field_name, config=self.config) 

1745 vars.is_file = True 

1746 

1747 # Parse the given Content-Transfer-Encoding to determine what 

1748 # we need to do with the incoming data. 

1749 # TODO: check that we properly handle 8bit / 7bit encoding. 

1750 transfer_encoding = headers.get(b"Content-Transfer-Encoding", b"7bit") 

1751 

1752 if transfer_encoding == b"binary" or transfer_encoding == b"8bit" or transfer_encoding == b"7bit": 

1753 vars.writer = vars.f 

1754 

1755 elif transfer_encoding == b"base64": 

1756 vars.writer = Base64Decoder(vars.f) 

1757 

1758 elif transfer_encoding == b"quoted-printable": 

1759 vars.writer = QuotedPrintableDecoder(vars.f) 

1760 

1761 else: 

1762 self.logger.warning("Unknown Content-Transfer-Encoding: %r", transfer_encoding) 

1763 if self.config["UPLOAD_ERROR_ON_BAD_CTE"]: 

1764 raise FormParserError('Unknown Content-Transfer-Encoding "{}"'.format(transfer_encoding)) 

1765 else: 

1766 # If we aren't erroring, then we just treat this as an 

1767 # unencoded Content-Transfer-Encoding. 

1768 vars.writer = vars.f 

1769 

1770 def on_end() -> None: 

1771 vars.writer.finalize() 

1772 if self.on_end is not None: 

1773 self.on_end() 

1774 

1775 # Instantiate a multipart parser. 

1776 parser = MultipartParser( 

1777 boundary, 

1778 callbacks={ 

1779 "on_part_begin": on_part_begin, 

1780 "on_part_data": on_part_data, 

1781 "on_part_end": on_part_end, 

1782 "on_header_field": on_header_field, 

1783 "on_header_value": on_header_value, 

1784 "on_header_end": on_header_end, 

1785 "on_headers_finished": on_headers_finished, 

1786 "on_end": on_end, 

1787 }, 

1788 max_size=self.config["MAX_BODY_SIZE"], 

1789 ) 

1790 

1791 else: 

1792 self.logger.warning("Unknown Content-Type: %r", content_type) 

1793 raise FormParserError("Unknown Content-Type: {}".format(content_type)) 

1794 

1795 self.parser = parser 

1796 

1797 def write(self, data: bytes): 

1798 """Write some data. The parser will forward this to the appropriate 

1799 underlying parser. 

1800 

1801 :param data: a bytestring 

1802 """ 

1803 self.bytes_received += len(data) 

1804 # TODO: check the parser's return value for errors? 

1805 return self.parser.write(data) 

1806 

1807 def finalize(self) -> None: 

1808 """Finalize the parser.""" 

1809 if self.parser is not None and hasattr(self.parser, "finalize"): 

1810 self.parser.finalize() 

1811 

1812 def close(self) -> None: 

1813 """Close the parser.""" 

1814 if self.parser is not None and hasattr(self.parser, "close"): 

1815 self.parser.close() 

1816 

1817 def __repr__(self) -> str: 

1818 return "{}(content_type={!r}, parser={!r})".format(self.__class__.__name__, self.content_type, self.parser) 

1819 

1820 

1821def create_form_parser(headers, on_field, on_file, trust_x_headers=False, config={}): 

1822 """This function is a helper function to aid in creating a FormParser 

1823 instances. Given a dictionary-like headers object, it will determine 

1824 the correct information needed, instantiate a FormParser with the 

1825 appropriate values and given callbacks, and then return the corresponding 

1826 parser. 

1827 

1828 :param headers: A dictionary-like object of HTTP headers. The only 

1829 required header is Content-Type. 

1830 

1831 :param on_field: Callback to call with each parsed field. 

1832 

1833 :param on_file: Callback to call with each parsed file. 

1834 

1835 :param trust_x_headers: Whether or not to trust information received from 

1836 certain X-Headers - for example, the file name from 

1837 X-File-Name. 

1838 

1839 :param config: Configuration variables to pass to the FormParser. 

1840 """ 

1841 content_type = headers.get("Content-Type") 

1842 if content_type is None: 

1843 logging.getLogger(__name__).warning("No Content-Type header given") 

1844 raise ValueError("No Content-Type header given!") 

1845 

1846 # Boundaries are optional (the FormParser will raise if one is needed 

1847 # but not given). 

1848 content_type, params = parse_options_header(content_type) 

1849 boundary = params.get(b"boundary") 

1850 

1851 # We need content_type to be a string, not a bytes object. 

1852 content_type = content_type.decode("latin-1") 

1853 

1854 # File names are optional. 

1855 file_name = headers.get("X-File-Name") 

1856 

1857 # Instantiate a form parser. 

1858 form_parser = FormParser(content_type, on_field, on_file, boundary=boundary, file_name=file_name, config=config) 

1859 

1860 # Return our parser. 

1861 return form_parser 

1862 

1863 

1864def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, **kwargs): 

1865 """This function is useful if you just want to parse a request body, 

1866 without too much work. Pass it a dictionary-like object of the request's 

1867 headers, and a file-like object for the input stream, along with two 

1868 callbacks that will get called whenever a field or file is parsed. 

1869 

1870 :param headers: A dictionary-like object of HTTP headers. The only 

1871 required header is Content-Type. 

1872 

1873 :param input_stream: A file-like object that represents the request body. 

1874 The read() method must return bytestrings. 

1875 

1876 :param on_field: Callback to call with each parsed field. 

1877 

1878 :param on_file: Callback to call with each parsed file. 

1879 

1880 :param chunk_size: The maximum size to read from the input stream and write 

1881 to the parser at one time. Defaults to 1 MiB. 

1882 """ 

1883 

1884 # Create our form parser. 

1885 parser = create_form_parser(headers, on_field, on_file) 

1886 

1887 # Read chunks of 100KiB and write to the parser, but never read more than 

1888 # the given Content-Length, if any. 

1889 content_length = headers.get("Content-Length") 

1890 if content_length is not None: 

1891 content_length = int(content_length) 

1892 else: 

1893 content_length = float("inf") 

1894 bytes_read = 0 

1895 

1896 while True: 

1897 # Read only up to the Content-Length given. 

1898 max_readable = min(content_length - bytes_read, 1048576) 

1899 buff = input_stream.read(max_readable) 

1900 

1901 # Write to the parser and update our length. 

1902 parser.write(buff) 

1903 bytes_read += len(buff) 

1904 

1905 # If we get a buffer that's smaller than the size requested, or if we 

1906 # have read up to our content length, we're done. 

1907 if len(buff) != max_readable or bytes_read == content_length: 

1908 break 

1909 

1910 # Tell our parser that we're done writing data. 

1911 parser.finalize()