Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/multipart/multipart.py: 17%

717 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-26 06:12 +0000

1from .decoders import * 

2from .exceptions import * 

3 

4import os 

5import re 

6import sys 

7import shutil 

8import logging 

9import tempfile 

10from io import BytesIO 

11from numbers import Number 

12 

13# Unique missing object. 

14_missing = object() 

15 

16# States for the querystring parser. 

17STATE_BEFORE_FIELD = 0 

18STATE_FIELD_NAME = 1 

19STATE_FIELD_DATA = 2 

20 

21# States for the multipart parser 

22STATE_START = 0 

23STATE_START_BOUNDARY = 1 

24STATE_HEADER_FIELD_START = 2 

25STATE_HEADER_FIELD = 3 

26STATE_HEADER_VALUE_START = 4 

27STATE_HEADER_VALUE = 5 

28STATE_HEADER_VALUE_ALMOST_DONE = 6 

29STATE_HEADERS_ALMOST_DONE = 7 

30STATE_PART_DATA_START = 8 

31STATE_PART_DATA = 9 

32STATE_PART_DATA_END = 10 

33STATE_END = 11 

34 

35STATES = [ 

36 "START", 

37 "START_BOUNDARY", "HEADER_FIELD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE", 

38 "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END" 

39] 

40 

41 

42# Flags for the multipart parser. 

43FLAG_PART_BOUNDARY = 1 

44FLAG_LAST_BOUNDARY = 2 

45 

46# Get constants. Since iterating over a str on Python 2 gives you a 1-length 

47# string, but iterating over a bytes object on Python 3 gives you an integer, 

48# we need to save these constants. 

49CR = b'\r'[0] 

50LF = b'\n'[0] 

51COLON = b':'[0] 

52SPACE = b' '[0] 

53HYPHEN = b'-'[0] 

54AMPERSAND = b'&'[0] 

55SEMICOLON = b';'[0] 

56LOWER_A = b'a'[0] 

57LOWER_Z = b'z'[0] 

58NULL = b'\x00'[0] 

59 

60# Lower-casing a character is different, because of the difference between 

61# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte, 

62# and joining a list of bytes together. 

63# These functions abstract that. 

64lower_char = lambda c: c | 0x20 

65ord_char = lambda c: c 

66join_bytes = lambda b: bytes(list(b)) 

67 

68# These are regexes for parsing header values. 

69SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t') 

70QUOTED_STR = br'"(?:\\.|[^"])*"' 

71VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')' 

72OPTION_RE_STR = ( 

73 br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')' 

74) 

75OPTION_RE = re.compile(OPTION_RE_STR) 

76QUOTE = b'"'[0] 

77 

78 

79def parse_options_header(value): 

80 """ 

81 Parses a Content-Type header into a value in the following format: 

82 (content_type, {parameters}) 

83 """ 

84 if not value: 

85 return (b'', {}) 

86 

87 # If we are passed a string, we assume that it conforms to WSGI and does 

88 # not contain any code point that's not in latin-1. 

89 if isinstance(value, str): # pragma: no cover 

90 value = value.encode('latin-1') 

91 

92 # If we have no options, return the string as-is. 

93 if b';' not in value: 

94 return (value.lower().strip(), {}) 

95 

96 # Split at the first semicolon, to get our value and then options. 

97 ctype, rest = value.split(b';', 1) 

98 options = {} 

99 

100 # Parse the options. 

101 for match in OPTION_RE.finditer(rest): 

102 key = match.group(1).lower() 

103 value = match.group(2) 

104 if value[0] == QUOTE and value[-1] == QUOTE: 

105 # Unquote the value. 

106 value = value[1:-1] 

107 value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') 

108 

109 # If the value is a filename, we need to fix a bug on IE6 that sends 

110 # the full file path instead of the filename. 

111 if key == b'filename': 

112 if value[1:3] == b':\\' or value[:2] == b'\\\\': 

113 value = value.split(b'\\')[-1] 

114 

115 options[key] = value 

116 

117 return ctype, options 

118 

119 

120class Field: 

121 """A Field object represents a (parsed) form field. It represents a single 

122 field with a corresponding name and value. 

123 

124 The name that a :class:`Field` will be instantiated with is the same name 

125 that would be found in the following HTML:: 

126 

127 <input name="name_goes_here" type="text"/> 

128 

129 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that 

130 will be called when data is written to the Field, and when the Field is 

131 finalized, respectively. 

132 

133 :param name: the name of the form field 

134 """ 

135 def __init__(self, name): 

136 self._name = name 

137 self._value = [] 

138 

139 # We cache the joined version of _value for speed. 

140 self._cache = _missing 

141 

142 @classmethod 

143 def from_value(klass, name, value): 

144 """Create an instance of a :class:`Field`, and set the corresponding 

145 value - either None or an actual value. This method will also 

146 finalize the Field itself. 

147 

148 :param name: the name of the form field 

149 :param value: the value of the form field - either a bytestring or 

150 None 

151 """ 

152 

153 f = klass(name) 

154 if value is None: 

155 f.set_none() 

156 else: 

157 f.write(value) 

158 f.finalize() 

159 return f 

160 

161 def write(self, data): 

162 """Write some data into the form field. 

163 

164 :param data: a bytestring 

165 """ 

166 return self.on_data(data) 

167 

168 def on_data(self, data): 

169 """This method is a callback that will be called whenever data is 

170 written to the Field. 

171 

172 :param data: a bytestring 

173 """ 

174 self._value.append(data) 

175 self._cache = _missing 

176 return len(data) 

177 

178 def on_end(self): 

179 """This method is called whenever the Field is finalized. 

180 """ 

181 if self._cache is _missing: 

182 self._cache = b''.join(self._value) 

183 

184 def finalize(self): 

185 """Finalize the form field. 

186 """ 

187 self.on_end() 

188 

189 def close(self): 

190 """Close the Field object. This will free any underlying cache. 

191 """ 

192 # Free our value array. 

193 if self._cache is _missing: 

194 self._cache = b''.join(self._value) 

195 

196 del self._value 

197 

198 def set_none(self): 

199 """Some fields in a querystring can possibly have a value of None - for 

200 example, the string "foo&bar=&baz=asdf" will have a field with the 

201 name "foo" and value None, one with name "bar" and value "", and one 

202 with name "baz" and value "asdf". Since the write() interface doesn't 

203 support writing None, this function will set the field value to None. 

204 """ 

205 self._cache = None 

206 

207 @property 

208 def field_name(self): 

209 """This property returns the name of the field.""" 

210 return self._name 

211 

212 @property 

213 def value(self): 

214 """This property returns the value of the form field.""" 

215 if self._cache is _missing: 

216 self._cache = b''.join(self._value) 

217 

218 return self._cache 

219 

220 def __eq__(self, other): 

221 if isinstance(other, Field): 

222 return ( 

223 self.field_name == other.field_name and 

224 self.value == other.value 

225 ) 

226 else: 

227 return NotImplemented 

228 

229 def __repr__(self): 

230 if len(self.value) > 97: 

231 # We get the repr, and then insert three dots before the final 

232 # quote. 

233 v = repr(self.value[:97])[:-1] + "...'" 

234 else: 

235 v = repr(self.value) 

236 

237 return "{}(field_name={!r}, value={})".format( 

238 self.__class__.__name__, 

239 self.field_name, 

240 v 

241 ) 

242 

243 

244class File: 

245 """This class represents an uploaded file. It handles writing file data to 

246 either an in-memory file or a temporary file on-disk, if the optional 

247 threshold is passed. 

248 

249 There are some options that can be passed to the File to change behavior 

250 of the class. Valid options are as follows: 

251 

252 .. list-table:: 

253 :widths: 15 5 5 30 

254 :header-rows: 1 

255 

256 * - Name 

257 - Type 

258 - Default 

259 - Description 

260 * - UPLOAD_DIR 

261 - `str` 

262 - None 

263 - The directory to store uploaded files in. If this is None, a 

264 temporary file will be created in the system's standard location. 

265 * - UPLOAD_DELETE_TMP 

266 - `bool` 

267 - True 

268 - Delete automatically created TMP file 

269 * - UPLOAD_KEEP_FILENAME 

270 - `bool` 

271 - False 

272 - Whether or not to keep the filename of the uploaded file. If True, 

273 then the filename will be converted to a safe representation (e.g. 

274 by removing any invalid path segments), and then saved with the 

275 same name). Otherwise, a temporary name will be used. 

276 * - UPLOAD_KEEP_EXTENSIONS 

277 - `bool` 

278 - False 

279 - Whether or not to keep the uploaded file's extension. If False, the 

280 file will be saved with the default temporary extension (usually 

281 ".tmp"). Otherwise, the file's extension will be maintained. Note 

282 that this will properly combine with the UPLOAD_KEEP_FILENAME 

283 setting. 

284 * - MAX_MEMORY_FILE_SIZE 

285 - `int` 

286 - 1 MiB 

287 - The maximum number of bytes of a File to keep in memory. By 

288 default, the contents of a File are kept into memory until a certain 

289 limit is reached, after which the contents of the File are written 

290 to a temporary file. This behavior can be disabled by setting this 

291 value to an appropriately large value (or, for example, infinity, 

292 such as `float('inf')`. 

293 

294 :param file_name: The name of the file that this :class:`File` represents 

295 

296 :param field_name: The field name that uploaded this file. Note that this 

297 can be None, if, for example, the file was uploaded 

298 with Content-Type application/octet-stream 

299 

300 :param config: The configuration for this File. See above for valid 

301 configuration keys and their corresponding values. 

302 """ 

303 def __init__(self, file_name, field_name=None, config={}): 

304 # Save configuration, set other variables default. 

305 self.logger = logging.getLogger(__name__) 

306 self._config = config 

307 self._in_memory = True 

308 self._bytes_written = 0 

309 self._fileobj = BytesIO() 

310 

311 # Save the provided field/file name. 

312 self._field_name = field_name 

313 self._file_name = file_name 

314 

315 # Our actual file name is None by default, since, depending on our 

316 # config, we may not actually use the provided name. 

317 self._actual_file_name = None 

318 

319 # Split the extension from the filename. 

320 if file_name is not None: 

321 base, ext = os.path.splitext(file_name) 

322 self._file_base = base 

323 self._ext = ext 

324 

325 @property 

326 def field_name(self): 

327 """The form field associated with this file. May be None if there isn't 

328 one, for example when we have an application/octet-stream upload. 

329 """ 

330 return self._field_name 

331 

332 @property 

333 def file_name(self): 

334 """The file name given in the upload request. 

335 """ 

336 return self._file_name 

337 

338 @property 

339 def actual_file_name(self): 

340 """The file name that this file is saved as. Will be None if it's not 

341 currently saved on disk. 

342 """ 

343 return self._actual_file_name 

344 

345 @property 

346 def file_object(self): 

347 """The file object that we're currently writing to. Note that this 

348 will either be an instance of a :class:`io.BytesIO`, or a regular file 

349 object. 

350 """ 

351 return self._fileobj 

352 

353 @property 

354 def size(self): 

355 """The total size of this file, counted as the number of bytes that 

356 currently have been written to the file. 

357 """ 

358 return self._bytes_written 

359 

360 @property 

361 def in_memory(self): 

362 """A boolean representing whether or not this file object is currently 

363 stored in-memory or on-disk. 

364 """ 

365 return self._in_memory 

366 

367 def flush_to_disk(self): 

368 """If the file is already on-disk, do nothing. Otherwise, copy from 

369 the in-memory buffer to a disk file, and then reassign our internal 

370 file object to this new disk file. 

371 

372 Note that if you attempt to flush a file that is already on-disk, a 

373 warning will be logged to this module's logger. 

374 """ 

375 if not self._in_memory: 

376 self.logger.warning( 

377 "Trying to flush to disk when we're not in memory" 

378 ) 

379 return 

380 

381 # Go back to the start of our file. 

382 self._fileobj.seek(0) 

383 

384 # Open a new file. 

385 new_file = self._get_disk_file() 

386 

387 # Copy the file objects. 

388 shutil.copyfileobj(self._fileobj, new_file) 

389 

390 # Seek to the new position in our new file. 

391 new_file.seek(self._bytes_written) 

392 

393 # Reassign the fileobject. 

394 old_fileobj = self._fileobj 

395 self._fileobj = new_file 

396 

397 # We're no longer in memory. 

398 self._in_memory = False 

399 

400 # Close the old file object. 

401 old_fileobj.close() 

402 

403 def _get_disk_file(self): 

404 """This function is responsible for getting a file object on-disk for us. 

405 """ 

406 self.logger.info("Opening a file on disk") 

407 

408 file_dir = self._config.get('UPLOAD_DIR') 

409 keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False) 

410 keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False) 

411 delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True) 

412 

413 # If we have a directory and are to keep the filename... 

414 if file_dir is not None and keep_filename: 

415 self.logger.info("Saving with filename in: %r", file_dir) 

416 

417 # Build our filename. 

418 # TODO: what happens if we don't have a filename? 

419 fname = self._file_base 

420 if keep_extensions: 

421 fname = fname + self._ext 

422 

423 path = os.path.join(file_dir, fname) 

424 try: 

425 self.logger.info("Opening file: %r", path) 

426 tmp_file = open(path, 'w+b') 

427 except OSError as e: 

428 tmp_file = None 

429 

430 self.logger.exception("Error opening temporary file") 

431 raise FileError("Error opening temporary file: %r" % path) 

432 else: 

433 # Build options array. 

434 # Note that on Python 3, tempfile doesn't support byte names. We 

435 # encode our paths using the default filesystem encoding. 

436 options = {} 

437 if keep_extensions: 

438 ext = self._ext 

439 if isinstance(ext, bytes): 

440 ext = ext.decode(sys.getfilesystemencoding()) 

441 

442 options['suffix'] = ext 

443 if file_dir is not None: 

444 d = file_dir 

445 if isinstance(d, bytes): 

446 d = d.decode(sys.getfilesystemencoding()) 

447 

448 options['dir'] = d 

449 options['delete'] = delete_tmp 

450 

451 # Create a temporary (named) file with the appropriate settings. 

452 self.logger.info("Creating a temporary file with options: %r", 

453 options) 

454 try: 

455 tmp_file = tempfile.NamedTemporaryFile(**options) 

456 except OSError: 

457 self.logger.exception("Error creating named temporary file") 

458 raise FileError("Error creating named temporary file") 

459 

460 fname = tmp_file.name 

461 

462 # Encode filename as bytes. 

463 if isinstance(fname, str): 

464 fname = fname.encode(sys.getfilesystemencoding()) 

465 

466 self._actual_file_name = fname 

467 return tmp_file 

468 

469 def write(self, data): 

470 """Write some data to the File. 

471 

472 :param data: a bytestring 

473 """ 

474 return self.on_data(data) 

475 

476 def on_data(self, data): 

477 """This method is a callback that will be called whenever data is 

478 written to the File. 

479 

480 :param data: a bytestring 

481 """ 

482 pos = self._fileobj.tell() 

483 bwritten = self._fileobj.write(data) 

484 # true file objects write returns None 

485 if bwritten is None: 

486 bwritten = self._fileobj.tell() - pos 

487 

488 # If the bytes written isn't the same as the length, just return. 

489 if bwritten != len(data): 

490 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, 

491 len(data)) 

492 return bwritten 

493 

494 # Keep track of how many bytes we've written. 

495 self._bytes_written += bwritten 

496 

497 # If we're in-memory and are over our limit, we create a file. 

498 if (self._in_memory and 

499 self._config.get('MAX_MEMORY_FILE_SIZE') is not None and 

500 (self._bytes_written > 

501 self._config.get('MAX_MEMORY_FILE_SIZE'))): 

502 self.logger.info("Flushing to disk") 

503 self.flush_to_disk() 

504 

505 # Return the number of bytes written. 

506 return bwritten 

507 

508 def on_end(self): 

509 """This method is called whenever the Field is finalized. 

510 """ 

511 # Flush the underlying file object 

512 self._fileobj.flush() 

513 

514 def finalize(self): 

515 """Finalize the form file. This will not close the underlying file, 

516 but simply signal that we are finished writing to the File. 

517 """ 

518 self.on_end() 

519 

520 def close(self): 

521 """Close the File object. This will actually close the underlying 

522 file object (whether it's a :class:`io.BytesIO` or an actual file 

523 object). 

524 """ 

525 self._fileobj.close() 

526 

527 def __repr__(self): 

528 return "{}(file_name={!r}, field_name={!r})".format( 

529 self.__class__.__name__, 

530 self.file_name, 

531 self.field_name 

532 ) 

533 

534 

535class BaseParser: 

536 """This class is the base class for all parsers. It contains the logic for 

537 calling and adding callbacks. 

538 

539 A callback can be one of two different forms. "Notification callbacks" are 

540 callbacks that are called when something happens - for example, when a new 

541 part of a multipart message is encountered by the parser. "Data callbacks" 

542 are called when we get some sort of data - for example, part of the body of 

543 a multipart chunk. Notification callbacks are called with no parameters, 

544 whereas data callbacks are called with three, as follows:: 

545 

546 data_callback(data, start, end) 

547 

548 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on 

549 Python 3). "start" and "end" are integer indexes into the "data" string 

550 that represent the data of interest. Thus, in a data callback, the slice 

551 `data[start:end]` represents the data that the callback is "interested in". 

552 The callback is not passed a copy of the data, since copying severely hurts 

553 performance. 

554 """ 

555 def __init__(self): 

556 self.logger = logging.getLogger(__name__) 

557 

558 def callback(self, name, data=None, start=None, end=None): 

559 """This function calls a provided callback with some data. If the 

560 callback is not set, will do nothing. 

561 

562 :param name: The name of the callback to call (as a string). 

563 

564 :param data: Data to pass to the callback. If None, then it is 

565 assumed that the callback is a notification callback, 

566 and no parameters are given. 

567 

568 :param end: An integer that is passed to the data callback. 

569 

570 :param start: An integer that is passed to the data callback. 

571 """ 

572 name = "on_" + name 

573 func = self.callbacks.get(name) 

574 if func is None: 

575 return 

576 

577 # Depending on whether we're given a buffer... 

578 if data is not None: 

579 # Don't do anything if we have start == end. 

580 if start is not None and start == end: 

581 return 

582 

583 self.logger.debug("Calling %s with data[%d:%d]", name, start, end) 

584 func(data, start, end) 

585 else: 

586 self.logger.debug("Calling %s with no data", name) 

587 func() 

588 

589 def set_callback(self, name, new_func): 

590 """Update the function for a callback. Removes from the callbacks dict 

591 if new_func is None. 

592 

593 :param name: The name of the callback to call (as a string). 

594 

595 :param new_func: The new function for the callback. If None, then the 

596 callback will be removed (with no error if it does not 

597 exist). 

598 """ 

599 if new_func is None: 

600 self.callbacks.pop('on_' + name, None) 

601 else: 

602 self.callbacks['on_' + name] = new_func 

603 

604 def close(self): 

605 pass # pragma: no cover 

606 

607 def finalize(self): 

608 pass # pragma: no cover 

609 

610 def __repr__(self): 

611 return "%s()" % self.__class__.__name__ 

612 

613 

614class OctetStreamParser(BaseParser): 

615 """This parser parses an octet-stream request body and calls callbacks when 

616 incoming data is received. Callbacks are as follows: 

617 

618 .. list-table:: 

619 :widths: 15 10 30 

620 :header-rows: 1 

621 

622 * - Callback Name 

623 - Parameters 

624 - Description 

625 * - on_start 

626 - None 

627 - Called when the first data is parsed. 

628 * - on_data 

629 - data, start, end 

630 - Called for each data chunk that is parsed. 

631 * - on_end 

632 - None 

633 - Called when the parser is finished parsing all data. 

634 

635 :param callbacks: A dictionary of callbacks. See the documentation for 

636 :class:`BaseParser`. 

637 

638 :param max_size: The maximum size of body to parse. Defaults to infinity - 

639 i.e. unbounded. 

640 """ 

641 def __init__(self, callbacks={}, max_size=float('inf')): 

642 super().__init__() 

643 self.callbacks = callbacks 

644 self._started = False 

645 

646 if not isinstance(max_size, Number) or max_size < 1: 

647 raise ValueError("max_size must be a positive number, not %r" % 

648 max_size) 

649 self.max_size = max_size 

650 self._current_size = 0 

651 

652 def write(self, data): 

653 """Write some data to the parser, which will perform size verification, 

654 and then pass the data to the underlying callback. 

655 

656 :param data: a bytestring 

657 """ 

658 if not self._started: 

659 self.callback('start') 

660 self._started = True 

661 

662 # Truncate data length. 

663 data_len = len(data) 

664 if (self._current_size + data_len) > self.max_size: 

665 # We truncate the length of data that we are to process. 

666 new_size = int(self.max_size - self._current_size) 

667 self.logger.warning("Current size is %d (max %d), so truncating " 

668 "data length from %d to %d", 

669 self._current_size, self.max_size, data_len, 

670 new_size) 

671 data_len = new_size 

672 

673 # Increment size, then callback, in case there's an exception. 

674 self._current_size += data_len 

675 self.callback('data', data, 0, data_len) 

676 return data_len 

677 

678 def finalize(self): 

679 """Finalize this parser, which signals to that we are finished parsing, 

680 and sends the on_end callback. 

681 """ 

682 self.callback('end') 

683 

684 def __repr__(self): 

685 return "%s()" % self.__class__.__name__ 

686 

687 

688class QuerystringParser(BaseParser): 

689 """This is a streaming querystring parser. It will consume data, and call 

690 the callbacks given when it has data. 

691 

692 .. list-table:: 

693 :widths: 15 10 30 

694 :header-rows: 1 

695 

696 * - Callback Name 

697 - Parameters 

698 - Description 

699 * - on_field_start 

700 - None 

701 - Called when a new field is encountered. 

702 * - on_field_name 

703 - data, start, end 

704 - Called when a portion of a field's name is encountered. 

705 * - on_field_data 

706 - data, start, end 

707 - Called when a portion of a field's data is encountered. 

708 * - on_field_end 

709 - None 

710 - Called when the end of a field is encountered. 

711 * - on_end 

712 - None 

713 - Called when the parser is finished parsing all data. 

714 

715 :param callbacks: A dictionary of callbacks. See the documentation for 

716 :class:`BaseParser`. 

717 

718 :param strict_parsing: Whether or not to parse the body strictly. Defaults 

719 to False. If this is set to True, then the behavior 

720 of the parser changes as the following: if a field 

721 has a value with an equal sign (e.g. "foo=bar", or 

722 "foo="), it is always included. If a field has no 

723 equals sign (e.g. "...&name&..."), it will be 

724 treated as an error if 'strict_parsing' is True, 

725 otherwise included. If an error is encountered, 

726 then a 

727 :class:`multipart.exceptions.QuerystringParseError` 

728 will be raised. 

729 

730 :param max_size: The maximum size of body to parse. Defaults to infinity - 

731 i.e. unbounded. 

732 """ 

733 def __init__(self, callbacks={}, strict_parsing=False, 

734 max_size=float('inf')): 

735 super().__init__() 

736 self.state = STATE_BEFORE_FIELD 

737 self._found_sep = False 

738 

739 self.callbacks = callbacks 

740 

741 # Max-size stuff 

742 if not isinstance(max_size, Number) or max_size < 1: 

743 raise ValueError("max_size must be a positive number, not %r" % 

744 max_size) 

745 self.max_size = max_size 

746 self._current_size = 0 

747 

748 # Should parsing be strict? 

749 self.strict_parsing = strict_parsing 

750 

751 def write(self, data): 

752 """Write some data to the parser, which will perform size verification, 

753 parse into either a field name or value, and then pass the 

754 corresponding data to the underlying callback. If an error is 

755 encountered while parsing, a QuerystringParseError will be raised. The 

756 "offset" attribute of the raised exception will be set to the offset in 

757 the input data chunk (NOT the overall stream) that caused the error. 

758 

759 :param data: a bytestring 

760 """ 

761 # Handle sizing. 

762 data_len = len(data) 

763 if (self._current_size + data_len) > self.max_size: 

764 # We truncate the length of data that we are to process. 

765 new_size = int(self.max_size - self._current_size) 

766 self.logger.warning("Current size is %d (max %d), so truncating " 

767 "data length from %d to %d", 

768 self._current_size, self.max_size, data_len, 

769 new_size) 

770 data_len = new_size 

771 

772 l = 0 

773 try: 

774 l = self._internal_write(data, data_len) 

775 finally: 

776 self._current_size += l 

777 

778 return l 

779 

780 def _internal_write(self, data, length): 

781 state = self.state 

782 strict_parsing = self.strict_parsing 

783 found_sep = self._found_sep 

784 

785 i = 0 

786 while i < length: 

787 ch = data[i] 

788 

789 # Depending on our state... 

790 if state == STATE_BEFORE_FIELD: 

791 # If the 'found_sep' flag is set, we've already encountered 

792 # and skipped a single separator. If so, we check our strict 

793 # parsing flag and decide what to do. Otherwise, we haven't 

794 # yet reached a separator, and thus, if we do, we need to skip 

795 # it as it will be the boundary between fields that's supposed 

796 # to be there. 

797 if ch == AMPERSAND or ch == SEMICOLON: 

798 if found_sep: 

799 # If we're parsing strictly, we disallow blank chunks. 

800 if strict_parsing: 

801 e = QuerystringParseError( 

802 "Skipping duplicate ampersand/semicolon at " 

803 "%d" % i 

804 ) 

805 e.offset = i 

806 raise e 

807 else: 

808 self.logger.debug("Skipping duplicate ampersand/" 

809 "semicolon at %d", i) 

810 else: 

811 # This case is when we're skipping the (first) 

812 # separator between fields, so we just set our flag 

813 # and continue on. 

814 found_sep = True 

815 else: 

816 # Emit a field-start event, and go to that state. Also, 

817 # reset the "found_sep" flag, for the next time we get to 

818 # this state. 

819 self.callback('field_start') 

820 i -= 1 

821 state = STATE_FIELD_NAME 

822 found_sep = False 

823 

824 elif state == STATE_FIELD_NAME: 

825 # Try and find a separator - we ensure that, if we do, we only 

826 # look for the equal sign before it. 

827 sep_pos = data.find(b'&', i) 

828 if sep_pos == -1: 

829 sep_pos = data.find(b';', i) 

830 

831 # See if we can find an equals sign in the remaining data. If 

832 # so, we can immediately emit the field name and jump to the 

833 # data state. 

834 if sep_pos != -1: 

835 equals_pos = data.find(b'=', i, sep_pos) 

836 else: 

837 equals_pos = data.find(b'=', i) 

838 

839 if equals_pos != -1: 

840 # Emit this name. 

841 self.callback('field_name', data, i, equals_pos) 

842 

843 # Jump i to this position. Note that it will then have 1 

844 # added to it below, which means the next iteration of this 

845 # loop will inspect the character after the equals sign. 

846 i = equals_pos 

847 state = STATE_FIELD_DATA 

848 else: 

849 # No equals sign found. 

850 if not strict_parsing: 

851 # See also comments in the STATE_FIELD_DATA case below. 

852 # If we found the separator, we emit the name and just 

853 # end - there's no data callback at all (not even with 

854 # a blank value). 

855 if sep_pos != -1: 

856 self.callback('field_name', data, i, sep_pos) 

857 self.callback('field_end') 

858 

859 i = sep_pos - 1 

860 state = STATE_BEFORE_FIELD 

861 else: 

862 # Otherwise, no separator in this block, so the 

863 # rest of this chunk must be a name. 

864 self.callback('field_name', data, i, length) 

865 i = length 

866 

867 else: 

868 # We're parsing strictly. If we find a separator, 

869 # this is an error - we require an equals sign. 

870 if sep_pos != -1: 

871 e = QuerystringParseError( 

872 "When strict_parsing is True, we require an " 

873 "equals sign in all field chunks. Did not " 

874 "find one in the chunk that starts at %d" % 

875 (i,) 

876 ) 

877 e.offset = i 

878 raise e 

879 

880 # No separator in the rest of this chunk, so it's just 

881 # a field name. 

882 self.callback('field_name', data, i, length) 

883 i = length 

884 

885 elif state == STATE_FIELD_DATA: 

886 # Try finding either an ampersand or a semicolon after this 

887 # position. 

888 sep_pos = data.find(b'&', i) 

889 if sep_pos == -1: 

890 sep_pos = data.find(b';', i) 

891 

892 # If we found it, callback this bit as data and then go back 

893 # to expecting to find a field. 

894 if sep_pos != -1: 

895 self.callback('field_data', data, i, sep_pos) 

896 self.callback('field_end') 

897 

898 # Note that we go to the separator, which brings us to the 

899 # "before field" state. This allows us to properly emit 

900 # "field_start" events only when we actually have data for 

901 # a field of some sort. 

902 i = sep_pos - 1 

903 state = STATE_BEFORE_FIELD 

904 

905 # Otherwise, emit the rest as data and finish. 

906 else: 

907 self.callback('field_data', data, i, length) 

908 i = length 

909 

910 else: # pragma: no cover (error case) 

911 msg = "Reached an unknown state %d at %d" % (state, i) 

912 self.logger.warning(msg) 

913 e = QuerystringParseError(msg) 

914 e.offset = i 

915 raise e 

916 

917 i += 1 

918 

919 self.state = state 

920 self._found_sep = found_sep 

921 return len(data) 

922 

923 def finalize(self): 

924 """Finalize this parser, which signals to that we are finished parsing, 

925 if we're still in the middle of a field, an on_field_end callback, and 

926 then the on_end callback. 

927 """ 

928 # If we're currently in the middle of a field, we finish it. 

929 if self.state == STATE_FIELD_DATA: 

930 self.callback('field_end') 

931 self.callback('end') 

932 

933 def __repr__(self): 

934 return "{}(strict_parsing={!r}, max_size={!r})".format( 

935 self.__class__.__name__, 

936 self.strict_parsing, self.max_size 

937 ) 

938 

939 

940class MultipartParser(BaseParser): 

941 """This class is a streaming multipart/form-data parser. 

942 

943 .. list-table:: 

944 :widths: 15 10 30 

945 :header-rows: 1 

946 

947 * - Callback Name 

948 - Parameters 

949 - Description 

950 * - on_part_begin 

951 - None 

952 - Called when a new part of the multipart message is encountered. 

953 * - on_part_data 

954 - data, start, end 

955 - Called when a portion of a part's data is encountered. 

956 * - on_part_end 

957 - None 

958 - Called when the end of a part is reached. 

959 * - on_header_begin 

960 - None 

961 - Called when we've found a new header in a part of a multipart 

962 message 

963 * - on_header_field 

964 - data, start, end 

965 - Called each time an additional portion of a header is read (i.e. the 

966 part of the header that is before the colon; the "Foo" in 

967 "Foo: Bar"). 

968 * - on_header_value 

969 - data, start, end 

970 - Called when we get data for a header. 

971 * - on_header_end 

972 - None 

973 - Called when the current header is finished - i.e. we've reached the 

974 newline at the end of the header. 

975 * - on_headers_finished 

976 - None 

977 - Called when all headers are finished, and before the part data 

978 starts. 

979 * - on_end 

980 - None 

981 - Called when the parser is finished parsing all data. 

982 

983 

984 :param boundary: The multipart boundary. This is required, and must match 

985 what is given in the HTTP request - usually in the 

986 Content-Type header. 

987 

988 :param callbacks: A dictionary of callbacks. See the documentation for 

989 :class:`BaseParser`. 

990 

991 :param max_size: The maximum size of body to parse. Defaults to infinity - 

992 i.e. unbounded. 

993 """ 

994 

995 def __init__(self, boundary, callbacks={}, max_size=float('inf')): 

996 # Initialize parser state. 

997 super().__init__() 

998 self.state = STATE_START 

999 self.index = self.flags = 0 

1000 

1001 self.callbacks = callbacks 

1002 

1003 if not isinstance(max_size, Number) or max_size < 1: 

1004 raise ValueError("max_size must be a positive number, not %r" % 

1005 max_size) 

1006 self.max_size = max_size 

1007 self._current_size = 0 

1008 

1009 # Setup marks. These are used to track the state of data received. 

1010 self.marks = {} 

1011 

1012 # TODO: Actually use this rather than the dumb version we currently use 

1013 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm. 

1014 # skip = [len(boundary) for x in range(256)] 

1015 # for i in range(len(boundary) - 1): 

1016 # skip[ord_char(boundary[i])] = len(boundary) - i - 1 

1017 # 

1018 # # We use a tuple since it's a constant, and marginally faster. 

1019 # self.skip = tuple(skip) 

1020 

1021 # Save our boundary. 

1022 if isinstance(boundary, str): # pragma: no cover 

1023 boundary = boundary.encode('latin-1') 

1024 self.boundary = b'\r\n--' + boundary 

1025 

1026 # Get a set of characters that belong to our boundary. 

1027 self.boundary_chars = frozenset(self.boundary) 

1028 

1029 # We also create a lookbehind list. 

1030 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary + 

1031 # "--\r\n" at the final boundary, and the length of '\r\n--' and 

1032 # '--\r\n' is 8 bytes. 

1033 self.lookbehind = [NULL for x in range(len(boundary) + 8)] 

1034 

1035 def write(self, data): 

1036 """Write some data to the parser, which will perform size verification, 

1037 and then parse the data into the appropriate location (e.g. header, 

1038 data, etc.), and pass this on to the underlying callback. If an error 

1039 is encountered, a MultipartParseError will be raised. The "offset" 

1040 attribute on the raised exception will be set to the offset of the byte 

1041 in the input chunk that caused the error. 

1042 

1043 :param data: a bytestring 

1044 """ 

1045 # Handle sizing. 

1046 data_len = len(data) 

1047 if (self._current_size + data_len) > self.max_size: 

1048 # We truncate the length of data that we are to process. 

1049 new_size = int(self.max_size - self._current_size) 

1050 self.logger.warning("Current size is %d (max %d), so truncating " 

1051 "data length from %d to %d", 

1052 self._current_size, self.max_size, data_len, 

1053 new_size) 

1054 data_len = new_size 

1055 

1056 l = 0 

1057 try: 

1058 l = self._internal_write(data, data_len) 

1059 finally: 

1060 self._current_size += l 

1061 

1062 return l 

1063 

1064 def _internal_write(self, data, length): 

1065 # Get values from locals. 

1066 boundary = self.boundary 

1067 

1068 # Get our state, flags and index. These are persisted between calls to 

1069 # this function. 

1070 state = self.state 

1071 index = self.index 

1072 flags = self.flags 

1073 

1074 # Our index defaults to 0. 

1075 i = 0 

1076 

1077 # Set a mark. 

1078 def set_mark(name): 

1079 self.marks[name] = i 

1080 

1081 # Remove a mark. 

1082 def delete_mark(name, reset=False): 

1083 self.marks.pop(name, None) 

1084 

1085 # Helper function that makes calling a callback with data easier. The 

1086 # 'remaining' parameter will callback from the marked value until the 

1087 # end of the buffer, and reset the mark, instead of deleting it. This 

1088 # is used at the end of the function to call our callbacks with any 

1089 # remaining data in this chunk. 

1090 def data_callback(name, remaining=False): 

1091 marked_index = self.marks.get(name) 

1092 if marked_index is None: 

1093 return 

1094 

1095 # If we're getting remaining data, we ignore the current i value 

1096 # and just call with the remaining data. 

1097 if remaining: 

1098 self.callback(name, data, marked_index, length) 

1099 self.marks[name] = 0 

1100 

1101 # Otherwise, we call it from the mark to the current byte we're 

1102 # processing. 

1103 else: 

1104 self.callback(name, data, marked_index, i) 

1105 self.marks.pop(name, None) 

1106 

1107 # For each byte... 

1108 while i < length: 

1109 c = data[i] 

1110 

1111 if state == STATE_START: 

1112 # Skip leading newlines 

1113 if c == CR or c == LF: 

1114 i += 1 

1115 self.logger.debug("Skipping leading CR/LF at %d", i) 

1116 continue 

1117 

1118 # index is used as in index into our boundary. Set to 0. 

1119 index = 0 

1120 

1121 # Move to the next state, but decrement i so that we re-process 

1122 # this character. 

1123 state = STATE_START_BOUNDARY 

1124 i -= 1 

1125 

1126 elif state == STATE_START_BOUNDARY: 

1127 # Check to ensure that the last 2 characters in our boundary 

1128 # are CRLF. 

1129 if index == len(boundary) - 2: 

1130 if c != CR: 

1131 # Error! 

1132 msg = "Did not find CR at end of boundary (%d)" % (i,) 

1133 self.logger.warning(msg) 

1134 e = MultipartParseError(msg) 

1135 e.offset = i 

1136 raise e 

1137 

1138 index += 1 

1139 

1140 elif index == len(boundary) - 2 + 1: 

1141 if c != LF: 

1142 msg = "Did not find LF at end of boundary (%d)" % (i,) 

1143 self.logger.warning(msg) 

1144 e = MultipartParseError(msg) 

1145 e.offset = i 

1146 raise e 

1147 

1148 # The index is now used for indexing into our boundary. 

1149 index = 0 

1150 

1151 # Callback for the start of a part. 

1152 self.callback('part_begin') 

1153 

1154 # Move to the next character and state. 

1155 state = STATE_HEADER_FIELD_START 

1156 

1157 else: 

1158 # Check to ensure our boundary matches 

1159 if c != boundary[index + 2]: 

1160 msg = "Did not find boundary character %r at index " \ 

1161 "%d" % (c, index + 2) 

1162 self.logger.warning(msg) 

1163 e = MultipartParseError(msg) 

1164 e.offset = i 

1165 raise e 

1166 

1167 # Increment index into boundary and continue. 

1168 index += 1 

1169 

1170 elif state == STATE_HEADER_FIELD_START: 

1171 # Mark the start of a header field here, reset the index, and 

1172 # continue parsing our header field. 

1173 index = 0 

1174 

1175 # Set a mark of our header field. 

1176 set_mark('header_field') 

1177 

1178 # Move to parsing header fields. 

1179 state = STATE_HEADER_FIELD 

1180 i -= 1 

1181 

1182 elif state == STATE_HEADER_FIELD: 

1183 # If we've reached a CR at the beginning of a header, it means 

1184 # that we've reached the second of 2 newlines, and so there are 

1185 # no more headers to parse. 

1186 if c == CR: 

1187 delete_mark('header_field') 

1188 state = STATE_HEADERS_ALMOST_DONE 

1189 i += 1 

1190 continue 

1191 

1192 # Increment our index in the header. 

1193 index += 1 

1194 

1195 # Do nothing if we encounter a hyphen. 

1196 if c == HYPHEN: 

1197 pass 

1198 

1199 # If we've reached a colon, we're done with this header. 

1200 elif c == COLON: 

1201 # A 0-length header is an error. 

1202 if index == 1: 

1203 msg = "Found 0-length header at %d" % (i,) 

1204 self.logger.warning(msg) 

1205 e = MultipartParseError(msg) 

1206 e.offset = i 

1207 raise e 

1208 

1209 # Call our callback with the header field. 

1210 data_callback('header_field') 

1211 

1212 # Move to parsing the header value. 

1213 state = STATE_HEADER_VALUE_START 

1214 

1215 else: 

1216 # Lower-case this character, and ensure that it is in fact 

1217 # a valid letter. If not, it's an error. 

1218 cl = lower_char(c) 

1219 if cl < LOWER_A or cl > LOWER_Z: 

1220 msg = "Found non-alphanumeric character %r in " \ 

1221 "header at %d" % (c, i) 

1222 self.logger.warning(msg) 

1223 e = MultipartParseError(msg) 

1224 e.offset = i 

1225 raise e 

1226 

1227 elif state == STATE_HEADER_VALUE_START: 

1228 # Skip leading spaces. 

1229 if c == SPACE: 

1230 i += 1 

1231 continue 

1232 

1233 # Mark the start of the header value. 

1234 set_mark('header_value') 

1235 

1236 # Move to the header-value state, reprocessing this character. 

1237 state = STATE_HEADER_VALUE 

1238 i -= 1 

1239 

1240 elif state == STATE_HEADER_VALUE: 

1241 # If we've got a CR, we're nearly done our headers. Otherwise, 

1242 # we do nothing and just move past this character. 

1243 if c == CR: 

1244 data_callback('header_value') 

1245 self.callback('header_end') 

1246 state = STATE_HEADER_VALUE_ALMOST_DONE 

1247 

1248 elif state == STATE_HEADER_VALUE_ALMOST_DONE: 

1249 # The last character should be a LF. If not, it's an error. 

1250 if c != LF: 

1251 msg = "Did not find LF character at end of header " \ 

1252 "(found %r)" % (c,) 

1253 self.logger.warning(msg) 

1254 e = MultipartParseError(msg) 

1255 e.offset = i 

1256 raise e 

1257 

1258 # Move back to the start of another header. Note that if that 

1259 # state detects ANOTHER newline, it'll trigger the end of our 

1260 # headers. 

1261 state = STATE_HEADER_FIELD_START 

1262 

1263 elif state == STATE_HEADERS_ALMOST_DONE: 

1264 # We're almost done our headers. This is reached when we parse 

1265 # a CR at the beginning of a header, so our next character 

1266 # should be a LF, or it's an error. 

1267 if c != LF: 

1268 msg = f"Did not find LF at end of headers (found {c!r})" 

1269 self.logger.warning(msg) 

1270 e = MultipartParseError(msg) 

1271 e.offset = i 

1272 raise e 

1273 

1274 self.callback('headers_finished') 

1275 state = STATE_PART_DATA_START 

1276 

1277 elif state == STATE_PART_DATA_START: 

1278 # Mark the start of our part data. 

1279 set_mark('part_data') 

1280 

1281 # Start processing part data, including this character. 

1282 state = STATE_PART_DATA 

1283 i -= 1 

1284 

1285 elif state == STATE_PART_DATA: 

1286 # We're processing our part data right now. During this, we 

1287 # need to efficiently search for our boundary, since any data 

1288 # on any number of lines can be a part of the current data. 

1289 # We use the Boyer-Moore-Horspool algorithm to efficiently 

1290 # search through the remainder of the buffer looking for our 

1291 # boundary. 

1292 

1293 # Save the current value of our index. We use this in case we 

1294 # find part of a boundary, but it doesn't match fully. 

1295 prev_index = index 

1296 

1297 # Set up variables. 

1298 boundary_length = len(boundary) 

1299 boundary_end = boundary_length - 1 

1300 data_length = length 

1301 boundary_chars = self.boundary_chars 

1302 

1303 # If our index is 0, we're starting a new part, so start our 

1304 # search. 

1305 if index == 0: 

1306 # Search forward until we either hit the end of our buffer, 

1307 # or reach a character that's in our boundary. 

1308 i += boundary_end 

1309 while i < data_length - 1 and data[i] not in boundary_chars: 

1310 i += boundary_length 

1311 

1312 # Reset i back the length of our boundary, which is the 

1313 # earliest possible location that could be our match (i.e. 

1314 # if we've just broken out of our loop since we saw the 

1315 # last character in our boundary) 

1316 i -= boundary_end 

1317 c = data[i] 

1318 

1319 # Now, we have a couple of cases here. If our index is before 

1320 # the end of the boundary... 

1321 if index < boundary_length: 

1322 # If the character matches... 

1323 if boundary[index] == c: 

1324 # If we found a match for our boundary, we send the 

1325 # existing data. 

1326 if index == 0: 

1327 data_callback('part_data') 

1328 

1329 # The current character matches, so continue! 

1330 index += 1 

1331 else: 

1332 index = 0 

1333 

1334 # Our index is equal to the length of our boundary! 

1335 elif index == boundary_length: 

1336 # First we increment it. 

1337 index += 1 

1338 

1339 # Now, if we've reached a newline, we need to set this as 

1340 # the potential end of our boundary. 

1341 if c == CR: 

1342 flags |= FLAG_PART_BOUNDARY 

1343 

1344 # Otherwise, if this is a hyphen, we might be at the last 

1345 # of all boundaries. 

1346 elif c == HYPHEN: 

1347 flags |= FLAG_LAST_BOUNDARY 

1348 

1349 # Otherwise, we reset our index, since this isn't either a 

1350 # newline or a hyphen. 

1351 else: 

1352 index = 0 

1353 

1354 # Our index is right after the part boundary, which should be 

1355 # a LF. 

1356 elif index == boundary_length + 1: 

1357 # If we're at a part boundary (i.e. we've seen a CR 

1358 # character already)... 

1359 if flags & FLAG_PART_BOUNDARY: 

1360 # We need a LF character next. 

1361 if c == LF: 

1362 # Unset the part boundary flag. 

1363 flags &= (~FLAG_PART_BOUNDARY) 

1364 

1365 # Callback indicating that we've reached the end of 

1366 # a part, and are starting a new one. 

1367 self.callback('part_end') 

1368 self.callback('part_begin') 

1369 

1370 # Move to parsing new headers. 

1371 index = 0 

1372 state = STATE_HEADER_FIELD_START 

1373 i += 1 

1374 continue 

1375 

1376 # We didn't find an LF character, so no match. Reset 

1377 # our index and clear our flag. 

1378 index = 0 

1379 flags &= (~FLAG_PART_BOUNDARY) 

1380 

1381 # Otherwise, if we're at the last boundary (i.e. we've 

1382 # seen a hyphen already)... 

1383 elif flags & FLAG_LAST_BOUNDARY: 

1384 # We need a second hyphen here. 

1385 if c == HYPHEN: 

1386 # Callback to end the current part, and then the 

1387 # message. 

1388 self.callback('part_end') 

1389 self.callback('end') 

1390 state = STATE_END 

1391 else: 

1392 # No match, so reset index. 

1393 index = 0 

1394 

1395 # If we have an index, we need to keep this byte for later, in 

1396 # case we can't match the full boundary. 

1397 if index > 0: 

1398 self.lookbehind[index - 1] = c 

1399 

1400 # Otherwise, our index is 0. If the previous index is not, it 

1401 # means we reset something, and we need to take the data we 

1402 # thought was part of our boundary and send it along as actual 

1403 # data. 

1404 elif prev_index > 0: 

1405 # Callback to write the saved data. 

1406 lb_data = join_bytes(self.lookbehind) 

1407 self.callback('part_data', lb_data, 0, prev_index) 

1408 

1409 # Overwrite our previous index. 

1410 prev_index = 0 

1411 

1412 # Re-set our mark for part data. 

1413 set_mark('part_data') 

1414 

1415 # Re-consider the current character, since this could be 

1416 # the start of the boundary itself. 

1417 i -= 1 

1418 

1419 elif state == STATE_END: 

1420 # Do nothing and just consume a byte in the end state. 

1421 if c not in (CR, LF): 

1422 self.logger.warning("Consuming a byte '0x%x' in the end state", c) 

1423 

1424 else: # pragma: no cover (error case) 

1425 # We got into a strange state somehow! Just stop processing. 

1426 msg = "Reached an unknown state %d at %d" % (state, i) 

1427 self.logger.warning(msg) 

1428 e = MultipartParseError(msg) 

1429 e.offset = i 

1430 raise e 

1431 

1432 # Move to the next byte. 

1433 i += 1 

1434 

1435 # We call our callbacks with any remaining data. Note that we pass 

1436 # the 'remaining' flag, which sets the mark back to 0 instead of 

1437 # deleting it, if it's found. This is because, if the mark is found 

1438 # at this point, we assume that there's data for one of these things 

1439 # that has been parsed, but not yet emitted. And, as such, it implies 

1440 # that we haven't yet reached the end of this 'thing'. So, by setting 

1441 # the mark to 0, we cause any data callbacks that take place in future 

1442 # calls to this function to start from the beginning of that buffer. 

1443 data_callback('header_field', True) 

1444 data_callback('header_value', True) 

1445 data_callback('part_data', True) 

1446 

1447 # Save values to locals. 

1448 self.state = state 

1449 self.index = index 

1450 self.flags = flags 

1451 

1452 # Return our data length to indicate no errors, and that we processed 

1453 # all of it. 

1454 return length 

1455 

1456 def finalize(self): 

1457 """Finalize this parser, which signals to that we are finished parsing. 

1458 

1459 Note: It does not currently, but in the future, it will verify that we 

1460 are in the final state of the parser (i.e. the end of the multipart 

1461 message is well-formed), and, if not, throw an error. 

1462 """ 

1463 # TODO: verify that we're in the state STATE_END, otherwise throw an 

1464 # error or otherwise state that we're not finished parsing. 

1465 pass 

1466 

1467 def __repr__(self): 

1468 return f"{self.__class__.__name__}(boundary={self.boundary!r})" 

1469 

1470 

1471class FormParser: 

1472 """This class is the all-in-one form parser. Given all the information 

1473 necessary to parse a form, it will instantiate the correct parser, create 

1474 the proper :class:`Field` and :class:`File` classes to store the data that 

1475 is parsed, and call the two given callbacks with each field and file as 

1476 they become available. 

1477 

1478 :param content_type: The Content-Type of the incoming request. This is 

1479 used to select the appropriate parser. 

1480 

1481 :param on_field: The callback to call when a field has been parsed and is 

1482 ready for usage. See above for parameters. 

1483 

1484 :param on_file: The callback to call when a file has been parsed and is 

1485 ready for usage. See above for parameters. 

1486 

1487 :param on_end: An optional callback to call when all fields and files in a 

1488 request has been parsed. Can be None. 

1489 

1490 :param boundary: If the request is a multipart/form-data request, this 

1491 should be the boundary of the request, as given in the 

1492 Content-Type header, as a bytestring. 

1493 

1494 :param file_name: If the request is of type application/octet-stream, then 

1495 the body of the request will not contain any information 

1496 about the uploaded file. In such cases, you can provide 

1497 the file name of the uploaded file manually. 

1498 

1499 :param FileClass: The class to use for uploaded files. Defaults to 

1500 :class:`File`, but you can provide your own class if you 

1501 wish to customize behaviour. The class will be 

1502 instantiated as FileClass(file_name, field_name), and it 

1503 must provide the following functions:: 

1504 file_instance.write(data) 

1505 file_instance.finalize() 

1506 file_instance.close() 

1507 

1508 :param FieldClass: The class to use for uploaded fields. Defaults to 

1509 :class:`Field`, but you can provide your own class if 

1510 you wish to customize behaviour. The class will be 

1511 instantiated as FieldClass(field_name), and it must 

1512 provide the following functions:: 

1513 field_instance.write(data) 

1514 field_instance.finalize() 

1515 field_instance.close() 

1516 

1517 :param config: Configuration to use for this FormParser. The default 

1518 values are taken from the DEFAULT_CONFIG value, and then 

1519 any keys present in this dictionary will overwrite the 

1520 default values. 

1521 

1522 """ 

1523 #: This is the default configuration for our form parser. 

1524 #: Note: all file sizes should be in bytes. 

1525 DEFAULT_CONFIG = { 

1526 'MAX_BODY_SIZE': float('inf'), 

1527 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024, 

1528 'UPLOAD_DIR': None, 

1529 'UPLOAD_KEEP_FILENAME': False, 

1530 'UPLOAD_KEEP_EXTENSIONS': False, 

1531 

1532 # Error on invalid Content-Transfer-Encoding? 

1533 'UPLOAD_ERROR_ON_BAD_CTE': False, 

1534 } 

1535 

1536 def __init__(self, content_type, on_field, on_file, on_end=None, 

1537 boundary=None, file_name=None, FileClass=File, 

1538 FieldClass=Field, config={}): 

1539 

1540 self.logger = logging.getLogger(__name__) 

1541 

1542 # Save variables. 

1543 self.content_type = content_type 

1544 self.boundary = boundary 

1545 self.bytes_received = 0 

1546 self.parser = None 

1547 

1548 # Save callbacks. 

1549 self.on_field = on_field 

1550 self.on_file = on_file 

1551 self.on_end = on_end 

1552 

1553 # Save classes. 

1554 self.FileClass = File 

1555 self.FieldClass = Field 

1556 

1557 # Set configuration options. 

1558 self.config = self.DEFAULT_CONFIG.copy() 

1559 self.config.update(config) 

1560 

1561 # Depending on the Content-Type, we instantiate the correct parser. 

1562 if content_type == 'application/octet-stream': 

1563 # Work around the lack of 'nonlocal' in Py2 

1564 class vars: 

1565 f = None 

1566 

1567 def on_start(): 

1568 vars.f = FileClass(file_name, None, config=self.config) 

1569 

1570 def on_data(data, start, end): 

1571 vars.f.write(data[start:end]) 

1572 

1573 def on_end(): 

1574 # Finalize the file itself. 

1575 vars.f.finalize() 

1576 

1577 # Call our callback. 

1578 on_file(vars.f) 

1579 

1580 # Call the on-end callback. 

1581 if self.on_end is not None: 

1582 self.on_end() 

1583 

1584 callbacks = { 

1585 'on_start': on_start, 

1586 'on_data': on_data, 

1587 'on_end': on_end, 

1588 } 

1589 

1590 # Instantiate an octet-stream parser 

1591 parser = OctetStreamParser(callbacks, 

1592 max_size=self.config['MAX_BODY_SIZE']) 

1593 

1594 elif (content_type == 'application/x-www-form-urlencoded' or 

1595 content_type == 'application/x-url-encoded'): 

1596 

1597 name_buffer = [] 

1598 

1599 class vars: 

1600 f = None 

1601 

1602 def on_field_start(): 

1603 pass 

1604 

1605 def on_field_name(data, start, end): 

1606 name_buffer.append(data[start:end]) 

1607 

1608 def on_field_data(data, start, end): 

1609 if vars.f is None: 

1610 vars.f = FieldClass(b''.join(name_buffer)) 

1611 del name_buffer[:] 

1612 vars.f.write(data[start:end]) 

1613 

1614 def on_field_end(): 

1615 # Finalize and call callback. 

1616 if vars.f is None: 

1617 # If we get here, it's because there was no field data. 

1618 # We create a field, set it to None, and then continue. 

1619 vars.f = FieldClass(b''.join(name_buffer)) 

1620 del name_buffer[:] 

1621 vars.f.set_none() 

1622 

1623 vars.f.finalize() 

1624 on_field(vars.f) 

1625 vars.f = None 

1626 

1627 def on_end(): 

1628 if self.on_end is not None: 

1629 self.on_end() 

1630 

1631 # Setup callbacks. 

1632 callbacks = { 

1633 'on_field_start': on_field_start, 

1634 'on_field_name': on_field_name, 

1635 'on_field_data': on_field_data, 

1636 'on_field_end': on_field_end, 

1637 'on_end': on_end, 

1638 } 

1639 

1640 # Instantiate parser. 

1641 parser = QuerystringParser( 

1642 callbacks=callbacks, 

1643 max_size=self.config['MAX_BODY_SIZE'] 

1644 ) 

1645 

1646 elif content_type == 'multipart/form-data': 

1647 if boundary is None: 

1648 self.logger.error("No boundary given") 

1649 raise FormParserError("No boundary given") 

1650 

1651 header_name = [] 

1652 header_value = [] 

1653 headers = {} 

1654 

1655 # No 'nonlocal' on Python 2 :-( 

1656 class vars: 

1657 f = None 

1658 writer = None 

1659 is_file = False 

1660 

1661 def on_part_begin(): 

1662 pass 

1663 

1664 def on_part_data(data, start, end): 

1665 bytes_processed = vars.writer.write(data[start:end]) 

1666 # TODO: check for error here. 

1667 return bytes_processed 

1668 

1669 def on_part_end(): 

1670 vars.f.finalize() 

1671 if vars.is_file: 

1672 on_file(vars.f) 

1673 else: 

1674 on_field(vars.f) 

1675 

1676 def on_header_field(data, start, end): 

1677 header_name.append(data[start:end]) 

1678 

1679 def on_header_value(data, start, end): 

1680 header_value.append(data[start:end]) 

1681 

1682 def on_header_end(): 

1683 headers[b''.join(header_name)] = b''.join(header_value) 

1684 del header_name[:] 

1685 del header_value[:] 

1686 

1687 def on_headers_finished(): 

1688 # Reset the 'is file' flag. 

1689 vars.is_file = False 

1690 

1691 # Parse the content-disposition header. 

1692 # TODO: handle mixed case 

1693 content_disp = headers.get(b'Content-Disposition') 

1694 disp, options = parse_options_header(content_disp) 

1695 

1696 # Get the field and filename. 

1697 field_name = options.get(b'name') 

1698 file_name = options.get(b'filename') 

1699 # TODO: check for errors 

1700 

1701 # Create the proper class. 

1702 if file_name is None: 

1703 vars.f = FieldClass(field_name) 

1704 else: 

1705 vars.f = FileClass(file_name, field_name, config=self.config) 

1706 vars.is_file = True 

1707 

1708 # Parse the given Content-Transfer-Encoding to determine what 

1709 # we need to do with the incoming data. 

1710 # TODO: check that we properly handle 8bit / 7bit encoding. 

1711 transfer_encoding = headers.get(b'Content-Transfer-Encoding', 

1712 b'7bit') 

1713 

1714 if (transfer_encoding == b'binary' or 

1715 transfer_encoding == b'8bit' or 

1716 transfer_encoding == b'7bit'): 

1717 vars.writer = vars.f 

1718 

1719 elif transfer_encoding == b'base64': 

1720 vars.writer = Base64Decoder(vars.f) 

1721 

1722 elif transfer_encoding == b'quoted-printable': 

1723 vars.writer = QuotedPrintableDecoder(vars.f) 

1724 

1725 else: 

1726 self.logger.warning("Unknown Content-Transfer-Encoding: " 

1727 "%r", transfer_encoding) 

1728 if self.config['UPLOAD_ERROR_ON_BAD_CTE']: 

1729 raise FormParserError( 

1730 'Unknown Content-Transfer-Encoding "{}"'.format( 

1731 transfer_encoding 

1732 ) 

1733 ) 

1734 else: 

1735 # If we aren't erroring, then we just treat this as an 

1736 # unencoded Content-Transfer-Encoding. 

1737 vars.writer = vars.f 

1738 

1739 def on_end(): 

1740 vars.writer.finalize() 

1741 if self.on_end is not None: 

1742 self.on_end() 

1743 

1744 # These are our callbacks for the parser. 

1745 callbacks = { 

1746 'on_part_begin': on_part_begin, 

1747 'on_part_data': on_part_data, 

1748 'on_part_end': on_part_end, 

1749 'on_header_field': on_header_field, 

1750 'on_header_value': on_header_value, 

1751 'on_header_end': on_header_end, 

1752 'on_headers_finished': on_headers_finished, 

1753 'on_end': on_end, 

1754 } 

1755 

1756 # Instantiate a multipart parser. 

1757 parser = MultipartParser(boundary, callbacks, 

1758 max_size=self.config['MAX_BODY_SIZE']) 

1759 

1760 else: 

1761 self.logger.warning("Unknown Content-Type: %r", content_type) 

1762 raise FormParserError("Unknown Content-Type: {}".format( 

1763 content_type 

1764 )) 

1765 

1766 self.parser = parser 

1767 

1768 def write(self, data): 

1769 """Write some data. The parser will forward this to the appropriate 

1770 underlying parser. 

1771 

1772 :param data: a bytestring 

1773 """ 

1774 self.bytes_received += len(data) 

1775 # TODO: check the parser's return value for errors? 

1776 return self.parser.write(data) 

1777 

1778 def finalize(self): 

1779 """Finalize the parser.""" 

1780 if self.parser is not None and hasattr(self.parser, 'finalize'): 

1781 self.parser.finalize() 

1782 

1783 def close(self): 

1784 """Close the parser.""" 

1785 if self.parser is not None and hasattr(self.parser, 'close'): 

1786 self.parser.close() 

1787 

1788 def __repr__(self): 

1789 return "{}(content_type={!r}, parser={!r})".format( 

1790 self.__class__.__name__, 

1791 self.content_type, 

1792 self.parser, 

1793 ) 

1794 

1795 

1796def create_form_parser(headers, on_field, on_file, trust_x_headers=False, 

1797 config={}): 

1798 """This function is a helper function to aid in creating a FormParser 

1799 instances. Given a dictionary-like headers object, it will determine 

1800 the correct information needed, instantiate a FormParser with the 

1801 appropriate values and given callbacks, and then return the corresponding 

1802 parser. 

1803 

1804 :param headers: A dictionary-like object of HTTP headers. The only 

1805 required header is Content-Type. 

1806 

1807 :param on_field: Callback to call with each parsed field. 

1808 

1809 :param on_file: Callback to call with each parsed file. 

1810 

1811 :param trust_x_headers: Whether or not to trust information received from 

1812 certain X-Headers - for example, the file name from 

1813 X-File-Name. 

1814 

1815 :param config: Configuration variables to pass to the FormParser. 

1816 """ 

1817 content_type = headers.get('Content-Type') 

1818 if content_type is None: 

1819 logging.getLogger(__name__).warning("No Content-Type header given") 

1820 raise ValueError("No Content-Type header given!") 

1821 

1822 # Boundaries are optional (the FormParser will raise if one is needed 

1823 # but not given). 

1824 content_type, params = parse_options_header(content_type) 

1825 boundary = params.get(b'boundary') 

1826 

1827 # We need content_type to be a string, not a bytes object. 

1828 content_type = content_type.decode('latin-1') 

1829 

1830 # File names are optional. 

1831 file_name = headers.get('X-File-Name') 

1832 

1833 # Instantiate a form parser. 

1834 form_parser = FormParser(content_type, 

1835 on_field, 

1836 on_file, 

1837 boundary=boundary, 

1838 file_name=file_name, 

1839 config=config) 

1840 

1841 # Return our parser. 

1842 return form_parser 

1843 

1844 

1845def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, 

1846 **kwargs): 

1847 """This function is useful if you just want to parse a request body, 

1848 without too much work. Pass it a dictionary-like object of the request's 

1849 headers, and a file-like object for the input stream, along with two 

1850 callbacks that will get called whenever a field or file is parsed. 

1851 

1852 :param headers: A dictionary-like object of HTTP headers. The only 

1853 required header is Content-Type. 

1854 

1855 :param input_stream: A file-like object that represents the request body. 

1856 The read() method must return bytestrings. 

1857 

1858 :param on_field: Callback to call with each parsed field. 

1859 

1860 :param on_file: Callback to call with each parsed file. 

1861 

1862 :param chunk_size: The maximum size to read from the input stream and write 

1863 to the parser at one time. Defaults to 1 MiB. 

1864 """ 

1865 

1866 # Create our form parser. 

1867 parser = create_form_parser(headers, on_field, on_file) 

1868 

1869 # Read chunks of 100KiB and write to the parser, but never read more than 

1870 # the given Content-Length, if any. 

1871 content_length = headers.get('Content-Length') 

1872 if content_length is not None: 

1873 content_length = int(content_length) 

1874 else: 

1875 content_length = float('inf') 

1876 bytes_read = 0 

1877 

1878 while True: 

1879 # Read only up to the Content-Length given. 

1880 max_readable = min(content_length - bytes_read, 1048576) 

1881 buff = input_stream.read(max_readable) 

1882 

1883 # Write to the parser and update our length. 

1884 parser.write(buff) 

1885 bytes_read += len(buff) 

1886 

1887 # If we get a buffer that's smaller than the size requested, or if we 

1888 # have read up to our content length, we're done. 

1889 if len(buff) != max_readable or bytes_read == content_length: 

1890 break 

1891 

1892 # Tell our parser that we're done writing data. 

1893 parser.finalize()