Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/multipart/multipart.py: 17%

1from .decoders import *

2from .exceptions import *

4import os

5import re

6import sys

7import shutil

8import logging

9import tempfile

10from io import BytesIO

11from numbers import Number

13# Unique missing object.

14_missing = object()

16# States for the querystring parser.

17STATE_BEFORE_FIELD = 0

18STATE_FIELD_NAME = 1

19STATE_FIELD_DATA = 2

21# States for the multipart parser

22STATE_START = 0

23STATE_START_BOUNDARY = 1

24STATE_HEADER_FIELD_START = 2

25STATE_HEADER_FIELD = 3

26STATE_HEADER_VALUE_START = 4

27STATE_HEADER_VALUE = 5

28STATE_HEADER_VALUE_ALMOST_DONE = 6

29STATE_HEADERS_ALMOST_DONE = 7

30STATE_PART_DATA_START = 8

31STATE_PART_DATA = 9

32STATE_PART_DATA_END = 10

33STATE_END = 11

35STATES = [

36 "START",

37 "START_BOUNDARY", "HEADER_FIELD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE",

38 "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END"

39]

42# Flags for the multipart parser.

43FLAG_PART_BOUNDARY = 1

44FLAG_LAST_BOUNDARY = 2

46# Get constants. Since iterating over a str on Python 2 gives you a 1-length

47# string, but iterating over a bytes object on Python 3 gives you an integer,

48# we need to save these constants.

49CR = b'\r'[0]

50LF = b'\n'[0]

51COLON = b':'[0]

52SPACE = b' '[0]

53HYPHEN = b'-'[0]

54AMPERSAND = b'&'[0]

55SEMICOLON = b';'[0]

56LOWER_A = b'a'[0]

57LOWER_Z = b'z'[0]

58NULL = b'\x00'[0]

60# Lower-casing a character is different, because of the difference between

61# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,

62# and joining a list of bytes together.

63# These functions abstract that.

64lower_char = lambda c: c | 0x20

65ord_char = lambda c: c

66join_bytes = lambda b: bytes(list(b))

68# These are regexes for parsing header values.

69SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t')

70QUOTED_STR = br'"(?:\\.|[^"])*"'

71VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')'

72OPTION_RE_STR = (

73 br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')'

74)

75OPTION_RE = re.compile(OPTION_RE_STR)

76QUOTE = b'"'[0]

79def parse_options_header(value):

80 """

81 Parses a Content-Type header into a value in the following format:

82 (content_type, {parameters})

83 """

84 if not value:

85 return (b'', {})

87 # If we are passed a string, we assume that it conforms to WSGI and does

88 # not contain any code point that's not in latin-1.

89 if isinstance(value, str): # pragma: no cover

90 value = value.encode('latin-1')

92 # If we have no options, return the string as-is.

93 if b';' not in value:

94 return (value.lower().strip(), {})

96 # Split at the first semicolon, to get our value and then options.

97 ctype, rest = value.split(b';', 1)

98 options = {}

100 # Parse the options.

101 for match in OPTION_RE.finditer(rest):

102 key = match.group(1).lower()

103 value = match.group(2)

104 if value[0] == QUOTE and value[-1] == QUOTE:

105 # Unquote the value.

106 value = value[1:-1]

107 value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')

108

109 # If the value is a filename, we need to fix a bug on IE6 that sends

110 # the full file path instead of the filename.

111 if key == b'filename':

112 if value[1:3] == b':\\' or value[:2] == b'\\\\':

113 value = value.split(b'\\')[-1]

114

115 options[key] = value

116

117 return ctype, options

118

119

120class Field:

121 """A Field object represents a (parsed) form field. It represents a single

122 field with a corresponding name and value.

123

124 The name that a :class:`Field` will be instantiated with is the same name

125 that would be found in the following HTML::

126

127 <input name="name_goes_here" type="text"/>

128

129 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that

130 will be called when data is written to the Field, and when the Field is

131 finalized, respectively.

132

133 :param name: the name of the form field

134 """

135 def __init__(self, name):

136 self._name = name

137 self._value = []

138

139 # We cache the joined version of _value for speed.

140 self._cache = _missing

141

142 @classmethod

143 def from_value(klass, name, value):

144 """Create an instance of a :class:`Field`, and set the corresponding

145 value - either None or an actual value. This method will also

146 finalize the Field itself.

147

148 :param name: the name of the form field

149 :param value: the value of the form field - either a bytestring or

150 None

151 """

152

153 f = klass(name)

154 if value is None:

155 f.set_none()

156 else:

157 f.write(value)

158 f.finalize()

159 return f

160

161 def write(self, data):

162 """Write some data into the form field.

163

164 :param data: a bytestring

165 """

166 return self.on_data(data)

167

168 def on_data(self, data):

169 """This method is a callback that will be called whenever data is

170 written to the Field.

171

172 :param data: a bytestring

173 """

174 self._value.append(data)

175 self._cache = _missing

176 return len(data)

177

178 def on_end(self):

179 """This method is called whenever the Field is finalized.

180 """

181 if self._cache is _missing:

182 self._cache = b''.join(self._value)

183

184 def finalize(self):

185 """Finalize the form field.

186 """

187 self.on_end()

188

189 def close(self):

190 """Close the Field object. This will free any underlying cache.

191 """

192 # Free our value array.

193 if self._cache is _missing:

194 self._cache = b''.join(self._value)

195

196 del self._value

197

198 def set_none(self):

199 """Some fields in a querystring can possibly have a value of None - for

200 example, the string "foo&bar=&baz=asdf" will have a field with the

201 name "foo" and value None, one with name "bar" and value "", and one

202 with name "baz" and value "asdf". Since the write() interface doesn't

203 support writing None, this function will set the field value to None.

204 """

205 self._cache = None

206

207 @property

208 def field_name(self):

209 """This property returns the name of the field."""

210 return self._name

211

212 @property

213 def value(self):

214 """This property returns the value of the form field."""

215 if self._cache is _missing:

216 self._cache = b''.join(self._value)

217

218 return self._cache

219

220 def __eq__(self, other):

221 if isinstance(other, Field):

222 return (

223 self.field_name == other.field_name and

224 self.value == other.value

225 )

226 else:

227 return NotImplemented

228

229 def __repr__(self):

230 if len(self.value) > 97:

231 # We get the repr, and then insert three dots before the final

232 # quote.

233 v = repr(self.value[:97])[:-1] + "...'"

234 else:

235 v = repr(self.value)

236

237 return "{}(field_name={!r}, value={})".format(

238 self.__class__.__name__,

239 self.field_name,

240 v

241 )

242

243

244class File:

245 """This class represents an uploaded file. It handles writing file data to

246 either an in-memory file or a temporary file on-disk, if the optional

247 threshold is passed.

248

249 There are some options that can be passed to the File to change behavior

250 of the class. Valid options are as follows:

251

252 .. list-table::

253 :widths: 15 5 5 30

254 :header-rows: 1

255

256 * - Name

257 - Type

258 - Default

259 - Description

260 * - UPLOAD_DIR

261 - `str`

262 - None

263 - The directory to store uploaded files in. If this is None, a

264 temporary file will be created in the system's standard location.

265 * - UPLOAD_DELETE_TMP

266 - `bool`

267 - True

268 - Delete automatically created TMP file

269 * - UPLOAD_KEEP_FILENAME

270 - `bool`

271 - False

272 - Whether or not to keep the filename of the uploaded file. If True,

273 then the filename will be converted to a safe representation (e.g.

274 by removing any invalid path segments), and then saved with the

275 same name). Otherwise, a temporary name will be used.

276 * - UPLOAD_KEEP_EXTENSIONS

277 - `bool`

278 - False

279 - Whether or not to keep the uploaded file's extension. If False, the

280 file will be saved with the default temporary extension (usually

281 ".tmp"). Otherwise, the file's extension will be maintained. Note

282 that this will properly combine with the UPLOAD_KEEP_FILENAME

283 setting.

284 * - MAX_MEMORY_FILE_SIZE

285 - `int`

286 - 1 MiB

287 - The maximum number of bytes of a File to keep in memory. By

288 default, the contents of a File are kept into memory until a certain

289 limit is reached, after which the contents of the File are written

290 to a temporary file. This behavior can be disabled by setting this

291 value to an appropriately large value (or, for example, infinity,

292 such as `float('inf')`.

293

294 :param file_name: The name of the file that this :class:`File` represents

295

296 :param field_name: The field name that uploaded this file. Note that this

297 can be None, if, for example, the file was uploaded

298 with Content-Type application/octet-stream

299

300 :param config: The configuration for this File. See above for valid

301 configuration keys and their corresponding values.

302 """

303 def __init__(self, file_name, field_name=None, config={}):

304 # Save configuration, set other variables default.

305 self.logger = logging.getLogger(__name__)

306 self._config = config

307 self._in_memory = True

308 self._bytes_written = 0

309 self._fileobj = BytesIO()

310

311 # Save the provided field/file name.

312 self._field_name = field_name

313 self._file_name = file_name

314

315 # Our actual file name is None by default, since, depending on our

316 # config, we may not actually use the provided name.

317 self._actual_file_name = None

318

319 # Split the extension from the filename.

320 if file_name is not None:

321 base, ext = os.path.splitext(file_name)

322 self._file_base = base

323 self._ext = ext

324

325 @property

326 def field_name(self):

327 """The form field associated with this file. May be None if there isn't

328 one, for example when we have an application/octet-stream upload.

329 """

330 return self._field_name

331

332 @property

333 def file_name(self):

334 """The file name given in the upload request.

335 """

336 return self._file_name

337

338 @property

339 def actual_file_name(self):

340 """The file name that this file is saved as. Will be None if it's not

341 currently saved on disk.

342 """

343 return self._actual_file_name

344

345 @property

346 def file_object(self):

347 """The file object that we're currently writing to. Note that this

348 will either be an instance of a :class:`io.BytesIO`, or a regular file

349 object.

350 """

351 return self._fileobj

352

353 @property

354 def size(self):

355 """The total size of this file, counted as the number of bytes that

356 currently have been written to the file.

357 """

358 return self._bytes_written

359

360 @property

361 def in_memory(self):

362 """A boolean representing whether or not this file object is currently

363 stored in-memory or on-disk.

364 """

365 return self._in_memory

366

367 def flush_to_disk(self):

368 """If the file is already on-disk, do nothing. Otherwise, copy from

369 the in-memory buffer to a disk file, and then reassign our internal

370 file object to this new disk file.

371

372 Note that if you attempt to flush a file that is already on-disk, a

373 warning will be logged to this module's logger.

374 """

375 if not self._in_memory:

376 self.logger.warning(

377 "Trying to flush to disk when we're not in memory"

378 )

379 return

380

381 # Go back to the start of our file.

382 self._fileobj.seek(0)

383

384 # Open a new file.

385 new_file = self._get_disk_file()

386

387 # Copy the file objects.

388 shutil.copyfileobj(self._fileobj, new_file)

389

390 # Seek to the new position in our new file.

391 new_file.seek(self._bytes_written)

392

393 # Reassign the fileobject.

394 old_fileobj = self._fileobj

395 self._fileobj = new_file

396

397 # We're no longer in memory.

398 self._in_memory = False

399

400 # Close the old file object.

401 old_fileobj.close()

402

403 def _get_disk_file(self):

404 """This function is responsible for getting a file object on-disk for us.

405 """

406 self.logger.info("Opening a file on disk")

407

408 file_dir = self._config.get('UPLOAD_DIR')

409 keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False)

410 keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False)

411 delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True)

412

413 # If we have a directory and are to keep the filename...

414 if file_dir is not None and keep_filename:

415 self.logger.info("Saving with filename in: %r", file_dir)

416

417 # Build our filename.

418 # TODO: what happens if we don't have a filename?

419 fname = self._file_base

420 if keep_extensions:

421 fname = fname + self._ext

422

423 path = os.path.join(file_dir, fname)

424 try:

425 self.logger.info("Opening file: %r", path)

426 tmp_file = open(path, 'w+b')

427 except OSError as e:

428 tmp_file = None

429

430 self.logger.exception("Error opening temporary file")

431 raise FileError("Error opening temporary file: %r" % path)

432 else:

433 # Build options array.

434 # Note that on Python 3, tempfile doesn't support byte names. We

435 # encode our paths using the default filesystem encoding.

436 options = {}

437 if keep_extensions:

438 ext = self._ext

439 if isinstance(ext, bytes):

440 ext = ext.decode(sys.getfilesystemencoding())

441

442 options['suffix'] = ext

443 if file_dir is not None:

444 d = file_dir

445 if isinstance(d, bytes):

446 d = d.decode(sys.getfilesystemencoding())

447

448 options['dir'] = d

449 options['delete'] = delete_tmp

450

451 # Create a temporary (named) file with the appropriate settings.

452 self.logger.info("Creating a temporary file with options: %r",

453 options)

454 try:

455 tmp_file = tempfile.NamedTemporaryFile(**options)

456 except OSError:

457 self.logger.exception("Error creating named temporary file")

458 raise FileError("Error creating named temporary file")

459

460 fname = tmp_file.name

461

462 # Encode filename as bytes.

463 if isinstance(fname, str):

464 fname = fname.encode(sys.getfilesystemencoding())

465

466 self._actual_file_name = fname

467 return tmp_file

468

469 def write(self, data):

470 """Write some data to the File.

471

472 :param data: a bytestring

473 """

474 return self.on_data(data)

475

476 def on_data(self, data):

477 """This method is a callback that will be called whenever data is

478 written to the File.

479

480 :param data: a bytestring

481 """

482 pos = self._fileobj.tell()

483 bwritten = self._fileobj.write(data)

484 # true file objects write returns None

485 if bwritten is None:

486 bwritten = self._fileobj.tell() - pos

487

488 # If the bytes written isn't the same as the length, just return.

489 if bwritten != len(data):

490 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten,

491 len(data))

492 return bwritten

493

494 # Keep track of how many bytes we've written.

495 self._bytes_written += bwritten

496

497 # If we're in-memory and are over our limit, we create a file.

498 if (self._in_memory and

499 self._config.get('MAX_MEMORY_FILE_SIZE') is not None and

500 (self._bytes_written >

501 self._config.get('MAX_MEMORY_FILE_SIZE'))):

502 self.logger.info("Flushing to disk")

503 self.flush_to_disk()

504

505 # Return the number of bytes written.

506 return bwritten

507

508 def on_end(self):

509 """This method is called whenever the Field is finalized.

510 """

511 # Flush the underlying file object

512 self._fileobj.flush()

513

514 def finalize(self):

515 """Finalize the form file. This will not close the underlying file,

516 but simply signal that we are finished writing to the File.

517 """

518 self.on_end()

519

520 def close(self):

521 """Close the File object. This will actually close the underlying

522 file object (whether it's a :class:`io.BytesIO` or an actual file

523 object).

524 """

525 self._fileobj.close()

526

527 def __repr__(self):

528 return "{}(file_name={!r}, field_name={!r})".format(

529 self.__class__.__name__,

530 self.file_name,

531 self.field_name

532 )

533

534

535class BaseParser:

536 """This class is the base class for all parsers. It contains the logic for

537 calling and adding callbacks.

538

539 A callback can be one of two different forms. "Notification callbacks" are

540 callbacks that are called when something happens - for example, when a new

541 part of a multipart message is encountered by the parser. "Data callbacks"

542 are called when we get some sort of data - for example, part of the body of

543 a multipart chunk. Notification callbacks are called with no parameters,

544 whereas data callbacks are called with three, as follows::

545

546 data_callback(data, start, end)

547

548 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on

549 Python 3). "start" and "end" are integer indexes into the "data" string

550 that represent the data of interest. Thus, in a data callback, the slice

551 `data[start:end]` represents the data that the callback is "interested in".

552 The callback is not passed a copy of the data, since copying severely hurts

553 performance.

554 """

555 def __init__(self):

556 self.logger = logging.getLogger(__name__)

557

558 def callback(self, name, data=None, start=None, end=None):

559 """This function calls a provided callback with some data. If the

560 callback is not set, will do nothing.

561

562 :param name: The name of the callback to call (as a string).

563

564 :param data: Data to pass to the callback. If None, then it is

565 assumed that the callback is a notification callback,

566 and no parameters are given.

567

568 :param end: An integer that is passed to the data callback.

569

570 :param start: An integer that is passed to the data callback.

571 """

572 name = "on_" + name

573 func = self.callbacks.get(name)

574 if func is None:

575 return

576

577 # Depending on whether we're given a buffer...

578 if data is not None:

579 # Don't do anything if we have start == end.

580 if start is not None and start == end:

581 return

582

583 self.logger.debug("Calling %s with data[%d:%d]", name, start, end)

584 func(data, start, end)

585 else:

586 self.logger.debug("Calling %s with no data", name)

587 func()

588

589 def set_callback(self, name, new_func):

590 """Update the function for a callback. Removes from the callbacks dict

591 if new_func is None.

592

593 :param name: The name of the callback to call (as a string).

594

595 :param new_func: The new function for the callback. If None, then the

596 callback will be removed (with no error if it does not

597 exist).

598 """

599 if new_func is None:

600 self.callbacks.pop('on_' + name, None)

601 else:

602 self.callbacks['on_' + name] = new_func

603

604 def close(self):

605 pass # pragma: no cover

606

607 def finalize(self):

608 pass # pragma: no cover

609

610 def __repr__(self):

611 return "%s()" % self.__class__.__name__

612

613

614class OctetStreamParser(BaseParser):

615 """This parser parses an octet-stream request body and calls callbacks when

616 incoming data is received. Callbacks are as follows:

617

618 .. list-table::

619 :widths: 15 10 30

620 :header-rows: 1

621

622 * - Callback Name

623 - Parameters

624 - Description

625 * - on_start

626 - None

627 - Called when the first data is parsed.

628 * - on_data

629 - data, start, end

630 - Called for each data chunk that is parsed.

631 * - on_end

632 - None

633 - Called when the parser is finished parsing all data.

634

635 :param callbacks: A dictionary of callbacks. See the documentation for

636 :class:`BaseParser`.

637

638 :param max_size: The maximum size of body to parse. Defaults to infinity -

639 i.e. unbounded.

640 """

641 def __init__(self, callbacks={}, max_size=float('inf')):

642 super().__init__()

643 self.callbacks = callbacks

644 self._started = False

645

646 if not isinstance(max_size, Number) or max_size < 1:

647 raise ValueError("max_size must be a positive number, not %r" %

648 max_size)

649 self.max_size = max_size

650 self._current_size = 0

651

652 def write(self, data):

653 """Write some data to the parser, which will perform size verification,

654 and then pass the data to the underlying callback.

655

656 :param data: a bytestring

657 """

658 if not self._started:

659 self.callback('start')

660 self._started = True

661

662 # Truncate data length.

663 data_len = len(data)

664 if (self._current_size + data_len) > self.max_size:

665 # We truncate the length of data that we are to process.

666 new_size = int(self.max_size - self._current_size)

667 self.logger.warning("Current size is %d (max %d), so truncating "

668 "data length from %d to %d",

669 self._current_size, self.max_size, data_len,

670 new_size)

671 data_len = new_size

672

673 # Increment size, then callback, in case there's an exception.

674 self._current_size += data_len

675 self.callback('data', data, 0, data_len)

676 return data_len

677

678 def finalize(self):

679 """Finalize this parser, which signals to that we are finished parsing,

680 and sends the on_end callback.

681 """

682 self.callback('end')

683

684 def __repr__(self):

685 return "%s()" % self.__class__.__name__

686

687

688class QuerystringParser(BaseParser):

689 """This is a streaming querystring parser. It will consume data, and call

690 the callbacks given when it has data.

691

692 .. list-table::

693 :widths: 15 10 30

694 :header-rows: 1

695

696 * - Callback Name

697 - Parameters

698 - Description

699 * - on_field_start

700 - None

701 - Called when a new field is encountered.

702 * - on_field_name

703 - data, start, end

704 - Called when a portion of a field's name is encountered.

705 * - on_field_data

706 - data, start, end

707 - Called when a portion of a field's data is encountered.

708 * - on_field_end

709 - None

710 - Called when the end of a field is encountered.

711 * - on_end

712 - None

713 - Called when the parser is finished parsing all data.

714

715 :param callbacks: A dictionary of callbacks. See the documentation for

716 :class:`BaseParser`.

717

718 :param strict_parsing: Whether or not to parse the body strictly. Defaults

719 to False. If this is set to True, then the behavior

720 of the parser changes as the following: if a field

721 has a value with an equal sign (e.g. "foo=bar", or

722 "foo="), it is always included. If a field has no

723 equals sign (e.g. "...&name&..."), it will be

724 treated as an error if 'strict_parsing' is True,

725 otherwise included. If an error is encountered,

726 then a

727 :class:`multipart.exceptions.QuerystringParseError`

728 will be raised.

729

730 :param max_size: The maximum size of body to parse. Defaults to infinity -

731 i.e. unbounded.

732 """

733 def __init__(self, callbacks={}, strict_parsing=False,

734 max_size=float('inf')):

735 super().__init__()

736 self.state = STATE_BEFORE_FIELD

737 self._found_sep = False

738

739 self.callbacks = callbacks

740

741 # Max-size stuff

742 if not isinstance(max_size, Number) or max_size < 1:

743 raise ValueError("max_size must be a positive number, not %r" %

744 max_size)

745 self.max_size = max_size

746 self._current_size = 0

747

748 # Should parsing be strict?

749 self.strict_parsing = strict_parsing

750

751 def write(self, data):

752 """Write some data to the parser, which will perform size verification,

753 parse into either a field name or value, and then pass the

754 corresponding data to the underlying callback. If an error is

755 encountered while parsing, a QuerystringParseError will be raised. The

756 "offset" attribute of the raised exception will be set to the offset in

757 the input data chunk (NOT the overall stream) that caused the error.

758

759 :param data: a bytestring

760 """

761 # Handle sizing.

762 data_len = len(data)

763 if (self._current_size + data_len) > self.max_size:

764 # We truncate the length of data that we are to process.

765 new_size = int(self.max_size - self._current_size)

766 self.logger.warning("Current size is %d (max %d), so truncating "

767 "data length from %d to %d",

768 self._current_size, self.max_size, data_len,

769 new_size)

770 data_len = new_size

771

772 l = 0

773 try:

774 l = self._internal_write(data, data_len)

775 finally:

776 self._current_size += l

777

778 return l

779

780 def _internal_write(self, data, length):

781 state = self.state

782 strict_parsing = self.strict_parsing

783 found_sep = self._found_sep

784

785 i = 0

786 while i < length:

787 ch = data[i]

788

789 # Depending on our state...

790 if state == STATE_BEFORE_FIELD:

791 # If the 'found_sep' flag is set, we've already encountered

792 # and skipped a single separator. If so, we check our strict

793 # parsing flag and decide what to do. Otherwise, we haven't

794 # yet reached a separator, and thus, if we do, we need to skip

795 # it as it will be the boundary between fields that's supposed

796 # to be there.

797 if ch == AMPERSAND or ch == SEMICOLON:

798 if found_sep:

799 # If we're parsing strictly, we disallow blank chunks.

800 if strict_parsing:

801 e = QuerystringParseError(

802 "Skipping duplicate ampersand/semicolon at "

803 "%d" % i

804 )

805 e.offset = i

806 raise e

807 else:

808 self.logger.debug("Skipping duplicate ampersand/"

809 "semicolon at %d", i)

810 else:

811 # This case is when we're skipping the (first)

812 # separator between fields, so we just set our flag

813 # and continue on.

814 found_sep = True

815 else:

816 # Emit a field-start event, and go to that state. Also,

817 # reset the "found_sep" flag, for the next time we get to

818 # this state.

819 self.callback('field_start')

820 i -= 1

821 state = STATE_FIELD_NAME

822 found_sep = False

823

824 elif state == STATE_FIELD_NAME:

825 # Try and find a separator - we ensure that, if we do, we only

826 # look for the equal sign before it.

827 sep_pos = data.find(b'&', i)

828 if sep_pos == -1:

829 sep_pos = data.find(b';', i)

830

831 # See if we can find an equals sign in the remaining data. If

832 # so, we can immediately emit the field name and jump to the

833 # data state.

834 if sep_pos != -1:

835 equals_pos = data.find(b'=', i, sep_pos)

836 else:

837 equals_pos = data.find(b'=', i)

838

839 if equals_pos != -1:

840 # Emit this name.

841 self.callback('field_name', data, i, equals_pos)

842

843 # Jump i to this position. Note that it will then have 1

844 # added to it below, which means the next iteration of this

845 # loop will inspect the character after the equals sign.

846 i = equals_pos

847 state = STATE_FIELD_DATA

848 else:

849 # No equals sign found.

850 if not strict_parsing:

851 # See also comments in the STATE_FIELD_DATA case below.

852 # If we found the separator, we emit the name and just

853 # end - there's no data callback at all (not even with

854 # a blank value).

855 if sep_pos != -1:

856 self.callback('field_name', data, i, sep_pos)

857 self.callback('field_end')

858

859 i = sep_pos - 1

860 state = STATE_BEFORE_FIELD

861 else:

862 # Otherwise, no separator in this block, so the

863 # rest of this chunk must be a name.

864 self.callback('field_name', data, i, length)

865 i = length

866

867 else:

868 # We're parsing strictly. If we find a separator,

869 # this is an error - we require an equals sign.

870 if sep_pos != -1:

871 e = QuerystringParseError(

872 "When strict_parsing is True, we require an "

873 "equals sign in all field chunks. Did not "

874 "find one in the chunk that starts at %d" %

875 (i,)

876 )

877 e.offset = i

878 raise e

879

880 # No separator in the rest of this chunk, so it's just

881 # a field name.

882 self.callback('field_name', data, i, length)

883 i = length

884

885 elif state == STATE_FIELD_DATA:

886 # Try finding either an ampersand or a semicolon after this

887 # position.

888 sep_pos = data.find(b'&', i)

889 if sep_pos == -1:

890 sep_pos = data.find(b';', i)

891

892 # If we found it, callback this bit as data and then go back

893 # to expecting to find a field.

894 if sep_pos != -1:

895 self.callback('field_data', data, i, sep_pos)

896 self.callback('field_end')

897

898 # Note that we go to the separator, which brings us to the

899 # "before field" state. This allows us to properly emit

900 # "field_start" events only when we actually have data for

901 # a field of some sort.

902 i = sep_pos - 1

903 state = STATE_BEFORE_FIELD

904

905 # Otherwise, emit the rest as data and finish.

906 else:

907 self.callback('field_data', data, i, length)

908 i = length

909

910 else: # pragma: no cover (error case)

911 msg = "Reached an unknown state %d at %d" % (state, i)

912 self.logger.warning(msg)

913 e = QuerystringParseError(msg)

914 e.offset = i

915 raise e

916

917 i += 1

918

919 self.state = state

920 self._found_sep = found_sep

921 return len(data)

922

923 def finalize(self):

924 """Finalize this parser, which signals to that we are finished parsing,

925 if we're still in the middle of a field, an on_field_end callback, and

926 then the on_end callback.

927 """

928 # If we're currently in the middle of a field, we finish it.

929 if self.state == STATE_FIELD_DATA:

930 self.callback('field_end')

931 self.callback('end')

932

933 def __repr__(self):

934 return "{}(strict_parsing={!r}, max_size={!r})".format(

935 self.__class__.__name__,

936 self.strict_parsing, self.max_size

937 )

938

939

940class MultipartParser(BaseParser):

941 """This class is a streaming multipart/form-data parser.

942

943 .. list-table::

944 :widths: 15 10 30

945 :header-rows: 1

946

947 * - Callback Name

948 - Parameters

949 - Description

950 * - on_part_begin

951 - None

952 - Called when a new part of the multipart message is encountered.

953 * - on_part_data

954 - data, start, end

955 - Called when a portion of a part's data is encountered.

956 * - on_part_end

957 - None

958 - Called when the end of a part is reached.

959 * - on_header_begin

960 - None

961 - Called when we've found a new header in a part of a multipart

962 message

963 * - on_header_field

964 - data, start, end

965 - Called each time an additional portion of a header is read (i.e. the

966 part of the header that is before the colon; the "Foo" in

967 "Foo: Bar").

968 * - on_header_value

969 - data, start, end

970 - Called when we get data for a header.

971 * - on_header_end

972 - None

973 - Called when the current header is finished - i.e. we've reached the

974 newline at the end of the header.

975 * - on_headers_finished

976 - None

977 - Called when all headers are finished, and before the part data

978 starts.

979 * - on_end

980 - None

981 - Called when the parser is finished parsing all data.

982

983

984 :param boundary: The multipart boundary. This is required, and must match

985 what is given in the HTTP request - usually in the

986 Content-Type header.

987

988 :param callbacks: A dictionary of callbacks. See the documentation for

989 :class:`BaseParser`.

990

991 :param max_size: The maximum size of body to parse. Defaults to infinity -

992 i.e. unbounded.

993 """

994

995 def __init__(self, boundary, callbacks={}, max_size=float('inf')):

996 # Initialize parser state.

997 super().__init__()

998 self.state = STATE_START

999 self.index = self.flags = 0

1000

1001 self.callbacks = callbacks

1002

1003 if not isinstance(max_size, Number) or max_size < 1:

1004 raise ValueError("max_size must be a positive number, not %r" %

1005 max_size)

1006 self.max_size = max_size

1007 self._current_size = 0

1008

1009 # Setup marks. These are used to track the state of data received.

1010 self.marks = {}

1011

1012 # TODO: Actually use this rather than the dumb version we currently use

1013 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.

1014 # skip = [len(boundary) for x in range(256)]

1015 # for i in range(len(boundary) - 1):

1016 # skip[ord_char(boundary[i])] = len(boundary) - i - 1

1017 #

1018 # # We use a tuple since it's a constant, and marginally faster.

1019 # self.skip = tuple(skip)

1020

1021 # Save our boundary.

1022 if isinstance(boundary, str): # pragma: no cover

1023 boundary = boundary.encode('latin-1')

1024 self.boundary = b'\r\n--' + boundary

1025

1026 # Get a set of characters that belong to our boundary.

1027 self.boundary_chars = frozenset(self.boundary)

1028

1029 # We also create a lookbehind list.

1030 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +

1031 # "--\r\n" at the final boundary, and the length of '\r\n--' and

1032 # '--\r\n' is 8 bytes.

1033 self.lookbehind = [NULL for x in range(len(boundary) + 8)]

1034

1035 def write(self, data):

1036 """Write some data to the parser, which will perform size verification,

1037 and then parse the data into the appropriate location (e.g. header,

1038 data, etc.), and pass this on to the underlying callback. If an error

1039 is encountered, a MultipartParseError will be raised. The "offset"

1040 attribute on the raised exception will be set to the offset of the byte

1041 in the input chunk that caused the error.

1042

1043 :param data: a bytestring

1044 """

1045 # Handle sizing.

1046 data_len = len(data)

1047 if (self._current_size + data_len) > self.max_size:

1048 # We truncate the length of data that we are to process.

1049 new_size = int(self.max_size - self._current_size)

1050 self.logger.warning("Current size is %d (max %d), so truncating "

1051 "data length from %d to %d",

1052 self._current_size, self.max_size, data_len,

1053 new_size)

1054 data_len = new_size

1055

1056 l = 0

1057 try:

1058 l = self._internal_write(data, data_len)

1059 finally:

1060 self._current_size += l

1061

1062 return l

1063

1064 def _internal_write(self, data, length):

1065 # Get values from locals.

1066 boundary = self.boundary

1067

1068 # Get our state, flags and index. These are persisted between calls to

1069 # this function.

1070 state = self.state

1071 index = self.index

1072 flags = self.flags

1073

1074 # Our index defaults to 0.

1075 i = 0

1076

1077 # Set a mark.

1078 def set_mark(name):

1079 self.marks[name] = i

1080

1081 # Remove a mark.

1082 def delete_mark(name, reset=False):

1083 self.marks.pop(name, None)

1084

1085 # Helper function that makes calling a callback with data easier. The

1086 # 'remaining' parameter will callback from the marked value until the

1087 # end of the buffer, and reset the mark, instead of deleting it. This

1088 # is used at the end of the function to call our callbacks with any

1089 # remaining data in this chunk.

1090 def data_callback(name, remaining=False):

1091 marked_index = self.marks.get(name)

1092 if marked_index is None:

1093 return

1094

1095 # If we're getting remaining data, we ignore the current i value

1096 # and just call with the remaining data.

1097 if remaining:

1098 self.callback(name, data, marked_index, length)

1099 self.marks[name] = 0

1100

1101 # Otherwise, we call it from the mark to the current byte we're

1102 # processing.

1103 else:

1104 self.callback(name, data, marked_index, i)

1105 self.marks.pop(name, None)

1106

1107 # For each byte...

1108 while i < length:

1109 c = data[i]

1110

1111 if state == STATE_START:

1112 # Skip leading newlines

1113 if c == CR or c == LF:

1114 i += 1

1115 self.logger.debug("Skipping leading CR/LF at %d", i)

1116 continue

1117

1118 # index is used as in index into our boundary. Set to 0.

1119 index = 0

1120

1121 # Move to the next state, but decrement i so that we re-process

1122 # this character.

1123 state = STATE_START_BOUNDARY

1124 i -= 1

1125

1126 elif state == STATE_START_BOUNDARY:

1127 # Check to ensure that the last 2 characters in our boundary

1128 # are CRLF.

1129 if index == len(boundary) - 2:

1130 if c != CR:

1131 # Error!

1132 msg = "Did not find CR at end of boundary (%d)" % (i,)

1133 self.logger.warning(msg)

1134 e = MultipartParseError(msg)

1135 e.offset = i

1136 raise e

1137

1138 index += 1

1139

1140 elif index == len(boundary) - 2 + 1:

1141 if c != LF:

1142 msg = "Did not find LF at end of boundary (%d)" % (i,)

1143 self.logger.warning(msg)

1144 e = MultipartParseError(msg)

1145 e.offset = i

1146 raise e

1147

1148 # The index is now used for indexing into our boundary.

1149 index = 0

1150

1151 # Callback for the start of a part.

1152 self.callback('part_begin')

1153

1154 # Move to the next character and state.

1155 state = STATE_HEADER_FIELD_START

1156

1157 else:

1158 # Check to ensure our boundary matches

1159 if c != boundary[index + 2]:

1160 msg = "Did not find boundary character %r at index " \

1161 "%d" % (c, index + 2)

1162 self.logger.warning(msg)

1163 e = MultipartParseError(msg)

1164 e.offset = i

1165 raise e

1166

1167 # Increment index into boundary and continue.

1168 index += 1

1169

1170 elif state == STATE_HEADER_FIELD_START:

1171 # Mark the start of a header field here, reset the index, and

1172 # continue parsing our header field.

1173 index = 0

1174

1175 # Set a mark of our header field.

1176 set_mark('header_field')

1177

1178 # Move to parsing header fields.

1179 state = STATE_HEADER_FIELD

1180 i -= 1

1181

1182 elif state == STATE_HEADER_FIELD:

1183 # If we've reached a CR at the beginning of a header, it means

1184 # that we've reached the second of 2 newlines, and so there are

1185 # no more headers to parse.

1186 if c == CR:

1187 delete_mark('header_field')

1188 state = STATE_HEADERS_ALMOST_DONE

1189 i += 1

1190 continue

1191

1192 # Increment our index in the header.

1193 index += 1

1194

1195 # Do nothing if we encounter a hyphen.

1196 if c == HYPHEN:

1197 pass

1198

1199 # If we've reached a colon, we're done with this header.

1200 elif c == COLON:

1201 # A 0-length header is an error.

1202 if index == 1:

1203 msg = "Found 0-length header at %d" % (i,)

1204 self.logger.warning(msg)

1205 e = MultipartParseError(msg)

1206 e.offset = i

1207 raise e

1208

1209 # Call our callback with the header field.

1210 data_callback('header_field')

1211

1212 # Move to parsing the header value.

1213 state = STATE_HEADER_VALUE_START

1214

1215 else:

1216 # Lower-case this character, and ensure that it is in fact

1217 # a valid letter. If not, it's an error.

1218 cl = lower_char(c)

1219 if cl < LOWER_A or cl > LOWER_Z:

1220 msg = "Found non-alphanumeric character %r in " \

1221 "header at %d" % (c, i)

1222 self.logger.warning(msg)

1223 e = MultipartParseError(msg)

1224 e.offset = i

1225 raise e

1226

1227 elif state == STATE_HEADER_VALUE_START:

1228 # Skip leading spaces.

1229 if c == SPACE:

1230 i += 1

1231 continue

1232

1233 # Mark the start of the header value.

1234 set_mark('header_value')

1235

1236 # Move to the header-value state, reprocessing this character.

1237 state = STATE_HEADER_VALUE

1238 i -= 1

1239

1240 elif state == STATE_HEADER_VALUE:

1241 # If we've got a CR, we're nearly done our headers. Otherwise,

1242 # we do nothing and just move past this character.

1243 if c == CR:

1244 data_callback('header_value')

1245 self.callback('header_end')

1246 state = STATE_HEADER_VALUE_ALMOST_DONE

1247

1248 elif state == STATE_HEADER_VALUE_ALMOST_DONE:

1249 # The last character should be a LF. If not, it's an error.

1250 if c != LF:

1251 msg = "Did not find LF character at end of header " \

1252 "(found %r)" % (c,)

1253 self.logger.warning(msg)

1254 e = MultipartParseError(msg)

1255 e.offset = i

1256 raise e

1257

1258 # Move back to the start of another header. Note that if that

1259 # state detects ANOTHER newline, it'll trigger the end of our

1260 # headers.

1261 state = STATE_HEADER_FIELD_START

1262

1263 elif state == STATE_HEADERS_ALMOST_DONE:

1264 # We're almost done our headers. This is reached when we parse

1265 # a CR at the beginning of a header, so our next character

1266 # should be a LF, or it's an error.

1267 if c != LF:

1268 msg = f"Did not find LF at end of headers (found {c!r})"

1269 self.logger.warning(msg)

1270 e = MultipartParseError(msg)

1271 e.offset = i

1272 raise e

1273

1274 self.callback('headers_finished')

1275 state = STATE_PART_DATA_START

1276

1277 elif state == STATE_PART_DATA_START:

1278 # Mark the start of our part data.

1279 set_mark('part_data')

1280

1281 # Start processing part data, including this character.

1282 state = STATE_PART_DATA

1283 i -= 1

1284

1285 elif state == STATE_PART_DATA:

1286 # We're processing our part data right now. During this, we

1287 # need to efficiently search for our boundary, since any data

1288 # on any number of lines can be a part of the current data.

1289 # We use the Boyer-Moore-Horspool algorithm to efficiently

1290 # search through the remainder of the buffer looking for our

1291 # boundary.

1292

1293 # Save the current value of our index. We use this in case we

1294 # find part of a boundary, but it doesn't match fully.

1295 prev_index = index

1296

1297 # Set up variables.

1298 boundary_length = len(boundary)

1299 boundary_end = boundary_length - 1

1300 data_length = length

1301 boundary_chars = self.boundary_chars

1302

1303 # If our index is 0, we're starting a new part, so start our

1304 # search.

1305 if index == 0:

1306 # Search forward until we either hit the end of our buffer,

1307 # or reach a character that's in our boundary.

1308 i += boundary_end

1309 while i < data_length - 1 and data[i] not in boundary_chars:

1310 i += boundary_length

1311

1312 # Reset i back the length of our boundary, which is the

1313 # earliest possible location that could be our match (i.e.

1314 # if we've just broken out of our loop since we saw the

1315 # last character in our boundary)

1316 i -= boundary_end

1317 c = data[i]

1318

1319 # Now, we have a couple of cases here. If our index is before

1320 # the end of the boundary...

1321 if index < boundary_length:

1322 # If the character matches...

1323 if boundary[index] == c:

1324 # If we found a match for our boundary, we send the

1325 # existing data.

1326 if index == 0:

1327 data_callback('part_data')

1328

1329 # The current character matches, so continue!

1330 index += 1

1331 else:

1332 index = 0

1333

1334 # Our index is equal to the length of our boundary!

1335 elif index == boundary_length:

1336 # First we increment it.

1337 index += 1

1338

1339 # Now, if we've reached a newline, we need to set this as

1340 # the potential end of our boundary.

1341 if c == CR:

1342 flags |= FLAG_PART_BOUNDARY

1343

1344 # Otherwise, if this is a hyphen, we might be at the last

1345 # of all boundaries.

1346 elif c == HYPHEN:

1347 flags |= FLAG_LAST_BOUNDARY

1348

1349 # Otherwise, we reset our index, since this isn't either a

1350 # newline or a hyphen.

1351 else:

1352 index = 0

1353

1354 # Our index is right after the part boundary, which should be

1355 # a LF.

1356 elif index == boundary_length + 1:

1357 # If we're at a part boundary (i.e. we've seen a CR

1358 # character already)...

1359 if flags & FLAG_PART_BOUNDARY:

1360 # We need a LF character next.

1361 if c == LF:

1362 # Unset the part boundary flag.

1363 flags &= (~FLAG_PART_BOUNDARY)

1364

1365 # Callback indicating that we've reached the end of

1366 # a part, and are starting a new one.

1367 self.callback('part_end')

1368 self.callback('part_begin')

1369

1370 # Move to parsing new headers.

1371 index = 0

1372 state = STATE_HEADER_FIELD_START

1373 i += 1

1374 continue

1375

1376 # We didn't find an LF character, so no match. Reset

1377 # our index and clear our flag.

1378 index = 0

1379 flags &= (~FLAG_PART_BOUNDARY)

1380

1381 # Otherwise, if we're at the last boundary (i.e. we've

1382 # seen a hyphen already)...

1383 elif flags & FLAG_LAST_BOUNDARY:

1384 # We need a second hyphen here.

1385 if c == HYPHEN:

1386 # Callback to end the current part, and then the

1387 # message.

1388 self.callback('part_end')

1389 self.callback('end')

1390 state = STATE_END

1391 else:

1392 # No match, so reset index.

1393 index = 0

1394

1395 # If we have an index, we need to keep this byte for later, in

1396 # case we can't match the full boundary.

1397 if index > 0:

1398 self.lookbehind[index - 1] = c

1399

1400 # Otherwise, our index is 0. If the previous index is not, it

1401 # means we reset something, and we need to take the data we

1402 # thought was part of our boundary and send it along as actual

1403 # data.

1404 elif prev_index > 0:

1405 # Callback to write the saved data.

1406 lb_data = join_bytes(self.lookbehind)

1407 self.callback('part_data', lb_data, 0, prev_index)

1408

1409 # Overwrite our previous index.

1410 prev_index = 0

1411

1412 # Re-set our mark for part data.

1413 set_mark('part_data')

1414

1415 # Re-consider the current character, since this could be

1416 # the start of the boundary itself.

1417 i -= 1

1418

1419 elif state == STATE_END:

1420 # Do nothing and just consume a byte in the end state.

1421 if c not in (CR, LF):

1422 self.logger.warning("Consuming a byte '0x%x' in the end state", c)

1423

1424 else: # pragma: no cover (error case)

1425 # We got into a strange state somehow! Just stop processing.

1426 msg = "Reached an unknown state %d at %d" % (state, i)

1427 self.logger.warning(msg)

1428 e = MultipartParseError(msg)

1429 e.offset = i

1430 raise e

1431

1432 # Move to the next byte.

1433 i += 1

1434

1435 # We call our callbacks with any remaining data. Note that we pass

1436 # the 'remaining' flag, which sets the mark back to 0 instead of

1437 # deleting it, if it's found. This is because, if the mark is found

1438 # at this point, we assume that there's data for one of these things

1439 # that has been parsed, but not yet emitted. And, as such, it implies

1440 # that we haven't yet reached the end of this 'thing'. So, by setting

1441 # the mark to 0, we cause any data callbacks that take place in future

1442 # calls to this function to start from the beginning of that buffer.

1443 data_callback('header_field', True)

1444 data_callback('header_value', True)

1445 data_callback('part_data', True)

1446

1447 # Save values to locals.

1448 self.state = state

1449 self.index = index

1450 self.flags = flags

1451

1452 # Return our data length to indicate no errors, and that we processed

1453 # all of it.

1454 return length

1455

1456 def finalize(self):

1457 """Finalize this parser, which signals to that we are finished parsing.

1458

1459 Note: It does not currently, but in the future, it will verify that we

1460 are in the final state of the parser (i.e. the end of the multipart

1461 message is well-formed), and, if not, throw an error.

1462 """

1463 # TODO: verify that we're in the state STATE_END, otherwise throw an

1464 # error or otherwise state that we're not finished parsing.

1465 pass

1466

1467 def __repr__(self):

1468 return f"{self.__class__.__name__}(boundary={self.boundary!r})"

1469

1470

1471class FormParser:

1472 """This class is the all-in-one form parser. Given all the information

1473 necessary to parse a form, it will instantiate the correct parser, create

1474 the proper :class:`Field` and :class:`File` classes to store the data that

1475 is parsed, and call the two given callbacks with each field and file as

1476 they become available.

1477

1478 :param content_type: The Content-Type of the incoming request. This is

1479 used to select the appropriate parser.

1480

1481 :param on_field: The callback to call when a field has been parsed and is

1482 ready for usage. See above for parameters.

1483

1484 :param on_file: The callback to call when a file has been parsed and is

1485 ready for usage. See above for parameters.

1486

1487 :param on_end: An optional callback to call when all fields and files in a

1488 request has been parsed. Can be None.

1489

1490 :param boundary: If the request is a multipart/form-data request, this

1491 should be the boundary of the request, as given in the

1492 Content-Type header, as a bytestring.

1493

1494 :param file_name: If the request is of type application/octet-stream, then

1495 the body of the request will not contain any information

1496 about the uploaded file. In such cases, you can provide

1497 the file name of the uploaded file manually.

1498

1499 :param FileClass: The class to use for uploaded files. Defaults to

1500 :class:`File`, but you can provide your own class if you

1501 wish to customize behaviour. The class will be

1502 instantiated as FileClass(file_name, field_name), and it

1503 must provide the following functions::

1504 file_instance.write(data)

1505 file_instance.finalize()

1506 file_instance.close()

1507

1508 :param FieldClass: The class to use for uploaded fields. Defaults to

1509 :class:`Field`, but you can provide your own class if

1510 you wish to customize behaviour. The class will be

1511 instantiated as FieldClass(field_name), and it must

1512 provide the following functions::

1513 field_instance.write(data)

1514 field_instance.finalize()

1515 field_instance.close()

1516

1517 :param config: Configuration to use for this FormParser. The default

1518 values are taken from the DEFAULT_CONFIG value, and then

1519 any keys present in this dictionary will overwrite the

1520 default values.

1521

1522 """

1523 #: This is the default configuration for our form parser.

1524 #: Note: all file sizes should be in bytes.

1525 DEFAULT_CONFIG = {

1526 'MAX_BODY_SIZE': float('inf'),

1527 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024,

1528 'UPLOAD_DIR': None,

1529 'UPLOAD_KEEP_FILENAME': False,

1530 'UPLOAD_KEEP_EXTENSIONS': False,

1531

1532 # Error on invalid Content-Transfer-Encoding?

1533 'UPLOAD_ERROR_ON_BAD_CTE': False,

1534 }

1535

1536 def __init__(self, content_type, on_field, on_file, on_end=None,

1537 boundary=None, file_name=None, FileClass=File,

1538 FieldClass=Field, config={}):

1539

1540 self.logger = logging.getLogger(__name__)

1541

1542 # Save variables.

1543 self.content_type = content_type

1544 self.boundary = boundary

1545 self.bytes_received = 0

1546 self.parser = None

1547

1548 # Save callbacks.

1549 self.on_field = on_field

1550 self.on_file = on_file

1551 self.on_end = on_end

1552

1553 # Save classes.

1554 self.FileClass = File

1555 self.FieldClass = Field

1556

1557 # Set configuration options.

1558 self.config = self.DEFAULT_CONFIG.copy()

1559 self.config.update(config)

1560

1561 # Depending on the Content-Type, we instantiate the correct parser.

1562 if content_type == 'application/octet-stream':

1563 # Work around the lack of 'nonlocal' in Py2

1564 class vars:

1565 f = None

1566

1567 def on_start():

1568 vars.f = FileClass(file_name, None, config=self.config)

1569

1570 def on_data(data, start, end):

1571 vars.f.write(data[start:end])

1572

1573 def on_end():

1574 # Finalize the file itself.

1575 vars.f.finalize()

1576

1577 # Call our callback.

1578 on_file(vars.f)

1579

1580 # Call the on-end callback.

1581 if self.on_end is not None:

1582 self.on_end()

1583

1584 callbacks = {

1585 'on_start': on_start,

1586 'on_data': on_data,

1587 'on_end': on_end,

1588 }

1589

1590 # Instantiate an octet-stream parser

1591 parser = OctetStreamParser(callbacks,

1592 max_size=self.config['MAX_BODY_SIZE'])

1593

1594 elif (content_type == 'application/x-www-form-urlencoded' or

1595 content_type == 'application/x-url-encoded'):

1596

1597 name_buffer = []

1598

1599 class vars:

1600 f = None

1601

1602 def on_field_start():

1603 pass

1604

1605 def on_field_name(data, start, end):

1606 name_buffer.append(data[start:end])

1607

1608 def on_field_data(data, start, end):

1609 if vars.f is None:

1610 vars.f = FieldClass(b''.join(name_buffer))

1611 del name_buffer[:]

1612 vars.f.write(data[start:end])

1613

1614 def on_field_end():

1615 # Finalize and call callback.

1616 if vars.f is None:

1617 # If we get here, it's because there was no field data.

1618 # We create a field, set it to None, and then continue.

1619 vars.f = FieldClass(b''.join(name_buffer))

1620 del name_buffer[:]

1621 vars.f.set_none()

1622

1623 vars.f.finalize()

1624 on_field(vars.f)

1625 vars.f = None

1626

1627 def on_end():

1628 if self.on_end is not None:

1629 self.on_end()

1630

1631 # Setup callbacks.

1632 callbacks = {

1633 'on_field_start': on_field_start,

1634 'on_field_name': on_field_name,

1635 'on_field_data': on_field_data,

1636 'on_field_end': on_field_end,

1637 'on_end': on_end,

1638 }

1639

1640 # Instantiate parser.

1641 parser = QuerystringParser(

1642 callbacks=callbacks,

1643 max_size=self.config['MAX_BODY_SIZE']

1644 )

1645

1646 elif content_type == 'multipart/form-data':

1647 if boundary is None:

1648 self.logger.error("No boundary given")

1649 raise FormParserError("No boundary given")

1650

1651 header_name = []

1652 header_value = []

1653 headers = {}

1654

1655 # No 'nonlocal' on Python 2 :-(

1656 class vars:

1657 f = None

1658 writer = None

1659 is_file = False

1660

1661 def on_part_begin():

1662 pass

1663

1664 def on_part_data(data, start, end):

1665 bytes_processed = vars.writer.write(data[start:end])

1666 # TODO: check for error here.

1667 return bytes_processed

1668

1669 def on_part_end():

1670 vars.f.finalize()

1671 if vars.is_file:

1672 on_file(vars.f)

1673 else:

1674 on_field(vars.f)

1675

1676 def on_header_field(data, start, end):

1677 header_name.append(data[start:end])

1678

1679 def on_header_value(data, start, end):

1680 header_value.append(data[start:end])

1681

1682 def on_header_end():

1683 headers[b''.join(header_name)] = b''.join(header_value)

1684 del header_name[:]

1685 del header_value[:]

1686

1687 def on_headers_finished():

1688 # Reset the 'is file' flag.

1689 vars.is_file = False

1690

1691 # Parse the content-disposition header.

1692 # TODO: handle mixed case

1693 content_disp = headers.get(b'Content-Disposition')

1694 disp, options = parse_options_header(content_disp)

1695

1696 # Get the field and filename.

1697 field_name = options.get(b'name')

1698 file_name = options.get(b'filename')

1699 # TODO: check for errors

1700

1701 # Create the proper class.

1702 if file_name is None:

1703 vars.f = FieldClass(field_name)

1704 else:

1705 vars.f = FileClass(file_name, field_name, config=self.config)

1706 vars.is_file = True

1707

1708 # Parse the given Content-Transfer-Encoding to determine what

1709 # we need to do with the incoming data.

1710 # TODO: check that we properly handle 8bit / 7bit encoding.

1711 transfer_encoding = headers.get(b'Content-Transfer-Encoding',

1712 b'7bit')

1713

1714 if (transfer_encoding == b'binary' or

1715 transfer_encoding == b'8bit' or

1716 transfer_encoding == b'7bit'):

1717 vars.writer = vars.f

1718

1719 elif transfer_encoding == b'base64':

1720 vars.writer = Base64Decoder(vars.f)

1721

1722 elif transfer_encoding == b'quoted-printable':

1723 vars.writer = QuotedPrintableDecoder(vars.f)

1724

1725 else:

1726 self.logger.warning("Unknown Content-Transfer-Encoding: "

1727 "%r", transfer_encoding)

1728 if self.config['UPLOAD_ERROR_ON_BAD_CTE']:

1729 raise FormParserError(

1730 'Unknown Content-Transfer-Encoding "{}"'.format(

1731 transfer_encoding

1732 )

1733 )

1734 else:

1735 # If we aren't erroring, then we just treat this as an

1736 # unencoded Content-Transfer-Encoding.

1737 vars.writer = vars.f

1738

1739 def on_end():

1740 vars.writer.finalize()

1741 if self.on_end is not None:

1742 self.on_end()

1743

1744 # These are our callbacks for the parser.

1745 callbacks = {

1746 'on_part_begin': on_part_begin,

1747 'on_part_data': on_part_data,

1748 'on_part_end': on_part_end,

1749 'on_header_field': on_header_field,

1750 'on_header_value': on_header_value,

1751 'on_header_end': on_header_end,

1752 'on_headers_finished': on_headers_finished,

1753 'on_end': on_end,

1754 }

1755

1756 # Instantiate a multipart parser.

1757 parser = MultipartParser(boundary, callbacks,

1758 max_size=self.config['MAX_BODY_SIZE'])

1759

1760 else:

1761 self.logger.warning("Unknown Content-Type: %r", content_type)

1762 raise FormParserError("Unknown Content-Type: {}".format(

1763 content_type

1764 ))

1765

1766 self.parser = parser

1767

1768 def write(self, data):

1769 """Write some data. The parser will forward this to the appropriate

1770 underlying parser.

1771

1772 :param data: a bytestring

1773 """

1774 self.bytes_received += len(data)

1775 # TODO: check the parser's return value for errors?

1776 return self.parser.write(data)

1777

1778 def finalize(self):

1779 """Finalize the parser."""

1780 if self.parser is not None and hasattr(self.parser, 'finalize'):

1781 self.parser.finalize()

1782

1783 def close(self):

1784 """Close the parser."""

1785 if self.parser is not None and hasattr(self.parser, 'close'):

1786 self.parser.close()

1787

1788 def __repr__(self):

1789 return "{}(content_type={!r}, parser={!r})".format(

1790 self.__class__.__name__,

1791 self.content_type,

1792 self.parser,

1793 )

1794

1795

1796def create_form_parser(headers, on_field, on_file, trust_x_headers=False,

1797 config={}):

1798 """This function is a helper function to aid in creating a FormParser

1799 instances. Given a dictionary-like headers object, it will determine

1800 the correct information needed, instantiate a FormParser with the

1801 appropriate values and given callbacks, and then return the corresponding

1802 parser.

1803

1804 :param headers: A dictionary-like object of HTTP headers. The only

1805 required header is Content-Type.

1806

1807 :param on_field: Callback to call with each parsed field.

1808

1809 :param on_file: Callback to call with each parsed file.

1810

1811 :param trust_x_headers: Whether or not to trust information received from

1812 certain X-Headers - for example, the file name from

1813 X-File-Name.

1814

1815 :param config: Configuration variables to pass to the FormParser.

1816 """

1817 content_type = headers.get('Content-Type')

1818 if content_type is None:

1819 logging.getLogger(__name__).warning("No Content-Type header given")

1820 raise ValueError("No Content-Type header given!")

1821

1822 # Boundaries are optional (the FormParser will raise if one is needed

1823 # but not given).

1824 content_type, params = parse_options_header(content_type)

1825 boundary = params.get(b'boundary')

1826

1827 # We need content_type to be a string, not a bytes object.

1828 content_type = content_type.decode('latin-1')

1829

1830 # File names are optional.

1831 file_name = headers.get('X-File-Name')

1832

1833 # Instantiate a form parser.

1834 form_parser = FormParser(content_type,

1835 on_field,

1836 on_file,

1837 boundary=boundary,

1838 file_name=file_name,

1839 config=config)

1840

1841 # Return our parser.

1842 return form_parser

1843

1844

1845def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576,

1846 **kwargs):

1847 """This function is useful if you just want to parse a request body,

1848 without too much work. Pass it a dictionary-like object of the request's

1849 headers, and a file-like object for the input stream, along with two

1850 callbacks that will get called whenever a field or file is parsed.

1851

1852 :param headers: A dictionary-like object of HTTP headers. The only

1853 required header is Content-Type.

1854

1855 :param input_stream: A file-like object that represents the request body.

1856 The read() method must return bytestrings.

1857

1858 :param on_field: Callback to call with each parsed field.

1859

1860 :param on_file: Callback to call with each parsed file.

1861

1862 :param chunk_size: The maximum size to read from the input stream and write

1863 to the parser at one time. Defaults to 1 MiB.

1864 """

1865

1866 # Create our form parser.

1867 parser = create_form_parser(headers, on_field, on_file)

1868

1869 # Read chunks of 100KiB and write to the parser, but never read more than

1870 # the given Content-Length, if any.

1871 content_length = headers.get('Content-Length')

1872 if content_length is not None:

1873 content_length = int(content_length)

1874 else:

1875 content_length = float('inf')

1876 bytes_read = 0

1877

1878 while True:

1879 # Read only up to the Content-Length given.

1880 max_readable = min(content_length - bytes_read, 1048576)

1881 buff = input_stream.read(max_readable)

1882

1883 # Write to the parser and update our length.

1884 parser.write(buff)

1885 bytes_read += len(buff)

1886

1887 # If we get a buffer that's smaller than the size requested, or if we

1888 # have read up to our content length, we're done.

1889 if len(buff) != max_readable or bytes_read == content_length:

1890 break

1891

1892 # Tell our parser that we're done writing data.

1893 parser.finalize()