Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/django/http/multipartparser.py: 15%

1"""

2Multi-part parsing for file uploads.

4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to

5file upload handlers for processing.

6"""

8import base64

9import binascii

10import collections

11import html

13from django.conf import settings

14from django.core.exceptions import (

15 RequestDataTooBig,

16 SuspiciousMultipartForm,

17 TooManyFieldsSent,

18 TooManyFilesSent,

19)

20from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload

21from django.utils.datastructures import MultiValueDict

22from django.utils.encoding import force_str

23from django.utils.http import parse_header_parameters

24from django.utils.regex_helper import _lazy_re_compile

26__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")

29class MultiPartParserError(Exception):

30 pass

33class InputStreamExhausted(Exception):

34 """

35 No more reads are allowed from this device.

36 """

38 pass

41RAW = "raw"

42FILE = "file"

43FIELD = "field"

44FIELD_TYPES = frozenset([FIELD, RAW])

45MAX_TOTAL_HEADER_SIZE = 1024

48class MultiPartParser:

49 """

50 An RFC 7578 multipart/form-data parser.

52 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks

53 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.

54 """

56 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]")

58 def __init__(self, META, input_data, upload_handlers, encoding=None):

59 """

60 Initialize the MultiPartParser object.

62 :META:

63 The standard ``META`` dictionary in Django request objects.

64 :input_data:

65 The raw post data, as a file-like object.

66 :upload_handlers:

67 A list of UploadHandler instances that perform operations on the

68 uploaded data.

69 :encoding:

70 The encoding with which to treat the incoming data.

71 """

72 # Content-Type should contain multipart and the boundary information.

73 content_type = META.get("CONTENT_TYPE", "")

74 if not content_type.startswith("multipart/"):

75 raise MultiPartParserError("Invalid Content-Type: %s" % content_type)

77 try:

78 content_type.encode("ascii")

79 except UnicodeEncodeError:

80 raise MultiPartParserError(

81 "Invalid non-ASCII Content-Type in multipart: %s"

82 % force_str(content_type)

83 )

85 # Parse the header to get the boundary to split the parts.

86 _, opts = parse_header_parameters(content_type)

87 boundary = opts.get("boundary")

88 if not boundary or not self.boundary_re.fullmatch(boundary):

89 raise MultiPartParserError(

90 "Invalid boundary in multipart: %s" % force_str(boundary)

91 )

93 # Content-Length should contain the length of the body we are about

94 # to receive.

95 try:

96 content_length = int(META.get("CONTENT_LENGTH", 0))

97 except (ValueError, TypeError):

98 content_length = 0

100 if content_length < 0:

101 # This means we shouldn't continue...raise an error.

102 raise MultiPartParserError("Invalid content length: %r" % content_length)

103

104 self._boundary = boundary.encode("ascii")

105 self._input_data = input_data

106

107 # For compatibility with low-level network APIs (with 32-bit integers),

108 # the chunk size should be < 2^31, but still divisible by 4.

109 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]

110 self._chunk_size = min([2**31 - 4] + possible_sizes)

111

112 self._meta = META

113 self._encoding = encoding or settings.DEFAULT_CHARSET

114 self._content_length = content_length

115 self._upload_handlers = upload_handlers

116

117 def parse(self):

118 # Call the actual parse routine and close all open files in case of

119 # errors. This is needed because if exceptions are thrown the

120 # MultiPartParser will not be garbage collected immediately and

121 # resources would be kept alive. This is only needed for errors because

122 # the Request object closes all uploaded files at the end of the

123 # request.

124 try:

125 return self._parse()

126 except Exception:

127 if hasattr(self, "_files"):

128 for _, files in self._files.lists():

129 for fileobj in files:

130 fileobj.close()

131 raise

132

133 def _parse(self):

134 """

135 Parse the POST data and break it into a FILES MultiValueDict and a POST

136 MultiValueDict.

137

138 Return a tuple containing the POST and FILES dictionary, respectively.

139 """

140 from django.http import QueryDict

141

142 encoding = self._encoding

143 handlers = self._upload_handlers

144

145 # HTTP spec says that Content-Length >= 0 is valid

146 # handling content-length == 0 before continuing

147 if self._content_length == 0:

148 return QueryDict(encoding=self._encoding), MultiValueDict()

149

150 # See if any of the handlers take care of the parsing.

151 # This allows overriding everything if need be.

152 for handler in handlers:

153 result = handler.handle_raw_input(

154 self._input_data,

155 self._meta,

156 self._content_length,

157 self._boundary,

158 encoding,

159 )

160 # Check to see if it was handled

161 if result is not None:

162 return result[0], result[1]

163

164 # Create the data structures to be used later.

165 self._post = QueryDict(mutable=True)

166 self._files = MultiValueDict()

167

168 # Instantiate the parser and stream:

169 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))

170

171 # Whether or not to signal a file-completion at the beginning of the loop.

172 old_field_name = None

173 counters = [0] * len(handlers)

174

175 # Number of bytes that have been read.

176 num_bytes_read = 0

177 # To count the number of keys in the request.

178 num_post_keys = 0

179 # To count the number of files in the request.

180 num_files = 0

181 # To limit the amount of data read from the request.

182 read_size = None

183 # Whether a file upload is finished.

184 uploaded_file = True

185

186 try:

187 for item_type, meta_data, field_stream in Parser(stream, self._boundary):

188 if old_field_name:

189 # We run this at the beginning of the next loop

190 # since we cannot be sure a file is complete until

191 # we hit the next boundary/part of the multipart content.

192 self.handle_file_complete(old_field_name, counters)

193 old_field_name = None

194 uploaded_file = True

195

196 if (

197 item_type in FIELD_TYPES

198 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None

199 ):

200 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.

201 num_post_keys += 1

202 # 2 accounts for empty raw fields before and after the

203 # last boundary.

204 if settings.DATA_UPLOAD_MAX_NUMBER_FIELDS + 2 < num_post_keys:

205 raise TooManyFieldsSent(

206 "The number of GET/POST parameters exceeded "

207 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."

208 )

209

210 try:

211 disposition = meta_data["content-disposition"][1]

212 field_name = disposition["name"].strip()

213 except (KeyError, IndexError, AttributeError):

214 continue

215

216 transfer_encoding = meta_data.get("content-transfer-encoding")

217 if transfer_encoding is not None:

218 transfer_encoding = transfer_encoding[0].strip()

219 field_name = force_str(field_name, encoding, errors="replace")

220

221 if item_type == FIELD:

222 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.

223 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:

224 read_size = (

225 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read

226 )

227

228 # This is a post field, we can just set it in the post

229 if transfer_encoding == "base64":

230 raw_data = field_stream.read(size=read_size)

231 num_bytes_read += len(raw_data)

232 try:

233 data = base64.b64decode(raw_data)

234 except binascii.Error:

235 data = raw_data

236 else:

237 data = field_stream.read(size=read_size)

238 num_bytes_read += len(data)

239

240 # Add two here to make the check consistent with the

241 # x-www-form-urlencoded check that includes '&='.

242 num_bytes_read += len(field_name) + 2

243 if (

244 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None

245 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE

246 ):

247 raise RequestDataTooBig(

248 "Request body exceeded "

249 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."

250 )

251

252 self._post.appendlist(

253 field_name, force_str(data, encoding, errors="replace")

254 )

255 elif item_type == FILE:

256 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FILES.

257 num_files += 1

258 if (

259 settings.DATA_UPLOAD_MAX_NUMBER_FILES is not None

260 and num_files > settings.DATA_UPLOAD_MAX_NUMBER_FILES

261 ):

262 raise TooManyFilesSent(

263 "The number of files exceeded "

264 "settings.DATA_UPLOAD_MAX_NUMBER_FILES."

265 )

266 # This is a file, use the handler...

267 file_name = disposition.get("filename")

268 if file_name:

269 file_name = force_str(file_name, encoding, errors="replace")

270 file_name = self.sanitize_file_name(file_name)

271 if not file_name:

272 continue

273

274 content_type, content_type_extra = meta_data.get(

275 "content-type", ("", {})

276 )

277 content_type = content_type.strip()

278 charset = content_type_extra.get("charset")

279

280 try:

281 content_length = int(meta_data.get("content-length")[0])

282 except (IndexError, TypeError, ValueError):

283 content_length = None

284

285 counters = [0] * len(handlers)

286 uploaded_file = False

287 try:

288 for handler in handlers:

289 try:

290 handler.new_file(

291 field_name,

292 file_name,

293 content_type,

294 content_length,

295 charset,

296 content_type_extra,

297 )

298 except StopFutureHandlers:

299 break

300

301 for chunk in field_stream:

302 if transfer_encoding == "base64":

303 # We only special-case base64 transfer encoding

304 # We should always decode base64 chunks by

305 # multiple of 4, ignoring whitespace.

306

307 stripped_chunk = b"".join(chunk.split())

308

309 remaining = len(stripped_chunk) % 4

310 while remaining != 0:

311 over_chunk = field_stream.read(4 - remaining)

312 if not over_chunk:

313 break

314 stripped_chunk += b"".join(over_chunk.split())

315 remaining = len(stripped_chunk) % 4

316

317 try:

318 chunk = base64.b64decode(stripped_chunk)

319 except Exception as exc:

320 # Since this is only a chunk, any error is

321 # an unfixable error.

322 raise MultiPartParserError(

323 "Could not decode base64 data."

324 ) from exc

325

326 for i, handler in enumerate(handlers):

327 chunk_length = len(chunk)

328 chunk = handler.receive_data_chunk(chunk, counters[i])

329 counters[i] += chunk_length

330 if chunk is None:

331 # Don't continue if the chunk received by

332 # the handler is None.

333 break

334

335 except SkipFile:

336 self._close_files()

337 # Just use up the rest of this file...

338 exhaust(field_stream)

339 else:

340 # Handle file upload completions on next iteration.

341 old_field_name = field_name

342 else:

343 # If this is neither a FIELD nor a FILE, exhaust the field

344 # stream. Note: There could be an error here at some point,

345 # but there will be at least two RAW types (before and

346 # after the other boundaries). This branch is usually not

347 # reached at all, because a missing content-disposition

348 # header will skip the whole boundary.

349 exhaust(field_stream)

350 except StopUpload as e:

351 self._close_files()

352 if not e.connection_reset:

353 exhaust(self._input_data)

354 else:

355 if not uploaded_file:

356 for handler in handlers:

357 handler.upload_interrupted()

358 # Make sure that the request data is all fed

359 exhaust(self._input_data)

360

361 # Signal that the upload has completed.

362 # any() shortcircuits if a handler's upload_complete() returns a value.

363 any(handler.upload_complete() for handler in handlers)

364 self._post._mutable = False

365 return self._post, self._files

366

367 def handle_file_complete(self, old_field_name, counters):

368 """

369 Handle all the signaling that takes place when a file is complete.

370 """

371 for i, handler in enumerate(self._upload_handlers):

372 file_obj = handler.file_complete(counters[i])

373 if file_obj:

374 # If it returns a file object, then set the files dict.

375 self._files.appendlist(

376 force_str(old_field_name, self._encoding, errors="replace"),

377 file_obj,

378 )

379 break

380

381 def sanitize_file_name(self, file_name):

382 """

383 Sanitize the filename of an upload.

384

385 Remove all possible path separators, even though that might remove more

386 than actually required by the target system. Filenames that could

387 potentially cause problems (current/parent dir) are also discarded.

388

389 It should be noted that this function could still return a "filepath"

390 like "C:some_file.txt" which is handled later on by the storage layer.

391 So while this function does sanitize filenames to some extent, the

392 resulting filename should still be considered as untrusted user input.

393 """

394 file_name = html.unescape(file_name)

395 file_name = file_name.rsplit("/")[-1]

396 file_name = file_name.rsplit("\\")[-1]

397 # Remove non-printable characters.

398 file_name = "".join([char for char in file_name if char.isprintable()])

399

400 if file_name in {"", ".", ".."}:

401 return None

402 return file_name

403

404 IE_sanitize = sanitize_file_name

405

406 def _close_files(self):

407 # Free up all file handles.

408 # FIXME: this currently assumes that upload handlers store the file as 'file'

409 # We should document that...

410 # (Maybe add handler.free_file to complement new_file)

411 for handler in self._upload_handlers:

412 if hasattr(handler, "file"):

413 handler.file.close()

414

415

416class LazyStream:

417 """

418 The LazyStream wrapper allows one to get and "unget" bytes from a stream.

419

420 Given a producer object (an iterator that yields bytestrings), the

421 LazyStream object will support iteration, reading, and keeping a "look-back"

422 variable in case you need to "unget" some bytes.

423 """

424

425 def __init__(self, producer, length=None):

426 """

427 Every LazyStream must have a producer when instantiated.

428

429 A producer is an iterable that returns a string each time it

430 is called.

431 """

432 self._producer = producer

433 self._empty = False

434 self._leftover = b""

435 self.length = length

436 self.position = 0

437 self._remaining = length

438 self._unget_history = []

439

440 def tell(self):

441 return self.position

442

443 def read(self, size=None):

444 def parts():

445 remaining = self._remaining if size is None else size

446 # do the whole thing in one shot if no limit was provided.

447 if remaining is None:

448 yield b"".join(self)

449 return

450

451 # otherwise do some bookkeeping to return exactly enough

452 # of the stream and stashing any extra content we get from

453 # the producer

454 while remaining != 0:

455 assert remaining > 0, "remaining bytes to read should never go negative"

456

457 try:

458 chunk = next(self)

459 except StopIteration:

460 return

461 else:

462 emitting = chunk[:remaining]

463 self.unget(chunk[remaining:])

464 remaining -= len(emitting)

465 yield emitting

466

467 return b"".join(parts())

468

469 def __next__(self):

470 """

471 Used when the exact number of bytes to read is unimportant.

472

473 Return whatever chunk is conveniently returned from the iterator.

474 Useful to avoid unnecessary bookkeeping if performance is an issue.

475 """

476 if self._leftover:

477 output = self._leftover

478 self._leftover = b""

479 else:

480 output = next(self._producer)

481 self._unget_history = []

482 self.position += len(output)

483 return output

484

485 def close(self):

486 """

487 Used to invalidate/disable this lazy stream.

488

489 Replace the producer with an empty list. Any leftover bytes that have

490 already been read will still be reported upon read() and/or next().

491 """

492 self._producer = []

493

494 def __iter__(self):

495 return self

496

497 def unget(self, bytes):

498 """

499 Place bytes back onto the front of the lazy stream.

500

501 Future calls to read() will return those bytes first. The

502 stream position and thus tell() will be rewound.

503 """

504 if not bytes:

505 return

506 self._update_unget_history(len(bytes))

507 self.position -= len(bytes)

508 self._leftover = bytes + self._leftover

509

510 def _update_unget_history(self, num_bytes):

511 """

512 Update the unget history as a sanity check to see if we've pushed

513 back the same number of bytes in one chunk. If we keep ungetting the

514 same number of bytes many times (here, 50), we're mostly likely in an

515 infinite loop of some sort. This is usually caused by a

516 maliciously-malformed MIME request.

517 """

518 self._unget_history = [num_bytes] + self._unget_history[:49]

519 number_equal = len(

520 [

521 current_number

522 for current_number in self._unget_history

523 if current_number == num_bytes

524 ]

525 )

526

527 if number_equal > 40:

528 raise SuspiciousMultipartForm(

529 "The multipart parser got stuck, which shouldn't happen with"

530 " normal uploaded files. Check for malicious upload activity;"

531 " if there is none, report this to the Django developers."

532 )

533

534

535class ChunkIter:

536 """

537 An iterable that will yield chunks of data. Given a file-like object as the

538 constructor, yield chunks of read operations from that object.

539 """

540

541 def __init__(self, flo, chunk_size=64 * 1024):

542 self.flo = flo

543 self.chunk_size = chunk_size

544

545 def __next__(self):

546 try:

547 data = self.flo.read(self.chunk_size)

548 except InputStreamExhausted:

549 raise StopIteration()

550 if data:

551 return data

552 else:

553 raise StopIteration()

554

555 def __iter__(self):

556 return self

557

558

559class InterBoundaryIter:

560 """

561 A Producer that will iterate over boundaries.

562 """

563

564 def __init__(self, stream, boundary):

565 self._stream = stream

566 self._boundary = boundary

567

568 def __iter__(self):

569 return self

570

571 def __next__(self):

572 try:

573 return LazyStream(BoundaryIter(self._stream, self._boundary))

574 except InputStreamExhausted:

575 raise StopIteration()

576

577

578class BoundaryIter:

579 """

580 A Producer that is sensitive to boundaries.

581

582 Will happily yield bytes until a boundary is found. Will yield the bytes

583 before the boundary, throw away the boundary bytes themselves, and push the

584 post-boundary bytes back on the stream.

585

586 The future calls to next() after locating the boundary will raise a

587 StopIteration exception.

588 """

589

590 def __init__(self, stream, boundary):

591 self._stream = stream

592 self._boundary = boundary

593 self._done = False

594 # rollback an additional six bytes because the format is like

595 # this: CRLF<boundary>[--CRLF]

596 self._rollback = len(boundary) + 6

597

598 # Try to use mx fast string search if available. Otherwise

599 # use Python find. Wrap the latter for consistency.

600 unused_char = self._stream.read(1)

601 if not unused_char:

602 raise InputStreamExhausted()

603 self._stream.unget(unused_char)

604

605 def __iter__(self):

606 return self

607

608 def __next__(self):

609 if self._done:

610 raise StopIteration()

611

612 stream = self._stream

613 rollback = self._rollback

614

615 bytes_read = 0

616 chunks = []

617 for bytes in stream:

618 bytes_read += len(bytes)

619 chunks.append(bytes)

620 if bytes_read > rollback:

621 break

622 if not bytes:

623 break

624 else:

625 self._done = True

626

627 if not chunks:

628 raise StopIteration()

629

630 chunk = b"".join(chunks)

631 boundary = self._find_boundary(chunk)

632

633 if boundary:

634 end, next = boundary

635 stream.unget(chunk[next:])

636 self._done = True

637 return chunk[:end]

638 else:

639 # make sure we don't treat a partial boundary (and

640 # its separators) as data

641 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):

642 # There's nothing left, we should just return and mark as done.

643 self._done = True

644 return chunk

645 else:

646 stream.unget(chunk[-rollback:])

647 return chunk[:-rollback]

648

649 def _find_boundary(self, data):

650 """

651 Find a multipart boundary in data.

652

653 Should no boundary exist in the data, return None. Otherwise, return

654 a tuple containing the indices of the following:

655 * the end of current encapsulation

656 * the start of the next encapsulation

657 """

658 index = data.find(self._boundary)

659 if index < 0:

660 return None

661 else:

662 end = index

663 next = index + len(self._boundary)

664 # backup over CRLF

665 last = max(0, end - 1)

666 if data[last : last + 1] == b"\n":

667 end -= 1

668 last = max(0, end - 1)

669 if data[last : last + 1] == b"\r":

670 end -= 1

671 return end, next

672

673

674def exhaust(stream_or_iterable):

675 """Exhaust an iterator or stream."""

676 try:

677 iterator = iter(stream_or_iterable)

678 except TypeError:

679 iterator = ChunkIter(stream_or_iterable, 16384)

680 collections.deque(iterator, maxlen=0) # consume iterator quickly.

681

682

683def parse_boundary_stream(stream, max_header_size):

684 """

685 Parse one and exactly one stream that encapsulates a boundary.

686 """

687

688 # Look for the end of headers and if not found extend the search to double

689 # the size up to the MAX_TOTAL_HEADER_SIZE.

690 headers_chunk_size = 1024

691 while True:

692 if headers_chunk_size > max_header_size:

693 raise MultiPartParserError("Request max total header size exceeded.")

694

695 # Stream at beginning of header, look for end of header and parse it if

696 # found. The header must fit within one chunk.

697 chunk = stream.read(headers_chunk_size)

698 # 'find' returns the top of these four bytes, so munch them later to

699 # prevent them from polluting the payload.

700 header_end = chunk.find(b"\r\n\r\n")

701 if header_end != -1:

702 break

703

704 # Find no header, mark this fact and pass on the stream verbatim.

705 stream.unget(chunk)

706 # No more data to read.

707 if len(chunk) < headers_chunk_size:

708 return (RAW, {}, stream)

709 # Double the chunk size.

710 headers_chunk_size *= 2

711

712 header = chunk[:header_end]

713

714 # here we place any excess chunk back onto the stream, as

715 # well as throwing away the CRLFCRLF bytes from above.

716 stream.unget(chunk[header_end + 4 :])

717

718 TYPE = RAW

719 outdict = {}

720

721 # Eliminate blank lines

722 for line in header.split(b"\r\n"):

723 # This terminology ("main value" and "dictionary of

724 # parameters") is from the Python docs.

725 try:

726 main_value_pair, params = parse_header_parameters(line.decode())

727 name, value = main_value_pair.split(":", 1)

728 params = {k: v.encode() for k, v in params.items()}

729 except ValueError: # Invalid header.

730 continue

731

732 if name == "content-disposition":

733 TYPE = FIELD

734 if params.get("filename"):

735 TYPE = FILE

736

737 outdict[name] = value, params

738

739 if TYPE == RAW:

740 stream.unget(chunk)

741

742 return (TYPE, outdict, stream)

743

744

745class Parser:

746 def __init__(self, stream, boundary):

747 self._stream = stream

748 self._separator = b"--" + boundary

749

750 def __iter__(self):

751 boundarystream = InterBoundaryIter(self._stream, self._separator)

752 for sub_stream in boundarystream:

753 # Iterate over each part

754 yield parse_boundary_stream(sub_stream, MAX_TOTAL_HEADER_SIZE)