Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/http/multipartparser.py: 16%

1"""

2Multi-part parsing for file uploads.

4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to

5file upload handlers for processing.

6"""

7import base64

8import binascii

9import collections

10import html

12from django.conf import settings

13from django.core.exceptions import (

14 RequestDataTooBig,

15 SuspiciousMultipartForm,

16 TooManyFieldsSent,

17)

18from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload

19from django.utils.datastructures import MultiValueDict

20from django.utils.encoding import force_str

21from django.utils.http import parse_header_parameters

22from django.utils.regex_helper import _lazy_re_compile

24__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")

27class MultiPartParserError(Exception):

28 pass

31class InputStreamExhausted(Exception):

32 """

33 No more reads are allowed from this device.

34 """

36 pass

39RAW = "raw"

40FILE = "file"

41FIELD = "field"

44class MultiPartParser:

45 """

46 An RFC 7578 multipart/form-data parser.

48 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks

49 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.

50 """

52 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]")

54 def __init__(self, META, input_data, upload_handlers, encoding=None):

55 """

56 Initialize the MultiPartParser object.

58 :META:

59 The standard ``META`` dictionary in Django request objects.

60 :input_data:

61 The raw post data, as a file-like object.

62 :upload_handlers:

63 A list of UploadHandler instances that perform operations on the

64 uploaded data.

65 :encoding:

66 The encoding with which to treat the incoming data.

67 """

68 # Content-Type should contain multipart and the boundary information.

69 content_type = META.get("CONTENT_TYPE", "")

70 if not content_type.startswith("multipart/"):

71 raise MultiPartParserError("Invalid Content-Type: %s" % content_type)

73 try:

74 content_type.encode("ascii")

75 except UnicodeEncodeError:

76 raise MultiPartParserError(

77 "Invalid non-ASCII Content-Type in multipart: %s"

78 % force_str(content_type)

79 )

81 # Parse the header to get the boundary to split the parts.

82 _, opts = parse_header_parameters(content_type)

83 boundary = opts.get("boundary")

84 if not boundary or not self.boundary_re.fullmatch(boundary):

85 raise MultiPartParserError(

86 "Invalid boundary in multipart: %s" % force_str(boundary)

87 )

89 # Content-Length should contain the length of the body we are about

90 # to receive.

91 try:

92 content_length = int(META.get("CONTENT_LENGTH", 0))

93 except (ValueError, TypeError):

94 content_length = 0

96 if content_length < 0:

97 # This means we shouldn't continue...raise an error.

98 raise MultiPartParserError("Invalid content length: %r" % content_length)

100 self._boundary = boundary.encode("ascii")

101 self._input_data = input_data

102

103 # For compatibility with low-level network APIs (with 32-bit integers),

104 # the chunk size should be < 2^31, but still divisible by 4.

105 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]

106 self._chunk_size = min([2**31 - 4] + possible_sizes)

107

108 self._meta = META

109 self._encoding = encoding or settings.DEFAULT_CHARSET

110 self._content_length = content_length

111 self._upload_handlers = upload_handlers

112

113 def parse(self):

114 """

115 Parse the POST data and break it into a FILES MultiValueDict and a POST

116 MultiValueDict.

117

118 Return a tuple containing the POST and FILES dictionary, respectively.

119 """

120 from django.http import QueryDict

121

122 encoding = self._encoding

123 handlers = self._upload_handlers

124

125 # HTTP spec says that Content-Length >= 0 is valid

126 # handling content-length == 0 before continuing

127 if self._content_length == 0:

128 return QueryDict(encoding=self._encoding), MultiValueDict()

129

130 # See if any of the handlers take care of the parsing.

131 # This allows overriding everything if need be.

132 for handler in handlers:

133 result = handler.handle_raw_input(

134 self._input_data,

135 self._meta,

136 self._content_length,

137 self._boundary,

138 encoding,

139 )

140 # Check to see if it was handled

141 if result is not None:

142 return result[0], result[1]

143

144 # Create the data structures to be used later.

145 self._post = QueryDict(mutable=True)

146 self._files = MultiValueDict()

147

148 # Instantiate the parser and stream:

149 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))

150

151 # Whether or not to signal a file-completion at the beginning of the loop.

152 old_field_name = None

153 counters = [0] * len(handlers)

154

155 # Number of bytes that have been read.

156 num_bytes_read = 0

157 # To count the number of keys in the request.

158 num_post_keys = 0

159 # To limit the amount of data read from the request.

160 read_size = None

161 # Whether a file upload is finished.

162 uploaded_file = True

163

164 try:

165 for item_type, meta_data, field_stream in Parser(stream, self._boundary):

166 if old_field_name:

167 # We run this at the beginning of the next loop

168 # since we cannot be sure a file is complete until

169 # we hit the next boundary/part of the multipart content.

170 self.handle_file_complete(old_field_name, counters)

171 old_field_name = None

172 uploaded_file = True

173

174 try:

175 disposition = meta_data["content-disposition"][1]

176 field_name = disposition["name"].strip()

177 except (KeyError, IndexError, AttributeError):

178 continue

179

180 transfer_encoding = meta_data.get("content-transfer-encoding")

181 if transfer_encoding is not None:

182 transfer_encoding = transfer_encoding[0].strip()

183 field_name = force_str(field_name, encoding, errors="replace")

184

185 if item_type == FIELD:

186 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.

187 num_post_keys += 1

188 if (

189 settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None

190 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS < num_post_keys

191 ):

192 raise TooManyFieldsSent(

193 "The number of GET/POST parameters exceeded "

194 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."

195 )

196

197 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.

198 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:

199 read_size = (

200 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read

201 )

202

203 # This is a post field, we can just set it in the post

204 if transfer_encoding == "base64":

205 raw_data = field_stream.read(size=read_size)

206 num_bytes_read += len(raw_data)

207 try:

208 data = base64.b64decode(raw_data)

209 except binascii.Error:

210 data = raw_data

211 else:

212 data = field_stream.read(size=read_size)

213 num_bytes_read += len(data)

214

215 # Add two here to make the check consistent with the

216 # x-www-form-urlencoded check that includes '&='.

217 num_bytes_read += len(field_name) + 2

218 if (

219 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None

220 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE

221 ):

222 raise RequestDataTooBig(

223 "Request body exceeded "

224 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."

225 )

226

227 self._post.appendlist(

228 field_name, force_str(data, encoding, errors="replace")

229 )

230 elif item_type == FILE:

231 # This is a file, use the handler...

232 file_name = disposition.get("filename")

233 if file_name:

234 file_name = force_str(file_name, encoding, errors="replace")

235 file_name = self.sanitize_file_name(file_name)

236 if not file_name:

237 continue

238

239 content_type, content_type_extra = meta_data.get(

240 "content-type", ("", {})

241 )

242 content_type = content_type.strip()

243 charset = content_type_extra.get("charset")

244

245 try:

246 content_length = int(meta_data.get("content-length")[0])

247 except (IndexError, TypeError, ValueError):

248 content_length = None

249

250 counters = [0] * len(handlers)

251 uploaded_file = False

252 try:

253 for handler in handlers:

254 try:

255 handler.new_file(

256 field_name,

257 file_name,

258 content_type,

259 content_length,

260 charset,

261 content_type_extra,

262 )

263 except StopFutureHandlers:

264 break

265

266 for chunk in field_stream:

267 if transfer_encoding == "base64":

268 # We only special-case base64 transfer encoding

269 # We should always decode base64 chunks by

270 # multiple of 4, ignoring whitespace.

271

272 stripped_chunk = b"".join(chunk.split())

273

274 remaining = len(stripped_chunk) % 4

275 while remaining != 0:

276 over_chunk = field_stream.read(4 - remaining)

277 if not over_chunk:

278 break

279 stripped_chunk += b"".join(over_chunk.split())

280 remaining = len(stripped_chunk) % 4

281

282 try:

283 chunk = base64.b64decode(stripped_chunk)

284 except Exception as exc:

285 # Since this is only a chunk, any error is

286 # an unfixable error.

287 raise MultiPartParserError(

288 "Could not decode base64 data."

289 ) from exc

290

291 for i, handler in enumerate(handlers):

292 chunk_length = len(chunk)

293 chunk = handler.receive_data_chunk(chunk, counters[i])

294 counters[i] += chunk_length

295 if chunk is None:

296 # Don't continue if the chunk received by

297 # the handler is None.

298 break

299

300 except SkipFile:

301 self._close_files()

302 # Just use up the rest of this file...

303 exhaust(field_stream)

304 else:

305 # Handle file upload completions on next iteration.

306 old_field_name = field_name

307 else:

308 # If this is neither a FIELD or a FILE, just exhaust the stream.

309 exhaust(stream)

310 except StopUpload as e:

311 self._close_files()

312 if not e.connection_reset:

313 exhaust(self._input_data)

314 else:

315 if not uploaded_file:

316 for handler in handlers:

317 handler.upload_interrupted()

318 # Make sure that the request data is all fed

319 exhaust(self._input_data)

320

321 # Signal that the upload has completed.

322 # any() shortcircuits if a handler's upload_complete() returns a value.

323 any(handler.upload_complete() for handler in handlers)

324 self._post._mutable = False

325 return self._post, self._files

326

327 def handle_file_complete(self, old_field_name, counters):

328 """

329 Handle all the signaling that takes place when a file is complete.

330 """

331 for i, handler in enumerate(self._upload_handlers):

332 file_obj = handler.file_complete(counters[i])

333 if file_obj:

334 # If it returns a file object, then set the files dict.

335 self._files.appendlist(

336 force_str(old_field_name, self._encoding, errors="replace"),

337 file_obj,

338 )

339 break

340

341 def sanitize_file_name(self, file_name):

342 """

343 Sanitize the filename of an upload.

344

345 Remove all possible path separators, even though that might remove more

346 than actually required by the target system. Filenames that could

347 potentially cause problems (current/parent dir) are also discarded.

348

349 It should be noted that this function could still return a "filepath"

350 like "C:some_file.txt" which is handled later on by the storage layer.

351 So while this function does sanitize filenames to some extent, the

352 resulting filename should still be considered as untrusted user input.

353 """

354 file_name = html.unescape(file_name)

355 file_name = file_name.rsplit("/")[-1]

356 file_name = file_name.rsplit("\\")[-1]

357 # Remove non-printable characters.

358 file_name = "".join([char for char in file_name if char.isprintable()])

359

360 if file_name in {"", ".", ".."}:

361 return None

362 return file_name

363

364 IE_sanitize = sanitize_file_name

365

366 def _close_files(self):

367 # Free up all file handles.

368 # FIXME: this currently assumes that upload handlers store the file as 'file'

369 # We should document that...

370 # (Maybe add handler.free_file to complement new_file)

371 for handler in self._upload_handlers:

372 if hasattr(handler, "file"):

373 handler.file.close()

374

375

376class LazyStream:

377 """

378 The LazyStream wrapper allows one to get and "unget" bytes from a stream.

379

380 Given a producer object (an iterator that yields bytestrings), the

381 LazyStream object will support iteration, reading, and keeping a "look-back"

382 variable in case you need to "unget" some bytes.

383 """

384

385 def __init__(self, producer, length=None):

386 """

387 Every LazyStream must have a producer when instantiated.

388

389 A producer is an iterable that returns a string each time it

390 is called.

391 """

392 self._producer = producer

393 self._empty = False

394 self._leftover = b""

395 self.length = length

396 self.position = 0

397 self._remaining = length

398 self._unget_history = []

399

400 def tell(self):

401 return self.position

402

403 def read(self, size=None):

404 def parts():

405 remaining = self._remaining if size is None else size

406 # do the whole thing in one shot if no limit was provided.

407 if remaining is None:

408 yield b"".join(self)

409 return

410

411 # otherwise do some bookkeeping to return exactly enough

412 # of the stream and stashing any extra content we get from

413 # the producer

414 while remaining != 0:

415 assert remaining > 0, "remaining bytes to read should never go negative"

416

417 try:

418 chunk = next(self)

419 except StopIteration:

420 return

421 else:

422 emitting = chunk[:remaining]

423 self.unget(chunk[remaining:])

424 remaining -= len(emitting)

425 yield emitting

426

427 return b"".join(parts())

428

429 def __next__(self):

430 """

431 Used when the exact number of bytes to read is unimportant.

432

433 Return whatever chunk is conveniently returned from the iterator.

434 Useful to avoid unnecessary bookkeeping if performance is an issue.

435 """

436 if self._leftover:

437 output = self._leftover

438 self._leftover = b""

439 else:

440 output = next(self._producer)

441 self._unget_history = []

442 self.position += len(output)

443 return output

444

445 def close(self):

446 """

447 Used to invalidate/disable this lazy stream.

448

449 Replace the producer with an empty list. Any leftover bytes that have

450 already been read will still be reported upon read() and/or next().

451 """

452 self._producer = []

453

454 def __iter__(self):

455 return self

456

457 def unget(self, bytes):

458 """

459 Place bytes back onto the front of the lazy stream.

460

461 Future calls to read() will return those bytes first. The

462 stream position and thus tell() will be rewound.

463 """

464 if not bytes:

465 return

466 self._update_unget_history(len(bytes))

467 self.position -= len(bytes)

468 self._leftover = bytes + self._leftover

469

470 def _update_unget_history(self, num_bytes):

471 """

472 Update the unget history as a sanity check to see if we've pushed

473 back the same number of bytes in one chunk. If we keep ungetting the

474 same number of bytes many times (here, 50), we're mostly likely in an

475 infinite loop of some sort. This is usually caused by a

476 maliciously-malformed MIME request.

477 """

478 self._unget_history = [num_bytes] + self._unget_history[:49]

479 number_equal = len(

480 [

481 current_number

482 for current_number in self._unget_history

483 if current_number == num_bytes

484 ]

485 )

486

487 if number_equal > 40:

488 raise SuspiciousMultipartForm(

489 "The multipart parser got stuck, which shouldn't happen with"

490 " normal uploaded files. Check for malicious upload activity;"

491 " if there is none, report this to the Django developers."

492 )

493

494

495class ChunkIter:

496 """

497 An iterable that will yield chunks of data. Given a file-like object as the

498 constructor, yield chunks of read operations from that object.

499 """

500

501 def __init__(self, flo, chunk_size=64 * 1024):

502 self.flo = flo

503 self.chunk_size = chunk_size

504

505 def __next__(self):

506 try:

507 data = self.flo.read(self.chunk_size)

508 except InputStreamExhausted:

509 raise StopIteration()

510 if data:

511 return data

512 else:

513 raise StopIteration()

514

515 def __iter__(self):

516 return self

517

518

519class InterBoundaryIter:

520 """

521 A Producer that will iterate over boundaries.

522 """

523

524 def __init__(self, stream, boundary):

525 self._stream = stream

526 self._boundary = boundary

527

528 def __iter__(self):

529 return self

530

531 def __next__(self):

532 try:

533 return LazyStream(BoundaryIter(self._stream, self._boundary))

534 except InputStreamExhausted:

535 raise StopIteration()

536

537

538class BoundaryIter:

539 """

540 A Producer that is sensitive to boundaries.

541

542 Will happily yield bytes until a boundary is found. Will yield the bytes

543 before the boundary, throw away the boundary bytes themselves, and push the

544 post-boundary bytes back on the stream.

545

546 The future calls to next() after locating the boundary will raise a

547 StopIteration exception.

548 """

549

550 def __init__(self, stream, boundary):

551 self._stream = stream

552 self._boundary = boundary

553 self._done = False

554 # rollback an additional six bytes because the format is like

555 # this: CRLF<boundary>[--CRLF]

556 self._rollback = len(boundary) + 6

557

558 # Try to use mx fast string search if available. Otherwise

559 # use Python find. Wrap the latter for consistency.

560 unused_char = self._stream.read(1)

561 if not unused_char:

562 raise InputStreamExhausted()

563 self._stream.unget(unused_char)

564

565 def __iter__(self):

566 return self

567

568 def __next__(self):

569 if self._done:

570 raise StopIteration()

571

572 stream = self._stream

573 rollback = self._rollback

574

575 bytes_read = 0

576 chunks = []

577 for bytes in stream:

578 bytes_read += len(bytes)

579 chunks.append(bytes)

580 if bytes_read > rollback:

581 break

582 if not bytes:

583 break

584 else:

585 self._done = True

586

587 if not chunks:

588 raise StopIteration()

589

590 chunk = b"".join(chunks)

591 boundary = self._find_boundary(chunk)

592

593 if boundary:

594 end, next = boundary

595 stream.unget(chunk[next:])

596 self._done = True

597 return chunk[:end]

598 else:

599 # make sure we don't treat a partial boundary (and

600 # its separators) as data

601 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):

602 # There's nothing left, we should just return and mark as done.

603 self._done = True

604 return chunk

605 else:

606 stream.unget(chunk[-rollback:])

607 return chunk[:-rollback]

608

609 def _find_boundary(self, data):

610 """

611 Find a multipart boundary in data.

612

613 Should no boundary exist in the data, return None. Otherwise, return

614 a tuple containing the indices of the following:

615 * the end of current encapsulation

616 * the start of the next encapsulation

617 """

618 index = data.find(self._boundary)

619 if index < 0:

620 return None

621 else:

622 end = index

623 next = index + len(self._boundary)

624 # backup over CRLF

625 last = max(0, end - 1)

626 if data[last : last + 1] == b"\n":

627 end -= 1

628 last = max(0, end - 1)

629 if data[last : last + 1] == b"\r":

630 end -= 1

631 return end, next

632

633

634def exhaust(stream_or_iterable):

635 """Exhaust an iterator or stream."""

636 try:

637 iterator = iter(stream_or_iterable)

638 except TypeError:

639 iterator = ChunkIter(stream_or_iterable, 16384)

640 collections.deque(iterator, maxlen=0) # consume iterator quickly.

641

642

643def parse_boundary_stream(stream, max_header_size):

644 """

645 Parse one and exactly one stream that encapsulates a boundary.

646 """

647 # Stream at beginning of header, look for end of header

648 # and parse it if found. The header must fit within one

649 # chunk.

650 chunk = stream.read(max_header_size)

651

652 # 'find' returns the top of these four bytes, so we'll

653 # need to munch them later to prevent them from polluting

654 # the payload.

655 header_end = chunk.find(b"\r\n\r\n")

656

657 if header_end == -1:

658 # we find no header, so we just mark this fact and pass on

659 # the stream verbatim

660 stream.unget(chunk)

661 return (RAW, {}, stream)

662

663 header = chunk[:header_end]

664

665 # here we place any excess chunk back onto the stream, as

666 # well as throwing away the CRLFCRLF bytes from above.

667 stream.unget(chunk[header_end + 4 :])

668

669 TYPE = RAW

670 outdict = {}

671

672 # Eliminate blank lines

673 for line in header.split(b"\r\n"):

674 # This terminology ("main value" and "dictionary of

675 # parameters") is from the Python docs.

676 try:

677 main_value_pair, params = parse_header_parameters(line.decode())

678 name, value = main_value_pair.split(":", 1)

679 params = {k: v.encode() for k, v in params.items()}

680 except ValueError: # Invalid header.

681 continue

682

683 if name == "content-disposition":

684 TYPE = FIELD

685 if params.get("filename"):

686 TYPE = FILE

687

688 outdict[name] = value, params

689

690 if TYPE == RAW:

691 stream.unget(chunk)

692

693 return (TYPE, outdict, stream)

694

695

696class Parser:

697 def __init__(self, stream, boundary):

698 self._stream = stream

699 self._separator = b"--" + boundary

700

701 def __iter__(self):

702 boundarystream = InterBoundaryIter(self._stream, self._separator)

703 for sub_stream in boundarystream:

704 # Iterate over each part

705 yield parse_boundary_stream(sub_stream, 1024)