Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/http/multipartparser.py: 16%

346 statements  

« prev     ^ index     » next       coverage.py v7.0.5, created at 2023-01-17 06:13 +0000

1""" 

2Multi-part parsing for file uploads. 

3 

4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to 

5file upload handlers for processing. 

6""" 

7import base64 

8import binascii 

9import collections 

10import html 

11 

12from django.conf import settings 

13from django.core.exceptions import ( 

14 RequestDataTooBig, 

15 SuspiciousMultipartForm, 

16 TooManyFieldsSent, 

17) 

18from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload 

19from django.utils.datastructures import MultiValueDict 

20from django.utils.encoding import force_str 

21from django.utils.http import parse_header_parameters 

22from django.utils.regex_helper import _lazy_re_compile 

23 

24__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted") 

25 

26 

27class MultiPartParserError(Exception): 

28 pass 

29 

30 

31class InputStreamExhausted(Exception): 

32 """ 

33 No more reads are allowed from this device. 

34 """ 

35 

36 pass 

37 

38 

39RAW = "raw" 

40FILE = "file" 

41FIELD = "field" 

42 

43 

44class MultiPartParser: 

45 """ 

46 An RFC 7578 multipart/form-data parser. 

47 

48 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks 

49 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. 

50 """ 

51 

52 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]") 

53 

54 def __init__(self, META, input_data, upload_handlers, encoding=None): 

55 """ 

56 Initialize the MultiPartParser object. 

57 

58 :META: 

59 The standard ``META`` dictionary in Django request objects. 

60 :input_data: 

61 The raw post data, as a file-like object. 

62 :upload_handlers: 

63 A list of UploadHandler instances that perform operations on the 

64 uploaded data. 

65 :encoding: 

66 The encoding with which to treat the incoming data. 

67 """ 

68 # Content-Type should contain multipart and the boundary information. 

69 content_type = META.get("CONTENT_TYPE", "") 

70 if not content_type.startswith("multipart/"): 

71 raise MultiPartParserError("Invalid Content-Type: %s" % content_type) 

72 

73 try: 

74 content_type.encode("ascii") 

75 except UnicodeEncodeError: 

76 raise MultiPartParserError( 

77 "Invalid non-ASCII Content-Type in multipart: %s" 

78 % force_str(content_type) 

79 ) 

80 

81 # Parse the header to get the boundary to split the parts. 

82 _, opts = parse_header_parameters(content_type) 

83 boundary = opts.get("boundary") 

84 if not boundary or not self.boundary_re.fullmatch(boundary): 

85 raise MultiPartParserError( 

86 "Invalid boundary in multipart: %s" % force_str(boundary) 

87 ) 

88 

89 # Content-Length should contain the length of the body we are about 

90 # to receive. 

91 try: 

92 content_length = int(META.get("CONTENT_LENGTH", 0)) 

93 except (ValueError, TypeError): 

94 content_length = 0 

95 

96 if content_length < 0: 

97 # This means we shouldn't continue...raise an error. 

98 raise MultiPartParserError("Invalid content length: %r" % content_length) 

99 

100 self._boundary = boundary.encode("ascii") 

101 self._input_data = input_data 

102 

103 # For compatibility with low-level network APIs (with 32-bit integers), 

104 # the chunk size should be < 2^31, but still divisible by 4. 

105 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size] 

106 self._chunk_size = min([2**31 - 4] + possible_sizes) 

107 

108 self._meta = META 

109 self._encoding = encoding or settings.DEFAULT_CHARSET 

110 self._content_length = content_length 

111 self._upload_handlers = upload_handlers 

112 

113 def parse(self): 

114 """ 

115 Parse the POST data and break it into a FILES MultiValueDict and a POST 

116 MultiValueDict. 

117 

118 Return a tuple containing the POST and FILES dictionary, respectively. 

119 """ 

120 from django.http import QueryDict 

121 

122 encoding = self._encoding 

123 handlers = self._upload_handlers 

124 

125 # HTTP spec says that Content-Length >= 0 is valid 

126 # handling content-length == 0 before continuing 

127 if self._content_length == 0: 

128 return QueryDict(encoding=self._encoding), MultiValueDict() 

129 

130 # See if any of the handlers take care of the parsing. 

131 # This allows overriding everything if need be. 

132 for handler in handlers: 

133 result = handler.handle_raw_input( 

134 self._input_data, 

135 self._meta, 

136 self._content_length, 

137 self._boundary, 

138 encoding, 

139 ) 

140 # Check to see if it was handled 

141 if result is not None: 

142 return result[0], result[1] 

143 

144 # Create the data structures to be used later. 

145 self._post = QueryDict(mutable=True) 

146 self._files = MultiValueDict() 

147 

148 # Instantiate the parser and stream: 

149 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size)) 

150 

151 # Whether or not to signal a file-completion at the beginning of the loop. 

152 old_field_name = None 

153 counters = [0] * len(handlers) 

154 

155 # Number of bytes that have been read. 

156 num_bytes_read = 0 

157 # To count the number of keys in the request. 

158 num_post_keys = 0 

159 # To limit the amount of data read from the request. 

160 read_size = None 

161 # Whether a file upload is finished. 

162 uploaded_file = True 

163 

164 try: 

165 for item_type, meta_data, field_stream in Parser(stream, self._boundary): 

166 if old_field_name: 

167 # We run this at the beginning of the next loop 

168 # since we cannot be sure a file is complete until 

169 # we hit the next boundary/part of the multipart content. 

170 self.handle_file_complete(old_field_name, counters) 

171 old_field_name = None 

172 uploaded_file = True 

173 

174 try: 

175 disposition = meta_data["content-disposition"][1] 

176 field_name = disposition["name"].strip() 

177 except (KeyError, IndexError, AttributeError): 

178 continue 

179 

180 transfer_encoding = meta_data.get("content-transfer-encoding") 

181 if transfer_encoding is not None: 

182 transfer_encoding = transfer_encoding[0].strip() 

183 field_name = force_str(field_name, encoding, errors="replace") 

184 

185 if item_type == FIELD: 

186 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS. 

187 num_post_keys += 1 

188 if ( 

189 settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None 

190 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS < num_post_keys 

191 ): 

192 raise TooManyFieldsSent( 

193 "The number of GET/POST parameters exceeded " 

194 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS." 

195 ) 

196 

197 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE. 

198 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None: 

199 read_size = ( 

200 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read 

201 ) 

202 

203 # This is a post field, we can just set it in the post 

204 if transfer_encoding == "base64": 

205 raw_data = field_stream.read(size=read_size) 

206 num_bytes_read += len(raw_data) 

207 try: 

208 data = base64.b64decode(raw_data) 

209 except binascii.Error: 

210 data = raw_data 

211 else: 

212 data = field_stream.read(size=read_size) 

213 num_bytes_read += len(data) 

214 

215 # Add two here to make the check consistent with the 

216 # x-www-form-urlencoded check that includes '&='. 

217 num_bytes_read += len(field_name) + 2 

218 if ( 

219 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None 

220 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE 

221 ): 

222 raise RequestDataTooBig( 

223 "Request body exceeded " 

224 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE." 

225 ) 

226 

227 self._post.appendlist( 

228 field_name, force_str(data, encoding, errors="replace") 

229 ) 

230 elif item_type == FILE: 

231 # This is a file, use the handler... 

232 file_name = disposition.get("filename") 

233 if file_name: 

234 file_name = force_str(file_name, encoding, errors="replace") 

235 file_name = self.sanitize_file_name(file_name) 

236 if not file_name: 

237 continue 

238 

239 content_type, content_type_extra = meta_data.get( 

240 "content-type", ("", {}) 

241 ) 

242 content_type = content_type.strip() 

243 charset = content_type_extra.get("charset") 

244 

245 try: 

246 content_length = int(meta_data.get("content-length")[0]) 

247 except (IndexError, TypeError, ValueError): 

248 content_length = None 

249 

250 counters = [0] * len(handlers) 

251 uploaded_file = False 

252 try: 

253 for handler in handlers: 

254 try: 

255 handler.new_file( 

256 field_name, 

257 file_name, 

258 content_type, 

259 content_length, 

260 charset, 

261 content_type_extra, 

262 ) 

263 except StopFutureHandlers: 

264 break 

265 

266 for chunk in field_stream: 

267 if transfer_encoding == "base64": 

268 # We only special-case base64 transfer encoding 

269 # We should always decode base64 chunks by 

270 # multiple of 4, ignoring whitespace. 

271 

272 stripped_chunk = b"".join(chunk.split()) 

273 

274 remaining = len(stripped_chunk) % 4 

275 while remaining != 0: 

276 over_chunk = field_stream.read(4 - remaining) 

277 if not over_chunk: 

278 break 

279 stripped_chunk += b"".join(over_chunk.split()) 

280 remaining = len(stripped_chunk) % 4 

281 

282 try: 

283 chunk = base64.b64decode(stripped_chunk) 

284 except Exception as exc: 

285 # Since this is only a chunk, any error is 

286 # an unfixable error. 

287 raise MultiPartParserError( 

288 "Could not decode base64 data." 

289 ) from exc 

290 

291 for i, handler in enumerate(handlers): 

292 chunk_length = len(chunk) 

293 chunk = handler.receive_data_chunk(chunk, counters[i]) 

294 counters[i] += chunk_length 

295 if chunk is None: 

296 # Don't continue if the chunk received by 

297 # the handler is None. 

298 break 

299 

300 except SkipFile: 

301 self._close_files() 

302 # Just use up the rest of this file... 

303 exhaust(field_stream) 

304 else: 

305 # Handle file upload completions on next iteration. 

306 old_field_name = field_name 

307 else: 

308 # If this is neither a FIELD or a FILE, just exhaust the stream. 

309 exhaust(stream) 

310 except StopUpload as e: 

311 self._close_files() 

312 if not e.connection_reset: 

313 exhaust(self._input_data) 

314 else: 

315 if not uploaded_file: 

316 for handler in handlers: 

317 handler.upload_interrupted() 

318 # Make sure that the request data is all fed 

319 exhaust(self._input_data) 

320 

321 # Signal that the upload has completed. 

322 # any() shortcircuits if a handler's upload_complete() returns a value. 

323 any(handler.upload_complete() for handler in handlers) 

324 self._post._mutable = False 

325 return self._post, self._files 

326 

327 def handle_file_complete(self, old_field_name, counters): 

328 """ 

329 Handle all the signaling that takes place when a file is complete. 

330 """ 

331 for i, handler in enumerate(self._upload_handlers): 

332 file_obj = handler.file_complete(counters[i]) 

333 if file_obj: 

334 # If it returns a file object, then set the files dict. 

335 self._files.appendlist( 

336 force_str(old_field_name, self._encoding, errors="replace"), 

337 file_obj, 

338 ) 

339 break 

340 

341 def sanitize_file_name(self, file_name): 

342 """ 

343 Sanitize the filename of an upload. 

344 

345 Remove all possible path separators, even though that might remove more 

346 than actually required by the target system. Filenames that could 

347 potentially cause problems (current/parent dir) are also discarded. 

348 

349 It should be noted that this function could still return a "filepath" 

350 like "C:some_file.txt" which is handled later on by the storage layer. 

351 So while this function does sanitize filenames to some extent, the 

352 resulting filename should still be considered as untrusted user input. 

353 """ 

354 file_name = html.unescape(file_name) 

355 file_name = file_name.rsplit("/")[-1] 

356 file_name = file_name.rsplit("\\")[-1] 

357 # Remove non-printable characters. 

358 file_name = "".join([char for char in file_name if char.isprintable()]) 

359 

360 if file_name in {"", ".", ".."}: 

361 return None 

362 return file_name 

363 

364 IE_sanitize = sanitize_file_name 

365 

366 def _close_files(self): 

367 # Free up all file handles. 

368 # FIXME: this currently assumes that upload handlers store the file as 'file' 

369 # We should document that... 

370 # (Maybe add handler.free_file to complement new_file) 

371 for handler in self._upload_handlers: 

372 if hasattr(handler, "file"): 

373 handler.file.close() 

374 

375 

376class LazyStream: 

377 """ 

378 The LazyStream wrapper allows one to get and "unget" bytes from a stream. 

379 

380 Given a producer object (an iterator that yields bytestrings), the 

381 LazyStream object will support iteration, reading, and keeping a "look-back" 

382 variable in case you need to "unget" some bytes. 

383 """ 

384 

385 def __init__(self, producer, length=None): 

386 """ 

387 Every LazyStream must have a producer when instantiated. 

388 

389 A producer is an iterable that returns a string each time it 

390 is called. 

391 """ 

392 self._producer = producer 

393 self._empty = False 

394 self._leftover = b"" 

395 self.length = length 

396 self.position = 0 

397 self._remaining = length 

398 self._unget_history = [] 

399 

400 def tell(self): 

401 return self.position 

402 

403 def read(self, size=None): 

404 def parts(): 

405 remaining = self._remaining if size is None else size 

406 # do the whole thing in one shot if no limit was provided. 

407 if remaining is None: 

408 yield b"".join(self) 

409 return 

410 

411 # otherwise do some bookkeeping to return exactly enough 

412 # of the stream and stashing any extra content we get from 

413 # the producer 

414 while remaining != 0: 

415 assert remaining > 0, "remaining bytes to read should never go negative" 

416 

417 try: 

418 chunk = next(self) 

419 except StopIteration: 

420 return 

421 else: 

422 emitting = chunk[:remaining] 

423 self.unget(chunk[remaining:]) 

424 remaining -= len(emitting) 

425 yield emitting 

426 

427 return b"".join(parts()) 

428 

429 def __next__(self): 

430 """ 

431 Used when the exact number of bytes to read is unimportant. 

432 

433 Return whatever chunk is conveniently returned from the iterator. 

434 Useful to avoid unnecessary bookkeeping if performance is an issue. 

435 """ 

436 if self._leftover: 

437 output = self._leftover 

438 self._leftover = b"" 

439 else: 

440 output = next(self._producer) 

441 self._unget_history = [] 

442 self.position += len(output) 

443 return output 

444 

445 def close(self): 

446 """ 

447 Used to invalidate/disable this lazy stream. 

448 

449 Replace the producer with an empty list. Any leftover bytes that have 

450 already been read will still be reported upon read() and/or next(). 

451 """ 

452 self._producer = [] 

453 

454 def __iter__(self): 

455 return self 

456 

457 def unget(self, bytes): 

458 """ 

459 Place bytes back onto the front of the lazy stream. 

460 

461 Future calls to read() will return those bytes first. The 

462 stream position and thus tell() will be rewound. 

463 """ 

464 if not bytes: 

465 return 

466 self._update_unget_history(len(bytes)) 

467 self.position -= len(bytes) 

468 self._leftover = bytes + self._leftover 

469 

470 def _update_unget_history(self, num_bytes): 

471 """ 

472 Update the unget history as a sanity check to see if we've pushed 

473 back the same number of bytes in one chunk. If we keep ungetting the 

474 same number of bytes many times (here, 50), we're mostly likely in an 

475 infinite loop of some sort. This is usually caused by a 

476 maliciously-malformed MIME request. 

477 """ 

478 self._unget_history = [num_bytes] + self._unget_history[:49] 

479 number_equal = len( 

480 [ 

481 current_number 

482 for current_number in self._unget_history 

483 if current_number == num_bytes 

484 ] 

485 ) 

486 

487 if number_equal > 40: 

488 raise SuspiciousMultipartForm( 

489 "The multipart parser got stuck, which shouldn't happen with" 

490 " normal uploaded files. Check for malicious upload activity;" 

491 " if there is none, report this to the Django developers." 

492 ) 

493 

494 

495class ChunkIter: 

496 """ 

497 An iterable that will yield chunks of data. Given a file-like object as the 

498 constructor, yield chunks of read operations from that object. 

499 """ 

500 

501 def __init__(self, flo, chunk_size=64 * 1024): 

502 self.flo = flo 

503 self.chunk_size = chunk_size 

504 

505 def __next__(self): 

506 try: 

507 data = self.flo.read(self.chunk_size) 

508 except InputStreamExhausted: 

509 raise StopIteration() 

510 if data: 

511 return data 

512 else: 

513 raise StopIteration() 

514 

515 def __iter__(self): 

516 return self 

517 

518 

519class InterBoundaryIter: 

520 """ 

521 A Producer that will iterate over boundaries. 

522 """ 

523 

524 def __init__(self, stream, boundary): 

525 self._stream = stream 

526 self._boundary = boundary 

527 

528 def __iter__(self): 

529 return self 

530 

531 def __next__(self): 

532 try: 

533 return LazyStream(BoundaryIter(self._stream, self._boundary)) 

534 except InputStreamExhausted: 

535 raise StopIteration() 

536 

537 

538class BoundaryIter: 

539 """ 

540 A Producer that is sensitive to boundaries. 

541 

542 Will happily yield bytes until a boundary is found. Will yield the bytes 

543 before the boundary, throw away the boundary bytes themselves, and push the 

544 post-boundary bytes back on the stream. 

545 

546 The future calls to next() after locating the boundary will raise a 

547 StopIteration exception. 

548 """ 

549 

550 def __init__(self, stream, boundary): 

551 self._stream = stream 

552 self._boundary = boundary 

553 self._done = False 

554 # rollback an additional six bytes because the format is like 

555 # this: CRLF<boundary>[--CRLF] 

556 self._rollback = len(boundary) + 6 

557 

558 # Try to use mx fast string search if available. Otherwise 

559 # use Python find. Wrap the latter for consistency. 

560 unused_char = self._stream.read(1) 

561 if not unused_char: 

562 raise InputStreamExhausted() 

563 self._stream.unget(unused_char) 

564 

565 def __iter__(self): 

566 return self 

567 

568 def __next__(self): 

569 if self._done: 

570 raise StopIteration() 

571 

572 stream = self._stream 

573 rollback = self._rollback 

574 

575 bytes_read = 0 

576 chunks = [] 

577 for bytes in stream: 

578 bytes_read += len(bytes) 

579 chunks.append(bytes) 

580 if bytes_read > rollback: 

581 break 

582 if not bytes: 

583 break 

584 else: 

585 self._done = True 

586 

587 if not chunks: 

588 raise StopIteration() 

589 

590 chunk = b"".join(chunks) 

591 boundary = self._find_boundary(chunk) 

592 

593 if boundary: 

594 end, next = boundary 

595 stream.unget(chunk[next:]) 

596 self._done = True 

597 return chunk[:end] 

598 else: 

599 # make sure we don't treat a partial boundary (and 

600 # its separators) as data 

601 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6): 

602 # There's nothing left, we should just return and mark as done. 

603 self._done = True 

604 return chunk 

605 else: 

606 stream.unget(chunk[-rollback:]) 

607 return chunk[:-rollback] 

608 

609 def _find_boundary(self, data): 

610 """ 

611 Find a multipart boundary in data. 

612 

613 Should no boundary exist in the data, return None. Otherwise, return 

614 a tuple containing the indices of the following: 

615 * the end of current encapsulation 

616 * the start of the next encapsulation 

617 """ 

618 index = data.find(self._boundary) 

619 if index < 0: 

620 return None 

621 else: 

622 end = index 

623 next = index + len(self._boundary) 

624 # backup over CRLF 

625 last = max(0, end - 1) 

626 if data[last : last + 1] == b"\n": 

627 end -= 1 

628 last = max(0, end - 1) 

629 if data[last : last + 1] == b"\r": 

630 end -= 1 

631 return end, next 

632 

633 

634def exhaust(stream_or_iterable): 

635 """Exhaust an iterator or stream.""" 

636 try: 

637 iterator = iter(stream_or_iterable) 

638 except TypeError: 

639 iterator = ChunkIter(stream_or_iterable, 16384) 

640 collections.deque(iterator, maxlen=0) # consume iterator quickly. 

641 

642 

643def parse_boundary_stream(stream, max_header_size): 

644 """ 

645 Parse one and exactly one stream that encapsulates a boundary. 

646 """ 

647 # Stream at beginning of header, look for end of header 

648 # and parse it if found. The header must fit within one 

649 # chunk. 

650 chunk = stream.read(max_header_size) 

651 

652 # 'find' returns the top of these four bytes, so we'll 

653 # need to munch them later to prevent them from polluting 

654 # the payload. 

655 header_end = chunk.find(b"\r\n\r\n") 

656 

657 if header_end == -1: 

658 # we find no header, so we just mark this fact and pass on 

659 # the stream verbatim 

660 stream.unget(chunk) 

661 return (RAW, {}, stream) 

662 

663 header = chunk[:header_end] 

664 

665 # here we place any excess chunk back onto the stream, as 

666 # well as throwing away the CRLFCRLF bytes from above. 

667 stream.unget(chunk[header_end + 4 :]) 

668 

669 TYPE = RAW 

670 outdict = {} 

671 

672 # Eliminate blank lines 

673 for line in header.split(b"\r\n"): 

674 # This terminology ("main value" and "dictionary of 

675 # parameters") is from the Python docs. 

676 try: 

677 main_value_pair, params = parse_header_parameters(line.decode()) 

678 name, value = main_value_pair.split(":", 1) 

679 params = {k: v.encode() for k, v in params.items()} 

680 except ValueError: # Invalid header. 

681 continue 

682 

683 if name == "content-disposition": 

684 TYPE = FIELD 

685 if params.get("filename"): 

686 TYPE = FILE 

687 

688 outdict[name] = value, params 

689 

690 if TYPE == RAW: 

691 stream.unget(chunk) 

692 

693 return (TYPE, outdict, stream) 

694 

695 

696class Parser: 

697 def __init__(self, stream, boundary): 

698 self._stream = stream 

699 self._separator = b"--" + boundary 

700 

701 def __iter__(self): 

702 boundarystream = InterBoundaryIter(self._stream, self._separator) 

703 for sub_stream in boundarystream: 

704 # Iterate over each part 

705 yield parse_boundary_stream(sub_stream, 1024)