Coverage for /pythoncovmergedfiles/medio/medio/src/aiohttp/aiohttp/http_parser.py: 83%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

596 statements  

1import abc 

2import asyncio 

3import re 

4import string 

5import sys 

6from contextlib import suppress 

7from enum import IntEnum 

8from re import Pattern 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 ClassVar, 

13 Final, 

14 Generic, 

15 Literal, 

16 NamedTuple, 

17 TypeVar, 

18) 

19 

20from multidict import CIMultiDict, istr 

21from yarl import URL 

22 

23from . import hdrs 

24from .base_protocol import BaseProtocol 

25from .compression_utils import ( 

26 HAS_BROTLI, 

27 HAS_ZSTD, 

28 BrotliDecompressor, 

29 ZLibDecompressor, 

30 ZSTDDecompressor, 

31) 

32from .helpers import ( 

33 _EXC_SENTINEL, 

34 DEBUG, 

35 DEFAULT_CHUNK_SIZE, 

36 EMPTY_BODY_METHODS, 

37 EMPTY_BODY_STATUS_CODES, 

38 NO_EXTENSIONS, 

39 BaseTimerContext, 

40 HeadersDictProxy, 

41 set_exception, 

42) 

43from .http_exceptions import ( 

44 BadHttpMessage, 

45 BadHttpMethod, 

46 BadStatusLine, 

47 ContentEncodingError, 

48 ContentLengthError, 

49 InvalidHeader, 

50 InvalidURLError, 

51 LineTooLong, 

52 TransferEncodingError, 

53) 

54from .http_writer import HttpVersion, HttpVersion10, HttpVersion11 

55from .streams import EMPTY_PAYLOAD, StreamReader 

56from .typedefs import RawHeaders 

57 

58if TYPE_CHECKING: 

59 from .client_proto import ResponseHandler 

60 

61__all__ = ( 

62 "HeadersParser", 

63 "HttpParser", 

64 "HttpRequestParser", 

65 "HttpResponseParser", 

66 "RawRequestMessage", 

67 "RawResponseMessage", 

68) 

69 

70_T = TypeVar("_T") 

71 

72_SEP = Literal[b"\r\n", b"\n"] 

73 

74ASCIISET: Final[set[str]] = set(string.printable) 

75 

76# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview 

77# and https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens 

78# 

79# method = token 

80# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / 

81# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA 

82# token = 1*tchar 

83_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~") 

84TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+") 

85# https://www.rfc-editor.org/rfc/rfc9110#section-5.5-5 

86_FIELD_VALUE_FORBIDDEN_CTL_RE: Final[Pattern[str]] = re.compile( 

87 r"[\x00-\x08\x0a-\x1f\x7f]" 

88) 

89VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII) 

90DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII) 

91HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+") 

92 

93# RFC 9110 singleton headers — duplicates are rejected in strict mode. 

94# In lax mode (response parser default), the check is skipped entirely 

95# since real-world servers (e.g. Google APIs, Werkzeug) commonly send 

96# duplicate headers like Content-Type or Server. 

97# Lowercased for case-insensitive matching against wire names. 

98SINGLETON_HEADERS: Final[frozenset[str]] = frozenset( 

99 { 

100 "content-length", 

101 "content-location", 

102 "content-range", 

103 "content-type", 

104 "etag", 

105 "host", 

106 "max-forwards", 

107 "server", 

108 "transfer-encoding", 

109 "user-agent", 

110 } 

111) 

112 

113 

114class RawRequestMessage(NamedTuple): 

115 method: str 

116 path: str 

117 version: HttpVersion 

118 headers: HeadersDictProxy 

119 raw_headers: RawHeaders 

120 should_close: bool 

121 compression: str | None 

122 upgrade: bool 

123 chunked: bool 

124 url: URL 

125 

126 

127class RawResponseMessage(NamedTuple): 

128 version: HttpVersion 

129 code: int 

130 reason: str 

131 headers: HeadersDictProxy 

132 raw_headers: RawHeaders 

133 should_close: bool 

134 compression: str | None 

135 upgrade: bool 

136 chunked: bool 

137 

138 

139_MsgT = TypeVar("_MsgT", RawRequestMessage, RawResponseMessage) 

140 

141 

142class PayloadState(IntEnum): 

143 PAYLOAD_COMPLETE = 0 

144 PAYLOAD_NEEDS_INPUT = 1 

145 PAYLOAD_HAS_PENDING_INPUT = 2 

146 

147 

148class ParseState(IntEnum): 

149 PARSE_NONE = 0 

150 PARSE_LENGTH = 1 

151 PARSE_CHUNKED = 2 

152 PARSE_UNTIL_EOF = 3 

153 

154 

155class ChunkState(IntEnum): 

156 PARSE_CHUNKED_SIZE = 0 

157 PARSE_CHUNKED_CHUNK = 1 

158 PARSE_CHUNKED_CHUNK_EOF = 2 

159 PARSE_TRAILERS = 4 

160 

161 

162class HeadersParser: 

163 def __init__(self, max_field_size: int = 8190, lax: bool = False) -> None: 

164 self.max_field_size = max_field_size 

165 self._lax = lax 

166 

167 def parse_headers(self, lines: list[bytes]) -> tuple[HeadersDictProxy, RawHeaders]: 

168 headers: CIMultiDict[str] = CIMultiDict() 

169 # note: "raw" does not mean inclusion of OWS before/after the field value 

170 raw_headers = [] 

171 

172 lines_idx = 0 

173 line = lines[lines_idx] 

174 line_count = len(lines) 

175 

176 while line: 

177 # Parse initial header name : value pair. 

178 try: 

179 bname, bvalue = line.split(b":", 1) 

180 except ValueError: 

181 raise InvalidHeader(line) from None 

182 

183 if len(bname) == 0: 

184 raise InvalidHeader(bname) 

185 

186 # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 

187 if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"} 

188 raise InvalidHeader(line) 

189 

190 bvalue = bvalue.lstrip(b" \t") 

191 name = bname.decode("utf-8", "surrogateescape") 

192 if not TOKENRE.fullmatch(name): 

193 raise InvalidHeader(bname) 

194 

195 # next line 

196 lines_idx += 1 

197 line = lines[lines_idx] 

198 

199 # consume continuation lines 

200 continuation = self._lax and line and line[0] in (32, 9) # (' ', '\t') 

201 

202 # Deprecated: https://www.rfc-editor.org/rfc/rfc9112.html#name-obsolete-line-folding 

203 if continuation: 

204 header_length = len(bvalue) 

205 bvalue_lst = [bvalue] 

206 while continuation: 

207 header_length += len(line) 

208 if header_length > self.max_field_size: 

209 header_line = bname + b": " + b"".join(bvalue_lst) 

210 raise LineTooLong( 

211 header_line[:100] + b"...", self.max_field_size 

212 ) 

213 bvalue_lst.append(line) 

214 

215 # next line 

216 lines_idx += 1 

217 if lines_idx < line_count: 

218 line = lines[lines_idx] 

219 if line: 

220 continuation = line[0] in (32, 9) # (' ', '\t') 

221 else: 

222 line = b"" 

223 break 

224 bvalue = b"".join(bvalue_lst) 

225 

226 bvalue = bvalue.strip(b" \t") 

227 value = bvalue.decode("utf-8", "surrogateescape") 

228 

229 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 

230 if self._lax: 

231 if "\n" in value or "\r" in value or "\x00" in value: 

232 raise InvalidHeader(bvalue) 

233 elif _FIELD_VALUE_FORBIDDEN_CTL_RE.search(value): 

234 raise InvalidHeader(bvalue) 

235 

236 if not self._lax and name in headers and name.lower() in SINGLETON_HEADERS: 

237 raise BadHttpMessage(f"Duplicate '{name}' header found.") 

238 headers.add(name, value) 

239 raw_headers.append((bname, bvalue)) 

240 

241 return (HeadersDictProxy(headers), tuple(raw_headers)) 

242 

243 

244def _is_supported_upgrade(headers: HeadersDictProxy) -> bool: 

245 """Check if the upgrade header is supported.""" 

246 u = headers.get(hdrs.UPGRADE, "") 

247 # .lower() can transform non-ascii characters. 

248 return u.isascii() and u.lower() in {"tcp", "websocket"} 

249 

250 

251class HttpParser(abc.ABC, Generic[_MsgT]): 

252 lax: ClassVar[bool] = False 

253 

254 def __init__( 

255 self, 

256 protocol: BaseProtocol, 

257 loop: asyncio.AbstractEventLoop, 

258 limit: int, 

259 max_line_size: int = 8190, 

260 max_headers: int = 128, 

261 max_field_size: int = 8190, 

262 timer: BaseTimerContext | None = None, 

263 code: int | None = None, 

264 method: str | None = None, 

265 payload_exception: type[BaseException] | None = None, 

266 response_with_body: bool = True, 

267 read_until_eof: bool = False, 

268 auto_decompress: bool = True, 

269 ) -> None: 

270 self.protocol = protocol 

271 self.loop = loop 

272 self.max_line_size = max_line_size 

273 self.max_field_size = max_field_size 

274 self.max_headers = max_headers 

275 self.timer = timer 

276 self.code = code 

277 self.method = method 

278 self.payload_exception = payload_exception 

279 self.response_with_body = response_with_body 

280 self.read_until_eof = read_until_eof 

281 

282 self._lines: list[bytes] = [] 

283 self._tail = b"" 

284 self._upgraded = False 

285 self._payload = None 

286 self._payload_parser: HttpPayloadParser | None = None 

287 self._payload_has_more_data = False 

288 self._auto_decompress = auto_decompress 

289 self._limit = limit 

290 self._headers_parser = HeadersParser(max_field_size, self.lax) 

291 

292 @abc.abstractmethod 

293 def parse_message(self, lines: list[bytes]) -> _MsgT: ... 

294 

295 @abc.abstractmethod 

296 def _is_chunked_te(self, te: str) -> bool: ... 

297 

298 def pause_reading(self) -> None: 

299 assert self._payload_parser is not None 

300 self._payload_parser.pause_reading() 

301 

302 def feed_eof(self) -> _MsgT | None: 

303 if self._payload_parser is not None: 

304 self._payload_parser.feed_eof() 

305 if self._payload_parser.done: 

306 self._payload_parser = None 

307 else: 

308 # try to extract partial message 

309 if self._tail: 

310 self._lines.append(self._tail) 

311 

312 if self._lines: 

313 if self._lines[-1] != "\r\n": 

314 self._lines.append(b"") 

315 with suppress(Exception): 

316 return self.parse_message(self._lines) 

317 return None 

318 

319 def feed_data( 

320 self, 

321 data: bytes, 

322 SEP: _SEP = b"\r\n", 

323 EMPTY: bytes = b"", 

324 CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH, 

325 METH_CONNECT: str = hdrs.METH_CONNECT, 

326 SEC_WEBSOCKET_KEY1: istr = hdrs.SEC_WEBSOCKET_KEY1, 

327 ) -> tuple[list[tuple[_MsgT, StreamReader]], bool, bytes]: 

328 messages = [] 

329 

330 if self._tail: 

331 data, self._tail = self._tail + data, b"" 

332 

333 data_len = len(data) 

334 start_pos = 0 

335 loop = self.loop 

336 max_line_length = self.max_line_size 

337 

338 should_close = False 

339 while start_pos < data_len or self._payload_has_more_data: 

340 # read HTTP message (request/response line + headers), \r\n\r\n 

341 # and split by lines 

342 if self._payload_parser is None and not self._upgraded: 

343 pos = data.find(SEP, start_pos) 

344 # consume \r\n 

345 if pos == start_pos and not self._lines: 

346 start_pos = pos + len(SEP) 

347 continue 

348 

349 if pos >= start_pos: 

350 if should_close: 

351 raise BadHttpMessage("Data after `Connection: close`") 

352 

353 # line found 

354 line = data[start_pos:pos] 

355 if SEP == b"\n": # For lax response parsing 

356 line = line.rstrip(b"\r") 

357 if len(line) > max_line_length: 

358 raise LineTooLong(line[:100] + b"...", max_line_length) 

359 

360 self._lines.append(line) 

361 # After processing the status/request line, everything is a header. 

362 max_line_length = self.max_field_size 

363 

364 if len(self._lines) > self.max_headers: 

365 raise BadHttpMessage("Too many headers received") 

366 

367 start_pos = pos + len(SEP) 

368 

369 # \r\n\r\n found 

370 if self._lines[-1] == EMPTY: 

371 max_trailers = self.max_headers - len(self._lines) 

372 try: 

373 msg: _MsgT = self.parse_message(self._lines) 

374 finally: 

375 self._lines.clear() 

376 

377 def get_content_length() -> int | None: 

378 # payload length 

379 length_hdr = msg.headers.get(CONTENT_LENGTH) 

380 if length_hdr is None: 

381 return None 

382 

383 # Shouldn't allow +/- or other number formats. 

384 # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2 

385 # msg.headers is already stripped of leading/trailing wsp 

386 if not DIGITS.fullmatch(length_hdr): 

387 raise InvalidHeader(CONTENT_LENGTH) 

388 

389 return int(length_hdr) 

390 

391 length = get_content_length() 

392 # do not support old websocket spec 

393 if SEC_WEBSOCKET_KEY1 in msg.headers: 

394 raise InvalidHeader(SEC_WEBSOCKET_KEY1) 

395 

396 self._upgraded = msg.upgrade and _is_supported_upgrade( 

397 msg.headers 

398 ) 

399 

400 method = getattr(msg, "method", self.method) 

401 # code is only present on responses 

402 code = getattr(msg, "code", 0) 

403 

404 assert self.protocol is not None 

405 # calculate payload 

406 empty_body = code in EMPTY_BODY_STATUS_CODES or bool( 

407 method and method in EMPTY_BODY_METHODS 

408 ) 

409 if not empty_body and ( 

410 ((length is not None and length > 0) or msg.chunked) 

411 and not self._upgraded 

412 ): 

413 payload = StreamReader( 

414 self.protocol, 

415 timer=self.timer, 

416 loop=loop, 

417 limit=self._limit, 

418 ) 

419 payload_parser = HttpPayloadParser( 

420 payload, 

421 length=length, 

422 chunked=msg.chunked, 

423 method=method, 

424 compression=msg.compression, 

425 code=self.code, 

426 response_with_body=self.response_with_body, 

427 auto_decompress=self._auto_decompress, 

428 lax=self.lax, 

429 headers_parser=self._headers_parser, 

430 max_line_size=self.max_line_size, 

431 max_field_size=self.max_field_size, 

432 max_trailers=max_trailers, 

433 limit=self._limit, 

434 ) 

435 if not payload_parser.done: 

436 self._payload_parser = payload_parser 

437 elif method == METH_CONNECT: 

438 assert isinstance(msg, RawRequestMessage) 

439 payload = StreamReader( 

440 self.protocol, 

441 timer=self.timer, 

442 loop=loop, 

443 limit=self._limit, 

444 ) 

445 self._upgraded = True 

446 self._payload_parser = HttpPayloadParser( 

447 payload, 

448 method=msg.method, 

449 compression=msg.compression, 

450 auto_decompress=self._auto_decompress, 

451 lax=self.lax, 

452 headers_parser=self._headers_parser, 

453 max_line_size=self.max_line_size, 

454 max_field_size=self.max_field_size, 

455 max_trailers=max_trailers, 

456 limit=self._limit, 

457 ) 

458 elif not empty_body and length is None and self.read_until_eof: 

459 payload = StreamReader( 

460 self.protocol, 

461 timer=self.timer, 

462 loop=loop, 

463 limit=self._limit, 

464 ) 

465 payload_parser = HttpPayloadParser( 

466 payload, 

467 length=length, 

468 chunked=msg.chunked, 

469 method=method, 

470 compression=msg.compression, 

471 code=self.code, 

472 response_with_body=self.response_with_body, 

473 auto_decompress=self._auto_decompress, 

474 lax=self.lax, 

475 headers_parser=self._headers_parser, 

476 max_line_size=self.max_line_size, 

477 max_field_size=self.max_field_size, 

478 max_trailers=max_trailers, 

479 limit=self._limit, 

480 ) 

481 if not payload_parser.done: 

482 self._payload_parser = payload_parser 

483 else: 

484 payload = EMPTY_PAYLOAD 

485 

486 messages.append((msg, payload)) 

487 should_close = msg.should_close 

488 else: 

489 self._tail = data[start_pos:] 

490 if len(self._tail) > self.max_line_size: 

491 raise LineTooLong(self._tail[:100] + b"...", self.max_line_size) 

492 data = EMPTY 

493 break 

494 

495 # no parser, just store 

496 elif self._payload_parser is None and self._upgraded: 

497 assert not self._lines 

498 break 

499 

500 # feed payload 

501 else: 

502 assert not self._lines 

503 assert self._payload_parser is not None 

504 try: 

505 payload_state, data = self._payload_parser.feed_data( 

506 data[start_pos:], SEP 

507 ) 

508 except Exception as underlying_exc: 

509 reraised_exc: BaseException = underlying_exc 

510 if self.payload_exception is not None: 

511 reraised_exc = self.payload_exception(str(underlying_exc)) 

512 

513 set_exception( 

514 self._payload_parser.payload, 

515 reraised_exc, 

516 underlying_exc, 

517 ) 

518 

519 payload_state = PayloadState.PAYLOAD_COMPLETE 

520 data = b"" 

521 if isinstance( 

522 underlying_exc, (InvalidHeader, TransferEncodingError) 

523 ): 

524 raise 

525 

526 self._payload_has_more_data = ( 

527 payload_state == PayloadState.PAYLOAD_HAS_PENDING_INPUT 

528 ) 

529 

530 if payload_state is not PayloadState.PAYLOAD_COMPLETE: 

531 # We've either consumed all available data, or we're pausing 

532 # until the reader buffer is freed up. 

533 break 

534 

535 start_pos = 0 

536 data_len = len(data) 

537 self._payload_parser = None 

538 

539 if data and start_pos < data_len: 

540 data = data[start_pos:] 

541 else: 

542 data = EMPTY 

543 

544 return messages, self._upgraded, data 

545 

546 def parse_headers( 

547 self, lines: list[bytes] 

548 ) -> tuple[HeadersDictProxy, RawHeaders, bool | None, str | None, bool, bool]: 

549 """Parses RFC 5322 headers from a stream. 

550 

551 Line continuations are supported. Returns list of header name 

552 and value pairs. Header name is in upper case. 

553 """ 

554 headers, raw_headers = self._headers_parser.parse_headers(lines) 

555 close_conn = None 

556 encoding = None 

557 upgrade = False 

558 chunked = False 

559 

560 # keep-alive and protocol switching 

561 # RFC 9110 section 7.6.1 defines Connection as a comma-separated list. 

562 # We use a simple comma split here rather than getall() for performance, 

563 # as the target tokens (close, keep-alive, upgrade) are simple ASCII 

564 # values that never contain commas. 

565 conn_values = headers.get(hdrs.CONNECTION) 

566 if conn_values: 

567 conn_tokens = { 

568 token.lower() 

569 for token in (part.strip(" \t") for part in conn_values.split(",")) 

570 if token and token.isascii() 

571 } 

572 

573 if "close" in conn_tokens: 

574 close_conn = True 

575 elif "keep-alive" in conn_tokens: 

576 close_conn = False 

577 

578 # https://www.rfc-editor.org/rfc/rfc9110.html#name-101-switching-protocols 

579 if "upgrade" in conn_tokens and headers.get(hdrs.UPGRADE): 

580 upgrade = True 

581 

582 # encoding 

583 enc = headers.get(hdrs.CONTENT_ENCODING, "") 

584 if enc.isascii() and enc.lower() in {"gzip", "deflate", "br", "zstd"}: 

585 encoding = enc 

586 

587 # chunking 

588 te = headers.get(hdrs.TRANSFER_ENCODING) 

589 if te is not None: 

590 if self._is_chunked_te(te): 

591 chunked = True 

592 

593 if hdrs.CONTENT_LENGTH in headers: 

594 raise BadHttpMessage( 

595 "Transfer-Encoding can't be present with Content-Length", 

596 ) 

597 

598 return (headers, raw_headers, close_conn, encoding, upgrade, chunked) 

599 

600 def set_upgraded(self, val: bool) -> None: 

601 """Set connection upgraded (to websocket) mode. 

602 

603 :param bool val: new state. 

604 """ 

605 self._upgraded = val 

606 

607 

608class HttpRequestParser(HttpParser[RawRequestMessage]): 

609 """Read request status line. 

610 

611 Exception .http_exceptions.BadStatusLine 

612 could be raised in case of any errors in status line. 

613 Returns RawRequestMessage. 

614 """ 

615 

616 def parse_message(self, lines: list[bytes]) -> RawRequestMessage: 

617 # request line 

618 line = lines[0].decode("utf-8", "surrogateescape") 

619 try: 

620 method, path, version = line.split(" ", maxsplit=2) 

621 except ValueError: 

622 raise BadHttpMethod(line) from None 

623 

624 # method 

625 if not TOKENRE.fullmatch(method): 

626 raise BadHttpMethod(method) 

627 

628 # version 

629 match = VERSRE.fullmatch(version) 

630 if match is None: 

631 raise BadStatusLine(line) 

632 version_o = HttpVersion(int(match.group(1)), int(match.group(2))) 

633 

634 if method == "CONNECT": 

635 # authority-form, 

636 # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3 

637 url = URL.build(authority=path, encoded=True) 

638 elif path.startswith("/"): 

639 # origin-form, 

640 # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1 

641 path_part, _hash_separator, url_fragment = path.partition("#") 

642 path_part, _question_mark_separator, qs_part = path_part.partition("?") 

643 

644 # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based 

645 # NOTE: parser does, otherwise it results into the same 

646 # NOTE: HTTP Request-Line input producing different 

647 # NOTE: `yarl.URL()` objects 

648 url = URL.build( 

649 path=path_part, 

650 query_string=qs_part, 

651 fragment=url_fragment, 

652 encoded=True, 

653 ) 

654 elif path == "*" and method == "OPTIONS": 

655 # asterisk-form, 

656 url = URL(path, encoded=True) 

657 else: 

658 # absolute-form for proxy maybe, 

659 # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.2 

660 url = URL(path, encoded=True) 

661 if url.scheme == "": 

662 # not absolute-form 

663 raise InvalidURLError( 

664 path.encode(errors="surrogateescape").decode("latin1") 

665 ) 

666 

667 # read headers 

668 ( 

669 headers, 

670 raw_headers, 

671 close, 

672 compression, 

673 upgrade, 

674 chunked, 

675 ) = self.parse_headers(lines[1:]) 

676 

677 if version_o == HttpVersion11 and hdrs.HOST not in headers: 

678 raise BadHttpMessage("Missing 'Host' header in request.") 

679 

680 if close is None: # then the headers weren't set in the request 

681 if version_o <= HttpVersion10: # HTTP 1.0 must asks to not close 

682 close = True 

683 else: # HTTP 1.1 must ask to close. 

684 close = False 

685 

686 return RawRequestMessage( 

687 method, 

688 path, 

689 version_o, 

690 headers, 

691 raw_headers, 

692 close, 

693 compression, 

694 upgrade, 

695 chunked, 

696 url, 

697 ) 

698 

699 def _is_chunked_te(self, te: str) -> bool: 

700 # https://www.rfc-editor.org/rfc/rfc9112#section-7.1-3 

701 # "A sender MUST NOT apply the chunked transfer coding more 

702 # than once to a message body" 

703 parts = [p.strip(" \t") for p in te.split(",")] 

704 chunked_count = sum(1 for p in parts if p.isascii() and p.lower() == "chunked") 

705 if chunked_count > 1: 

706 raise BadHttpMessage("Request has duplicate `chunked` Transfer-Encoding") 

707 last = parts[-1] 

708 # .lower() transforms some non-ascii chars, so must check first. 

709 if last.isascii() and last.lower() == "chunked": 

710 return True 

711 # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.3 

712 raise BadHttpMessage("Request has invalid `Transfer-Encoding`") 

713 

714 

715class HttpResponseParser(HttpParser[RawResponseMessage]): 

716 """Read response status line and headers. 

717 

718 BadStatusLine could be raised in case of any errors in status line. 

719 Returns RawResponseMessage. 

720 """ 

721 

722 protocol: "ResponseHandler" 

723 

724 # Lax mode should only be enabled on response parser. 

725 lax = not DEBUG 

726 

727 def feed_data( 

728 self, 

729 data: bytes, 

730 SEP: _SEP | None = None, 

731 *args: Any, 

732 **kwargs: Any, 

733 ) -> tuple[list[tuple[RawResponseMessage, StreamReader]], bool, bytes]: 

734 if SEP is None: 

735 SEP = b"\r\n" if DEBUG else b"\n" 

736 return super().feed_data(data, SEP, *args, **kwargs) 

737 

738 def parse_message(self, lines: list[bytes]) -> RawResponseMessage: 

739 line = lines[0].decode("utf-8", "surrogateescape") 

740 try: 

741 version, status = line.split(maxsplit=1) 

742 except ValueError: 

743 raise BadStatusLine(line) from None 

744 

745 try: 

746 status, reason = status.split(maxsplit=1) 

747 except ValueError: 

748 status = status.strip() 

749 reason = "" 

750 

751 # version 

752 match = VERSRE.fullmatch(version) 

753 if match is None: 

754 raise BadStatusLine(line) 

755 version_o = HttpVersion(int(match.group(1)), int(match.group(2))) 

756 

757 # The status code is a three-digit ASCII number, no padding 

758 if len(status) != 3 or not DIGITS.fullmatch(status): 

759 raise BadStatusLine(line) 

760 status_i = int(status) 

761 

762 # read headers 

763 ( 

764 headers, 

765 raw_headers, 

766 close, 

767 compression, 

768 upgrade, 

769 chunked, 

770 ) = self.parse_headers(lines[1:]) 

771 

772 if close is None: 

773 if version_o <= HttpVersion10: 

774 close = True 

775 # https://www.rfc-editor.org/rfc/rfc9112.html#name-message-body-length 

776 elif 100 <= status_i < 200 or status_i in {204, 304}: 

777 close = False 

778 elif hdrs.CONTENT_LENGTH in headers or hdrs.TRANSFER_ENCODING in headers: 

779 close = False 

780 else: 

781 # https://www.rfc-editor.org/rfc/rfc9112.html#section-6.3-2.8 

782 close = True 

783 

784 return RawResponseMessage( 

785 version_o, 

786 status_i, 

787 reason.strip(), 

788 headers, 

789 raw_headers, 

790 close, 

791 compression, 

792 upgrade, 

793 chunked, 

794 ) 

795 

796 def _is_chunked_te(self, te: str) -> bool: 

797 # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.2 

798 return te.rsplit(",", maxsplit=1)[-1].strip(" \t").lower() == "chunked" 

799 

800 

801class HttpPayloadParser: 

802 def __init__( 

803 self, 

804 payload: StreamReader, 

805 length: int | None = None, 

806 chunked: bool = False, 

807 compression: str | None = None, 

808 code: int | None = None, 

809 method: str | None = None, 

810 response_with_body: bool = True, 

811 auto_decompress: bool = True, 

812 lax: bool = False, 

813 *, 

814 headers_parser: HeadersParser, 

815 max_line_size: int = 8190, 

816 max_field_size: int = 8190, 

817 max_trailers: int = 128, 

818 limit: int = DEFAULT_CHUNK_SIZE, 

819 ) -> None: 

820 self._length = 0 

821 self._paused = False 

822 self._type = ParseState.PARSE_UNTIL_EOF 

823 self._chunk = ChunkState.PARSE_CHUNKED_SIZE 

824 self._chunk_size = 0 

825 self._chunk_tail = b"" 

826 self._auto_decompress = auto_decompress 

827 self._lax = lax 

828 self._headers_parser = headers_parser 

829 self._max_line_size = max_line_size 

830 self._max_field_size = max_field_size 

831 self._max_trailers = max_trailers 

832 self._more_data_available = False 

833 self._trailer_lines: list[bytes] = [] 

834 self.done = False 

835 self._eof_pending = False 

836 

837 # payload decompression wrapper 

838 if response_with_body and compression and self._auto_decompress: 

839 real_payload: StreamReader | DeflateBuffer = DeflateBuffer( 

840 payload, compression, max_decompress_size=limit 

841 ) 

842 else: 

843 real_payload = payload 

844 

845 # payload parser 

846 if not response_with_body: 

847 # don't parse payload if it's not expected to be received 

848 self._type = ParseState.PARSE_NONE 

849 real_payload.feed_eof() 

850 self.done = True 

851 elif chunked: 

852 self._type = ParseState.PARSE_CHUNKED 

853 elif length is not None: 

854 self._type = ParseState.PARSE_LENGTH 

855 self._length = length 

856 if self._length == 0: 

857 real_payload.feed_eof() 

858 self.done = True 

859 

860 self.payload = real_payload 

861 

862 def pause_reading(self) -> None: 

863 self._paused = True 

864 

865 def feed_eof(self) -> None: 

866 if self._type == ParseState.PARSE_UNTIL_EOF: 

867 self._eof_pending = True 

868 while self._more_data_available: 

869 if self._paused: 

870 self._paused = False 

871 return # Will resume via feed_data(b"") later 

872 self._more_data_available = self.payload.feed_data(b"") 

873 self.payload.feed_eof() 

874 self.done = True 

875 self._eof_pending = False 

876 elif self._type == ParseState.PARSE_LENGTH: 

877 raise ContentLengthError( 

878 "Not enough data to satisfy content length header." 

879 ) 

880 elif self._type == ParseState.PARSE_CHUNKED: 

881 raise TransferEncodingError( 

882 "Not enough data to satisfy transfer length header." 

883 ) 

884 

885 def feed_data( 

886 self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";" 

887 ) -> tuple[PayloadState, bytes]: 

888 """Receive a chunk of data to process. 

889 

890 Return: 

891 PayloadState - The current state of payload processing. 

892 This function may be called with empty bytes after returning 

893 PAYLOAD_HAS_PENDING_INPUT to continue processing after a pause. 

894 bytes - If payload is complete, this is the unconsumed bytes intended for the 

895 next message/payload, b"" otherwise. 

896 """ 

897 # Read specified amount of bytes 

898 if self._type == ParseState.PARSE_LENGTH: 

899 if self._chunk_tail: 

900 chunk = self._chunk_tail + chunk 

901 self._chunk_tail = b"" 

902 

903 required = self._length 

904 self._length = max(required - len(chunk), 0) 

905 self._more_data_available = self.payload.feed_data(chunk[:required]) 

906 while self._more_data_available: 

907 if self._paused: 

908 self._paused = False 

909 self._chunk_tail = chunk[required:] 

910 return PayloadState.PAYLOAD_HAS_PENDING_INPUT, b"" 

911 self._more_data_available = self.payload.feed_data(b"") 

912 

913 if self._length == 0: 

914 self.payload.feed_eof() 

915 return PayloadState.PAYLOAD_COMPLETE, chunk[required:] 

916 # Chunked transfer encoding parser 

917 elif self._type == ParseState.PARSE_CHUNKED: 

918 if self._chunk_tail: 

919 # We should check the length is sane when not processing payload body. 

920 if self._chunk != ChunkState.PARSE_CHUNKED_CHUNK: 

921 max_line_length = self._max_line_size 

922 if self._chunk == ChunkState.PARSE_TRAILERS: 

923 max_line_length = self._max_field_size 

924 if len(self._chunk_tail) > max_line_length: 

925 raise LineTooLong( 

926 self._chunk_tail[:100] + b"...", max_line_length 

927 ) 

928 

929 chunk = self._chunk_tail + chunk 

930 self._chunk_tail = b"" 

931 

932 while chunk or self._more_data_available: 

933 # read next chunk size 

934 if self._chunk == ChunkState.PARSE_CHUNKED_SIZE: 

935 pos = chunk.find(SEP) 

936 if pos >= 0: 

937 i = chunk.find(CHUNK_EXT, 0, pos) 

938 if i >= 0: 

939 size_b = chunk[:i] # strip chunk-extensions 

940 # Verify no LF in the chunk-extension 

941 if b"\n" in (ext := chunk[i:pos]): 

942 exc = TransferEncodingError( 

943 f"Unexpected LF in chunk-extension: {ext!r}" 

944 ) 

945 set_exception(self.payload, exc) 

946 raise exc 

947 else: 

948 size_b = chunk[:pos] 

949 

950 if self._lax: # Allow whitespace in lax mode. 

951 size_b = size_b.strip() 

952 

953 if not re.fullmatch(HEXDIGITS, size_b): 

954 exc = TransferEncodingError( 

955 chunk[:pos].decode("ascii", "surrogateescape") 

956 ) 

957 set_exception(self.payload, exc) 

958 raise exc 

959 size = int(bytes(size_b), 16) 

960 

961 chunk = chunk[pos + len(SEP) :] 

962 if size == 0: # eof marker 

963 self._chunk = ChunkState.PARSE_TRAILERS 

964 if self._lax and chunk.startswith(b"\r"): 

965 chunk = chunk[1:] 

966 else: 

967 self._chunk = ChunkState.PARSE_CHUNKED_CHUNK 

968 self._chunk_size = size 

969 self.payload.begin_http_chunk_receiving() 

970 else: 

971 self._chunk_tail = chunk 

972 return PayloadState.PAYLOAD_NEEDS_INPUT, b"" 

973 

974 # read chunk and feed buffer 

975 if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK: 

976 if self._paused: 

977 self._paused = False 

978 self._chunk_tail = chunk 

979 return PayloadState.PAYLOAD_HAS_PENDING_INPUT, b"" 

980 

981 required = self._chunk_size 

982 self._chunk_size = max(required - len(chunk), 0) 

983 self._more_data_available = self.payload.feed_data(chunk[:required]) 

984 chunk = chunk[required:] 

985 

986 if self._more_data_available: 

987 continue 

988 

989 if self._chunk_size: 

990 self._paused = False 

991 return PayloadState.PAYLOAD_NEEDS_INPUT, b"" 

992 self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF 

993 self.payload.end_http_chunk_receiving() 

994 

995 # toss the CRLF at the end of the chunk 

996 if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF: 

997 if self._lax and chunk.startswith(b"\r"): 

998 chunk = chunk[1:] 

999 if chunk[: len(SEP)] == SEP: 

1000 chunk = chunk[len(SEP) :] 

1001 self._chunk = ChunkState.PARSE_CHUNKED_SIZE 

1002 elif len(chunk) >= len(SEP) or chunk != SEP[: len(chunk)]: 

1003 exc = TransferEncodingError( 

1004 "Chunk size mismatch: expected CRLF after chunk data" 

1005 ) 

1006 set_exception(self.payload, exc) 

1007 raise exc 

1008 else: 

1009 self._chunk_tail = chunk 

1010 return PayloadState.PAYLOAD_NEEDS_INPUT, b"" 

1011 

1012 if self._chunk == ChunkState.PARSE_TRAILERS: 

1013 pos = chunk.find(SEP) 

1014 if pos < 0: # No line found 

1015 self._chunk_tail = chunk 

1016 return PayloadState.PAYLOAD_NEEDS_INPUT, b"" 

1017 

1018 line = chunk[:pos] 

1019 chunk = chunk[pos + len(SEP) :] 

1020 if SEP == b"\n": # For lax response parsing 

1021 line = line.rstrip(b"\r") 

1022 

1023 if len(line) > self._max_field_size: 

1024 raise LineTooLong(line[:100] + b"...", self._max_field_size) 

1025 

1026 self._trailer_lines.append(line) 

1027 

1028 if len(self._trailer_lines) > self._max_trailers: 

1029 raise BadHttpMessage("Too many trailers received") 

1030 

1031 # \r\n\r\n found, end of stream 

1032 if self._trailer_lines[-1] == b"": 

1033 # Headers and trailers are defined the same way, 

1034 # so we reuse the HeadersParser here. 

1035 try: 

1036 trailers, raw_trailers = self._headers_parser.parse_headers( 

1037 self._trailer_lines 

1038 ) 

1039 finally: 

1040 self._trailer_lines.clear() 

1041 self.payload.feed_eof() 

1042 return PayloadState.PAYLOAD_COMPLETE, chunk 

1043 

1044 # Read all bytes until eof 

1045 elif self._type == ParseState.PARSE_UNTIL_EOF: 

1046 self._more_data_available = self.payload.feed_data(chunk) 

1047 while self._more_data_available: 

1048 if self._paused: 

1049 self._paused = False 

1050 return PayloadState.PAYLOAD_HAS_PENDING_INPUT, b"" 

1051 self._more_data_available = self.payload.feed_data(b"") 

1052 

1053 if self._eof_pending: 

1054 self.payload.feed_eof() 

1055 self.done = True 

1056 self._eof_pending = False 

1057 return PayloadState.PAYLOAD_COMPLETE, b"" 

1058 

1059 return PayloadState.PAYLOAD_NEEDS_INPUT, b"" 

1060 

1061 

1062class DeflateBuffer: 

1063 """DeflateStream decompress stream and feed data into specified stream.""" 

1064 

1065 def __init__( 

1066 self, 

1067 out: StreamReader, 

1068 encoding: str | None, 

1069 max_decompress_size: int = DEFAULT_CHUNK_SIZE, 

1070 ) -> None: 

1071 self.out = out 

1072 self.size = 0 

1073 out.total_compressed_bytes = self.size 

1074 self.encoding = encoding 

1075 self._started_decoding = False 

1076 

1077 self.decompressor: BrotliDecompressor | ZLibDecompressor | ZSTDDecompressor 

1078 if encoding == "br": 

1079 if not HAS_BROTLI: 

1080 raise ContentEncodingError( 

1081 "Can not decode content-encoding: brotli (br). " 

1082 "Please install `Brotli`" 

1083 ) 

1084 self.decompressor = BrotliDecompressor() 

1085 elif encoding == "zstd": 

1086 if not HAS_ZSTD: 

1087 raise ContentEncodingError( 

1088 "Can not decode content-encoding: zstandard (zstd). " 

1089 "Please install `backports.zstd`" 

1090 ) 

1091 self.decompressor = ZSTDDecompressor() 

1092 else: 

1093 self.decompressor = ZLibDecompressor(encoding=encoding) 

1094 

1095 self._max_decompress_size = max_decompress_size 

1096 

1097 def set_exception( 

1098 self, 

1099 exc: type[BaseException] | BaseException, 

1100 exc_cause: BaseException = _EXC_SENTINEL, 

1101 ) -> None: 

1102 set_exception(self.out, exc, exc_cause) 

1103 

1104 def feed_data(self, chunk: bytes) -> bool: 

1105 """Return True if more data is available and this method should be called again with b"".""" 

1106 self.size += len(chunk) 

1107 self.out.total_compressed_bytes = self.size 

1108 

1109 # RFC1950 

1110 # bits 0..3 = CM = 0b1000 = 8 = "deflate" 

1111 # bits 4..7 = CINFO = 1..7 = windows size. 

1112 if ( 

1113 not self._started_decoding 

1114 and self.encoding == "deflate" 

1115 and chunk[0] & 0xF != 8 

1116 ): 

1117 # Change the decoder to decompress incorrectly compressed data 

1118 # Actually we should issue a warning about non-RFC-compliant data. 

1119 self.decompressor = ZLibDecompressor( 

1120 encoding=self.encoding, suppress_deflate_header=True 

1121 ) 

1122 

1123 low_water = self.out._low_water 

1124 max_length = ( 

1125 0 if low_water >= sys.maxsize else max(self._max_decompress_size, low_water) 

1126 ) 

1127 try: 

1128 chunk = self.decompressor.decompress_sync(chunk, max_length=max_length) 

1129 except Exception: 

1130 raise ContentEncodingError( 

1131 "Can not decode content-encoding: %s" % self.encoding 

1132 ) 

1133 

1134 self._started_decoding = True 

1135 

1136 if chunk: 

1137 self.out.feed_data(chunk) 

1138 return self.decompressor.data_available 

1139 

1140 def feed_eof(self) -> None: 

1141 chunk = self.decompressor.flush() 

1142 # This should never contain data as we defer the call until exhausting 

1143 # the decompression. If .flush() is returning data, this may indicate a 

1144 # zip bomb vulnerability as it will decompress all remaining data at once. 

1145 assert not chunk 

1146 

1147 if self.size > 0: 

1148 # decompressor is not brotli unless encoding is "br" 

1149 if self.encoding == "deflate" and not self.decompressor.eof: # type: ignore[union-attr] 

1150 raise ContentEncodingError("deflate") 

1151 

1152 self.out.feed_eof() 

1153 

1154 def begin_http_chunk_receiving(self) -> None: 

1155 self.out.begin_http_chunk_receiving() 

1156 

1157 def end_http_chunk_receiving(self) -> None: 

1158 self.out.end_http_chunk_receiving() 

1159 

1160 

1161HttpRequestParserPy = HttpRequestParser 

1162HttpResponseParserPy = HttpResponseParser 

1163RawRequestMessagePy = RawRequestMessage 

1164RawResponseMessagePy = RawResponseMessage 

1165 

1166with suppress(ImportError): 

1167 if not NO_EXTENSIONS: 

1168 from ._http_parser import ( # type: ignore[import-not-found,no-redef] 

1169 HttpRequestParser, 

1170 HttpResponseParser, 

1171 RawRequestMessage, 

1172 RawResponseMessage, 

1173 ) 

1174 

1175 HttpRequestParserC = HttpRequestParser 

1176 HttpResponseParserC = HttpResponseParser 

1177 RawRequestMessageC = RawRequestMessage 

1178 RawResponseMessageC = RawResponseMessage