Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/websockets/http11.py: 75%

1from __future__ import annotations

3import dataclasses

4import os

5import re

6import sys

7import warnings

8from collections.abc import Generator

9from typing import Callable

11from .datastructures import Headers

12from .exceptions import SecurityError

13from .version import version as websockets_version

16__all__ = [

17 "SERVER",

18 "USER_AGENT",

19 "Request",

20 "Response",

21]

24PYTHON_VERSION = "{}.{}".format(*sys.version_info)

26# User-Agent header for HTTP requests.

27USER_AGENT = os.environ.get(

28 "WEBSOCKETS_USER_AGENT",

29 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",

30)

32# Server header for HTTP responses.

33SERVER = os.environ.get(

34 "WEBSOCKETS_SERVER",

35 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",

36)

38# Maximum total size of headers is around 128 * 8 KiB = 1 MiB.

39MAX_NUM_HEADERS = int(os.environ.get("WEBSOCKETS_MAX_NUM_HEADERS", "128"))

41# Limit request line and header lines. 8KiB is the most common default

42# configuration of popular HTTP servers.

43MAX_LINE_LENGTH = int(os.environ.get("WEBSOCKETS_MAX_LINE_LENGTH", "8192"))

45# Support for HTTP response bodies is intended to read an error message

46# returned by a server. It isn't designed to perform large file transfers.

47MAX_BODY_SIZE = int(os.environ.get("WEBSOCKETS_MAX_BODY_SIZE", "1_048_576")) # 1 MiB

50def d(value: bytes | bytearray) -> str:

51 """

52 Decode a bytestring for interpolating into an error message.

54 """

55 return value.decode(errors="backslashreplace")

58# See https://datatracker.ietf.org/doc/html/rfc7230#appendix-B.

60# Regex for validating header names.

62_token_re = re.compile(rb"[-!#$%&\'*+.^_`|~0-9a-zA-Z]+")

64# Regex for validating header values.

66# We don't attempt to support obsolete line folding.

68# Include HTAB (\x09), SP (\x20), VCHAR (\x21-\x7e), obs-text (\x80-\xff).

70# The ABNF is complicated because it attempts to express that optional

71# whitespace is ignored. We strip whitespace and don't revalidate that.

73# See also https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189

75_value_re = re.compile(rb"[\x09\x20-\x7e\x80-\xff]*")

78@dataclasses.dataclass

79class Request:

80 """

81 WebSocket handshake request.

83 Attributes:

84 path: Request path, including optional query.

85 headers: Request headers.

86 """

88 path: str

89 headers: Headers

90 # body isn't useful is the context of this library.

92 _exception: Exception | None = None

94 @property

95 def exception(self) -> Exception | None: # pragma: no cover

96 warnings.warn( # deprecated in 10.3 - 2022-04-17

97 "Request.exception is deprecated; use ServerProtocol.handshake_exc instead",

98 DeprecationWarning,

99 )

100 return self._exception

101

102 @classmethod

103 def parse(

104 cls,

105 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],

106 ) -> Generator[None, None, Request]:

107 """

108 Parse a WebSocket handshake request.

109

110 This is a generator-based coroutine.

111

112 The request path isn't URL-decoded or validated in any way.

113

114 The request path and headers are expected to contain only ASCII

115 characters. Other characters are represented with surrogate escapes.

116

117 :meth:`parse` doesn't attempt to read the request body because

118 WebSocket handshake requests don't have one. If the request contains a

119 body, it may be read from the data stream after :meth:`parse` returns.

120

121 Args:

122 read_line: Generator-based coroutine that reads a LF-terminated

123 line or raises an exception if there isn't enough data

124

125 Raises:

126 EOFError: If the connection is closed without a full HTTP request.

127 SecurityError: If the request exceeds a security limit.

128 ValueError: If the request isn't well formatted.

129

130 """

131 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1

132

133 # Parsing is simple because fixed values are expected for method and

134 # version and because path isn't checked. Since WebSocket software tends

135 # to implement HTTP/1.1 strictly, there's little need for lenient parsing.

136

137 try:

138 request_line = yield from parse_line(read_line)

139 except EOFError as exc:

140 raise EOFError("connection closed while reading HTTP request line") from exc

141

142 try:

143 method, raw_path, protocol = request_line.split(b" ", 2)

144 except ValueError: # not enough values to unpack (expected 3, got 1-2)

145 raise ValueError(f"invalid HTTP request line: {d(request_line)}") from None

146 if protocol != b"HTTP/1.1":

147 raise ValueError(

148 f"unsupported protocol; expected HTTP/1.1: {d(request_line)}"

149 )

150 if method != b"GET":

151 raise ValueError(f"unsupported HTTP method; expected GET; got {d(method)}")

152

153 # RFC 9110 defers the definition of URIs to RFC 3986, which allows only

154 # a subset of ASCII. Non-ASCII IRIs must be UTF-8 then percent-encoded.

155 path = raw_path.decode("ascii")

156

157 headers = yield from parse_headers(read_line)

158

159 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3

160

161 if "Transfer-Encoding" in headers:

162 raise NotImplementedError("transfer codings aren't supported")

163

164 if "Content-Length" in headers:

165 # Some devices send a Content-Length header with a value of 0.

166 # This raises ValueError if Content-Length isn't an integer too.

167 if int(headers["Content-Length"]) != 0:

168 raise ValueError("unsupported request body")

169

170 return cls(path, headers)

171

172 def serialize(self) -> bytes:

173 """

174 Serialize a WebSocket handshake request.

175

176 """

177 # Since the request line and headers only contain ASCII characters,

178 # we can keep this simple.

179 request = f"GET {self.path} HTTP/1.1\r\n".encode()

180 request += self.headers.serialize()

181 return request

182

183

184@dataclasses.dataclass

185class Response:

186 """

187 WebSocket handshake response.

188

189 Attributes:

190 status_code: Response code.

191 reason_phrase: Response reason.

192 headers: Response headers.

193 body: Response body.

194

195 """

196

197 status_code: int

198 reason_phrase: str

199 headers: Headers

200 body: bytes | bytearray = b""

201

202 _exception: Exception | None = None

203

204 @property

205 def exception(self) -> Exception | None: # pragma: no cover

206 warnings.warn( # deprecated in 10.3 - 2022-04-17

207 "Response.exception is deprecated; "

208 "use ClientProtocol.handshake_exc instead",

209 DeprecationWarning,

210 )

211 return self._exception

212

213 @classmethod

214 def parse(

215 cls,

216 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],

217 read_exact: Callable[[int], Generator[None, None, bytes | bytearray]],

218 read_to_eof: Callable[[int], Generator[None, None, bytes | bytearray]],

219 proxy: bool = False,

220 ) -> Generator[None, None, Response]:

221 """

222 Parse a WebSocket handshake response.

223

224 This is a generator-based coroutine.

225

226 The reason phrase and headers are expected to contain only ASCII

227 characters. Other characters are represented with surrogate escapes.

228

229 Args:

230 read_line: Generator-based coroutine that reads a LF-terminated

231 line or raises an exception if there isn't enough data.

232 read_exact: Generator-based coroutine that reads the requested

233 bytes or raises an exception if there isn't enough data.

234 read_to_eof: Generator-based coroutine that reads until the end

235 of the stream.

236

237 Raises:

238 EOFError: If the connection is closed without a full HTTP response.

239 SecurityError: If the response exceeds a security limit.

240 LookupError: If the response isn't well formatted.

241 ValueError: If the response isn't well formatted.

242

243 """

244 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.2

245

246 try:

247 status_line = yield from parse_line(read_line)

248 except EOFError as exc:

249 raise EOFError("connection closed while reading HTTP status line") from exc

250

251 try:

252 protocol, raw_status_code, raw_reason = status_line.split(b" ", 2)

253 except ValueError: # not enough values to unpack (expected 3, got 1-2)

254 raise ValueError(f"invalid HTTP status line: {d(status_line)}") from None

255 if proxy: # some proxies still use HTTP/1.0

256 if protocol not in [b"HTTP/1.1", b"HTTP/1.0"]:

257 raise ValueError(

258 f"unsupported protocol; expected HTTP/1.1 or HTTP/1.0: "

259 f"{d(status_line)}"

260 )

261 else:

262 if protocol != b"HTTP/1.1":

263 raise ValueError(

264 f"unsupported protocol; expected HTTP/1.1: {d(status_line)}"

265 )

266 try:

267 status_code = int(raw_status_code)

268 except ValueError: # invalid literal for int() with base 10

269 raise ValueError(

270 f"invalid status code; expected integer; got {d(raw_status_code)}"

271 ) from None

272 if not 100 <= status_code < 600:

273 raise ValueError(

274 f"invalid status code; expected 100–599; got {d(raw_status_code)}"

275 )

276 if not _value_re.fullmatch(raw_reason):

277 raise ValueError(f"invalid HTTP reason phrase: {d(raw_reason)}")

278

279 # RFC 2616 implies ISO-8859-1. It's easy to reverse and cannot crash.

280 # Non-ASCII never worked reliably and the reason isn't useful anyway.

281 reason = raw_reason.decode("iso-8859-1")

282

283 headers = yield from parse_headers(read_line)

284

285 body: bytes | bytearray

286 if proxy:

287 body = b""

288 else:

289 body = yield from read_body(

290 status_code, headers, read_line, read_exact, read_to_eof

291 )

292

293 return cls(status_code, reason, headers, body)

294

295 def serialize(self) -> bytes:

296 """

297 Serialize a WebSocket handshake response.

298

299 """

300 # Since the status line and headers only contain ASCII characters,

301 # we can keep this simple.

302 response = f"HTTP/1.1 {self.status_code} {self.reason_phrase}\r\n".encode()

303 response += self.headers.serialize()

304 response += self.body

305 return response

306

307

308def parse_line(

309 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],

310) -> Generator[None, None, bytes | bytearray]:

311 """

312 Parse a single line.

313

314 CRLF is stripped from the return value.

315

316 Args:

317 read_line: Generator-based coroutine that reads a LF-terminated line

318 or raises an exception if there isn't enough data.

319

320 Raises:

321 EOFError: If the connection is closed without a CRLF.

322 SecurityError: If the response exceeds a security limit.

323

324 """

325 try:

326 line = yield from read_line(MAX_LINE_LENGTH)

327 except RuntimeError:

328 raise SecurityError("line too long")

329 # Not mandatory but safe - https://datatracker.ietf.org/doc/html/rfc7230#section-3.5

330 if not line.endswith(b"\r\n"):

331 raise EOFError("line without CRLF")

332 return line[:-2]

333

334

335def parse_headers(

336 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],

337) -> Generator[None, None, Headers]:

338 """

339 Parse HTTP headers.

340

341 Non-ASCII characters are represented with surrogate escapes.

342

343 Args:

344 read_line: Generator-based coroutine that reads a LF-terminated line

345 or raises an exception if there isn't enough data.

346

347 Raises:

348 EOFError: If the connection is closed without complete headers.

349 SecurityError: If the request exceeds a security limit.

350 ValueError: If the request isn't well formatted.

351

352 """

353 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.2

354

355 # We don't attempt to support obsolete line folding.

356

357 headers = Headers()

358 for _ in range(MAX_NUM_HEADERS + 1):

359 try:

360 line = yield from parse_line(read_line)

361 except EOFError as exc:

362 raise EOFError("connection closed while reading HTTP headers") from exc

363 if line == b"":

364 break

365

366 try:

367 raw_name, raw_value = line.split(b":", 1)

368 except ValueError: # not enough values to unpack (expected 2, got 1)

369 raise ValueError(f"invalid HTTP header line: {d(line)}") from None

370 if not _token_re.fullmatch(raw_name):

371 raise ValueError(f"invalid HTTP header name: {d(raw_name)}")

372 raw_value = raw_value.strip(b" \t")

373 if not _value_re.fullmatch(raw_value):

374 raise ValueError(f"invalid HTTP header value: {d(raw_value)}")

375

376 name = raw_name.decode("ascii") # guaranteed to be ASCII at this point

377 # Headers should be ASCII. Section 5.5 of RFC 9110 says: "Historically,

378 # HTTP allowed field content with text in the ISO-8859-1 charset."

379 # It's easy to reverse and cannot crash, making it a decent choice.

380 value = raw_value.decode("iso-8859-1")

381

382 # Since we just validated raw_value, we don't need to revalidate it.

383 headers.set_insecure(name, value)

384

385 else:

386 raise SecurityError("too many HTTP headers")

387

388 return headers

389

390

391def read_body(

392 status_code: int,

393 headers: Headers,

394 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],

395 read_exact: Callable[[int], Generator[None, None, bytes | bytearray]],

396 read_to_eof: Callable[[int], Generator[None, None, bytes | bytearray]],

397) -> Generator[None, None, bytes | bytearray]:

398 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3

399

400 # Since websockets only does GET requests (no HEAD, no CONNECT), all

401 # responses except 1xx, 204, and 304 include a message body.

402 if 100 <= status_code < 200 or status_code == 204 or status_code == 304:

403 return b""

404

405 # MultipleValuesError is sufficiently unlikely that we don't attempt to

406 # handle it when accessing headers. Instead we document that its parent

407 # class, LookupError, may be raised.

408 # Conversions from str to int are protected by sys.set_int_max_str_digits..

409

410 elif (coding := headers.get("Transfer-Encoding")) is not None:

411 if coding != "chunked":

412 raise NotImplementedError(f"transfer coding {coding} isn't supported")

413

414 body = b""

415 while True:

416 chunk_size_line = yield from parse_line(read_line)

417 raw_chunk_size = chunk_size_line.split(b";", 1)[0]

418 # Set a lower limit than default_max_str_digits; 1 EB is plenty.

419 if len(raw_chunk_size) > 15:

420 str_chunk_size = raw_chunk_size.decode(errors="backslashreplace")

421 raise SecurityError(f"chunk too large: 0x{str_chunk_size} bytes")

422 chunk_size = int(raw_chunk_size, 16)

423 if chunk_size == 0:

424 break

425 if len(body) + chunk_size > MAX_BODY_SIZE:

426 raise SecurityError(

427 f"chunk too large: {chunk_size} bytes after {len(body)} bytes"

428 )

429 body += yield from read_exact(chunk_size)

430 if (yield from read_exact(2)) != b"\r\n":

431 raise ValueError("chunk without CRLF")

432 # Read the trailer.

433 yield from parse_headers(read_line)

434 return body

435

436 elif (raw_content_length := headers.get("Content-Length")) is not None:

437 # Set a lower limit than default_max_str_digits; 1 EiB is plenty.

438 if len(raw_content_length) > 18:

439 raise SecurityError(f"body too large: {raw_content_length} bytes")

440 content_length = int(raw_content_length)

441 if content_length > MAX_BODY_SIZE:

442 raise SecurityError(f"body too large: {content_length} bytes")

443 return (yield from read_exact(content_length))

444

445 else:

446 try:

447 return (yield from read_to_eof(MAX_BODY_SIZE))

448 except RuntimeError:

449 raise SecurityError(f"body too large: over {MAX_BODY_SIZE} bytes")