1from __future__ import annotations
2
3import dataclasses
4import os
5import re
6import sys
7import warnings
8from collections.abc import Generator
9from typing import Callable
10
11from .datastructures import Headers
12from .exceptions import SecurityError
13from .version import version as websockets_version
14
15
16__all__ = [
17 "SERVER",
18 "USER_AGENT",
19 "Request",
20 "Response",
21]
22
23
24PYTHON_VERSION = "{}.{}".format(*sys.version_info)
25
26# User-Agent header for HTTP requests.
27USER_AGENT = os.environ.get(
28 "WEBSOCKETS_USER_AGENT",
29 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",
30)
31
32# Server header for HTTP responses.
33SERVER = os.environ.get(
34 "WEBSOCKETS_SERVER",
35 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",
36)
37
38# Maximum total size of headers is around 128 * 8 KiB = 1 MiB.
39MAX_NUM_HEADERS = int(os.environ.get("WEBSOCKETS_MAX_NUM_HEADERS", "128"))
40
41# Limit request line and header lines. 8KiB is the most common default
42# configuration of popular HTTP servers.
43MAX_LINE_LENGTH = int(os.environ.get("WEBSOCKETS_MAX_LINE_LENGTH", "8192"))
44
45# Support for HTTP response bodies is intended to read an error message
46# returned by a server. It isn't designed to perform large file transfers.
47MAX_BODY_SIZE = int(os.environ.get("WEBSOCKETS_MAX_BODY_SIZE", "1_048_576")) # 1 MiB
48
49
50def d(value: bytes | bytearray) -> str:
51 """
52 Decode a bytestring for interpolating into an error message.
53
54 """
55 return value.decode(errors="backslashreplace")
56
57
58# See https://datatracker.ietf.org/doc/html/rfc7230#appendix-B.
59
60# Regex for validating header names.
61
62_token_re = re.compile(rb"[-!#$%&\'*+.^_`|~0-9a-zA-Z]+")
63
64# Regex for validating header values.
65
66# We don't attempt to support obsolete line folding.
67
68# Include HTAB (\x09), SP (\x20), VCHAR (\x21-\x7e), obs-text (\x80-\xff).
69
70# The ABNF is complicated because it attempts to express that optional
71# whitespace is ignored. We strip whitespace and don't revalidate that.
72
73# See also https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
74
75_value_re = re.compile(rb"[\x09\x20-\x7e\x80-\xff]*")
76
77
78@dataclasses.dataclass
79class Request:
80 """
81 WebSocket handshake request.
82
83 Attributes:
84 path: Request path, including optional query.
85 headers: Request headers.
86 """
87
88 path: str
89 headers: Headers
90 # body isn't useful is the context of this library.
91
92 _exception: Exception | None = None
93
94 @property
95 def exception(self) -> Exception | None: # pragma: no cover
96 warnings.warn( # deprecated in 10.3 - 2022-04-17
97 "Request.exception is deprecated; use ServerProtocol.handshake_exc instead",
98 DeprecationWarning,
99 )
100 return self._exception
101
102 @classmethod
103 def parse(
104 cls,
105 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],
106 ) -> Generator[None, None, Request]:
107 """
108 Parse a WebSocket handshake request.
109
110 This is a generator-based coroutine.
111
112 The request path isn't URL-decoded or validated in any way.
113
114 The request path and headers are expected to contain only ASCII
115 characters. Other characters are represented with surrogate escapes.
116
117 :meth:`parse` doesn't attempt to read the request body because
118 WebSocket handshake requests don't have one. If the request contains a
119 body, it may be read from the data stream after :meth:`parse` returns.
120
121 Args:
122 read_line: Generator-based coroutine that reads a LF-terminated
123 line or raises an exception if there isn't enough data
124
125 Raises:
126 EOFError: If the connection is closed without a full HTTP request.
127 SecurityError: If the request exceeds a security limit.
128 ValueError: If the request isn't well formatted.
129
130 """
131 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1
132
133 # Parsing is simple because fixed values are expected for method and
134 # version and because path isn't checked. Since WebSocket software tends
135 # to implement HTTP/1.1 strictly, there's little need for lenient parsing.
136
137 try:
138 request_line = yield from parse_line(read_line)
139 except EOFError as exc:
140 raise EOFError("connection closed while reading HTTP request line") from exc
141
142 try:
143 method, raw_path, protocol = request_line.split(b" ", 2)
144 except ValueError: # not enough values to unpack (expected 3, got 1-2)
145 raise ValueError(f"invalid HTTP request line: {d(request_line)}") from None
146 if protocol != b"HTTP/1.1":
147 raise ValueError(
148 f"unsupported protocol; expected HTTP/1.1: {d(request_line)}"
149 )
150 if method != b"GET":
151 raise ValueError(f"unsupported HTTP method; expected GET; got {d(method)}")
152 path = raw_path.decode("ascii", "surrogateescape")
153
154 headers = yield from parse_headers(read_line)
155
156 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3
157
158 if "Transfer-Encoding" in headers:
159 raise NotImplementedError("transfer codings aren't supported")
160
161 if "Content-Length" in headers:
162 # Some devices send a Content-Length header with a value of 0.
163 # This raises ValueError if Content-Length isn't an integer too.
164 if int(headers["Content-Length"]) != 0:
165 raise ValueError("unsupported request body")
166
167 return cls(path, headers)
168
169 def serialize(self) -> bytes:
170 """
171 Serialize a WebSocket handshake request.
172
173 """
174 # Since the request line and headers only contain ASCII characters,
175 # we can keep this simple.
176 request = f"GET {self.path} HTTP/1.1\r\n".encode()
177 request += self.headers.serialize()
178 return request
179
180
181@dataclasses.dataclass
182class Response:
183 """
184 WebSocket handshake response.
185
186 Attributes:
187 status_code: Response code.
188 reason_phrase: Response reason.
189 headers: Response headers.
190 body: Response body.
191
192 """
193
194 status_code: int
195 reason_phrase: str
196 headers: Headers
197 body: bytes | bytearray = b""
198
199 _exception: Exception | None = None
200
201 @property
202 def exception(self) -> Exception | None: # pragma: no cover
203 warnings.warn( # deprecated in 10.3 - 2022-04-17
204 "Response.exception is deprecated; "
205 "use ClientProtocol.handshake_exc instead",
206 DeprecationWarning,
207 )
208 return self._exception
209
210 @classmethod
211 def parse(
212 cls,
213 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],
214 read_exact: Callable[[int], Generator[None, None, bytes | bytearray]],
215 read_to_eof: Callable[[int], Generator[None, None, bytes | bytearray]],
216 proxy: bool = False,
217 ) -> Generator[None, None, Response]:
218 """
219 Parse a WebSocket handshake response.
220
221 This is a generator-based coroutine.
222
223 The reason phrase and headers are expected to contain only ASCII
224 characters. Other characters are represented with surrogate escapes.
225
226 Args:
227 read_line: Generator-based coroutine that reads a LF-terminated
228 line or raises an exception if there isn't enough data.
229 read_exact: Generator-based coroutine that reads the requested
230 bytes or raises an exception if there isn't enough data.
231 read_to_eof: Generator-based coroutine that reads until the end
232 of the stream.
233
234 Raises:
235 EOFError: If the connection is closed without a full HTTP response.
236 SecurityError: If the response exceeds a security limit.
237 LookupError: If the response isn't well formatted.
238 ValueError: If the response isn't well formatted.
239
240 """
241 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.2
242
243 try:
244 status_line = yield from parse_line(read_line)
245 except EOFError as exc:
246 raise EOFError("connection closed while reading HTTP status line") from exc
247
248 try:
249 protocol, raw_status_code, raw_reason = status_line.split(b" ", 2)
250 except ValueError: # not enough values to unpack (expected 3, got 1-2)
251 raise ValueError(f"invalid HTTP status line: {d(status_line)}") from None
252 if proxy: # some proxies still use HTTP/1.0
253 if protocol not in [b"HTTP/1.1", b"HTTP/1.0"]:
254 raise ValueError(
255 f"unsupported protocol; expected HTTP/1.1 or HTTP/1.0: "
256 f"{d(status_line)}"
257 )
258 else:
259 if protocol != b"HTTP/1.1":
260 raise ValueError(
261 f"unsupported protocol; expected HTTP/1.1: {d(status_line)}"
262 )
263 try:
264 status_code = int(raw_status_code)
265 except ValueError: # invalid literal for int() with base 10
266 raise ValueError(
267 f"invalid status code; expected integer; got {d(raw_status_code)}"
268 ) from None
269 if not 100 <= status_code < 600:
270 raise ValueError(
271 f"invalid status code; expected 100–599; got {d(raw_status_code)}"
272 )
273 if not _value_re.fullmatch(raw_reason):
274 raise ValueError(f"invalid HTTP reason phrase: {d(raw_reason)}")
275 reason = raw_reason.decode("ascii", "surrogateescape")
276
277 headers = yield from parse_headers(read_line)
278
279 body: bytes | bytearray
280 if proxy:
281 body = b""
282 else:
283 body = yield from read_body(
284 status_code, headers, read_line, read_exact, read_to_eof
285 )
286
287 return cls(status_code, reason, headers, body)
288
289 def serialize(self) -> bytes:
290 """
291 Serialize a WebSocket handshake response.
292
293 """
294 # Since the status line and headers only contain ASCII characters,
295 # we can keep this simple.
296 response = f"HTTP/1.1 {self.status_code} {self.reason_phrase}\r\n".encode()
297 response += self.headers.serialize()
298 response += self.body
299 return response
300
301
302def parse_line(
303 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],
304) -> Generator[None, None, bytes | bytearray]:
305 """
306 Parse a single line.
307
308 CRLF is stripped from the return value.
309
310 Args:
311 read_line: Generator-based coroutine that reads a LF-terminated line
312 or raises an exception if there isn't enough data.
313
314 Raises:
315 EOFError: If the connection is closed without a CRLF.
316 SecurityError: If the response exceeds a security limit.
317
318 """
319 try:
320 line = yield from read_line(MAX_LINE_LENGTH)
321 except RuntimeError:
322 raise SecurityError("line too long")
323 # Not mandatory but safe - https://datatracker.ietf.org/doc/html/rfc7230#section-3.5
324 if not line.endswith(b"\r\n"):
325 raise EOFError("line without CRLF")
326 return line[:-2]
327
328
329def parse_headers(
330 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],
331) -> Generator[None, None, Headers]:
332 """
333 Parse HTTP headers.
334
335 Non-ASCII characters are represented with surrogate escapes.
336
337 Args:
338 read_line: Generator-based coroutine that reads a LF-terminated line
339 or raises an exception if there isn't enough data.
340
341 Raises:
342 EOFError: If the connection is closed without complete headers.
343 SecurityError: If the request exceeds a security limit.
344 ValueError: If the request isn't well formatted.
345
346 """
347 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.2
348
349 # We don't attempt to support obsolete line folding.
350
351 headers = Headers()
352 for _ in range(MAX_NUM_HEADERS + 1):
353 try:
354 line = yield from parse_line(read_line)
355 except EOFError as exc:
356 raise EOFError("connection closed while reading HTTP headers") from exc
357 if line == b"":
358 break
359
360 try:
361 raw_name, raw_value = line.split(b":", 1)
362 except ValueError: # not enough values to unpack (expected 2, got 1)
363 raise ValueError(f"invalid HTTP header line: {d(line)}") from None
364 if not _token_re.fullmatch(raw_name):
365 raise ValueError(f"invalid HTTP header name: {d(raw_name)}")
366 raw_value = raw_value.strip(b" \t")
367 if not _value_re.fullmatch(raw_value):
368 raise ValueError(f"invalid HTTP header value: {d(raw_value)}")
369
370 name = raw_name.decode("ascii") # guaranteed to be ASCII at this point
371 value = raw_value.decode("ascii", "surrogateescape")
372 headers[name] = value
373
374 else:
375 raise SecurityError("too many HTTP headers")
376
377 return headers
378
379
380def read_body(
381 status_code: int,
382 headers: Headers,
383 read_line: Callable[[int], Generator[None, None, bytes | bytearray]],
384 read_exact: Callable[[int], Generator[None, None, bytes | bytearray]],
385 read_to_eof: Callable[[int], Generator[None, None, bytes | bytearray]],
386) -> Generator[None, None, bytes | bytearray]:
387 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3
388
389 # Since websockets only does GET requests (no HEAD, no CONNECT), all
390 # responses except 1xx, 204, and 304 include a message body.
391 if 100 <= status_code < 200 or status_code == 204 or status_code == 304:
392 return b""
393
394 # MultipleValuesError is sufficiently unlikely that we don't attempt to
395 # handle it when accessing headers. Instead we document that its parent
396 # class, LookupError, may be raised.
397 # Conversions from str to int are protected by sys.set_int_max_str_digits..
398
399 elif (coding := headers.get("Transfer-Encoding")) is not None:
400 if coding != "chunked":
401 raise NotImplementedError(f"transfer coding {coding} isn't supported")
402
403 body = b""
404 while True:
405 chunk_size_line = yield from parse_line(read_line)
406 raw_chunk_size = chunk_size_line.split(b";", 1)[0]
407 # Set a lower limit than default_max_str_digits; 1 EB is plenty.
408 if len(raw_chunk_size) > 15:
409 str_chunk_size = raw_chunk_size.decode(errors="backslashreplace")
410 raise SecurityError(f"chunk too large: 0x{str_chunk_size} bytes")
411 chunk_size = int(raw_chunk_size, 16)
412 if chunk_size == 0:
413 break
414 if len(body) + chunk_size > MAX_BODY_SIZE:
415 raise SecurityError(
416 f"chunk too large: {chunk_size} bytes after {len(body)} bytes"
417 )
418 body += yield from read_exact(chunk_size)
419 if (yield from read_exact(2)) != b"\r\n":
420 raise ValueError("chunk without CRLF")
421 # Read the trailer.
422 yield from parse_headers(read_line)
423 return body
424
425 elif (raw_content_length := headers.get("Content-Length")) is not None:
426 # Set a lower limit than default_max_str_digits; 1 EiB is plenty.
427 if len(raw_content_length) > 18:
428 raise SecurityError(f"body too large: {raw_content_length} bytes")
429 content_length = int(raw_content_length)
430 if content_length > MAX_BODY_SIZE:
431 raise SecurityError(f"body too large: {content_length} bytes")
432 return (yield from read_exact(content_length))
433
434 else:
435 try:
436 return (yield from read_to_eof(MAX_BODY_SIZE))
437 except RuntimeError:
438 raise SecurityError(f"body too large: over {MAX_BODY_SIZE} bytes")