1from __future__ import annotations
2
3import dataclasses
4import os
5import re
6import sys
7import warnings
8from collections.abc import Generator
9from typing import Callable
10
11from .datastructures import Headers
12from .exceptions import SecurityError
13from .version import version as websockets_version
14
15
16__all__ = [
17 "SERVER",
18 "USER_AGENT",
19 "Request",
20 "Response",
21]
22
23
24PYTHON_VERSION = "{}.{}".format(*sys.version_info)
25
26# User-Agent header for HTTP requests.
27USER_AGENT = os.environ.get(
28 "WEBSOCKETS_USER_AGENT",
29 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",
30)
31
32# Server header for HTTP responses.
33SERVER = os.environ.get(
34 "WEBSOCKETS_SERVER",
35 f"Python/{PYTHON_VERSION} websockets/{websockets_version}",
36)
37
38# Maximum total size of headers is around 128 * 8 KiB = 1 MiB.
39MAX_NUM_HEADERS = int(os.environ.get("WEBSOCKETS_MAX_NUM_HEADERS", "128"))
40
41# Limit request line and header lines. 8KiB is the most common default
42# configuration of popular HTTP servers.
43MAX_LINE_LENGTH = int(os.environ.get("WEBSOCKETS_MAX_LINE_LENGTH", "8192"))
44
45# Support for HTTP response bodies is intended to read an error message
46# returned by a server. It isn't designed to perform large file transfers.
47MAX_BODY_SIZE = int(os.environ.get("WEBSOCKETS_MAX_BODY_SIZE", "1_048_576")) # 1 MiB
48
49
50def d(value: bytes) -> str:
51 """
52 Decode a bytestring for interpolating into an error message.
53
54 """
55 return value.decode(errors="backslashreplace")
56
57
58# See https://datatracker.ietf.org/doc/html/rfc7230#appendix-B.
59
60# Regex for validating header names.
61
62_token_re = re.compile(rb"[-!#$%&\'*+.^_`|~0-9a-zA-Z]+")
63
64# Regex for validating header values.
65
66# We don't attempt to support obsolete line folding.
67
68# Include HTAB (\x09), SP (\x20), VCHAR (\x21-\x7e), obs-text (\x80-\xff).
69
70# The ABNF is complicated because it attempts to express that optional
71# whitespace is ignored. We strip whitespace and don't revalidate that.
72
73# See also https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
74
75_value_re = re.compile(rb"[\x09\x20-\x7e\x80-\xff]*")
76
77
78@dataclasses.dataclass
79class Request:
80 """
81 WebSocket handshake request.
82
83 Attributes:
84 path: Request path, including optional query.
85 headers: Request headers.
86 """
87
88 path: str
89 headers: Headers
90 # body isn't useful is the context of this library.
91
92 _exception: Exception | None = None
93
94 @property
95 def exception(self) -> Exception | None: # pragma: no cover
96 warnings.warn( # deprecated in 10.3 - 2022-04-17
97 "Request.exception is deprecated; use ServerProtocol.handshake_exc instead",
98 DeprecationWarning,
99 )
100 return self._exception
101
102 @classmethod
103 def parse(
104 cls,
105 read_line: Callable[[int], Generator[None, None, bytes]],
106 ) -> Generator[None, None, Request]:
107 """
108 Parse a WebSocket handshake request.
109
110 This is a generator-based coroutine.
111
112 The request path isn't URL-decoded or validated in any way.
113
114 The request path and headers are expected to contain only ASCII
115 characters. Other characters are represented with surrogate escapes.
116
117 :meth:`parse` doesn't attempt to read the request body because
118 WebSocket handshake requests don't have one. If the request contains a
119 body, it may be read from the data stream after :meth:`parse` returns.
120
121 Args:
122 read_line: Generator-based coroutine that reads a LF-terminated
123 line or raises an exception if there isn't enough data
124
125 Raises:
126 EOFError: If the connection is closed without a full HTTP request.
127 SecurityError: If the request exceeds a security limit.
128 ValueError: If the request isn't well formatted.
129
130 """
131 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1
132
133 # Parsing is simple because fixed values are expected for method and
134 # version and because path isn't checked. Since WebSocket software tends
135 # to implement HTTP/1.1 strictly, there's little need for lenient parsing.
136
137 try:
138 request_line = yield from parse_line(read_line)
139 except EOFError as exc:
140 raise EOFError("connection closed while reading HTTP request line") from exc
141
142 try:
143 method, raw_path, protocol = request_line.split(b" ", 2)
144 except ValueError: # not enough values to unpack (expected 3, got 1-2)
145 raise ValueError(f"invalid HTTP request line: {d(request_line)}") from None
146 if protocol != b"HTTP/1.1":
147 raise ValueError(
148 f"unsupported protocol; expected HTTP/1.1: {d(request_line)}"
149 )
150 if method != b"GET":
151 raise ValueError(f"unsupported HTTP method; expected GET; got {d(method)}")
152 path = raw_path.decode("ascii", "surrogateescape")
153
154 headers = yield from parse_headers(read_line)
155
156 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3
157
158 if "Transfer-Encoding" in headers:
159 raise NotImplementedError("transfer codings aren't supported")
160
161 if "Content-Length" in headers:
162 # Some devices send a Content-Length header with a value of 0.
163 # This raises ValueError if Content-Length isn't an integer too.
164 if int(headers["Content-Length"]) != 0:
165 raise ValueError("unsupported request body")
166
167 return cls(path, headers)
168
169 def serialize(self) -> bytes:
170 """
171 Serialize a WebSocket handshake request.
172
173 """
174 # Since the request line and headers only contain ASCII characters,
175 # we can keep this simple.
176 request = f"GET {self.path} HTTP/1.1\r\n".encode()
177 request += self.headers.serialize()
178 return request
179
180
181@dataclasses.dataclass
182class Response:
183 """
184 WebSocket handshake response.
185
186 Attributes:
187 status_code: Response code.
188 reason_phrase: Response reason.
189 headers: Response headers.
190 body: Response body.
191
192 """
193
194 status_code: int
195 reason_phrase: str
196 headers: Headers
197 body: bytes = b""
198
199 _exception: Exception | None = None
200
201 @property
202 def exception(self) -> Exception | None: # pragma: no cover
203 warnings.warn( # deprecated in 10.3 - 2022-04-17
204 "Response.exception is deprecated; "
205 "use ClientProtocol.handshake_exc instead",
206 DeprecationWarning,
207 )
208 return self._exception
209
210 @classmethod
211 def parse(
212 cls,
213 read_line: Callable[[int], Generator[None, None, bytes]],
214 read_exact: Callable[[int], Generator[None, None, bytes]],
215 read_to_eof: Callable[[int], Generator[None, None, bytes]],
216 proxy: bool = False,
217 ) -> Generator[None, None, Response]:
218 """
219 Parse a WebSocket handshake response.
220
221 This is a generator-based coroutine.
222
223 The reason phrase and headers are expected to contain only ASCII
224 characters. Other characters are represented with surrogate escapes.
225
226 Args:
227 read_line: Generator-based coroutine that reads a LF-terminated
228 line or raises an exception if there isn't enough data.
229 read_exact: Generator-based coroutine that reads the requested
230 bytes or raises an exception if there isn't enough data.
231 read_to_eof: Generator-based coroutine that reads until the end
232 of the stream.
233
234 Raises:
235 EOFError: If the connection is closed without a full HTTP response.
236 SecurityError: If the response exceeds a security limit.
237 LookupError: If the response isn't well formatted.
238 ValueError: If the response isn't well formatted.
239
240 """
241 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.2
242
243 try:
244 status_line = yield from parse_line(read_line)
245 except EOFError as exc:
246 raise EOFError("connection closed while reading HTTP status line") from exc
247
248 try:
249 protocol, raw_status_code, raw_reason = status_line.split(b" ", 2)
250 except ValueError: # not enough values to unpack (expected 3, got 1-2)
251 raise ValueError(f"invalid HTTP status line: {d(status_line)}") from None
252 if proxy: # some proxies still use HTTP/1.0
253 if protocol not in [b"HTTP/1.1", b"HTTP/1.0"]:
254 raise ValueError(
255 f"unsupported protocol; expected HTTP/1.1 or HTTP/1.0: "
256 f"{d(status_line)}"
257 )
258 else:
259 if protocol != b"HTTP/1.1":
260 raise ValueError(
261 f"unsupported protocol; expected HTTP/1.1: {d(status_line)}"
262 )
263 try:
264 status_code = int(raw_status_code)
265 except ValueError: # invalid literal for int() with base 10
266 raise ValueError(
267 f"invalid status code; expected integer; got {d(raw_status_code)}"
268 ) from None
269 if not 100 <= status_code < 600:
270 raise ValueError(
271 f"invalid status code; expected 100–599; got {d(raw_status_code)}"
272 )
273 if not _value_re.fullmatch(raw_reason):
274 raise ValueError(f"invalid HTTP reason phrase: {d(raw_reason)}")
275 reason = raw_reason.decode("ascii", "surrogateescape")
276
277 headers = yield from parse_headers(read_line)
278
279 if proxy:
280 body = b""
281 else:
282 body = yield from read_body(
283 status_code, headers, read_line, read_exact, read_to_eof
284 )
285
286 return cls(status_code, reason, headers, body)
287
288 def serialize(self) -> bytes:
289 """
290 Serialize a WebSocket handshake response.
291
292 """
293 # Since the status line and headers only contain ASCII characters,
294 # we can keep this simple.
295 response = f"HTTP/1.1 {self.status_code} {self.reason_phrase}\r\n".encode()
296 response += self.headers.serialize()
297 response += self.body
298 return response
299
300
301def parse_line(
302 read_line: Callable[[int], Generator[None, None, bytes]],
303) -> Generator[None, None, bytes]:
304 """
305 Parse a single line.
306
307 CRLF is stripped from the return value.
308
309 Args:
310 read_line: Generator-based coroutine that reads a LF-terminated line
311 or raises an exception if there isn't enough data.
312
313 Raises:
314 EOFError: If the connection is closed without a CRLF.
315 SecurityError: If the response exceeds a security limit.
316
317 """
318 try:
319 line = yield from read_line(MAX_LINE_LENGTH)
320 except RuntimeError:
321 raise SecurityError("line too long")
322 # Not mandatory but safe - https://datatracker.ietf.org/doc/html/rfc7230#section-3.5
323 if not line.endswith(b"\r\n"):
324 raise EOFError("line without CRLF")
325 return line[:-2]
326
327
328def parse_headers(
329 read_line: Callable[[int], Generator[None, None, bytes]],
330) -> Generator[None, None, Headers]:
331 """
332 Parse HTTP headers.
333
334 Non-ASCII characters are represented with surrogate escapes.
335
336 Args:
337 read_line: Generator-based coroutine that reads a LF-terminated line
338 or raises an exception if there isn't enough data.
339
340 Raises:
341 EOFError: If the connection is closed without complete headers.
342 SecurityError: If the request exceeds a security limit.
343 ValueError: If the request isn't well formatted.
344
345 """
346 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.2
347
348 # We don't attempt to support obsolete line folding.
349
350 headers = Headers()
351 for _ in range(MAX_NUM_HEADERS + 1):
352 try:
353 line = yield from parse_line(read_line)
354 except EOFError as exc:
355 raise EOFError("connection closed while reading HTTP headers") from exc
356 if line == b"":
357 break
358
359 try:
360 raw_name, raw_value = line.split(b":", 1)
361 except ValueError: # not enough values to unpack (expected 2, got 1)
362 raise ValueError(f"invalid HTTP header line: {d(line)}") from None
363 if not _token_re.fullmatch(raw_name):
364 raise ValueError(f"invalid HTTP header name: {d(raw_name)}")
365 raw_value = raw_value.strip(b" \t")
366 if not _value_re.fullmatch(raw_value):
367 raise ValueError(f"invalid HTTP header value: {d(raw_value)}")
368
369 name = raw_name.decode("ascii") # guaranteed to be ASCII at this point
370 value = raw_value.decode("ascii", "surrogateescape")
371 headers[name] = value
372
373 else:
374 raise SecurityError("too many HTTP headers")
375
376 return headers
377
378
379def read_body(
380 status_code: int,
381 headers: Headers,
382 read_line: Callable[[int], Generator[None, None, bytes]],
383 read_exact: Callable[[int], Generator[None, None, bytes]],
384 read_to_eof: Callable[[int], Generator[None, None, bytes]],
385) -> Generator[None, None, bytes]:
386 # https://datatracker.ietf.org/doc/html/rfc7230#section-3.3.3
387
388 # Since websockets only does GET requests (no HEAD, no CONNECT), all
389 # responses except 1xx, 204, and 304 include a message body.
390 if 100 <= status_code < 200 or status_code == 204 or status_code == 304:
391 return b""
392
393 # MultipleValuesError is sufficiently unlikely that we don't attempt to
394 # handle it when accessing headers. Instead we document that its parent
395 # class, LookupError, may be raised.
396 # Conversions from str to int are protected by sys.set_int_max_str_digits..
397
398 elif (coding := headers.get("Transfer-Encoding")) is not None:
399 if coding != "chunked":
400 raise NotImplementedError(f"transfer coding {coding} isn't supported")
401
402 body = b""
403 while True:
404 chunk_size_line = yield from parse_line(read_line)
405 raw_chunk_size = chunk_size_line.split(b";", 1)[0]
406 # Set a lower limit than default_max_str_digits; 1 EB is plenty.
407 if len(raw_chunk_size) > 15:
408 str_chunk_size = raw_chunk_size.decode(errors="backslashreplace")
409 raise SecurityError(f"chunk too large: 0x{str_chunk_size} bytes")
410 chunk_size = int(raw_chunk_size, 16)
411 if chunk_size == 0:
412 break
413 if len(body) + chunk_size > MAX_BODY_SIZE:
414 raise SecurityError(
415 f"chunk too large: {chunk_size} bytes after {len(body)} bytes"
416 )
417 body += yield from read_exact(chunk_size)
418 if (yield from read_exact(2)) != b"\r\n":
419 raise ValueError("chunk without CRLF")
420 # Read the trailer.
421 yield from parse_headers(read_line)
422 return body
423
424 elif (raw_content_length := headers.get("Content-Length")) is not None:
425 # Set a lower limit than default_max_str_digits; 1 EiB is plenty.
426 if len(raw_content_length) > 18:
427 raise SecurityError(f"body too large: {raw_content_length} bytes")
428 content_length = int(raw_content_length)
429 if content_length > MAX_BODY_SIZE:
430 raise SecurityError(f"body too large: {content_length} bytes")
431 return (yield from read_exact(content_length))
432
433 else:
434 try:
435 return (yield from read_to_eof(MAX_BODY_SIZE))
436 except RuntimeError:
437 raise SecurityError(f"body too large: over {MAX_BODY_SIZE} bytes")