Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/werkzeug/urls.py: 72%

1"""Functions for working with URLs.

3Contains implementations of functions from :mod:`urllib.parse` that

4handle bytes and strings.

5"""

6from __future__ import annotations

8import codecs

9import os

10import re

11import typing as t

12import warnings

13from urllib.parse import quote

14from urllib.parse import unquote

15from urllib.parse import urlencode

16from urllib.parse import urlsplit

17from urllib.parse import urlunsplit

19from ._internal import _check_str_tuple

20from ._internal import _decode_idna

21from ._internal import _make_encode_wrapper

22from ._internal import _to_str

23from .datastructures import iter_multi_items

25if t.TYPE_CHECKING:

26 from . import datastructures as ds

28# A regular expression for what a valid schema looks like

29_scheme_re = re.compile(r"^[a-zA-Z0-9+-.]+$")

31# Characters that are safe in any part of an URL.

32_always_safe_chars = (

33 "abcdefghijklmnopqrstuvwxyz"

34 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

35 "0123456789"

36 "-._~"

37 "$!'()*+,;" # RFC3986 sub-delims set, not including query string delimiters &=

38)

39_always_safe = frozenset(_always_safe_chars.encode("ascii"))

41_hexdigits = "0123456789ABCDEFabcdef"

42_hextobyte = {

43 f"{a}{b}".encode("ascii"): int(f"{a}{b}", 16)

44 for a in _hexdigits

45 for b in _hexdigits

46}

47_bytetohex = [f"%{char:02X}".encode("ascii") for char in range(256)]

50class _URLTuple(t.NamedTuple):

51 scheme: str

52 netloc: str

53 path: str

54 query: str

55 fragment: str

58class BaseURL(_URLTuple):

59 """Superclass of :py:class:`URL` and :py:class:`BytesURL`.

61 .. deprecated:: 2.3

62 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead.

63 """

65 __slots__ = ()

66 _at: str

67 _colon: str

68 _lbracket: str

69 _rbracket: str

71 def __new__(cls, *args: t.Any, **kwargs: t.Any) -> BaseURL:

72 warnings.warn(

73 f"'werkzeug.urls.{cls.__name__}' is deprecated and will be removed in"

74 " Werkzeug 3.0. Use the 'urllib.parse' library instead.",

75 DeprecationWarning,

76 stacklevel=2,

77 )

78 return super().__new__(cls, *args, **kwargs)

80 def __str__(self) -> str:

81 return self.to_url()

83 def replace(self, **kwargs: t.Any) -> BaseURL:

84 """Return an URL with the same values, except for those parameters

85 given new values by whichever keyword arguments are specified."""

86 return self._replace(**kwargs)

88 @property

89 def host(self) -> str | None:

90 """The host part of the URL if available, otherwise `None`. The

91 host is either the hostname or the IP address mentioned in the

92 URL. It will not contain the port.

93 """

94 return self._split_host()[0]

96 @property

97 def ascii_host(self) -> str | None:

98 """Works exactly like :attr:`host` but will return a result that

99 is restricted to ASCII. If it finds a netloc that is not ASCII

100 it will attempt to idna decode it. This is useful for socket

101 operations when the URL might include internationalized characters.

102 """

103 rv = self.host

104 if rv is not None and isinstance(rv, str):

105 try:

106 rv = rv.encode("idna").decode("ascii")

107 except UnicodeError:

108 pass

109 return rv

110

111 @property

112 def port(self) -> int | None:

113 """The port in the URL as an integer if it was present, `None`

114 otherwise. This does not fill in default ports.

115 """

116 try:

117 rv = int(_to_str(self._split_host()[1]))

118 if 0 <= rv <= 65535:

119 return rv

120 except (ValueError, TypeError):

121 pass

122 return None

123

124 @property

125 def auth(self) -> str | None:

126 """The authentication part in the URL if available, `None`

127 otherwise.

128 """

129 return self._split_netloc()[0]

130

131 @property

132 def username(self) -> str | None:

133 """The username if it was part of the URL, `None` otherwise.

134 This undergoes URL decoding and will always be a string.

135 """

136 rv = self._split_auth()[0]

137 if rv is not None:

138 return _url_unquote_legacy(rv)

139 return None

140

141 @property

142 def raw_username(self) -> str | None:

143 """The username if it was part of the URL, `None` otherwise.

144 Unlike :attr:`username` this one is not being decoded.

145 """

146 return self._split_auth()[0]

147

148 @property

149 def password(self) -> str | None:

150 """The password if it was part of the URL, `None` otherwise.

151 This undergoes URL decoding and will always be a string.

152 """

153 rv = self._split_auth()[1]

154 if rv is not None:

155 return _url_unquote_legacy(rv)

156 return None

157

158 @property

159 def raw_password(self) -> str | None:

160 """The password if it was part of the URL, `None` otherwise.

161 Unlike :attr:`password` this one is not being decoded.

162 """

163 return self._split_auth()[1]

164

165 def decode_query(self, *args: t.Any, **kwargs: t.Any) -> ds.MultiDict[str, str]:

166 """Decodes the query part of the URL. Ths is a shortcut for

167 calling :func:`url_decode` on the query argument. The arguments and

168 keyword arguments are forwarded to :func:`url_decode` unchanged.

169 """

170 return url_decode(self.query, *args, **kwargs)

171

172 def join(self, *args: t.Any, **kwargs: t.Any) -> BaseURL:

173 """Joins this URL with another one. This is just a convenience

174 function for calling into :meth:`url_join` and then parsing the

175 return value again.

176 """

177 return url_parse(url_join(self, *args, **kwargs))

178

179 def to_url(self) -> str:

180 """Returns a URL string or bytes depending on the type of the

181 information stored. This is just a convenience function

182 for calling :meth:`url_unparse` for this URL.

183 """

184 return url_unparse(self)

185

186 def encode_netloc(self) -> str:

187 """Encodes the netloc part to an ASCII safe URL as bytes."""

188 rv = self.ascii_host or ""

189 if ":" in rv:

190 rv = f"[{rv}]"

191 port = self.port

192 if port is not None:

193 rv = f"{rv}:{port}"

194 auth = ":".join(

195 filter(

196 None,

197 [

198 url_quote(self.raw_username or "", "utf-8", "strict", "/:%"),

199 url_quote(self.raw_password or "", "utf-8", "strict", "/:%"),

200 ],

201 )

202 )

203 if auth:

204 rv = f"{auth}@{rv}"

205 return rv

206

207 def decode_netloc(self) -> str:

208 """Decodes the netloc part into a string."""

209 host = self.host or ""

210

211 if isinstance(host, bytes):

212 host = host.decode()

213

214 rv = _decode_idna(host)

215

216 if ":" in rv:

217 rv = f"[{rv}]"

218 port = self.port

219 if port is not None:

220 rv = f"{rv}:{port}"

221 auth = ":".join(

222 filter(

223 None,

224 [

225 _url_unquote_legacy(self.raw_username or "", "/:%@"),

226 _url_unquote_legacy(self.raw_password or "", "/:%@"),

227 ],

228 )

229 )

230 if auth:

231 rv = f"{auth}@{rv}"

232 return rv

233

234 def to_uri_tuple(self) -> BaseURL:

235 """Returns a :class:`BytesURL` tuple that holds a URI. This will

236 encode all the information in the URL properly to ASCII using the

237 rules a web browser would follow.

238

239 It's usually more interesting to directly call :meth:`iri_to_uri` which

240 will return a string.

241 """

242 return url_parse(iri_to_uri(self))

243

244 def to_iri_tuple(self) -> BaseURL:

245 """Returns a :class:`URL` tuple that holds a IRI. This will try

246 to decode as much information as possible in the URL without

247 losing information similar to how a web browser does it for the

248 URL bar.

249

250 It's usually more interesting to directly call :meth:`uri_to_iri` which

251 will return a string.

252 """

253 return url_parse(uri_to_iri(self))

254

255 def get_file_location(

256 self, pathformat: str | None = None

257 ) -> tuple[str | None, str | None]:

258 """Returns a tuple with the location of the file in the form

259 ``(server, location)``. If the netloc is empty in the URL or

260 points to localhost, it's represented as ``None``.

261

262 The `pathformat` by default is autodetection but needs to be set

263 when working with URLs of a specific system. The supported values

264 are ``'windows'`` when working with Windows or DOS paths and

265 ``'posix'`` when working with posix paths.

266

267 If the URL does not point to a local file, the server and location

268 are both represented as ``None``.

269

270 :param pathformat: The expected format of the path component.

271 Currently ``'windows'`` and ``'posix'`` are

272 supported. Defaults to ``None`` which is

273 autodetect.

274 """

275 if self.scheme != "file":

276 return None, None

277

278 path = url_unquote(self.path)

279 host = self.netloc or None

280

281 if pathformat is None:

282 if os.name == "nt":

283 pathformat = "windows"

284 else:

285 pathformat = "posix"

286

287 if pathformat == "windows":

288 if path[:1] == "/" and path[1:2].isalpha() and path[2:3] in "|:":

289 path = f"{path[1:2]}:{path[3:]}"

290 windows_share = path[:3] in ("\\" * 3, "/" * 3)

291 import ntpath

292

293 path = ntpath.normpath(path)

294 # Windows shared drives are represented as ``\\host\\directory``.

295 # That results in a URL like ``file://///host/directory``, and a

296 # path like ``///host/directory``. We need to special-case this

297 # because the path contains the hostname.

298 if windows_share and host is None:

299 parts = path.lstrip("\\").split("\\", 1)

300 if len(parts) == 2:

301 host, path = parts

302 else:

303 host = parts[0]

304 path = ""

305 elif pathformat == "posix":

306 import posixpath

307

308 path = posixpath.normpath(path)

309 else:

310 raise TypeError(f"Invalid path format {pathformat!r}")

311

312 if host in ("127.0.0.1", "::1", "localhost"):

313 host = None

314

315 return host, path

316

317 def _split_netloc(self) -> tuple[str | None, str]:

318 if self._at in self.netloc:

319 auth, _, netloc = self.netloc.partition(self._at)

320 return auth, netloc

321 return None, self.netloc

322

323 def _split_auth(self) -> tuple[str | None, str | None]:

324 auth = self._split_netloc()[0]

325 if not auth:

326 return None, None

327 if self._colon not in auth:

328 return auth, None

329

330 username, _, password = auth.partition(self._colon)

331 return username, password

332

333 def _split_host(self) -> tuple[str | None, str | None]:

334 rv = self._split_netloc()[1]

335 if not rv:

336 return None, None

337

338 if not rv.startswith(self._lbracket):

339 if self._colon in rv:

340 host, _, port = rv.partition(self._colon)

341 return host, port

342 return rv, None

343

344 idx = rv.find(self._rbracket)

345 if idx < 0:

346 return rv, None

347

348 host = rv[1:idx]

349 rest = rv[idx + 1 :]

350 if rest.startswith(self._colon):

351 return host, rest[1:]

352 return host, None

353

354

355class URL(BaseURL):

356 """Represents a parsed URL. This behaves like a regular tuple but

357 also has some extra attributes that give further insight into the

358 URL.

359

360 .. deprecated:: 2.3

361 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead.

362 """

363

364 __slots__ = ()

365 _at = "@"

366 _colon = ":"

367 _lbracket = "["

368 _rbracket = "]"

369

370 def encode(self, charset: str = "utf-8", errors: str = "replace") -> BytesURL:

371 """Encodes the URL to a tuple made out of bytes. The charset is

372 only being used for the path, query and fragment.

373 """

374 return BytesURL(

375 self.scheme.encode("ascii"),

376 self.encode_netloc(),

377 self.path.encode(charset, errors),

378 self.query.encode(charset, errors),

379 self.fragment.encode(charset, errors),

380 )

381

382

383class BytesURL(BaseURL):

384 """Represents a parsed URL in bytes.

385

386 .. deprecated:: 2.3

387 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead.

388 """

389

390 __slots__ = ()

391 _at = b"@" # type: ignore

392 _colon = b":" # type: ignore

393 _lbracket = b"[" # type: ignore

394 _rbracket = b"]" # type: ignore

395

396 def __str__(self) -> str:

397 return self.to_url().decode("utf-8", "replace") # type: ignore

398

399 def encode_netloc(self) -> bytes: # type: ignore

400 """Returns the netloc unchanged as bytes."""

401 return self.netloc # type: ignore

402

403 def decode(self, charset: str = "utf-8", errors: str = "replace") -> URL:

404 """Decodes the URL to a tuple made out of strings. The charset is

405 only being used for the path, query and fragment.

406 """

407 return URL(

408 self.scheme.decode("ascii"), # type: ignore

409 self.decode_netloc(),

410 self.path.decode(charset, errors), # type: ignore

411 self.query.decode(charset, errors), # type: ignore

412 self.fragment.decode(charset, errors), # type: ignore

413 )

414

415

416_unquote_maps: dict[frozenset[int], dict[bytes, int]] = {frozenset(): _hextobyte}

417

418

419def _unquote_to_bytes(string: str | bytes, unsafe: str | bytes = "") -> bytes:

420 if isinstance(string, str):

421 string = string.encode("utf-8")

422

423 if isinstance(unsafe, str):

424 unsafe = unsafe.encode("utf-8")

425

426 unsafe = frozenset(bytearray(unsafe))

427 groups = iter(string.split(b"%"))

428 result = bytearray(next(groups, b""))

429

430 try:

431 hex_to_byte = _unquote_maps[unsafe]

432 except KeyError:

433 hex_to_byte = _unquote_maps[unsafe] = {

434 h: b for h, b in _hextobyte.items() if b not in unsafe

435 }

436

437 for group in groups:

438 code = group[:2]

439

440 if code in hex_to_byte:

441 result.append(hex_to_byte[code])

442 result.extend(group[2:])

443 else:

444 result.append(37) # %

445 result.extend(group)

446

447 return bytes(result)

448

449

450def _url_encode_impl(

451 obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]],

452 charset: str,

453 sort: bool,

454 key: t.Callable[[tuple[str, str]], t.Any] | None,

455) -> t.Iterator[str]:

456 from .datastructures import iter_multi_items

457

458 iterable: t.Iterable[tuple[str, str]] = iter_multi_items(obj)

459

460 if sort:

461 iterable = sorted(iterable, key=key)

462

463 for key_str, value_str in iterable:

464 if value_str is None:

465 continue

466

467 if not isinstance(key_str, bytes):

468 key_bytes = str(key_str).encode(charset)

469 else:

470 key_bytes = key_str

471

472 if not isinstance(value_str, bytes):

473 value_bytes = str(value_str).encode(charset)

474 else:

475 value_bytes = value_str

476

477 yield f"{_fast_url_quote_plus(key_bytes)}={_fast_url_quote_plus(value_bytes)}"

478

479

480def _url_unquote_legacy(value: str, unsafe: str = "") -> str:

481 try:

482 return url_unquote(value, charset="utf-8", errors="strict", unsafe=unsafe)

483 except UnicodeError:

484 return url_unquote(value, charset="latin1", unsafe=unsafe)

485

486

487def url_parse(

488 url: str, scheme: str | None = None, allow_fragments: bool = True

489) -> BaseURL:

490 """Parses a URL from a string into a :class:`URL` tuple. If the URL

491 is lacking a scheme it can be provided as second argument. Otherwise,

492 it is ignored. Optionally fragments can be stripped from the URL

493 by setting `allow_fragments` to `False`.

494

495 The inverse of this function is :func:`url_unparse`.

496

497 :param url: the URL to parse.

498 :param scheme: the default schema to use if the URL is schemaless.

499 :param allow_fragments: if set to `False` a fragment will be removed

500 from the URL.

501

502 .. deprecated:: 2.3

503 Will be removed in Werkzeug 3.0. Use ``urllib.parse.urlsplit`` instead.

504 """

505 warnings.warn(

506 "'werkzeug.urls.url_parse' is deprecated and will be removed in Werkzeug 3.0."

507 " Use 'urllib.parse.urlsplit' instead.",

508 DeprecationWarning,

509 stacklevel=2,

510 )

511 s = _make_encode_wrapper(url)

512 is_text_based = isinstance(url, str)

513

514 if scheme is None:

515 scheme = s("")

516 netloc = query = fragment = s("")

517 i = url.find(s(":"))

518 if i > 0 and _scheme_re.match(_to_str(url[:i], errors="replace")):

519 # make sure "iri" is not actually a port number (in which case

520 # "scheme" is really part of the path)

521 rest = url[i + 1 :]

522 if not rest or any(c not in s("0123456789") for c in rest):

523 # not a port number

524 scheme, url = url[:i].lower(), rest

525

526 if url[:2] == s("//"):

527 delim = len(url)

528 for c in s("/?#"):

529 wdelim = url.find(c, 2)

530 if wdelim >= 0:

531 delim = min(delim, wdelim)

532 netloc, url = url[2:delim], url[delim:]

533 if (s("[") in netloc and s("]") not in netloc) or (

534 s("]") in netloc and s("[") not in netloc

535 ):

536 raise ValueError("Invalid IPv6 URL")

537

538 if allow_fragments and s("#") in url:

539 url, fragment = url.split(s("#"), 1)

540 if s("?") in url:

541 url, query = url.split(s("?"), 1)

542

543 result_type = URL if is_text_based else BytesURL

544

545 return result_type(scheme, netloc, url, query, fragment)

546

547

548def _make_fast_url_quote(

549 charset: str = "utf-8",

550 errors: str = "strict",

551 safe: str | bytes = "/:",

552 unsafe: str | bytes = "",

553) -> t.Callable[[bytes], str]:

554 """Precompile the translation table for a URL encoding function.

555

556 Unlike :func:`url_quote`, the generated function only takes the

557 string to quote.

558

559 :param charset: The charset to encode the result with.

560 :param errors: How to handle encoding errors.

561 :param safe: An optional sequence of safe characters to never encode.

562 :param unsafe: An optional sequence of unsafe characters to always encode.

563 """

564 if isinstance(safe, str):

565 safe = safe.encode(charset, errors)

566

567 if isinstance(unsafe, str):

568 unsafe = unsafe.encode(charset, errors)

569

570 safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe))

571 table = [chr(c) if c in safe else f"%{c:02X}" for c in range(256)]

572

573 def quote(string: bytes) -> str:

574 return "".join([table[c] for c in string])

575

576 return quote

577

578

579_fast_url_quote = _make_fast_url_quote()

580_fast_quote_plus = _make_fast_url_quote(safe=" ", unsafe="+")

581

582

583def _fast_url_quote_plus(string: bytes) -> str:

584 return _fast_quote_plus(string).replace(" ", "+")

585

586

587def url_quote(

588 string: str | bytes,

589 charset: str = "utf-8",

590 errors: str = "strict",

591 safe: str | bytes = "/:",

592 unsafe: str | bytes = "",

593) -> str:

594 """URL encode a single string with a given encoding.

595

596 :param s: the string to quote.

597 :param charset: the charset to be used.

598 :param safe: an optional sequence of safe characters.

599 :param unsafe: an optional sequence of unsafe characters.

600

601 .. deprecated:: 2.3

602 Will be removed in Werkzeug 3.0. Use ``urllib.parse.quote`` instead.

603

604 .. versionadded:: 0.9.2

605 The `unsafe` parameter was added.

606 """

607 warnings.warn(

608 "'werkzeug.urls.url_quote' is deprecated and will be removed in Werkzeug 3.0."

609 " Use 'urllib.parse.quote' instead.",

610 DeprecationWarning,

611 stacklevel=2,

612 )

613

614 if not isinstance(string, (str, bytes, bytearray)):

615 string = str(string)

616 if isinstance(string, str):

617 string = string.encode(charset, errors)

618 if isinstance(safe, str):

619 safe = safe.encode(charset, errors)

620 if isinstance(unsafe, str):

621 unsafe = unsafe.encode(charset, errors)

622 safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe))

623 rv = bytearray()

624 for char in bytearray(string):

625 if char in safe:

626 rv.append(char)

627 else:

628 rv.extend(_bytetohex[char])

629 return bytes(rv).decode(charset)

630

631

632def url_quote_plus(

633 string: str, charset: str = "utf-8", errors: str = "strict", safe: str = ""

634) -> str:

635 """URL encode a single string with the given encoding and convert

636 whitespace to "+".

637

638 :param s: The string to quote.

639 :param charset: The charset to be used.

640 :param safe: An optional sequence of safe characters.

641

642 .. deprecated:: 2.3

643 Will be removed in Werkzeug 3.0. Use ``urllib.parse.quote_plus`` instead.

644 """

645 warnings.warn(

646 "'werkzeug.urls.url_quote_plus' is deprecated and will be removed in Werkzeug"

647 " 2.4. Use 'urllib.parse.quote_plus' instead.",

648 DeprecationWarning,

649 stacklevel=2,

650 )

651

652 return url_quote(string, charset, errors, safe + " ", "+").replace(" ", "+")

653

654

655def url_unparse(components: tuple[str, str, str, str, str]) -> str:

656 """The reverse operation to :meth:`url_parse`. This accepts arbitrary

657 as well as :class:`URL` tuples and returns a URL as a string.

658

659 :param components: the parsed URL as tuple which should be converted

660 into a URL string.

661

662 .. deprecated:: 2.3

663 Will be removed in Werkzeug 3.0. Use ``urllib.parse.urlunsplit`` instead.

664 """

665 warnings.warn(

666 "'werkzeug.urls.url_unparse' is deprecated and will be removed in Werkzeug 3.0."

667 " Use 'urllib.parse.urlunsplit' instead.",

668 DeprecationWarning,

669 stacklevel=2,

670 )

671 _check_str_tuple(components)

672 scheme, netloc, path, query, fragment = components

673 s = _make_encode_wrapper(scheme)

674 url = s("")

675

676 # We generally treat file:///x and file:/x the same which is also

677 # what browsers seem to do. This also allows us to ignore a schema

678 # register for netloc utilization or having to differentiate between

679 # empty and missing netloc.

680 if netloc or (scheme and path.startswith(s("/"))):

681 if path and path[:1] != s("/"):

682 path = s("/") + path

683 url = s("//") + (netloc or s("")) + path

684 elif path:

685 url += path

686 if scheme:

687 url = scheme + s(":") + url

688 if query:

689 url = url + s("?") + query

690 if fragment:

691 url = url + s("#") + fragment

692 return url

693

694

695def url_unquote(

696 s: str | bytes,

697 charset: str = "utf-8",

698 errors: str = "replace",

699 unsafe: str = "",

700) -> str:

701 """URL decode a single string with a given encoding. If the charset

702 is set to `None` no decoding is performed and raw bytes are

703 returned.

704

705 :param s: the string to unquote.

706 :param charset: the charset of the query string. If set to `None`

707 no decoding will take place.

708 :param errors: the error handling for the charset decoding.

709

710 .. deprecated:: 2.3

711 Will be removed in Werkzeug 3.0. Use ``urllib.parse.unquote`` instead.

712 """

713 warnings.warn(

714 "'werkzeug.urls.url_unquote' is deprecated and will be removed in Werkzeug 3.0."

715 " Use 'urllib.parse.unquote' instead.",

716 DeprecationWarning,

717 stacklevel=2,

718 )

719 rv = _unquote_to_bytes(s, unsafe)

720 if charset is None:

721 return rv

722 return rv.decode(charset, errors)

723

724

725def url_unquote_plus(

726 s: str | bytes, charset: str = "utf-8", errors: str = "replace"

727) -> str:

728 """URL decode a single string with the given `charset` and decode "+" to

729 whitespace.

730

731 Per default encoding errors are ignored. If you want a different behavior

732 you can set `errors` to ``'replace'`` or ``'strict'``.

733

734 :param s: The string to unquote.

735 :param charset: the charset of the query string. If set to `None`

736 no decoding will take place.

737 :param errors: The error handling for the `charset` decoding.

738

739 .. deprecated:: 2.3

740 Will be removed in Werkzeug 3.0. Use ``urllib.parse.unquote_plus`` instead.

741 """

742 warnings.warn(

743 "'werkzeug.urls.url_unquote_plus' is deprecated and will be removed in Werkzeug"

744 " 2.4. Use 'urllib.parse.unquote_plus' instead.",

745 DeprecationWarning,

746 stacklevel=2,

747 )

748

749 if isinstance(s, str):

750 s = s.replace("+", " ")

751 else:

752 s = s.replace(b"+", b" ")

753

754 return url_unquote(s, charset, errors)

755

756

757def url_fix(s: str, charset: str = "utf-8") -> str:

758 r"""Sometimes you get an URL by a user that just isn't a real URL because

759 it contains unsafe characters like ' ' and so on. This function can fix

760 some of the problems in a similar way browsers handle data entered by the

761 user:

762

763 >>> url_fix('http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')

764 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)'

765

766 :param s: the string with the URL to fix.

767 :param charset: The target charset for the URL if the url was given

768 as a string.

769

770 .. deprecated:: 2.3

771 Will be removed in Werkzeug 3.0.

772 """

773 warnings.warn(

774 "'werkzeug.urls.url_fix' is deprecated and will be removed in Werkzeug 3.0.",

775 DeprecationWarning,

776 stacklevel=2,

777 )

778 # First step is to switch to text processing and to convert

779 # backslashes (which are invalid in URLs anyways) to slashes. This is

780 # consistent with what Chrome does.

781 s = _to_str(s, charset, "replace").replace("\\", "/")

782

783 # For the specific case that we look like a malformed windows URL

784 # we want to fix this up manually:

785 if s.startswith("file://") and s[7:8].isalpha() and s[8:10] in (":/", "|/"):

786 s = f"file:///{s[7:]}"

787

788 url = url_parse(s)

789 path = url_quote(url.path, charset, safe="/%+$!*'(),")

790 qs = url_quote_plus(url.query, charset, safe=":&%=+$!*'(),")

791 anchor = url_quote_plus(url.fragment, charset, safe=":&%=+$!*'(),")

792 return url_unparse((url.scheme, url.encode_netloc(), path, qs, anchor))

793

794

795def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]:

796 """Used in :func:`uri_to_iri` after unquoting to re-quote any

797 invalid bytes.

798 """

799 # the docs state that UnicodeError does have these attributes,

800 # but mypy isn't picking them up

801 out = quote(e.object[e.start : e.end], safe="") # type: ignore

802 return out, e.end # type: ignore

803

804

805codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)

806

807

808def _make_unquote_part(name: str, chars: str) -> t.Callable[[str, str, str], str]:

809 """Create a function that unquotes all percent encoded characters except those

810 given. This allows working with unquoted characters if possible while not changing

811 the meaning of a given part of a URL.

812 """

813 choices = "|".join(f"{ord(c):02X}" for c in sorted(chars))

814 pattern = re.compile(f"((?:%(?:{choices}))+)", re.I)

815

816 def _unquote_partial(value: str, encoding: str, errors: str) -> str:

817 parts = iter(pattern.split(value))

818 out = []

819

820 for part in parts:

821 out.append(unquote(part, encoding, errors))

822 out.append(next(parts, ""))

823

824 return "".join(out)

825

826 _unquote_partial.__name__ = f"_unquote_{name}"

827 return _unquote_partial

828

829

830# characters that should remain quoted in URL parts

831# based on https://url.spec.whatwg.org/#percent-encoded-bytes

832# always keep all controls, space, and % quoted

833_always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode()

834_unquote_fragment = _make_unquote_part("fragment", _always_unsafe)

835_unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#")

836_unquote_path = _make_unquote_part("path", _always_unsafe + "/?#")

837_unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#")

838

839

840def uri_to_iri(

841 uri: str | tuple[str, str, str, str, str],

842 charset: str | None = None,

843 errors: str | None = None,

844) -> str:

845 """Convert a URI to an IRI. All valid UTF-8 characters are unquoted,

846 leaving all reserved and invalid characters quoted. If the URL has

847 a domain, it is decoded from Punycode.

848

849 >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")

850 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'

851

852 :param uri: The URI to convert.

853 :param charset: The encoding to encode unquoted bytes with.

854 :param errors: Error handler to use during ``bytes.encode``. By

855 default, invalid bytes are left quoted.

856

857 .. versionchanged:: 2.3

858 Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters, are

859 deprecated and will be removed in Werkzeug 3.0.

860

861 .. versionchanged:: 2.3

862 Which characters remain quoted is specific to each part of the URL.

863

864 .. versionchanged:: 0.15

865 All reserved and invalid characters remain quoted. Previously,

866 only some reserved characters were preserved, and invalid bytes

867 were replaced instead of left quoted.

868

869 .. versionadded:: 0.6

870 """

871 if isinstance(uri, tuple):

872 warnings.warn(

873 "Passing a tuple is deprecated and will not be supported in Werkzeug 3.0.",

874 DeprecationWarning,

875 stacklevel=2,

876 )

877 uri = urlunsplit(uri)

878

879 if isinstance(uri, bytes):

880 warnings.warn(

881 "Passing bytes is deprecated and will not be supported in Werkzeug 3.0.",

882 DeprecationWarning,

883 stacklevel=2,

884 )

885 uri = uri.decode()

886

887 if charset is not None:

888 warnings.warn(

889 "The 'charset' parameter is deprecated and will be removed"

890 " in Werkzeug 3.0.",

891 DeprecationWarning,

892 stacklevel=2,

893 )

894 else:

895 charset = "utf-8"

896

897 if errors is not None:

898 warnings.warn(

899 "The 'errors' parameter is deprecated and will be removed in Werkzeug 3.0.",

900 DeprecationWarning,

901 stacklevel=2,

902 )

903 else:

904 errors = "werkzeug.url_quote"

905

906 parts = urlsplit(uri)

907 path = _unquote_path(parts.path, charset, errors)

908 query = _unquote_query(parts.query, charset, errors)

909 fragment = _unquote_fragment(parts.fragment, charset, errors)

910

911 if parts.hostname:

912 netloc = _decode_idna(parts.hostname)

913 else:

914 netloc = ""

915

916 if ":" in netloc:

917 netloc = f"[{netloc}]"

918

919 if parts.port:

920 netloc = f"{netloc}:{parts.port}"

921

922 if parts.username:

923 auth = _unquote_user(parts.username, charset, errors)

924

925 if parts.password:

926 auth = f"{auth}:{_unquote_user(parts.password, charset, errors)}"

927

928 netloc = f"{auth}@{netloc}"

929

930 return urlunsplit((parts.scheme, netloc, path, query, fragment))

931

932

933def iri_to_uri(

934 iri: str | tuple[str, str, str, str, str],

935 charset: str | None = None,

936 errors: str | None = None,

937 safe_conversion: bool | None = None,

938) -> str:

939 """Convert an IRI to a URI. All non-ASCII and unsafe characters are

940 quoted. If the URL has a domain, it is encoded to Punycode.

941

942 >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')

943 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'

944

945 :param iri: The IRI to convert.

946 :param charset: The encoding of the IRI.

947 :param errors: Error handler to use during ``bytes.encode``.

948

949 .. versionchanged:: 2.3

950 Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters, are

951 deprecated and will be removed in Werkzeug 3.0.

952

953 .. versionchanged:: 2.3

954 Which characters remain unquoted is specific to each part of the URL.

955

956 .. versionchanged:: 2.3

957 The ``safe_conversion`` parameter is deprecated and will be removed in Werkzeug

958 2.4.

959

960 .. versionchanged:: 0.15

961 All reserved characters remain unquoted. Previously, only some reserved

962 characters were left unquoted.

963

964 .. versionchanged:: 0.9.6

965 The ``safe_conversion`` parameter was added.

966

967 .. versionadded:: 0.6

968 """

969 if charset is not None:

970 warnings.warn(

971 "The 'charset' parameter is deprecated and will be removed"

972 " in Werkzeug 3.0.",

973 DeprecationWarning,

974 stacklevel=2,

975 )

976 else:

977 charset = "utf-8"

978

979 if isinstance(iri, tuple):

980 warnings.warn(

981 "Passing a tuple is deprecated and will not be supported in Werkzeug 3.0.",

982 DeprecationWarning,

983 stacklevel=2,

984 )

985 iri = urlunsplit(iri)

986

987 if isinstance(iri, bytes):

988 warnings.warn(

989 "Passing bytes is deprecated and will not be supported in Werkzeug 3.0.",

990 DeprecationWarning,

991 stacklevel=2,

992 )

993 iri = iri.decode(charset)

994

995 if errors is not None:

996 warnings.warn(

997 "The 'errors' parameter is deprecated and will be removed in Werkzeug 3.0.",

998 DeprecationWarning,

999 stacklevel=2,

1000 )

1001 else:

1002 errors = "strict"

1003

1004 if safe_conversion is not None:

1005 warnings.warn(

1006 "The 'safe_conversion' parameter is deprecated and will be removed in"

1007 " Werkzeug 3.0.",

1008 DeprecationWarning,

1009 stacklevel=2,

1010 )

1011

1012 if safe_conversion:

1013 # If we're not sure if it's safe to normalize the URL, and it only contains

1014 # ASCII characters, return it as-is.

1015 try:

1016 ascii_iri = iri.encode("ascii")

1017

1018 # Only return if it doesn't have whitespace. (Why?)

1019 if len(ascii_iri.split()) == 1:

1020 return iri

1021 except UnicodeError:

1022 pass

1023

1024 parts = urlsplit(iri)

1025 # safe = https://url.spec.whatwg.org/#url-path-segment-string

1026 # as well as percent for things that are already quoted

1027 path = quote(parts.path, safe="%!$&'()*+,/:;=@", encoding=charset, errors=errors)

1028 query = quote(parts.query, safe="%!$&'()*+,/:;=?@", encoding=charset, errors=errors)

1029 fragment = quote(

1030 parts.fragment, safe="%!#$&'()*+,/:;=?@", encoding=charset, errors=errors

1031 )

1032

1033 if parts.hostname:

1034 netloc = parts.hostname.encode("idna").decode("ascii")

1035 else:

1036 netloc = ""

1037

1038 if ":" in netloc:

1039 netloc = f"[{netloc}]"

1040

1041 if parts.port:

1042 netloc = f"{netloc}:{parts.port}"

1043

1044 if parts.username:

1045 auth = quote(parts.username, safe="%!$&'()*+,;=")

1046

1047 if parts.password:

1048 pass_quoted = quote(parts.password, safe="%!$&'()*+,;=")

1049 auth = f"{auth}:{pass_quoted}"

1050

1051 netloc = f"{auth}@{netloc}"

1052

1053 return urlunsplit((parts.scheme, netloc, path, query, fragment))

1054

1055

1056def _invalid_iri_to_uri(iri: str) -> str:

1057 """The URL scheme ``itms-services://`` must contain the ``//`` even though it does

1058 not have a host component. There may be other invalid schemes as well. Currently,

1059 responses will always call ``iri_to_uri`` on the redirect ``Location`` header, which

1060 removes the ``//``. For now, if the IRI only contains ASCII and does not contain

1061 spaces, pass it on as-is. In Werkzeug 3.0, this should become a

1062 ``response.process_location`` flag.

1063

1064 :meta private:

1065 """

1066 try:

1067 iri.encode("ascii")

1068 except UnicodeError:

1069 pass

1070 else:

1071 if len(iri.split(None, 1)) == 1:

1072 return iri

1073

1074 return iri_to_uri(iri)

1075

1076

1077def url_decode(

1078 s: t.AnyStr,

1079 charset: str = "utf-8",

1080 include_empty: bool = True,

1081 errors: str = "replace",

1082 separator: str = "&",

1083 cls: type[ds.MultiDict] | None = None,

1084) -> ds.MultiDict[str, str]:

1085 """Parse a query string and return it as a :class:`MultiDict`.

1086

1087 :param s: The query string to parse.

1088 :param charset: Decode bytes to string with this charset. If not

1089 given, bytes are returned as-is.

1090 :param include_empty: Include keys with empty values in the dict.

1091 :param errors: Error handling behavior when decoding bytes.

1092 :param separator: Separator character between pairs.

1093 :param cls: Container to hold result instead of :class:`MultiDict`.

1094

1095 .. deprecated:: 2.3

1096 Will be removed in Werkzeug 3.0. Use ``urllib.parse.parse_qs`` instead.

1097

1098 .. versionchanged:: 2.1

1099 The ``decode_keys`` parameter was removed.

1100

1101 .. versionchanged:: 0.5

1102 In previous versions ";" and "&" could be used for url decoding.

1103 Now only "&" is supported. If you want to use ";", a different

1104 ``separator`` can be provided.

1105

1106 .. versionchanged:: 0.5

1107 The ``cls`` parameter was added.

1108 """

1109 warnings.warn(

1110 "'werkzeug.urls.url_decode' is deprecated and will be removed in Werkzeug 2.4."

1111 " Use 'urllib.parse.parse_qs' instead.",

1112 DeprecationWarning,

1113 stacklevel=2,

1114 )

1115

1116 if cls is None:

1117 from .datastructures import MultiDict # noqa: F811

1118

1119 cls = MultiDict

1120 if isinstance(s, str) and not isinstance(separator, str):

1121 separator = separator.decode(charset or "ascii")

1122 elif isinstance(s, bytes) and not isinstance(separator, bytes):

1123 separator = separator.encode(charset or "ascii") # type: ignore

1124 return cls(

1125 _url_decode_impl(

1126 s.split(separator), charset, include_empty, errors # type: ignore

1127 )

1128 )

1129

1130

1131def url_decode_stream(

1132 stream: t.IO[bytes],

1133 charset: str = "utf-8",

1134 include_empty: bool = True,

1135 errors: str = "replace",

1136 separator: bytes = b"&",

1137 cls: type[ds.MultiDict] | None = None,

1138 limit: int | None = None,

1139) -> ds.MultiDict[str, str]:

1140 """Works like :func:`url_decode` but decodes a stream. The behavior

1141 of stream and limit follows functions like

1142 :func:`~werkzeug.wsgi.make_line_iter`. The generator of pairs is

1143 directly fed to the `cls` so you can consume the data while it's

1144 parsed.

1145

1146 :param stream: a stream with the encoded querystring

1147 :param charset: the charset of the query string. If set to `None`

1148 no decoding will take place.

1149 :param include_empty: Set to `False` if you don't want empty values to

1150 appear in the dict.

1151 :param errors: the decoding error behavior.

1152 :param separator: the pair separator to be used, defaults to ``&``

1153 :param cls: an optional dict class to use. If this is not specified

1154 or `None` the default :class:`MultiDict` is used.

1155 :param limit: the content length of the URL data. Not necessary if

1156 a limited stream is provided.

1157

1158 .. deprecated:: 2.3

1159 Will be removed in Werkzeug 2.4. Use ``urllib.parse.parse_qs`` instead.

1160

1161 .. versionchanged:: 2.1

1162 The ``decode_keys`` and ``return_iterator`` parameters were removed.

1163

1164 .. versionadded:: 0.8

1165 """

1166 warnings.warn(

1167 "'werkzeug.urls.url_decode_stream' is deprecated and will be removed in"

1168 " Werkzeug 2.4. Use 'urllib.parse.parse_qs' instead.",

1169 DeprecationWarning,

1170 stacklevel=2,

1171 )

1172

1173 from .wsgi import make_chunk_iter

1174

1175 pair_iter = make_chunk_iter(stream, separator, limit)

1176 decoder = _url_decode_impl(pair_iter, charset, include_empty, errors)

1177

1178 if cls is None:

1179 from .datastructures import MultiDict # noqa: F811

1180

1181 cls = MultiDict

1182

1183 return cls(decoder)

1184

1185

1186def _url_decode_impl(

1187 pair_iter: t.Iterable[t.AnyStr], charset: str, include_empty: bool, errors: str

1188) -> t.Iterator[tuple[str, str]]:

1189 for pair in pair_iter:

1190 if not pair:

1191 continue

1192 s = _make_encode_wrapper(pair)

1193 equal = s("=")

1194 if equal in pair:

1195 key, value = pair.split(equal, 1)

1196 else:

1197 if not include_empty:

1198 continue

1199 key = pair

1200 value = s("")

1201 yield (

1202 url_unquote_plus(key, charset, errors),

1203 url_unquote_plus(value, charset, errors),

1204 )

1205

1206

1207def url_encode(

1208 obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]],

1209 charset: str = "utf-8",

1210 sort: bool = False,

1211 key: t.Callable[[tuple[str, str]], t.Any] | None = None,

1212 separator: str = "&",

1213) -> str:

1214 """URL encode a dict/`MultiDict`. If a value is `None` it will not appear

1215 in the result string. Per default only values are encoded into the target

1216 charset strings.

1217

1218 :param obj: the object to encode into a query string.

1219 :param charset: the charset of the query string.

1220 :param sort: set to `True` if you want parameters to be sorted by `key`.

1221 :param separator: the separator to be used for the pairs.

1222 :param key: an optional function to be used for sorting. For more details

1223 check out the :func:`sorted` documentation.

1224

1225 .. deprecated:: 2.3

1226 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urlencode`` instead.

1227

1228 .. versionchanged:: 2.1

1229 The ``encode_keys`` parameter was removed.

1230

1231 .. versionchanged:: 0.5

1232 Added the ``sort``, ``key``, and ``separator`` parameters.

1233 """

1234 warnings.warn(

1235 "'werkzeug.urls.url_encode' is deprecated and will be removed in Werkzeug 2.4."

1236 " Use 'urllib.parse.urlencode' instead.",

1237 DeprecationWarning,

1238 stacklevel=2,

1239 )

1240 separator = _to_str(separator, "ascii")

1241 return separator.join(_url_encode_impl(obj, charset, sort, key))

1242

1243

1244def url_encode_stream(

1245 obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]],

1246 stream: t.IO[str] | None = None,

1247 charset: str = "utf-8",

1248 sort: bool = False,

1249 key: t.Callable[[tuple[str, str]], t.Any] | None = None,

1250 separator: str = "&",

1251) -> None:

1252 """Like :meth:`url_encode` but writes the results to a stream

1253 object. If the stream is `None` a generator over all encoded

1254 pairs is returned.

1255

1256 :param obj: the object to encode into a query string.

1257 :param stream: a stream to write the encoded object into or `None` if

1258 an iterator over the encoded pairs should be returned. In

1259 that case the separator argument is ignored.

1260 :param charset: the charset of the query string.

1261 :param sort: set to `True` if you want parameters to be sorted by `key`.

1262 :param separator: the separator to be used for the pairs.

1263 :param key: an optional function to be used for sorting. For more details

1264 check out the :func:`sorted` documentation.

1265

1266 .. deprecated:: 2.3

1267 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urlencode`` instead.

1268

1269 .. versionchanged:: 2.1

1270 The ``encode_keys`` parameter was removed.

1271

1272 .. versionadded:: 0.8

1273 """

1274 warnings.warn(

1275 "'werkzeug.urls.url_encode_stream' is deprecated and will be removed in"

1276 " Werkzeug 2.4. Use 'urllib.parse.urlencode' instead.",

1277 DeprecationWarning,

1278 stacklevel=2,

1279 )

1280 separator = _to_str(separator, "ascii")

1281 gen = _url_encode_impl(obj, charset, sort, key)

1282 if stream is None:

1283 return gen # type: ignore

1284 for idx, chunk in enumerate(gen):

1285 if idx:

1286 stream.write(separator)

1287 stream.write(chunk)

1288 return None

1289

1290

1291def url_join(

1292 base: str | tuple[str, str, str, str, str],

1293 url: str | tuple[str, str, str, str, str],

1294 allow_fragments: bool = True,

1295) -> str:

1296 """Join a base URL and a possibly relative URL to form an absolute

1297 interpretation of the latter.

1298

1299 :param base: the base URL for the join operation.

1300 :param url: the URL to join.

1301 :param allow_fragments: indicates whether fragments should be allowed.

1302

1303 .. deprecated:: 2.3

1304 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urljoin`` instead.

1305 """

1306 warnings.warn(

1307 "'werkzeug.urls.url_join' is deprecated and will be removed in Werkzeug 2.4."

1308 " Use 'urllib.parse.urljoin' instead.",

1309 DeprecationWarning,

1310 stacklevel=2,

1311 )

1312

1313 if isinstance(base, tuple):

1314 base = url_unparse(base)

1315 if isinstance(url, tuple):

1316 url = url_unparse(url)

1317

1318 _check_str_tuple((base, url))

1319 s = _make_encode_wrapper(base)

1320

1321 if not base:

1322 return url

1323 if not url:

1324 return base

1325

1326 bscheme, bnetloc, bpath, bquery, bfragment = url_parse(

1327 base, allow_fragments=allow_fragments

1328 )

1329 scheme, netloc, path, query, fragment = url_parse(url, bscheme, allow_fragments)

1330 if scheme != bscheme:

1331 return url

1332 if netloc:

1333 return url_unparse((scheme, netloc, path, query, fragment))

1334 netloc = bnetloc

1335

1336 if path[:1] == s("/"):

1337 segments = path.split(s("/"))

1338 elif not path:

1339 segments = bpath.split(s("/"))

1340 if not query:

1341 query = bquery

1342 else:

1343 segments = bpath.split(s("/"))[:-1] + path.split(s("/"))

1344

1345 # If the rightmost part is "./" we want to keep the slash but

1346 # remove the dot.

1347 if segments[-1] == s("."):

1348 segments[-1] = s("")

1349

1350 # Resolve ".." and "."

1351 segments = [segment for segment in segments if segment != s(".")]

1352 while True:

1353 i = 1

1354 n = len(segments) - 1

1355 while i < n:

1356 if segments[i] == s("..") and segments[i - 1] not in (s(""), s("..")):

1357 del segments[i - 1 : i + 1]

1358 break

1359 i += 1

1360 else:

1361 break

1362

1363 # Remove trailing ".." if the URL is absolute

1364 unwanted_marker = [s(""), s("..")]

1365 while segments[:2] == unwanted_marker:

1366 del segments[1]

1367

1368 path = s("/").join(segments)

1369 return url_unparse((scheme, netloc, path, query, fragment))

1370

1371

1372def _urlencode(

1373 query: t.Mapping[str, str] | t.Iterable[tuple[str, str]], encoding: str = "utf-8"

1374) -> str:

1375 items = [x for x in iter_multi_items(query) if x[1] is not None]

1376 # safe = https://url.spec.whatwg.org/#percent-encoded-bytes

1377 return urlencode(items, safe="!$'()*,/:;?@", encoding=encoding)