Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/werkzeug/urls.py: 25%

1"""Functions for working with URLs.

3Contains implementations of functions from :mod:`urllib.parse` that

4handle bytes and strings.

5"""

6import codecs

7import os

8import re

9import typing as t

11from ._internal import _check_str_tuple

12from ._internal import _decode_idna

13from ._internal import _encode_idna

14from ._internal import _make_encode_wrapper

15from ._internal import _to_str

17if t.TYPE_CHECKING:

18 from . import datastructures as ds

20# A regular expression for what a valid schema looks like

21_scheme_re = re.compile(r"^[a-zA-Z0-9+-.]+$")

23# Characters that are safe in any part of an URL.

24_always_safe = frozenset(

25 bytearray(

26 b"abcdefghijklmnopqrstuvwxyz"

27 b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"

28 b"0123456789"

29 b"-._~"

30 b"$!'()*+,;" # RFC3986 sub-delims set, not including query string delimiters &=

31 )

32)

34_hexdigits = "0123456789ABCDEFabcdef"

35_hextobyte = {

36 f"{a}{b}".encode("ascii"): int(f"{a}{b}", 16)

37 for a in _hexdigits

38 for b in _hexdigits

39}

40_bytetohex = [f"%{char:02X}".encode("ascii") for char in range(256)]

43class _URLTuple(t.NamedTuple):

44 scheme: str

45 netloc: str

46 path: str

47 query: str

48 fragment: str

51class BaseURL(_URLTuple):

52 """Superclass of :py:class:`URL` and :py:class:`BytesURL`."""

54 __slots__ = ()

55 _at: str

56 _colon: str

57 _lbracket: str

58 _rbracket: str

60 def __str__(self) -> str:

61 return self.to_url()

63 def replace(self, **kwargs: t.Any) -> "BaseURL":

64 """Return an URL with the same values, except for those parameters

65 given new values by whichever keyword arguments are specified."""

66 return self._replace(**kwargs)

68 @property

69 def host(self) -> t.Optional[str]:

70 """The host part of the URL if available, otherwise `None`. The

71 host is either the hostname or the IP address mentioned in the

72 URL. It will not contain the port.

73 """

74 return self._split_host()[0]

76 @property

77 def ascii_host(self) -> t.Optional[str]:

78 """Works exactly like :attr:`host` but will return a result that

79 is restricted to ASCII. If it finds a netloc that is not ASCII

80 it will attempt to idna decode it. This is useful for socket

81 operations when the URL might include internationalized characters.

82 """

83 rv = self.host

84 if rv is not None and isinstance(rv, str):

85 try:

86 rv = _encode_idna(rv) # type: ignore

87 except UnicodeError:

88 rv = rv.encode("ascii", "ignore") # type: ignore

89 return _to_str(rv, "ascii", "ignore")

91 @property

92 def port(self) -> t.Optional[int]:

93 """The port in the URL as an integer if it was present, `None`

94 otherwise. This does not fill in default ports.

95 """

96 try:

97 rv = int(_to_str(self._split_host()[1]))

98 if 0 <= rv <= 65535:

99 return rv

100 except (ValueError, TypeError):

101 pass

102 return None

103

104 @property

105 def auth(self) -> t.Optional[str]:

106 """The authentication part in the URL if available, `None`

107 otherwise.

108 """

109 return self._split_netloc()[0]

110

111 @property

112 def username(self) -> t.Optional[str]:

113 """The username if it was part of the URL, `None` otherwise.

114 This undergoes URL decoding and will always be a string.

115 """

116 rv = self._split_auth()[0]

117 if rv is not None:

118 return _url_unquote_legacy(rv)

119 return None

120

121 @property

122 def raw_username(self) -> t.Optional[str]:

123 """The username if it was part of the URL, `None` otherwise.

124 Unlike :attr:`username` this one is not being decoded.

125 """

126 return self._split_auth()[0]

127

128 @property

129 def password(self) -> t.Optional[str]:

130 """The password if it was part of the URL, `None` otherwise.

131 This undergoes URL decoding and will always be a string.

132 """

133 rv = self._split_auth()[1]

134 if rv is not None:

135 return _url_unquote_legacy(rv)

136 return None

137

138 @property

139 def raw_password(self) -> t.Optional[str]:

140 """The password if it was part of the URL, `None` otherwise.

141 Unlike :attr:`password` this one is not being decoded.

142 """

143 return self._split_auth()[1]

144

145 def decode_query(self, *args: t.Any, **kwargs: t.Any) -> "ds.MultiDict[str, str]":

146 """Decodes the query part of the URL. Ths is a shortcut for

147 calling :func:`url_decode` on the query argument. The arguments and

148 keyword arguments are forwarded to :func:`url_decode` unchanged.

149 """

150 return url_decode(self.query, *args, **kwargs)

151

152 def join(self, *args: t.Any, **kwargs: t.Any) -> "BaseURL":

153 """Joins this URL with another one. This is just a convenience

154 function for calling into :meth:`url_join` and then parsing the

155 return value again.

156 """

157 return url_parse(url_join(self, *args, **kwargs))

158

159 def to_url(self) -> str:

160 """Returns a URL string or bytes depending on the type of the

161 information stored. This is just a convenience function

162 for calling :meth:`url_unparse` for this URL.

163 """

164 return url_unparse(self)

165

166 def encode_netloc(self) -> str:

167 """Encodes the netloc part to an ASCII safe URL as bytes."""

168 rv = self.ascii_host or ""

169 if ":" in rv:

170 rv = f"[{rv}]"

171 port = self.port

172 if port is not None:

173 rv = f"{rv}:{port}"

174 auth = ":".join(

175 filter(

176 None,

177 [

178 url_quote(self.raw_username or "", "utf-8", "strict", "/:%"),

179 url_quote(self.raw_password or "", "utf-8", "strict", "/:%"),

180 ],

181 )

182 )

183 if auth:

184 rv = f"{auth}@{rv}"

185 return rv

186

187 def decode_netloc(self) -> str:

188 """Decodes the netloc part into a string."""

189 rv = _decode_idna(self.host or "")

190

191 if ":" in rv:

192 rv = f"[{rv}]"

193 port = self.port

194 if port is not None:

195 rv = f"{rv}:{port}"

196 auth = ":".join(

197 filter(

198 None,

199 [

200 _url_unquote_legacy(self.raw_username or "", "/:%@"),

201 _url_unquote_legacy(self.raw_password or "", "/:%@"),

202 ],

203 )

204 )

205 if auth:

206 rv = f"{auth}@{rv}"

207 return rv

208

209 def to_uri_tuple(self) -> "BaseURL":

210 """Returns a :class:`BytesURL` tuple that holds a URI. This will

211 encode all the information in the URL properly to ASCII using the

212 rules a web browser would follow.

213

214 It's usually more interesting to directly call :meth:`iri_to_uri` which

215 will return a string.

216 """

217 return url_parse(iri_to_uri(self))

218

219 def to_iri_tuple(self) -> "BaseURL":

220 """Returns a :class:`URL` tuple that holds a IRI. This will try

221 to decode as much information as possible in the URL without

222 losing information similar to how a web browser does it for the

223 URL bar.

224

225 It's usually more interesting to directly call :meth:`uri_to_iri` which

226 will return a string.

227 """

228 return url_parse(uri_to_iri(self))

229

230 def get_file_location(

231 self, pathformat: t.Optional[str] = None

232 ) -> t.Tuple[t.Optional[str], t.Optional[str]]:

233 """Returns a tuple with the location of the file in the form

234 ``(server, location)``. If the netloc is empty in the URL or

235 points to localhost, it's represented as ``None``.

236

237 The `pathformat` by default is autodetection but needs to be set

238 when working with URLs of a specific system. The supported values

239 are ``'windows'`` when working with Windows or DOS paths and

240 ``'posix'`` when working with posix paths.

241

242 If the URL does not point to a local file, the server and location

243 are both represented as ``None``.

244

245 :param pathformat: The expected format of the path component.

246 Currently ``'windows'`` and ``'posix'`` are

247 supported. Defaults to ``None`` which is

248 autodetect.

249 """

250 if self.scheme != "file":

251 return None, None

252

253 path = url_unquote(self.path)

254 host = self.netloc or None

255

256 if pathformat is None:

257 if os.name == "nt":

258 pathformat = "windows"

259 else:

260 pathformat = "posix"

261

262 if pathformat == "windows":

263 if path[:1] == "/" and path[1:2].isalpha() and path[2:3] in "|:":

264 path = f"{path[1:2]}:{path[3:]}"

265 windows_share = path[:3] in ("\\" * 3, "/" * 3)

266 import ntpath

267

268 path = ntpath.normpath(path)

269 # Windows shared drives are represented as ``\\host\\directory``.

270 # That results in a URL like ``file://///host/directory``, and a

271 # path like ``///host/directory``. We need to special-case this

272 # because the path contains the hostname.

273 if windows_share and host is None:

274 parts = path.lstrip("\\").split("\\", 1)

275 if len(parts) == 2:

276 host, path = parts

277 else:

278 host = parts[0]

279 path = ""

280 elif pathformat == "posix":

281 import posixpath

282

283 path = posixpath.normpath(path)

284 else:

285 raise TypeError(f"Invalid path format {pathformat!r}")

286

287 if host in ("127.0.0.1", "::1", "localhost"):

288 host = None

289

290 return host, path

291

292 def _split_netloc(self) -> t.Tuple[t.Optional[str], str]:

293 if self._at in self.netloc:

294 auth, _, netloc = self.netloc.partition(self._at)

295 return auth, netloc

296 return None, self.netloc

297

298 def _split_auth(self) -> t.Tuple[t.Optional[str], t.Optional[str]]:

299 auth = self._split_netloc()[0]

300 if not auth:

301 return None, None

302 if self._colon not in auth:

303 return auth, None

304

305 username, _, password = auth.partition(self._colon)

306 return username, password

307

308 def _split_host(self) -> t.Tuple[t.Optional[str], t.Optional[str]]:

309 rv = self._split_netloc()[1]

310 if not rv:

311 return None, None

312

313 if not rv.startswith(self._lbracket):

314 if self._colon in rv:

315 host, _, port = rv.partition(self._colon)

316 return host, port

317 return rv, None

318

319 idx = rv.find(self._rbracket)

320 if idx < 0:

321 return rv, None

322

323 host = rv[1:idx]

324 rest = rv[idx + 1 :]

325 if rest.startswith(self._colon):

326 return host, rest[1:]

327 return host, None

328

329

330class URL(BaseURL):

331 """Represents a parsed URL. This behaves like a regular tuple but

332 also has some extra attributes that give further insight into the

333 URL.

334 """

335

336 __slots__ = ()

337 _at = "@"

338 _colon = ":"

339 _lbracket = "["

340 _rbracket = "]"

341

342 def encode(self, charset: str = "utf-8", errors: str = "replace") -> "BytesURL":

343 """Encodes the URL to a tuple made out of bytes. The charset is

344 only being used for the path, query and fragment.

345 """

346 return BytesURL(

347 self.scheme.encode("ascii"), # type: ignore

348 self.encode_netloc(),

349 self.path.encode(charset, errors), # type: ignore

350 self.query.encode(charset, errors), # type: ignore

351 self.fragment.encode(charset, errors), # type: ignore

352 )

353

354

355class BytesURL(BaseURL):

356 """Represents a parsed URL in bytes."""

357

358 __slots__ = ()

359 _at = b"@" # type: ignore

360 _colon = b":" # type: ignore

361 _lbracket = b"[" # type: ignore

362 _rbracket = b"]" # type: ignore

363

364 def __str__(self) -> str:

365 return self.to_url().decode("utf-8", "replace") # type: ignore

366

367 def encode_netloc(self) -> bytes: # type: ignore

368 """Returns the netloc unchanged as bytes."""

369 return self.netloc # type: ignore

370

371 def decode(self, charset: str = "utf-8", errors: str = "replace") -> "URL":

372 """Decodes the URL to a tuple made out of strings. The charset is

373 only being used for the path, query and fragment.

374 """

375 return URL(

376 self.scheme.decode("ascii"), # type: ignore

377 self.decode_netloc(),

378 self.path.decode(charset, errors), # type: ignore

379 self.query.decode(charset, errors), # type: ignore

380 self.fragment.decode(charset, errors), # type: ignore

381 )

382

383

384_unquote_maps: t.Dict[t.FrozenSet[int], t.Dict[bytes, int]] = {frozenset(): _hextobyte}

385

386

387def _unquote_to_bytes(

388 string: t.Union[str, bytes], unsafe: t.Union[str, bytes] = ""

389) -> bytes:

390 if isinstance(string, str):

391 string = string.encode("utf-8")

392

393 if isinstance(unsafe, str):

394 unsafe = unsafe.encode("utf-8")

395

396 unsafe = frozenset(bytearray(unsafe))

397 groups = iter(string.split(b"%"))

398 result = bytearray(next(groups, b""))

399

400 try:

401 hex_to_byte = _unquote_maps[unsafe]

402 except KeyError:

403 hex_to_byte = _unquote_maps[unsafe] = {

404 h: b for h, b in _hextobyte.items() if b not in unsafe

405 }

406

407 for group in groups:

408 code = group[:2]

409

410 if code in hex_to_byte:

411 result.append(hex_to_byte[code])

412 result.extend(group[2:])

413 else:

414 result.append(37) # %

415 result.extend(group)

416

417 return bytes(result)

418

419

420def _url_encode_impl(

421 obj: t.Union[t.Mapping[str, str], t.Iterable[t.Tuple[str, str]]],

422 charset: str,

423 sort: bool,

424 key: t.Optional[t.Callable[[t.Tuple[str, str]], t.Any]],

425) -> t.Iterator[str]:

426 from .datastructures import iter_multi_items

427

428 iterable: t.Iterable[t.Tuple[str, str]] = iter_multi_items(obj)

429

430 if sort:

431 iterable = sorted(iterable, key=key)

432

433 for key_str, value_str in iterable:

434 if value_str is None:

435 continue

436

437 if not isinstance(key_str, bytes):

438 key_bytes = str(key_str).encode(charset)

439 else:

440 key_bytes = key_str

441

442 if not isinstance(value_str, bytes):

443 value_bytes = str(value_str).encode(charset)

444 else:

445 value_bytes = value_str

446

447 yield f"{_fast_url_quote_plus(key_bytes)}={_fast_url_quote_plus(value_bytes)}"

448

449

450def _url_unquote_legacy(value: str, unsafe: str = "") -> str:

451 try:

452 return url_unquote(value, charset="utf-8", errors="strict", unsafe=unsafe)

453 except UnicodeError:

454 return url_unquote(value, charset="latin1", unsafe=unsafe)

455

456

457def url_parse(

458 url: str, scheme: t.Optional[str] = None, allow_fragments: bool = True

459) -> BaseURL:

460 """Parses a URL from a string into a :class:`URL` tuple. If the URL

461 is lacking a scheme it can be provided as second argument. Otherwise,

462 it is ignored. Optionally fragments can be stripped from the URL

463 by setting `allow_fragments` to `False`.

464

465 The inverse of this function is :func:`url_unparse`.

466

467 :param url: the URL to parse.

468 :param scheme: the default schema to use if the URL is schemaless.

469 :param allow_fragments: if set to `False` a fragment will be removed

470 from the URL.

471 """

472 s = _make_encode_wrapper(url)

473 is_text_based = isinstance(url, str)

474

475 if scheme is None:

476 scheme = s("")

477 netloc = query = fragment = s("")

478 i = url.find(s(":"))

479 if i > 0 and _scheme_re.match(_to_str(url[:i], errors="replace")):

480 # make sure "iri" is not actually a port number (in which case

481 # "scheme" is really part of the path)

482 rest = url[i + 1 :]

483 if not rest or any(c not in s("0123456789") for c in rest):

484 # not a port number

485 scheme, url = url[:i].lower(), rest

486

487 if url[:2] == s("//"):

488 delim = len(url)

489 for c in s("/?#"):

490 wdelim = url.find(c, 2)

491 if wdelim >= 0:

492 delim = min(delim, wdelim)

493 netloc, url = url[2:delim], url[delim:]

494 if (s("[") in netloc and s("]") not in netloc) or (

495 s("]") in netloc and s("[") not in netloc

496 ):

497 raise ValueError("Invalid IPv6 URL")

498

499 if allow_fragments and s("#") in url:

500 url, fragment = url.split(s("#"), 1)

501 if s("?") in url:

502 url, query = url.split(s("?"), 1)

503

504 result_type = URL if is_text_based else BytesURL

505 return result_type(scheme, netloc, url, query, fragment)

506

507

508def _make_fast_url_quote(

509 charset: str = "utf-8",

510 errors: str = "strict",

511 safe: t.Union[str, bytes] = "/:",

512 unsafe: t.Union[str, bytes] = "",

513) -> t.Callable[[bytes], str]:

514 """Precompile the translation table for a URL encoding function.

515

516 Unlike :func:`url_quote`, the generated function only takes the

517 string to quote.

518

519 :param charset: The charset to encode the result with.

520 :param errors: How to handle encoding errors.

521 :param safe: An optional sequence of safe characters to never encode.

522 :param unsafe: An optional sequence of unsafe characters to always encode.

523 """

524 if isinstance(safe, str):

525 safe = safe.encode(charset, errors)

526

527 if isinstance(unsafe, str):

528 unsafe = unsafe.encode(charset, errors)

529

530 safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe))

531 table = [chr(c) if c in safe else f"%{c:02X}" for c in range(256)]

532

533 def quote(string: bytes) -> str:

534 return "".join([table[c] for c in string])

535

536 return quote

537

538

539_fast_url_quote = _make_fast_url_quote()

540_fast_quote_plus = _make_fast_url_quote(safe=" ", unsafe="+")

541

542

543def _fast_url_quote_plus(string: bytes) -> str:

544 return _fast_quote_plus(string).replace(" ", "+")

545

546

547def url_quote(

548 string: t.Union[str, bytes],

549 charset: str = "utf-8",

550 errors: str = "strict",

551 safe: t.Union[str, bytes] = "/:",

552 unsafe: t.Union[str, bytes] = "",

553) -> str:

554 """URL encode a single string with a given encoding.

555

556 :param s: the string to quote.

557 :param charset: the charset to be used.

558 :param safe: an optional sequence of safe characters.

559 :param unsafe: an optional sequence of unsafe characters.

560

561 .. versionadded:: 0.9.2

562 The `unsafe` parameter was added.

563 """

564 if not isinstance(string, (str, bytes, bytearray)):

565 string = str(string)

566 if isinstance(string, str):

567 string = string.encode(charset, errors)

568 if isinstance(safe, str):

569 safe = safe.encode(charset, errors)

570 if isinstance(unsafe, str):

571 unsafe = unsafe.encode(charset, errors)

572 safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe))

573 rv = bytearray()

574 for char in bytearray(string):

575 if char in safe:

576 rv.append(char)

577 else:

578 rv.extend(_bytetohex[char])

579 return bytes(rv).decode(charset)

580

581

582def url_quote_plus(

583 string: str, charset: str = "utf-8", errors: str = "strict", safe: str = ""

584) -> str:

585 """URL encode a single string with the given encoding and convert

586 whitespace to "+".

587

588 :param s: The string to quote.

589 :param charset: The charset to be used.

590 :param safe: An optional sequence of safe characters.

591 """

592 return url_quote(string, charset, errors, safe + " ", "+").replace(" ", "+")

593

594

595def url_unparse(components: t.Tuple[str, str, str, str, str]) -> str:

596 """The reverse operation to :meth:`url_parse`. This accepts arbitrary

597 as well as :class:`URL` tuples and returns a URL as a string.

598

599 :param components: the parsed URL as tuple which should be converted

600 into a URL string.

601 """

602 _check_str_tuple(components)

603 scheme, netloc, path, query, fragment = components

604 s = _make_encode_wrapper(scheme)

605 url = s("")

606

607 # We generally treat file:///x and file:/x the same which is also

608 # what browsers seem to do. This also allows us to ignore a schema

609 # register for netloc utilization or having to differentiate between

610 # empty and missing netloc.

611 if netloc or (scheme and path.startswith(s("/"))):

612 if path and path[:1] != s("/"):

613 path = s("/") + path

614 url = s("//") + (netloc or s("")) + path

615 elif path:

616 url += path

617 if scheme:

618 url = scheme + s(":") + url

619 if query:

620 url = url + s("?") + query

621 if fragment:

622 url = url + s("#") + fragment

623 return url

624

625

626def url_unquote(

627 s: t.Union[str, bytes],

628 charset: str = "utf-8",

629 errors: str = "replace",

630 unsafe: str = "",

631) -> str:

632 """URL decode a single string with a given encoding. If the charset

633 is set to `None` no decoding is performed and raw bytes are

634 returned.

635

636 :param s: the string to unquote.

637 :param charset: the charset of the query string. If set to `None`

638 no decoding will take place.

639 :param errors: the error handling for the charset decoding.

640 """

641 rv = _unquote_to_bytes(s, unsafe)

642 if charset is None:

643 return rv

644 return rv.decode(charset, errors)

645

646

647def url_unquote_plus(

648 s: t.Union[str, bytes], charset: str = "utf-8", errors: str = "replace"

649) -> str:

650 """URL decode a single string with the given `charset` and decode "+" to

651 whitespace.

652

653 Per default encoding errors are ignored. If you want a different behavior

654 you can set `errors` to ``'replace'`` or ``'strict'``.

655

656 :param s: The string to unquote.

657 :param charset: the charset of the query string. If set to `None`

658 no decoding will take place.

659 :param errors: The error handling for the `charset` decoding.

660 """

661 if isinstance(s, str):

662 s = s.replace("+", " ")

663 else:

664 s = s.replace(b"+", b" ")

665 return url_unquote(s, charset, errors)

666

667

668def url_fix(s: str, charset: str = "utf-8") -> str:

669 r"""Sometimes you get an URL by a user that just isn't a real URL because

670 it contains unsafe characters like ' ' and so on. This function can fix

671 some of the problems in a similar way browsers handle data entered by the

672 user:

673

674 >>> url_fix('http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')

675 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)'

676

677 :param s: the string with the URL to fix.

678 :param charset: The target charset for the URL if the url was given

679 as a string.

680 """

681 # First step is to switch to text processing and to convert

682 # backslashes (which are invalid in URLs anyways) to slashes. This is

683 # consistent with what Chrome does.

684 s = _to_str(s, charset, "replace").replace("\\", "/")

685

686 # For the specific case that we look like a malformed windows URL

687 # we want to fix this up manually:

688 if s.startswith("file://") and s[7:8].isalpha() and s[8:10] in (":/", "|/"):

689 s = f"file:///{s[7:]}"

690

691 url = url_parse(s)

692 path = url_quote(url.path, charset, safe="/%+$!*'(),")

693 qs = url_quote_plus(url.query, charset, safe=":&%=+$!*'(),")

694 anchor = url_quote_plus(url.fragment, charset, safe=":&%=+$!*'(),")

695 return url_unparse((url.scheme, url.encode_netloc(), path, qs, anchor))

696

697

698# not-unreserved characters remain quoted when unquoting to IRI

699_to_iri_unsafe = "".join([chr(c) for c in range(128) if c not in _always_safe])

700

701

702def _codec_error_url_quote(e: UnicodeError) -> t.Tuple[str, int]:

703 """Used in :func:`uri_to_iri` after unquoting to re-quote any

704 invalid bytes.

705 """

706 # the docs state that UnicodeError does have these attributes,

707 # but mypy isn't picking them up

708 out = _fast_url_quote(e.object[e.start : e.end]) # type: ignore

709 return out, e.end # type: ignore

710

711

712codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)

713

714

715def uri_to_iri(

716 uri: t.Union[str, t.Tuple[str, str, str, str, str]],

717 charset: str = "utf-8",

718 errors: str = "werkzeug.url_quote",

719) -> str:

720 """Convert a URI to an IRI. All valid UTF-8 characters are unquoted,

721 leaving all reserved and invalid characters quoted. If the URL has

722 a domain, it is decoded from Punycode.

723

724 >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")

725 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'

726

727 :param uri: The URI to convert.

728 :param charset: The encoding to encode unquoted bytes with.

729 :param errors: Error handler to use during ``bytes.encode``. By

730 default, invalid bytes are left quoted.

731

732 .. versionchanged:: 0.15

733 All reserved and invalid characters remain quoted. Previously,

734 only some reserved characters were preserved, and invalid bytes

735 were replaced instead of left quoted.

736

737 .. versionadded:: 0.6

738 """

739 if isinstance(uri, tuple):

740 uri = url_unparse(uri)

741

742 uri = url_parse(_to_str(uri, charset))

743 path = url_unquote(uri.path, charset, errors, _to_iri_unsafe)

744 query = url_unquote(uri.query, charset, errors, _to_iri_unsafe)

745 fragment = url_unquote(uri.fragment, charset, errors, _to_iri_unsafe)

746 return url_unparse((uri.scheme, uri.decode_netloc(), path, query, fragment))

747

748

749# reserved characters remain unquoted when quoting to URI

750_to_uri_safe = ":/?#[]@!$&'()*+,;=%"

751

752

753def iri_to_uri(

754 iri: t.Union[str, t.Tuple[str, str, str, str, str]],

755 charset: str = "utf-8",

756 errors: str = "strict",

757 safe_conversion: bool = False,

758) -> str:

759 """Convert an IRI to a URI. All non-ASCII and unsafe characters are

760 quoted. If the URL has a domain, it is encoded to Punycode.

761

762 >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')

763 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'

764

765 :param iri: The IRI to convert.

766 :param charset: The encoding of the IRI.

767 :param errors: Error handler to use during ``bytes.encode``.

768 :param safe_conversion: Return the URL unchanged if it only contains

769 ASCII characters and no whitespace. See the explanation below.

770

771 There is a general problem with IRI conversion with some protocols

772 that are in violation of the URI specification. Consider the

773 following two IRIs::

774

775 magnet:?xt=uri:whatever

776 itms-services://?action=download-manifest

777

778 After parsing, we don't know if the scheme requires the ``//``,

779 which is dropped if empty, but conveys different meanings in the

780 final URL if it's present or not. In this case, you can use

781 ``safe_conversion``, which will return the URL unchanged if it only

782 contains ASCII characters and no whitespace. This can result in a

783 URI with unquoted characters if it was not already quoted correctly,

784 but preserves the URL's semantics. Werkzeug uses this for the

785 ``Location`` header for redirects.

786

787 .. versionchanged:: 0.15

788 All reserved characters remain unquoted. Previously, only some

789 reserved characters were left unquoted.

790

791 .. versionchanged:: 0.9.6

792 The ``safe_conversion`` parameter was added.

793

794 .. versionadded:: 0.6

795 """

796 if isinstance(iri, tuple):

797 iri = url_unparse(iri)

798

799 if safe_conversion:

800 # If we're not sure if it's safe to convert the URL, and it only

801 # contains ASCII characters, return it unconverted.

802 try:

803 native_iri = _to_str(iri)

804 ascii_iri = native_iri.encode("ascii")

805

806 # Only return if it doesn't have whitespace. (Why?)

807 if len(ascii_iri.split()) == 1:

808 return native_iri

809 except UnicodeError:

810 pass

811

812 iri = url_parse(_to_str(iri, charset, errors))

813 path = url_quote(iri.path, charset, errors, _to_uri_safe)

814 query = url_quote(iri.query, charset, errors, _to_uri_safe)

815 fragment = url_quote(iri.fragment, charset, errors, _to_uri_safe)

816 return url_unparse((iri.scheme, iri.encode_netloc(), path, query, fragment))

817

818

819def url_decode(

820 s: t.AnyStr,

821 charset: str = "utf-8",

822 include_empty: bool = True,

823 errors: str = "replace",

824 separator: str = "&",

825 cls: t.Optional[t.Type["ds.MultiDict"]] = None,

826) -> "ds.MultiDict[str, str]":

827 """Parse a query string and return it as a :class:`MultiDict`.

828

829 :param s: The query string to parse.

830 :param charset: Decode bytes to string with this charset. If not

831 given, bytes are returned as-is.

832 :param include_empty: Include keys with empty values in the dict.

833 :param errors: Error handling behavior when decoding bytes.

834 :param separator: Separator character between pairs.

835 :param cls: Container to hold result instead of :class:`MultiDict`.

836

837 .. versionchanged:: 2.0

838 The ``decode_keys`` parameter is deprecated and will be removed

839 in Werkzeug 2.1.

840

841 .. versionchanged:: 0.5

842 In previous versions ";" and "&" could be used for url decoding.

843 Now only "&" is supported. If you want to use ";", a different

844 ``separator`` can be provided.

845

846 .. versionchanged:: 0.5

847 The ``cls`` parameter was added.

848 """

849 if cls is None:

850 from .datastructures import MultiDict # noqa: F811

851

852 cls = MultiDict

853 if isinstance(s, str) and not isinstance(separator, str):

854 separator = separator.decode(charset or "ascii")

855 elif isinstance(s, bytes) and not isinstance(separator, bytes):

856 separator = separator.encode(charset or "ascii") # type: ignore

857 return cls(

858 _url_decode_impl(

859 s.split(separator), charset, include_empty, errors # type: ignore

860 )

861 )

862

863

864def url_decode_stream(

865 stream: t.IO[bytes],

866 charset: str = "utf-8",

867 include_empty: bool = True,

868 errors: str = "replace",

869 separator: bytes = b"&",

870 cls: t.Optional[t.Type["ds.MultiDict"]] = None,

871 limit: t.Optional[int] = None,

872) -> "ds.MultiDict[str, str]":

873 """Works like :func:`url_decode` but decodes a stream. The behavior

874 of stream and limit follows functions like

875 :func:`~werkzeug.wsgi.make_line_iter`. The generator of pairs is

876 directly fed to the `cls` so you can consume the data while it's

877 parsed.

878

879 :param stream: a stream with the encoded querystring

880 :param charset: the charset of the query string. If set to `None`

881 no decoding will take place.

882 :param include_empty: Set to `False` if you don't want empty values to

883 appear in the dict.

884 :param errors: the decoding error behavior.

885 :param separator: the pair separator to be used, defaults to ``&``

886 :param cls: an optional dict class to use. If this is not specified

887 or `None` the default :class:`MultiDict` is used.

888 :param limit: the content length of the URL data. Not necessary if

889 a limited stream is provided.

890

891 .. versionchanged:: 2.0

892 The ``decode_keys`` and ``return_iterator`` parameters are

893 deprecated and will be removed in Werkzeug 2.1.

894

895 .. versionadded:: 0.8

896 """

897 from .wsgi import make_chunk_iter

898

899 pair_iter = make_chunk_iter(stream, separator, limit)

900 decoder = _url_decode_impl(pair_iter, charset, include_empty, errors)

901

902 if cls is None:

903 from .datastructures import MultiDict # noqa: F811

904

905 cls = MultiDict

906

907 return cls(decoder)

908

909

910def _url_decode_impl(

911 pair_iter: t.Iterable[t.AnyStr], charset: str, include_empty: bool, errors: str

912) -> t.Iterator[t.Tuple[str, str]]:

913 for pair in pair_iter:

914 if not pair:

915 continue

916 s = _make_encode_wrapper(pair)

917 equal = s("=")

918 if equal in pair:

919 key, value = pair.split(equal, 1)

920 else:

921 if not include_empty:

922 continue

923 key = pair

924 value = s("")

925 yield (

926 url_unquote_plus(key, charset, errors),

927 url_unquote_plus(value, charset, errors),

928 )

929

930

931def url_encode(

932 obj: t.Union[t.Mapping[str, str], t.Iterable[t.Tuple[str, str]]],

933 charset: str = "utf-8",

934 sort: bool = False,

935 key: t.Optional[t.Callable[[t.Tuple[str, str]], t.Any]] = None,

936 separator: str = "&",

937) -> str:

938 """URL encode a dict/`MultiDict`. If a value is `None` it will not appear

939 in the result string. Per default only values are encoded into the target

940 charset strings.

941

942 :param obj: the object to encode into a query string.

943 :param charset: the charset of the query string.

944 :param sort: set to `True` if you want parameters to be sorted by `key`.

945 :param separator: the separator to be used for the pairs.

946 :param key: an optional function to be used for sorting. For more details

947 check out the :func:`sorted` documentation.

948

949 .. versionchanged:: 2.0

950 The ``encode_keys`` parameter is deprecated and will be removed

951 in Werkzeug 2.1.

952

953 .. versionchanged:: 0.5

954 Added the ``sort``, ``key``, and ``separator`` parameters.

955 """

956 separator = _to_str(separator, "ascii")

957 return separator.join(_url_encode_impl(obj, charset, sort, key))

958

959

960def url_encode_stream(

961 obj: t.Union[t.Mapping[str, str], t.Iterable[t.Tuple[str, str]]],

962 stream: t.Optional[t.IO[str]] = None,

963 charset: str = "utf-8",

964 sort: bool = False,

965 key: t.Optional[t.Callable[[t.Tuple[str, str]], t.Any]] = None,

966 separator: str = "&",

967) -> None:

968 """Like :meth:`url_encode` but writes the results to a stream

969 object. If the stream is `None` a generator over all encoded

970 pairs is returned.

971

972 :param obj: the object to encode into a query string.

973 :param stream: a stream to write the encoded object into or `None` if

974 an iterator over the encoded pairs should be returned. In

975 that case the separator argument is ignored.

976 :param charset: the charset of the query string.

977 :param sort: set to `True` if you want parameters to be sorted by `key`.

978 :param separator: the separator to be used for the pairs.

979 :param key: an optional function to be used for sorting. For more details

980 check out the :func:`sorted` documentation.

981

982 .. versionchanged:: 2.0

983 The ``encode_keys`` parameter is deprecated and will be removed

984 in Werkzeug 2.1.

985

986 .. versionadded:: 0.8

987 """

988 separator = _to_str(separator, "ascii")

989 gen = _url_encode_impl(obj, charset, sort, key)

990 if stream is None:

991 return gen # type: ignore

992 for idx, chunk in enumerate(gen):

993 if idx:

994 stream.write(separator)

995 stream.write(chunk)

996 return None

997

998

999def url_join(

1000 base: t.Union[str, t.Tuple[str, str, str, str, str]],

1001 url: t.Union[str, t.Tuple[str, str, str, str, str]],

1002 allow_fragments: bool = True,

1003) -> str:

1004 """Join a base URL and a possibly relative URL to form an absolute

1005 interpretation of the latter.

1006

1007 :param base: the base URL for the join operation.

1008 :param url: the URL to join.

1009 :param allow_fragments: indicates whether fragments should be allowed.

1010 """

1011 if isinstance(base, tuple):

1012 base = url_unparse(base)

1013 if isinstance(url, tuple):

1014 url = url_unparse(url)

1015

1016 _check_str_tuple((base, url))

1017 s = _make_encode_wrapper(base)

1018

1019 if not base:

1020 return url

1021 if not url:

1022 return base

1023

1024 bscheme, bnetloc, bpath, bquery, bfragment = url_parse(

1025 base, allow_fragments=allow_fragments

1026 )

1027 scheme, netloc, path, query, fragment = url_parse(url, bscheme, allow_fragments)

1028 if scheme != bscheme:

1029 return url

1030 if netloc:

1031 return url_unparse((scheme, netloc, path, query, fragment))

1032 netloc = bnetloc

1033

1034 if path[:1] == s("/"):

1035 segments = path.split(s("/"))

1036 elif not path:

1037 segments = bpath.split(s("/"))

1038 if not query:

1039 query = bquery

1040 else:

1041 segments = bpath.split(s("/"))[:-1] + path.split(s("/"))

1042

1043 # If the rightmost part is "./" we want to keep the slash but

1044 # remove the dot.

1045 if segments[-1] == s("."):

1046 segments[-1] = s("")

1047

1048 # Resolve ".." and "."

1049 segments = [segment for segment in segments if segment != s(".")]

1050 while True:

1051 i = 1

1052 n = len(segments) - 1

1053 while i < n:

1054 if segments[i] == s("..") and segments[i - 1] not in (s(""), s("..")):

1055 del segments[i - 1 : i + 1]

1056 break

1057 i += 1

1058 else:

1059 break

1060

1061 # Remove trailing ".." if the URL is absolute

1062 unwanted_marker = [s(""), s("..")]

1063 while segments[:2] == unwanted_marker:

1064 del segments[1]

1065

1066 path = s("/").join(segments)

1067 return url_unparse((scheme, netloc, path, query, fragment))