Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/rfc3986/abnf_regexp.py: 98%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Licensed under the Apache License, Version 2.0 (the "License");
2# you may not use this file except in compliance with the License.
3# You may obtain a copy of the License at
4#
5# http://www.apache.org/licenses/LICENSE-2.0
6#
7# Unless required by applicable law or agreed to in writing, software
8# distributed under the License is distributed on an "AS IS" BASIS,
9# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
10# implied.
11# See the License for the specific language governing permissions and
12# limitations under the License.
13"""Module for the regular expressions crafted from ABNF."""
15import sys
17# https://tools.ietf.org/html/rfc3986#page-13
18GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
19GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
20# https://tools.ietf.org/html/rfc3986#page-13
21SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
22SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
23# Escape the '*' for use in regular expressions
24SUB_DELIMITERS_RE = r"!$&'()\*+,;="
25RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
26ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
27DIGIT = "0123456789"
28# https://tools.ietf.org/html/rfc3986#section-2.3
29UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r"._!-~"
30UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
31NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
32# We need to escape the '-' in this case:
33UNRESERVED_RE = r"A-Za-z0-9._~\-"
35# Percent encoded character values
36PERCENT_ENCODED = PCT_ENCODED = "%[A-Fa-f0-9]{2}"
37PCHAR = "([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
39# NOTE(sigmavirus24): We're going to use more strict regular expressions
40# than appear in Appendix B for scheme. This will prevent over-eager
41# consuming of items that aren't schemes.
42SCHEME_RE = "[a-zA-Z][a-zA-Z0-9+.-]*"
43_AUTHORITY_RE = "[^\\\\/?#]*"
44_PATH_RE = "[^?#]*"
45_QUERY_RE = "[^#]*"
46_FRAGMENT_RE = "(?s:.*)"
48# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
49COMPONENT_PATTERN_DICT = {
50 "scheme": SCHEME_RE,
51 "authority": _AUTHORITY_RE,
52 "path": _PATH_RE,
53 "query": _QUERY_RE,
54 "fragment": _FRAGMENT_RE,
55}
57# See http://tools.ietf.org/html/rfc3986#appendix-B
58# In this case, we name each of the important matches so we can use
59# SRE_Match#groupdict to parse the values out if we so choose. This is also
60# modified to ignore other matches that are not important to the parsing of
61# the reference so we can also simply use SRE_Match#groups.
62URL_PARSING_RE = (
63 r"(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?"
64 r"(?P<path>{path})(?:\?(?P<query>{query}))?"
65 r"(?:#(?P<fragment>{fragment}))?"
66).format(**COMPONENT_PATTERN_DICT)
69# #########################
70# Authority Matcher Section
71# #########################
73# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
74# The pattern for a regular name, e.g., www.google.com, api.github.com
75REGULAR_NAME_RE = REG_NAME = "((?:{}|[{}])*)".format(
76 "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + UNRESERVED_RE
77)
78# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
79IPv4_RE = r"([0-9]{1,3}\.){3}[0-9]{1,3}"
80# Hexadecimal characters used in each piece of an IPv6 address
81HEXDIG_RE = "[0-9A-Fa-f]{1,4}"
82# Least-significant 32 bits of an IPv6 address
83LS32_RE = "({hex}:{hex}|{ipv4})".format(hex=HEXDIG_RE, ipv4=IPv4_RE)
84# Substitutions into the following patterns for IPv6 patterns defined
85# http://tools.ietf.org/html/rfc3986#page-20
86_subs = {"hex": HEXDIG_RE, "ls32": LS32_RE}
88# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
89# about ABNF (Augmented Backus-Naur Form) use in the comments
90variations = [
91 # 6( h16 ":" ) ls32
92 "(%(hex)s:){6}%(ls32)s" % _subs,
93 # "::" 5( h16 ":" ) ls32
94 "::(%(hex)s:){5}%(ls32)s" % _subs,
95 # [ h16 ] "::" 4( h16 ":" ) ls32
96 "(%(hex)s)?::(%(hex)s:){4}%(ls32)s" % _subs,
97 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
98 "((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s" % _subs,
99 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
100 "((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s" % _subs,
101 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
102 "((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s" % _subs,
103 # [ *4( h16 ":" ) h16 ] "::" ls32
104 "((%(hex)s:){0,4}%(hex)s)?::%(ls32)s" % _subs,
105 # [ *5( h16 ":" ) h16 ] "::" h16
106 "((%(hex)s:){0,5}%(hex)s)?::%(hex)s" % _subs,
107 # [ *6( h16 ":" ) h16 ] "::"
108 "((%(hex)s:){0,6}%(hex)s)?::" % _subs,
109]
111IPv6_RE = "(({})|({})|({})|({})|({})|({})|({})|({})|({}))".format(*variations)
113IPv_FUTURE_RE = r"v[0-9A-Fa-f]+\.[%s]+" % (
114 UNRESERVED_RE + SUB_DELIMITERS_RE + ":"
115)
117# RFC 6874 Zone ID ABNF
118ZONE_ID = "(?:[" + UNRESERVED_RE + "]|" + PCT_ENCODED + ")+"
120IPv6_ADDRZ_RFC4007_RE = IPv6_RE + "(?:(?:%25|%)" + ZONE_ID + ")?"
121IPv6_ADDRZ_RE = IPv6_RE + "(?:%25" + ZONE_ID + ")?"
123IP_LITERAL_RE = r"\[({}|{})\]".format(
124 IPv6_ADDRZ_RFC4007_RE,
125 IPv_FUTURE_RE,
126)
128# Pattern for matching the host piece of the authority
129HOST_RE = HOST_PATTERN = "({}|{}|{})".format(
130 REG_NAME,
131 IPv4_RE,
132 IP_LITERAL_RE,
133)
134USERINFO_RE = (
135 "^([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
136)
137PORT_RE = "[0-9]{1,5}"
139# ####################
140# Path Matcher Section
141# ####################
143# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
144# about the path patterns defined below.
145segments = {
146 "segment": PCHAR + "*",
147 # Non-zero length segment
148 "segment-nz": PCHAR + "+",
149 # Non-zero length segment without ":"
150 "segment-nz-nc": PCHAR.replace(":", "") + "+",
151}
153# Path types taken from Section 3.3 (linked above)
154PATH_EMPTY = "(?:)"
155PATH_ROOTLESS = "%(segment-nz)s(/%(segment)s)*" % segments
156PATH_NOSCHEME = "%(segment-nz-nc)s(/%(segment)s)*" % segments
157PATH_ABSOLUTE = "/(%s)?" % PATH_ROOTLESS
158PATH_ABEMPTY = "(/%(segment)s)*" % segments
159PATH_RE = "^({}|{}|{}|{}|{})$".format(
160 PATH_ABEMPTY,
161 PATH_ABSOLUTE,
162 PATH_NOSCHEME,
163 PATH_ROOTLESS,
164 PATH_EMPTY,
165)
167FRAGMENT_RE = QUERY_RE = (
168 "^([/?:@" + UNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
169)
171# ##########################
172# Relative reference matcher
173# ##########################
175# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
176RELATIVE_PART_RE = "(//{}{}|{}|{}|{})".format(
177 COMPONENT_PATTERN_DICT["authority"],
178 PATH_ABEMPTY,
179 PATH_ABSOLUTE,
180 PATH_NOSCHEME,
181 PATH_EMPTY,
182)
184# See http://tools.ietf.org/html/rfc3986#section-3 for definition
185HIER_PART_RE = "(//{}{}|{}|{}|{})".format(
186 COMPONENT_PATTERN_DICT["authority"],
187 PATH_ABEMPTY,
188 PATH_ABSOLUTE,
189 PATH_ROOTLESS,
190 PATH_EMPTY,
191)
193# ###############
194# IRIs / RFC 3987
195# ###############
197# Only wide-unicode gets the high-ranges of UCSCHAR
198if sys.maxunicode > 0xFFFF: # pragma: no cover
199 IPRIVATE = "\ue000-\uf8ff\U000f0000-\U000ffffd\U00100000-\U0010fffd"
200 UCSCHAR_RE = (
201 "\u00a0-\ud7ff\uf900-\ufdcf\ufdf0-\uffef"
202 "\U00010000-\U0001fffd\U00020000-\U0002fffd"
203 "\U00030000-\U0003fffd\U00040000-\U0004fffd"
204 "\U00050000-\U0005fffd\U00060000-\U0006fffd"
205 "\U00070000-\U0007fffd\U00080000-\U0008fffd"
206 "\U00090000-\U0009fffd\U000a0000-\U000afffd"
207 "\U000b0000-\U000bfffd\U000c0000-\U000cfffd"
208 "\U000d0000-\U000dfffd\U000e1000-\U000efffd"
209 )
210else: # pragma: no cover
211 IPRIVATE = "\ue000-\uf8ff"
212 UCSCHAR_RE = "\u00a0-\ud7ff\uf900-\ufdcf\ufdf0-\uffef"
214IUNRESERVED_RE = "A-Za-z0-9\\._~\\-" + UCSCHAR_RE
215IPCHAR = "([" + IUNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
217isegments = {
218 "isegment": IPCHAR + "*",
219 # Non-zero length segment
220 "isegment-nz": IPCHAR + "+",
221 # Non-zero length segment without ":"
222 "isegment-nz-nc": IPCHAR.replace(":", "") + "+",
223}
225IPATH_ROOTLESS = "%(isegment-nz)s(/%(isegment)s)*" % isegments
226IPATH_NOSCHEME = "%(isegment-nz-nc)s(/%(isegment)s)*" % isegments
227IPATH_ABSOLUTE = "/(?:%s)?" % IPATH_ROOTLESS
228IPATH_ABEMPTY = "(?:/%(isegment)s)*" % isegments
229IPATH_RE = "^(?:{}|{}|{}|{}|{})$".format(
230 IPATH_ABEMPTY,
231 IPATH_ABSOLUTE,
232 IPATH_NOSCHEME,
233 IPATH_ROOTLESS,
234 PATH_EMPTY,
235)
237IREGULAR_NAME_RE = IREG_NAME = "(?:{}|[{}])*".format(
238 "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + IUNRESERVED_RE
239)
241IHOST_RE = IHOST_PATTERN = "({}|{}|{})".format(
242 IREG_NAME,
243 IPv4_RE,
244 IP_LITERAL_RE,
245)
247IUSERINFO_RE = (
248 "^(?:[" + IUNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
249)
251IFRAGMENT_RE = (
252 "^(?:[/?:@" + IUNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
253)
254IQUERY_RE = (
255 "^(?:[/?:@"
256 + IUNRESERVED_RE
257 + SUB_DELIMITERS_RE
258 + IPRIVATE
259 + "]|%s)*$" % PCT_ENCODED
260)
262IRELATIVE_PART_RE = "(//{}{}|{}|{}|{})".format(
263 COMPONENT_PATTERN_DICT["authority"],
264 IPATH_ABEMPTY,
265 IPATH_ABSOLUTE,
266 IPATH_NOSCHEME,
267 PATH_EMPTY,
268)
270IHIER_PART_RE = "(//{}{}|{}|{}|{})".format(
271 COMPONENT_PATTERN_DICT["authority"],
272 IPATH_ABEMPTY,
273 IPATH_ABSOLUTE,
274 IPATH_ROOTLESS,
275 PATH_EMPTY,
276)