1# Licensed under the Apache License, Version 2.0 (the "License");
2# you may not use this file except in compliance with the License.
3# You may obtain a copy of the License at
4#
5# http://www.apache.org/licenses/LICENSE-2.0
6#
7# Unless required by applicable law or agreed to in writing, software
8# distributed under the License is distributed on an "AS IS" BASIS,
9# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
10# implied.
11# See the License for the specific language governing permissions and
12# limitations under the License.
13"""Module for the regular expressions crafted from ABNF."""
14import sys
15
16# https://tools.ietf.org/html/rfc3986#page-13
17GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
18GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
19# https://tools.ietf.org/html/rfc3986#page-13
20SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
21SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
22# Escape the '*' for use in regular expressions
23SUB_DELIMITERS_RE = r"!$&'()\*+,;="
24RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
25ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
26DIGIT = "0123456789"
27# https://tools.ietf.org/html/rfc3986#section-2.3
28UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r"._!-~"
29UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
30NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
31# We need to escape the '-' in this case:
32UNRESERVED_RE = r"A-Za-z0-9._~\-"
33
34# Percent encoded character values
35PERCENT_ENCODED = PCT_ENCODED = "%[A-Fa-f0-9]{2}"
36PCHAR = "([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
37
38# NOTE(sigmavirus24): We're going to use more strict regular expressions
39# than appear in Appendix B for scheme. This will prevent over-eager
40# consuming of items that aren't schemes.
41SCHEME_RE = "[a-zA-Z][a-zA-Z0-9+.-]*"
42_AUTHORITY_RE = "[^\\\\/?#]*"
43_PATH_RE = "[^?#]*"
44_QUERY_RE = "[^#]*"
45_FRAGMENT_RE = "(?s:.*)"
46
47# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
48COMPONENT_PATTERN_DICT = {
49 "scheme": SCHEME_RE,
50 "authority": _AUTHORITY_RE,
51 "path": _PATH_RE,
52 "query": _QUERY_RE,
53 "fragment": _FRAGMENT_RE,
54}
55
56# See http://tools.ietf.org/html/rfc3986#appendix-B
57# In this case, we name each of the important matches so we can use
58# SRE_Match#groupdict to parse the values out if we so choose. This is also
59# modified to ignore other matches that are not important to the parsing of
60# the reference so we can also simply use SRE_Match#groups.
61URL_PARSING_RE = (
62 r"(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?"
63 r"(?P<path>{path})(?:\?(?P<query>{query}))?"
64 r"(?:#(?P<fragment>{fragment}))?"
65).format(**COMPONENT_PATTERN_DICT)
66
67
68# #########################
69# Authority Matcher Section
70# #########################
71
72# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
73# The pattern for a regular name, e.g., www.google.com, api.github.com
74REGULAR_NAME_RE = REG_NAME = "((?:{}|[{}])*)".format(
75 "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + UNRESERVED_RE
76)
77# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
78IPv4_RE = r"([0-9]{1,3}\.){3}[0-9]{1,3}"
79# Hexadecimal characters used in each piece of an IPv6 address
80HEXDIG_RE = "[0-9A-Fa-f]{1,4}"
81# Least-significant 32 bits of an IPv6 address
82LS32_RE = "({hex}:{hex}|{ipv4})".format(hex=HEXDIG_RE, ipv4=IPv4_RE)
83# Substitutions into the following patterns for IPv6 patterns defined
84# http://tools.ietf.org/html/rfc3986#page-20
85_subs = {"hex": HEXDIG_RE, "ls32": LS32_RE}
86
87# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
88# about ABNF (Augmented Backus-Naur Form) use in the comments
89variations = [
90 # 6( h16 ":" ) ls32
91 "(%(hex)s:){6}%(ls32)s" % _subs,
92 # "::" 5( h16 ":" ) ls32
93 "::(%(hex)s:){5}%(ls32)s" % _subs,
94 # [ h16 ] "::" 4( h16 ":" ) ls32
95 "(%(hex)s)?::(%(hex)s:){4}%(ls32)s" % _subs,
96 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
97 "((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s" % _subs,
98 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
99 "((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s" % _subs,
100 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
101 "((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s" % _subs,
102 # [ *4( h16 ":" ) h16 ] "::" ls32
103 "((%(hex)s:){0,4}%(hex)s)?::%(ls32)s" % _subs,
104 # [ *5( h16 ":" ) h16 ] "::" h16
105 "((%(hex)s:){0,5}%(hex)s)?::%(hex)s" % _subs,
106 # [ *6( h16 ":" ) h16 ] "::"
107 "((%(hex)s:){0,6}%(hex)s)?::" % _subs,
108]
109
110IPv6_RE = "(({})|({})|({})|({})|({})|({})|({})|({})|({}))".format(*variations)
111
112IPv_FUTURE_RE = r"v[0-9A-Fa-f]+\.[%s]+" % (
113 UNRESERVED_RE + SUB_DELIMITERS_RE + ":"
114)
115
116# RFC 6874 Zone ID ABNF
117ZONE_ID = "(?:[" + UNRESERVED_RE + "]|" + PCT_ENCODED + ")+"
118
119IPv6_ADDRZ_RFC4007_RE = IPv6_RE + "(?:(?:%25|%)" + ZONE_ID + ")?"
120IPv6_ADDRZ_RE = IPv6_RE + "(?:%25" + ZONE_ID + ")?"
121
122IP_LITERAL_RE = r"\[({}|{})\]".format(
123 IPv6_ADDRZ_RFC4007_RE,
124 IPv_FUTURE_RE,
125)
126
127# Pattern for matching the host piece of the authority
128HOST_RE = HOST_PATTERN = "({}|{}|{})".format(
129 REG_NAME,
130 IPv4_RE,
131 IP_LITERAL_RE,
132)
133USERINFO_RE = (
134 "^([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
135)
136PORT_RE = "[0-9]{1,5}"
137
138# ####################
139# Path Matcher Section
140# ####################
141
142# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
143# about the path patterns defined below.
144segments = {
145 "segment": PCHAR + "*",
146 # Non-zero length segment
147 "segment-nz": PCHAR + "+",
148 # Non-zero length segment without ":"
149 "segment-nz-nc": PCHAR.replace(":", "") + "+",
150}
151
152# Path types taken from Section 3.3 (linked above)
153PATH_EMPTY = "(?:)"
154PATH_ROOTLESS = "%(segment-nz)s(/%(segment)s)*" % segments
155PATH_NOSCHEME = "%(segment-nz-nc)s(/%(segment)s)*" % segments
156PATH_ABSOLUTE = "/(%s)?" % PATH_ROOTLESS
157PATH_ABEMPTY = "(/%(segment)s)*" % segments
158PATH_RE = "^({}|{}|{}|{}|{})$".format(
159 PATH_ABEMPTY,
160 PATH_ABSOLUTE,
161 PATH_NOSCHEME,
162 PATH_ROOTLESS,
163 PATH_EMPTY,
164)
165
166FRAGMENT_RE = QUERY_RE = (
167 "^([/?:@" + UNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
168)
169
170# ##########################
171# Relative reference matcher
172# ##########################
173
174# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
175RELATIVE_PART_RE = "(//{}{}|{}|{}|{})".format(
176 COMPONENT_PATTERN_DICT["authority"],
177 PATH_ABEMPTY,
178 PATH_ABSOLUTE,
179 PATH_NOSCHEME,
180 PATH_EMPTY,
181)
182
183# See http://tools.ietf.org/html/rfc3986#section-3 for definition
184HIER_PART_RE = "(//{}{}|{}|{}|{})".format(
185 COMPONENT_PATTERN_DICT["authority"],
186 PATH_ABEMPTY,
187 PATH_ABSOLUTE,
188 PATH_ROOTLESS,
189 PATH_EMPTY,
190)
191
192# ###############
193# IRIs / RFC 3987
194# ###############
195
196# Only wide-unicode gets the high-ranges of UCSCHAR
197if sys.maxunicode > 0xFFFF: # pragma: no cover
198 IPRIVATE = "\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD"
199 UCSCHAR_RE = (
200 "\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
201 "\U00010000-\U0001FFFD\U00020000-\U0002FFFD"
202 "\U00030000-\U0003FFFD\U00040000-\U0004FFFD"
203 "\U00050000-\U0005FFFD\U00060000-\U0006FFFD"
204 "\U00070000-\U0007FFFD\U00080000-\U0008FFFD"
205 "\U00090000-\U0009FFFD\U000A0000-\U000AFFFD"
206 "\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD"
207 "\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD"
208 )
209else: # pragma: no cover
210 IPRIVATE = "\uE000-\uF8FF"
211 UCSCHAR_RE = "\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
212
213IUNRESERVED_RE = "A-Za-z0-9\\._~\\-" + UCSCHAR_RE
214IPCHAR = "([" + IUNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
215
216isegments = {
217 "isegment": IPCHAR + "*",
218 # Non-zero length segment
219 "isegment-nz": IPCHAR + "+",
220 # Non-zero length segment without ":"
221 "isegment-nz-nc": IPCHAR.replace(":", "") + "+",
222}
223
224IPATH_ROOTLESS = "%(isegment-nz)s(/%(isegment)s)*" % isegments
225IPATH_NOSCHEME = "%(isegment-nz-nc)s(/%(isegment)s)*" % isegments
226IPATH_ABSOLUTE = "/(?:%s)?" % IPATH_ROOTLESS
227IPATH_ABEMPTY = "(?:/%(isegment)s)*" % isegments
228IPATH_RE = "^(?:{}|{}|{}|{}|{})$".format(
229 IPATH_ABEMPTY,
230 IPATH_ABSOLUTE,
231 IPATH_NOSCHEME,
232 IPATH_ROOTLESS,
233 PATH_EMPTY,
234)
235
236IREGULAR_NAME_RE = IREG_NAME = "(?:{}|[{}])*".format(
237 "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + IUNRESERVED_RE
238)
239
240IHOST_RE = IHOST_PATTERN = "({}|{}|{})".format(
241 IREG_NAME,
242 IPv4_RE,
243 IP_LITERAL_RE,
244)
245
246IUSERINFO_RE = (
247 "^(?:[" + IUNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
248)
249
250IFRAGMENT_RE = (
251 "^(?:[/?:@" + IUNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
252)
253IQUERY_RE = (
254 "^(?:[/?:@"
255 + IUNRESERVED_RE
256 + SUB_DELIMITERS_RE
257 + IPRIVATE
258 + "]|%s)*$" % PCT_ENCODED
259)
260
261IRELATIVE_PART_RE = "(//{}{}|{}|{}|{})".format(
262 COMPONENT_PATTERN_DICT["authority"],
263 IPATH_ABEMPTY,
264 IPATH_ABSOLUTE,
265 IPATH_NOSCHEME,
266 PATH_EMPTY,
267)
268
269IHIER_PART_RE = "(//{}{}|{}|{}|{})".format(
270 COMPONENT_PATTERN_DICT["authority"],
271 IPATH_ABEMPTY,
272 IPATH_ABSOLUTE,
273 IPATH_ROOTLESS,
274 PATH_EMPTY,
275)