1"""
2Functions for reversing a regular expression (used in reverse URL resolving).
3Used internally by Django and not intended for external use.
4
5This is not, and is not intended to be, a complete reg-exp decompiler. It
6should be good enough for a large class of URLS, however.
7"""
8
9import re
10
11from django.utils.functional import SimpleLazyObject
12
13# Mapping of an escape character to a representative of that class. So, e.g.,
14# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
15# this sequence. Any missing key is mapped to itself.
16ESCAPE_MAPPINGS = {
17 "A": None,
18 "b": None,
19 "B": None,
20 "d": "0",
21 "D": "x",
22 "s": " ",
23 "S": "x",
24 "w": "x",
25 "W": "!",
26 "Z": None,
27}
28
29
30class Choice(list):
31 """Represent multiple possibilities at this point in a pattern string."""
32
33
34class Group(list):
35 """Represent a capturing group in the pattern string."""
36
37
38class NonCapture(list):
39 """Represent a non-capturing group in the pattern string."""
40
41
42def normalize(pattern):
43 r"""
44 Given a reg-exp pattern, normalize it to an iterable of forms that
45 suffice for reverse matching. This does the following:
46
47 (1) For any repeating sections, keeps the minimum number of occurrences
48 permitted (this means zero for optional groups).
49 (2) If an optional group includes parameters, include one occurrence of
50 that group (along with the zero occurrence case from step (1)).
51 (3) Select the first (essentially an arbitrary) element from any character
52 class. Select an arbitrary character for any unordered class (e.g. '.'
53 or '\w') in the pattern.
54 (4) Ignore look-ahead and look-behind assertions.
55 (5) Raise an error on any disjunctive ('|') constructs.
56
57 Django's URLs for forward resolving are either all positional arguments or
58 all keyword arguments. That is assumed here, as well. Although reverse
59 resolving can be done using positional args when keyword args are
60 specified, the two cannot be mixed in the same reverse() call.
61 """
62 # Do a linear scan to work out the special features of this pattern. The
63 # idea is that we scan once here and collect all the information we need to
64 # make future decisions.
65 result = []
66 non_capturing_groups = []
67 consume_next = True
68 pattern_iter = next_char(iter(pattern))
69 num_args = 0
70
71 # A "while" loop is used here because later on we need to be able to peek
72 # at the next character and possibly go around without consuming another
73 # one at the top of the loop.
74 try:
75 ch, escaped = next(pattern_iter)
76 except StopIteration:
77 return [("", [])]
78
79 try:
80 while True:
81 if escaped:
82 result.append(ch)
83 elif ch == ".":
84 # Replace "any character" with an arbitrary representative.
85 result.append(".")
86 elif ch == "|":
87 # FIXME: One day we'll should do this, but not in 1.0.
88 raise NotImplementedError("Awaiting Implementation")
89 elif ch == "^":
90 pass
91 elif ch == "$":
92 break
93 elif ch == ")":
94 # This can only be the end of a non-capturing group, since all
95 # other unescaped parentheses are handled by the grouping
96 # section later (and the full group is handled there).
97 #
98 # We regroup everything inside the capturing group so that it
99 # can be quantified, if necessary.
100 start = non_capturing_groups.pop()
101 inner = NonCapture(result[start:])
102 result = result[:start] + [inner]
103 elif ch == "[":
104 # Replace ranges with the first character in the range.
105 ch, escaped = next(pattern_iter)
106 result.append(ch)
107 ch, escaped = next(pattern_iter)
108 while escaped or ch != "]":
109 ch, escaped = next(pattern_iter)
110 elif ch == "(":
111 # Some kind of group.
112 ch, escaped = next(pattern_iter)
113 if ch != "?" or escaped:
114 # A positional group
115 name = "_%d" % num_args
116 num_args += 1
117 result.append(Group((("%%(%s)s" % name), name)))
118 walk_to_end(ch, pattern_iter)
119 else:
120 ch, escaped = next(pattern_iter)
121 if ch in "!=<":
122 # All of these are ignorable. Walk to the end of the
123 # group.
124 walk_to_end(ch, pattern_iter)
125 elif ch == ":":
126 # Non-capturing group
127 non_capturing_groups.append(len(result))
128 elif ch != "P":
129 # Anything else, other than a named group, is something
130 # we cannot reverse.
131 raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch)
132 else:
133 ch, escaped = next(pattern_iter)
134 if ch not in ("<", "="):
135 raise ValueError(
136 "Non-reversible reg-exp portion: '(?P%s'" % ch
137 )
138 # We are in a named capturing group. Extra the name and
139 # then skip to the end.
140 if ch == "<":
141 terminal_char = ">"
142 # We are in a named backreference.
143 else:
144 terminal_char = ")"
145 name = []
146 ch, escaped = next(pattern_iter)
147 while ch != terminal_char:
148 name.append(ch)
149 ch, escaped = next(pattern_iter)
150 param = "".join(name)
151 # Named backreferences have already consumed the
152 # parenthesis.
153 if terminal_char != ")":
154 result.append(Group((("%%(%s)s" % param), param)))
155 walk_to_end(ch, pattern_iter)
156 else:
157 result.append(Group((("%%(%s)s" % param), None)))
158 elif ch in "*?+{":
159 # Quantifiers affect the previous item in the result list.
160 count, ch = get_quantifier(ch, pattern_iter)
161 if ch:
162 # We had to look ahead, but it wasn't need to compute the
163 # quantifier, so use this character next time around the
164 # main loop.
165 consume_next = False
166
167 if count == 0:
168 if contains(result[-1], Group):
169 # If we are quantifying a capturing group (or
170 # something containing such a group) and the minimum is
171 # zero, we must also handle the case of one occurrence
172 # being present. All the quantifiers (except {0,0},
173 # which we conveniently ignore) that have a 0 minimum
174 # also allow a single occurrence.
175 result[-1] = Choice([None, result[-1]])
176 else:
177 result.pop()
178 elif count > 1:
179 result.extend([result[-1]] * (count - 1))
180 else:
181 # Anything else is a literal.
182 result.append(ch)
183
184 if consume_next:
185 ch, escaped = next(pattern_iter)
186 consume_next = True
187 except StopIteration:
188 pass
189 except NotImplementedError:
190 # A case of using the disjunctive form. No results for you!
191 return [("", [])]
192
193 return list(zip(*flatten_result(result)))
194
195
196def next_char(input_iter):
197 r"""
198 An iterator that yields the next character from "pattern_iter", respecting
199 escape sequences. An escaped character is replaced by a representative of
200 its class (e.g. \w -> "x"). If the escaped character is one that is
201 skipped, it is not returned (the next character is returned instead).
202
203 Yield the next character, along with a boolean indicating whether it is a
204 raw (unescaped) character or not.
205 """
206 for ch in input_iter:
207 if ch != "\\":
208 yield ch, False
209 continue
210 ch = next(input_iter)
211 representative = ESCAPE_MAPPINGS.get(ch, ch)
212 if representative is None:
213 continue
214 yield representative, True
215
216
217def walk_to_end(ch, input_iter):
218 """
219 The iterator is currently inside a capturing group. Walk to the close of
220 this group, skipping over any nested groups and handling escaped
221 parentheses correctly.
222 """
223 if ch == "(":
224 nesting = 1
225 else:
226 nesting = 0
227 for ch, escaped in input_iter:
228 if escaped:
229 continue
230 elif ch == "(":
231 nesting += 1
232 elif ch == ")":
233 if not nesting:
234 return
235 nesting -= 1
236
237
238def get_quantifier(ch, input_iter):
239 """
240 Parse a quantifier from the input, where "ch" is the first character in the
241 quantifier.
242
243 Return the minimum number of occurrences permitted by the quantifier and
244 either None or the next character from the input_iter if the next character
245 is not part of the quantifier.
246 """
247 if ch in "*?+":
248 try:
249 ch2, escaped = next(input_iter)
250 except StopIteration:
251 ch2 = None
252 if ch2 == "?":
253 ch2 = None
254 if ch == "+":
255 return 1, ch2
256 return 0, ch2
257
258 quant = []
259 while ch != "}":
260 ch, escaped = next(input_iter)
261 quant.append(ch)
262 quant = quant[:-1]
263 values = "".join(quant).split(",")
264
265 # Consume the trailing '?', if necessary.
266 try:
267 ch, escaped = next(input_iter)
268 except StopIteration:
269 ch = None
270 if ch == "?":
271 ch = None
272 return int(values[0]), ch
273
274
275def contains(source, inst):
276 """
277 Return True if the "source" contains an instance of "inst". False,
278 otherwise.
279 """
280 if isinstance(source, inst):
281 return True
282 if isinstance(source, NonCapture):
283 for elt in source:
284 if contains(elt, inst):
285 return True
286 return False
287
288
289def flatten_result(source):
290 """
291 Turn the given source sequence into a list of reg-exp possibilities and
292 their arguments. Return a list of strings and a list of argument lists.
293 Each of the two lists will be of the same length.
294 """
295 if source is None:
296 return [""], [[]]
297 if isinstance(source, Group):
298 if source[1] is None:
299 params = []
300 else:
301 params = [source[1]]
302 return [source[0]], [params]
303 result = [""]
304 result_args = [[]]
305 pos = last = 0
306 for pos, elt in enumerate(source):
307 if isinstance(elt, str):
308 continue
309 piece = "".join(source[last:pos])
310 if isinstance(elt, Group):
311 piece += elt[0]
312 param = elt[1]
313 else:
314 param = None
315 last = pos + 1
316 for i in range(len(result)):
317 result[i] += piece
318 if param:
319 result_args[i].append(param)
320 if isinstance(elt, (Choice, NonCapture)):
321 if isinstance(elt, NonCapture):
322 elt = [elt]
323 inner_result, inner_args = [], []
324 for item in elt:
325 res, args = flatten_result(item)
326 inner_result.extend(res)
327 inner_args.extend(args)
328 new_result = []
329 new_args = []
330 for item, args in zip(result, result_args):
331 for i_item, i_args in zip(inner_result, inner_args):
332 new_result.append(item + i_item)
333 new_args.append(args[:] + i_args)
334 result = new_result
335 result_args = new_args
336 if pos >= last:
337 piece = "".join(source[last:])
338 for i in range(len(result)):
339 result[i] += piece
340 return result, result_args
341
342
343def _lazy_re_compile(regex, flags=0):
344 """Lazily compile a regex with flags."""
345
346 def _compile():
347 # Compile the regex if it was not passed pre-compiled.
348 if isinstance(regex, (str, bytes)):
349 return re.compile(regex, flags)
350 else:
351 assert not flags, "flags must be empty if regex is passed pre-compiled"
352 return regex
353
354 return SimpleLazyObject(_compile)