1"""
2This module implements Git's wildmatch pattern matching which itself is derived
3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
4"""
5
6import re
7import warnings
8from typing import (
9 AnyStr,
10 Optional, # Replaced by `X | None` in 3.10.
11 Tuple) # Replaced by `tuple` in 3.9.
12
13from .. import util
14from ..pattern import RegexPattern
15
16_BYTES_ENCODING = 'latin1'
17"""
18The encoding to use when parsing a byte string pattern.
19"""
20
21_DIR_MARK = 'ps_d'
22"""
23The regex group name for the directory marker. This is only used by
24:class:`GitIgnoreSpec`.
25"""
26
27
28class GitWildMatchPatternError(ValueError):
29 """
30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
31 pattern.
32 """
33 pass
34
35
36class GitWildMatchPattern(RegexPattern):
37 """
38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
39 pattern.
40 """
41
42 # Keep the dict-less class hierarchy.
43 __slots__ = ()
44
45 @classmethod
46 def pattern_to_regex(
47 cls,
48 pattern: AnyStr,
49 ) -> Tuple[Optional[AnyStr], Optional[bool]]:
50 """
51 Convert the pattern into a regular expression.
52
53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
54 regular expression.
55
56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
57 :data:`None`); and whether matched files should be included (:data:`True`),
58 excluded (:data:`False`), or if it is a null-operation (:data:`None`).
59 """
60 if isinstance(pattern, str):
61 return_type = str
62 elif isinstance(pattern, bytes):
63 return_type = bytes
64 pattern = pattern.decode(_BYTES_ENCODING)
65 else:
66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
67
68 original_pattern = pattern
69
70 if pattern.endswith('\\ '):
71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
72 # with backslash followed by a space, only strip from left.
73 pattern = pattern.lstrip()
74 else:
75 pattern = pattern.strip()
76
77 if pattern.startswith('#'):
78 # A pattern starting with a hash ('#') serves as a comment (neither
79 # includes nor excludes files). Escape the hash with a back-slash to match
80 # a literal hash (i.e., '\#').
81 regex = None
82 include = None
83
84 elif pattern == '/':
85 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
86 # not match any file.
87 regex = None
88 include = None
89
90 elif pattern:
91 if pattern.startswith('!'):
92 # A pattern starting with an exclamation mark ('!') negates the pattern
93 # (exclude instead of include). Escape the exclamation mark with a
94 # back-slash to match a literal exclamation mark (i.e., '\!').
95 include = False
96 # Remove leading exclamation mark.
97 pattern = pattern[1:]
98 else:
99 include = True
100
101 # Allow a regex override for edge cases that cannot be handled through
102 # normalization.
103 override_regex = None
104
105 # Split pattern into segments.
106 pattern_segs = pattern.split('/')
107
108 # Check whether the pattern is specifically a directory pattern before
109 # normalization.
110 is_dir_pattern = not pattern_segs[-1]
111
112 # Normalize pattern to make processing easier.
113
114 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
115 # sequence down to one double-asterisk. Iterate over the segments in
116 # reverse and remove the duplicate double asterisks as we go.
117 for i in range(len(pattern_segs) - 1, 0, -1):
118 prev = pattern_segs[i-1]
119 seg = pattern_segs[i]
120 if prev == '**' and seg == '**':
121 del pattern_segs[i]
122
123 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
124 # EDGE CASE: The '**/' pattern should match everything except individual
125 # files in the root directory. This case cannot be adequately handled
126 # through normalization. Use the override.
127 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
128
129 if not pattern_segs[0]:
130 # A pattern beginning with a slash ('/') will only match paths directly
131 # on the root directory instead of any descendant paths. So, remove
132 # empty first segment to make pattern relative to root.
133 del pattern_segs[0]
134
135 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
136 # A single pattern without a beginning slash ('/') will match any
137 # descendant path. This is equivalent to "**/{pattern}". So, prepend
138 # with double-asterisks to make pattern relative to root.
139 # - EDGE CASE: This also holds for a single pattern with a trailing
140 # slash (e.g. dir/).
141 if pattern_segs[0] != '**':
142 pattern_segs.insert(0, '**')
143
144 else:
145 # EDGE CASE: A pattern without a beginning slash ('/') but contains at
146 # least one prepended directory (e.g. "dir/{pattern}") should not match
147 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
148 pass
149
150 if not pattern_segs:
151 # After resolving the edge cases, we end up with no pattern at all. This
152 # must be because the pattern is invalid.
153 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
154
155 if not pattern_segs[-1] and len(pattern_segs) > 1:
156 # A pattern ending with a slash ('/') will match all descendant paths if
157 # it is a directory but not if it is a regular file. This is equivalent
158 # to "{pattern}/**". So, set last segment to a double-asterisk to
159 # include all descendants.
160 pattern_segs[-1] = '**'
161
162 if override_regex is None:
163 # Build regular expression from pattern.
164 output = ['^']
165 need_slash = False
166 end = len(pattern_segs) - 1
167 for i, seg in enumerate(pattern_segs):
168 if seg == '**':
169 if i == 0 and i == end:
170 # A pattern consisting solely of double-asterisks ('**') will
171 # match every path.
172 output.append(f'[^/]+(?:/.*)?')
173
174 elif i == 0:
175 # A normalized pattern beginning with double-asterisks
176 # ('**') will match any leading path segments.
177 output.append('(?:.+/)?')
178 need_slash = False
179
180 elif i == end:
181 # A normalized pattern ending with double-asterisks ('**') will
182 # match any trailing path segments.
183 if is_dir_pattern:
184 output.append(f'(?P<{_DIR_MARK}>/).*')
185 else:
186 output.append(f'/.*')
187
188 else:
189 # A pattern with inner double-asterisks ('**') will match multiple
190 # (or zero) inner path segments.
191 output.append('(?:/.+)?')
192 need_slash = True
193
194 elif seg == '*':
195 # Match single path segment.
196 if need_slash:
197 output.append('/')
198
199 output.append('[^/]+')
200
201 if i == end:
202 # A pattern ending without a slash ('/') will match a file or a
203 # directory (with paths underneath it). E.g., "foo" matches "foo",
204 # "foo/bar", "foo/bar/baz", etc.
205 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
206
207 need_slash = True
208
209 else:
210 # Match segment glob pattern.
211 if need_slash:
212 output.append('/')
213
214 try:
215 output.append(cls._translate_segment_glob(seg))
216 except ValueError as e:
217 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
218
219 if i == end:
220 # A pattern ending without a slash ('/') will match a file or a
221 # directory (with paths underneath it). E.g., "foo" matches "foo",
222 # "foo/bar", "foo/bar/baz", etc.
223 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
224
225 need_slash = True
226
227 output.append('$')
228 regex = ''.join(output)
229
230 else:
231 # Use regex override.
232 regex = override_regex
233
234 else:
235 # A blank pattern is a null-operation (neither includes nor excludes
236 # files).
237 regex = None
238 include = None
239
240 if regex is not None and return_type is bytes:
241 regex = regex.encode(_BYTES_ENCODING)
242
243 return regex, include
244
245 @staticmethod
246 def _translate_segment_glob(pattern: str) -> str:
247 """
248 Translates the glob pattern to a regular expression. This is used in the
249 constructor to translate a path segment glob pattern to its corresponding
250 regular expression.
251
252 *pattern* (:class:`str`) is the glob pattern.
253
254 Returns the regular expression (:class:`str`).
255 """
256 # NOTE: This is derived from `fnmatch.translate()` and is similar to the
257 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
258
259 escape = False
260 regex = ''
261 i, end = 0, len(pattern)
262 while i < end:
263 # Get next character.
264 char = pattern[i]
265 i += 1
266
267 if escape:
268 # Escape the character.
269 escape = False
270 regex += re.escape(char)
271
272 elif char == '\\':
273 # Escape character, escape next character.
274 escape = True
275
276 elif char == '*':
277 # Multi-character wildcard. Match any string (except slashes), including
278 # an empty string.
279 regex += '[^/]*'
280
281 elif char == '?':
282 # Single-character wildcard. Match any single character (except a
283 # slash).
284 regex += '[^/]'
285
286 elif char == '[':
287 # Bracket expression wildcard. Except for the beginning exclamation
288 # mark, the whole bracket expression can be used directly as regex, but
289 # we have to find where the expression ends.
290 # - "[][!]" matches ']', '[' and '!'.
291 # - "[]-]" matches ']' and '-'.
292 # - "[!]a-]" matches any character except ']', 'a' and '-'.
293 j = i
294
295 # Pass bracket expression negation.
296 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
297 j += 1
298
299 # Pass first closing bracket if it is at the beginning of the
300 # expression.
301 if j < end and pattern[j] == ']':
302 j += 1
303
304 # Find closing bracket. Stop once we reach the end or find it.
305 while j < end and pattern[j] != ']':
306 j += 1
307
308 if j < end:
309 # Found end of bracket expression. Increment j to be one past the
310 # closing bracket:
311 #
312 # [...]
313 # ^ ^
314 # i j
315 #
316 j += 1
317 expr = '['
318
319 if pattern[i] == '!':
320 # Bracket expression needs to be negated.
321 expr += '^'
322 i += 1
323 elif pattern[i] == '^':
324 # POSIX declares that the regex bracket expression negation "[^...]"
325 # is undefined in a glob pattern. Python's `fnmatch.translate()`
326 # escapes the caret ('^') as a literal. Git supports the using a
327 # caret for negation. Maintain consistency with Git because that is
328 # the expected behavior.
329 expr += '^'
330 i += 1
331
332 # Build regex bracket expression. Escape slashes so they are treated
333 # as literal slashes by regex as defined by POSIX.
334 expr += pattern[i:j].replace('\\', '\\\\')
335
336 # Add regex bracket expression to regex result.
337 regex += expr
338
339 # Set i to one past the closing bracket.
340 i = j
341
342 else:
343 # Failed to find closing bracket, treat opening bracket as a bracket
344 # literal instead of as an expression.
345 regex += '\\['
346
347 else:
348 # Regular character, escape it for regex.
349 regex += re.escape(char)
350
351 if escape:
352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
353
354 return regex
355
356 @staticmethod
357 def escape(s: AnyStr) -> AnyStr:
358 """
359 Escape special characters in the given string.
360
361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
362 escape, usually before adding it to a ".gitignore".
363
364 Returns the escaped string (:class:`str` or :class:`bytes`).
365 """
366 if isinstance(s, str):
367 return_type = str
368 string = s
369 elif isinstance(s, bytes):
370 return_type = bytes
371 string = s.decode(_BYTES_ENCODING)
372 else:
373 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
374
375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
376 meta_characters = r"[]!*#?"
377
378 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
379
380 if return_type is bytes:
381 return out_string.encode(_BYTES_ENCODING)
382 else:
383 return out_string
384
385util.register_pattern('gitwildmatch', GitWildMatchPattern)
386
387
388class GitIgnorePattern(GitWildMatchPattern):
389 """
390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
391 This class only exists to maintain compatibility with v0.4.
392 """
393
394 def __init__(self, *args, **kw) -> None:
395 """
396 Warn about deprecation.
397 """
398 self._deprecated()
399 super(GitIgnorePattern, self).__init__(*args, **kw)
400
401 @staticmethod
402 def _deprecated() -> None:
403 """
404 Warn about deprecation.
405 """
406 warnings.warn((
407 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
408 "('gitwildmatch') instead."
409 ), DeprecationWarning, stacklevel=3)
410
411 @classmethod
412 def pattern_to_regex(cls, *args, **kw):
413 """
414 Warn about deprecation.
415 """
416 cls._deprecated()
417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
418
419# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
420# v0.4.
421util.register_pattern('gitignore', GitIgnorePattern)