1"""
2This module implements Git's wildmatch pattern matching which itself is derived
3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
4"""
5
6import re
7import warnings
8from typing import (
9 AnyStr,
10 Optional) # Replaced by `X | None` in 3.10.
11
12from .. import (
13 util)
14from ..pattern import (
15 RegexPattern)
16from .._typing import (
17 override) # Added in 3.12.
18
19_BYTES_ENCODING = 'latin1'
20"""
21The encoding to use when parsing a byte string pattern.
22"""
23
24_DIR_MARK = 'ps_d'
25"""
26The regex group name for the directory marker. This is only used by
27:class:`GitIgnoreSpec`.
28"""
29
30_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'
31"""
32This regular expression matches the directory marker.
33"""
34
35_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'
36"""
37This regular expression matches the optional directory marker and sub-path.
38"""
39
40
41class GitWildMatchPatternError(ValueError):
42 """
43 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
44 pattern.
45 """
46 pass
47
48
49class GitWildMatchPattern(RegexPattern):
50 """
51 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
52 pattern.
53 """
54
55 # Keep the dict-less class hierarchy.
56 __slots__ = ()
57
58 @override
59 @classmethod
60 def pattern_to_regex(
61 cls,
62 pattern: AnyStr,
63 ) -> tuple[Optional[AnyStr], Optional[bool]]:
64 """
65 Convert the pattern into a regular expression.
66
67 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
68 regular expression.
69
70 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
71 :data:`None`); and whether matched files should be included (:data:`True`),
72 excluded (:data:`False`), or if it is a null-operation (:data:`None`).
73 """
74 if isinstance(pattern, str):
75 return_type = str
76 elif isinstance(pattern, bytes):
77 return_type = bytes
78 pattern = pattern.decode(_BYTES_ENCODING)
79 else:
80 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
81
82 original_pattern = pattern
83
84 if pattern.endswith('\\ '):
85 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
86 # with backslash followed by a space, only strip from left.
87 pattern = pattern.lstrip()
88 else:
89 pattern = pattern.strip()
90
91 regex: Optional[str]
92 include: Optional[bool]
93
94 if pattern.startswith('#'):
95 # A pattern starting with a hash ('#') serves as a comment (neither
96 # includes nor excludes files). Escape the hash with a back-slash to match
97 # a literal hash (i.e., '\#').
98 regex = None
99 include = None
100
101 elif pattern == '/':
102 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
103 # not match any file.
104 regex = None
105 include = None
106
107 elif pattern:
108 if pattern.startswith('!'):
109 # A pattern starting with an exclamation mark ('!') negates the pattern
110 # (exclude instead of include). Escape the exclamation mark with a
111 # back-slash to match a literal exclamation mark (i.e., '\!').
112 include = False
113 # Remove leading exclamation mark.
114 pattern = pattern[1:]
115 else:
116 include = True
117
118 # Allow a regex override for edge cases that cannot be handled through
119 # normalization.
120 override_regex: Optional[str] = None
121
122 # Split pattern into segments.
123 pattern_segs = pattern.split('/')
124
125 # Check whether the pattern is specifically a directory pattern before
126 # normalization.
127 is_dir_pattern = not pattern_segs[-1]
128
129 # Normalize pattern to make processing easier.
130
131 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
132 # sequence down to one double-asterisk. Iterate over the segments in
133 # reverse and remove the duplicate double asterisks as we go.
134 for i in range(len(pattern_segs) - 1, 0, -1):
135 prev = pattern_segs[i-1]
136 seg = pattern_segs[i]
137 if prev == '**' and seg == '**':
138 del pattern_segs[i]
139
140 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
141 # EDGE CASE: The '**/' pattern should match everything except individual
142 # files in the root directory. This case cannot be adequately handled
143 # through normalization. Use the override.
144 override_regex = _DIR_MARK_CG
145
146 if not pattern_segs[0]:
147 # A pattern beginning with a slash ('/') will only match paths directly
148 # on the root directory instead of any descendant paths. So, remove
149 # empty first segment to make pattern relative to root.
150 del pattern_segs[0]
151
152 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
153 # A single segment pattern without a beginning slash ('/') will match
154 # any descendant path. This is equivalent to "**/{pattern}". So, prepend
155 # with double-asterisks to make pattern relative to root.
156 # - EDGE CASE: This also holds for a single segment pattern with a
157 # trailing slash (e.g. 'dir/').
158 if pattern_segs[0] != '**':
159 pattern_segs.insert(0, '**')
160
161 else:
162 # EDGE CASE: A pattern without a beginning slash ('/') but contains at
163 # least one prepended directory (e.g. "dir/{pattern}") should not match
164 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
165 pass
166
167 if not pattern_segs:
168 # After resolving the edge cases, we end up with no pattern at all. This
169 # must be because the pattern is invalid.
170 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
171
172 if not pattern_segs[-1] and len(pattern_segs) > 1:
173 # A pattern ending with a slash ('/') will match all descendant paths if
174 # it is a directory but not if it is a regular file. This is equivalent
175 # to "{pattern}/**". So, set last segment to a double-asterisk to
176 # include all descendants.
177 pattern_segs[-1] = '**'
178
179 if override_regex is None:
180 seg_count = len(pattern_segs)
181 if seg_count == 1 and pattern_segs[0] == '**':
182 # The pattern "**" will match every path. Special case this pattern.
183 override_regex = '.'
184
185 elif (
186 seg_count == 2
187 and pattern_segs[0] == '**'
188 and pattern_segs[1] == '*'
189 ):
190 # The pattern "*" will be normalized to "**/*" and will match every
191 # path. Special case this pattern for efficiency.
192 override_regex = '.'
193
194 elif (
195 seg_count == 3
196 and pattern_segs[0] == '**'
197 and pattern_segs[1] == '*'
198 and pattern_segs[2] == '**'
199 ):
200 # The pattern "*/" will be normalized to "**/*/**" which will match
201 # every file not in the root directory. Special case this pattern for
202 # efficiency.
203 if is_dir_pattern:
204 override_regex = _DIR_MARK_CG
205 else:
206 override_regex = '/'
207
208 if override_regex is None:
209 # Build regular expression from pattern.
210 output = []
211 need_slash = False
212 end = len(pattern_segs) - 1
213 for i, seg in enumerate(pattern_segs):
214 if seg == '**':
215 if i == 0:
216 # A normalized pattern beginning with double-asterisks ('**') will
217 # match any leading path segments.
218 output.append('^(?:.+/)?')
219
220 elif i < end:
221 # A pattern with inner double-asterisks ('**') will match multiple
222 # (or zero) inner path segments.
223 output.append('(?:/.+)?')
224 need_slash = True
225
226 else:
227 assert i == end, (i, end)
228 # A normalized pattern ending with double-asterisks ('**') will
229 # match any trailing path segments.
230 if is_dir_pattern:
231 output.append(_DIR_MARK_CG)
232 else:
233 output.append(f'/')
234
235 else:
236 # Match path segment.
237 if i == 0:
238 # Anchor to root directory.
239 output.append('^')
240
241 if need_slash:
242 output.append('/')
243
244 if seg == '*':
245 # Match whole path segment.
246 output.append('[^/]+')
247
248 else:
249 # Match segment glob pattern.
250 try:
251 output.append(cls._translate_segment_glob(seg))
252 except ValueError as e:
253 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
254
255 if i == end:
256 # A pattern ending without a slash ('/') will match a file or a
257 # directory (with paths underneath it). E.g., "foo" matches "foo",
258 # "foo/bar", "foo/bar/baz", etc.
259 output.append(_DIR_MARK_OPT)
260
261 need_slash = True
262
263 regex = ''.join(output)
264
265 else:
266 # Use regex override.
267 regex = override_regex
268
269 else:
270 # A blank pattern is a null-operation (neither includes nor excludes
271 # files).
272 regex = None
273 include = None
274
275 if regex is not None and return_type is bytes:
276 regex = regex.encode(_BYTES_ENCODING)
277
278 return regex, include
279
280 @staticmethod
281 def _translate_segment_glob(pattern: str) -> str:
282 """
283 Translates the glob pattern to a regular expression. This is used in the
284 constructor to translate a path segment glob pattern to its corresponding
285 regular expression.
286
287 *pattern* (:class:`str`) is the glob pattern.
288
289 Returns the regular expression (:class:`str`).
290 """
291 # NOTE: This is derived from `fnmatch.translate()` and is similar to the
292 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
293
294 escape = False
295 regex = ''
296 i, end = 0, len(pattern)
297 while i < end:
298 # Get next character.
299 char = pattern[i]
300 i += 1
301
302 if escape:
303 # Escape the character.
304 escape = False
305 regex += re.escape(char)
306
307 elif char == '\\':
308 # Escape character, escape next character.
309 escape = True
310
311 elif char == '*':
312 # Multi-character wildcard. Match any string (except slashes), including
313 # an empty string.
314 regex += '[^/]*'
315
316 elif char == '?':
317 # Single-character wildcard. Match any single character (except a
318 # slash).
319 regex += '[^/]'
320
321 elif char == '[':
322 # Bracket expression wildcard. Except for the beginning exclamation
323 # mark, the whole bracket expression can be used directly as regex, but
324 # we have to find where the expression ends.
325 # - "[][!]" matches ']', '[' and '!'.
326 # - "[]-]" matches ']' and '-'.
327 # - "[!]a-]" matches any character except ']', 'a' and '-'.
328 j = i
329
330 # Pass bracket expression negation.
331 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
332 j += 1
333
334 # Pass first closing bracket if it is at the beginning of the
335 # expression.
336 if j < end and pattern[j] == ']':
337 j += 1
338
339 # Find closing bracket. Stop once we reach the end or find it.
340 while j < end and pattern[j] != ']':
341 j += 1
342
343 if j < end:
344 # Found end of bracket expression. Increment j to be one past the
345 # closing bracket:
346 #
347 # [...]
348 # ^ ^
349 # i j
350 #
351 j += 1
352 expr = '['
353
354 if pattern[i] == '!':
355 # Bracket expression needs to be negated.
356 expr += '^'
357 i += 1
358 elif pattern[i] == '^':
359 # POSIX declares that the regex bracket expression negation "[^...]"
360 # is undefined in a glob pattern. Python's `fnmatch.translate()`
361 # escapes the caret ('^') as a literal. Git supports the using a
362 # caret for negation. Maintain consistency with Git because that is
363 # the expected behavior.
364 expr += '^'
365 i += 1
366
367 # Build regex bracket expression. Escape slashes so they are treated
368 # as literal slashes by regex as defined by POSIX.
369 expr += pattern[i:j].replace('\\', '\\\\')
370
371 # Add regex bracket expression to regex result.
372 regex += expr
373
374 # Set i to one past the closing bracket.
375 i = j
376
377 else:
378 # Failed to find closing bracket, treat opening bracket as a bracket
379 # literal instead of as an expression.
380 regex += '\\['
381
382 else:
383 # Regular character, escape it for regex.
384 regex += re.escape(char)
385
386 if escape:
387 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
388
389 return regex
390
391 @staticmethod
392 def escape(s: AnyStr) -> AnyStr:
393 """
394 Escape special characters in the given string.
395
396 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
397 escape, usually before adding it to a ".gitignore".
398
399 Returns the escaped string (:class:`str` or :class:`bytes`).
400 """
401 if isinstance(s, str):
402 return_type = str
403 string = s
404 elif isinstance(s, bytes):
405 return_type = bytes
406 string = s.decode(_BYTES_ENCODING)
407 else:
408 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
409
410 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
411 meta_characters = r"[]!*#?"
412
413 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
414
415 if return_type is bytes:
416 return out_string.encode(_BYTES_ENCODING)
417 else:
418 return out_string
419
420util.register_pattern('gitwildmatch', GitWildMatchPattern)
421
422
423class GitIgnorePattern(GitWildMatchPattern):
424 """
425 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
426 This class only exists to maintain compatibility with v0.4.
427 """
428
429 def __init__(self, *args, **kw) -> None:
430 """
431 Warn about deprecation.
432 """
433 self._deprecated()
434 super(GitIgnorePattern, self).__init__(*args, **kw)
435
436 @staticmethod
437 def _deprecated() -> None:
438 """
439 Warn about deprecation.
440 """
441 warnings.warn((
442 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
443 "('gitwildmatch') instead."
444 ), DeprecationWarning, stacklevel=3)
445
446 @override
447 @classmethod
448 def pattern_to_regex(cls, *args, **kw):
449 """
450 Warn about deprecation.
451 """
452 cls._deprecated()
453 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
454
455# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
456# v0.4.
457util.register_pattern('gitignore', GitIgnorePattern)