1"""
2This module implements Git's wildmatch pattern matching which itself is derived
3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
4"""
5
6import re
7import warnings
8from typing import (
9 AnyStr,
10 Optional) # Replaced by `X | None` in 3.10.
11
12from .. import (
13 util)
14from ..pattern import (
15 RegexPattern)
16from .._typing import (
17 override) # Added in 3.12.
18
19_BYTES_ENCODING = 'latin1'
20"""
21The encoding to use when parsing a byte string pattern.
22"""
23
24_DIR_MARK = 'ps_d'
25"""
26The regex group name for the directory marker. This is only used by
27:class:`GitIgnoreSpec`.
28"""
29
30
31class GitWildMatchPatternError(ValueError):
32 """
33 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
34 pattern.
35 """
36 pass
37
38
39class GitWildMatchPattern(RegexPattern):
40 """
41 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
42 pattern.
43 """
44
45 # Keep the dict-less class hierarchy.
46 __slots__ = ()
47
48 @override
49 @classmethod
50 def pattern_to_regex(
51 cls,
52 pattern: AnyStr,
53 ) -> tuple[Optional[AnyStr], Optional[bool]]:
54 """
55 Convert the pattern into a regular expression.
56
57 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
58 regular expression.
59
60 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
61 :data:`None`); and whether matched files should be included (:data:`True`),
62 excluded (:data:`False`), or if it is a null-operation (:data:`None`).
63 """
64 if isinstance(pattern, str):
65 return_type = str
66 elif isinstance(pattern, bytes):
67 return_type = bytes
68 pattern = pattern.decode(_BYTES_ENCODING)
69 else:
70 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
71
72 original_pattern = pattern
73
74 if pattern.endswith('\\ '):
75 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
76 # with backslash followed by a space, only strip from left.
77 pattern = pattern.lstrip()
78 else:
79 pattern = pattern.strip()
80
81 regex: Optional[str]
82 include: Optional[bool]
83
84 if pattern.startswith('#'):
85 # A pattern starting with a hash ('#') serves as a comment (neither
86 # includes nor excludes files). Escape the hash with a back-slash to match
87 # a literal hash (i.e., '\#').
88 regex = None
89 include = None
90
91 elif pattern == '/':
92 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
93 # not match any file.
94 regex = None
95 include = None
96
97 elif pattern:
98 if pattern.startswith('!'):
99 # A pattern starting with an exclamation mark ('!') negates the pattern
100 # (exclude instead of include). Escape the exclamation mark with a
101 # back-slash to match a literal exclamation mark (i.e., '\!').
102 include = False
103 # Remove leading exclamation mark.
104 pattern = pattern[1:]
105 else:
106 include = True
107
108 # Allow a regex override for edge cases that cannot be handled through
109 # normalization.
110 override_regex: Optional[str] = None
111
112 # Split pattern into segments.
113 pattern_segs = pattern.split('/')
114
115 # Check whether the pattern is specifically a directory pattern before
116 # normalization.
117 is_dir_pattern = not pattern_segs[-1]
118
119 # Normalize pattern to make processing easier.
120
121 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
122 # sequence down to one double-asterisk. Iterate over the segments in
123 # reverse and remove the duplicate double asterisks as we go.
124 for i in range(len(pattern_segs) - 1, 0, -1):
125 prev = pattern_segs[i-1]
126 seg = pattern_segs[i]
127 if prev == '**' and seg == '**':
128 del pattern_segs[i]
129
130 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
131 # EDGE CASE: The '**/' pattern should match everything except individual
132 # files in the root directory. This case cannot be adequately handled
133 # through normalization. Use the override.
134 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
135
136 if not pattern_segs[0]:
137 # A pattern beginning with a slash ('/') will only match paths directly
138 # on the root directory instead of any descendant paths. So, remove
139 # empty first segment to make pattern relative to root.
140 del pattern_segs[0]
141
142 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
143 # A single pattern without a beginning slash ('/') will match any
144 # descendant path. This is equivalent to "**/{pattern}". So, prepend
145 # with double-asterisks to make pattern relative to root.
146 # - EDGE CASE: This also holds for a single pattern with a trailing
147 # slash (e.g. dir/).
148 if pattern_segs[0] != '**':
149 pattern_segs.insert(0, '**')
150
151 else:
152 # EDGE CASE: A pattern without a beginning slash ('/') but contains at
153 # least one prepended directory (e.g. "dir/{pattern}") should not match
154 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
155 pass
156
157 if not pattern_segs:
158 # After resolving the edge cases, we end up with no pattern at all. This
159 # must be because the pattern is invalid.
160 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
161
162 if not pattern_segs[-1] and len(pattern_segs) > 1:
163 # A pattern ending with a slash ('/') will match all descendant paths if
164 # it is a directory but not if it is a regular file. This is equivalent
165 # to "{pattern}/**". So, set last segment to a double-asterisk to
166 # include all descendants.
167 pattern_segs[-1] = '**'
168
169 if override_regex is None:
170 # Build regular expression from pattern.
171 output = ['^']
172 need_slash = False
173 end = len(pattern_segs) - 1
174 for i, seg in enumerate(pattern_segs):
175 if seg == '**':
176 if i == 0 and i == end:
177 # A pattern consisting solely of double-asterisks ('**') will
178 # match every path.
179 output.append(f'[^/]+(?:/.*)?')
180
181 elif i == 0:
182 # A normalized pattern beginning with double-asterisks
183 # ('**') will match any leading path segments.
184 output.append('(?:.+/)?')
185 need_slash = False
186
187 elif i == end:
188 # A normalized pattern ending with double-asterisks ('**') will
189 # match any trailing path segments.
190 if is_dir_pattern:
191 output.append(f'(?P<{_DIR_MARK}>/).*')
192 else:
193 output.append(f'/.*')
194
195 else:
196 # A pattern with inner double-asterisks ('**') will match multiple
197 # (or zero) inner path segments.
198 output.append('(?:/.+)?')
199 need_slash = True
200
201 elif seg == '*':
202 # Match single path segment.
203 if need_slash:
204 output.append('/')
205
206 output.append('[^/]+')
207
208 if i == end:
209 # A pattern ending without a slash ('/') will match a file or a
210 # directory (with paths underneath it). E.g., "foo" matches "foo",
211 # "foo/bar", "foo/bar/baz", etc.
212 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
213
214 need_slash = True
215
216 else:
217 # Match segment glob pattern.
218 if need_slash:
219 output.append('/')
220
221 try:
222 output.append(cls._translate_segment_glob(seg))
223 except ValueError as e:
224 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
225
226 if i == end:
227 # A pattern ending without a slash ('/') will match a file or a
228 # directory (with paths underneath it). E.g., "foo" matches "foo",
229 # "foo/bar", "foo/bar/baz", etc.
230 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
231
232 need_slash = True
233
234 output.append('$')
235 regex = ''.join(output)
236
237 else:
238 # Use regex override.
239 regex = override_regex
240
241 else:
242 # A blank pattern is a null-operation (neither includes nor excludes
243 # files).
244 regex = None
245 include = None
246
247 if regex is not None and return_type is bytes:
248 regex = regex.encode(_BYTES_ENCODING)
249
250 return regex, include
251
252 @staticmethod
253 def _translate_segment_glob(pattern: str) -> str:
254 """
255 Translates the glob pattern to a regular expression. This is used in the
256 constructor to translate a path segment glob pattern to its corresponding
257 regular expression.
258
259 *pattern* (:class:`str`) is the glob pattern.
260
261 Returns the regular expression (:class:`str`).
262 """
263 # NOTE: This is derived from `fnmatch.translate()` and is similar to the
264 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
265
266 escape = False
267 regex = ''
268 i, end = 0, len(pattern)
269 while i < end:
270 # Get next character.
271 char = pattern[i]
272 i += 1
273
274 if escape:
275 # Escape the character.
276 escape = False
277 regex += re.escape(char)
278
279 elif char == '\\':
280 # Escape character, escape next character.
281 escape = True
282
283 elif char == '*':
284 # Multi-character wildcard. Match any string (except slashes), including
285 # an empty string.
286 regex += '[^/]*'
287
288 elif char == '?':
289 # Single-character wildcard. Match any single character (except a
290 # slash).
291 regex += '[^/]'
292
293 elif char == '[':
294 # Bracket expression wildcard. Except for the beginning exclamation
295 # mark, the whole bracket expression can be used directly as regex, but
296 # we have to find where the expression ends.
297 # - "[][!]" matches ']', '[' and '!'.
298 # - "[]-]" matches ']' and '-'.
299 # - "[!]a-]" matches any character except ']', 'a' and '-'.
300 j = i
301
302 # Pass bracket expression negation.
303 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
304 j += 1
305
306 # Pass first closing bracket if it is at the beginning of the
307 # expression.
308 if j < end and pattern[j] == ']':
309 j += 1
310
311 # Find closing bracket. Stop once we reach the end or find it.
312 while j < end and pattern[j] != ']':
313 j += 1
314
315 if j < end:
316 # Found end of bracket expression. Increment j to be one past the
317 # closing bracket:
318 #
319 # [...]
320 # ^ ^
321 # i j
322 #
323 j += 1
324 expr = '['
325
326 if pattern[i] == '!':
327 # Bracket expression needs to be negated.
328 expr += '^'
329 i += 1
330 elif pattern[i] == '^':
331 # POSIX declares that the regex bracket expression negation "[^...]"
332 # is undefined in a glob pattern. Python's `fnmatch.translate()`
333 # escapes the caret ('^') as a literal. Git supports the using a
334 # caret for negation. Maintain consistency with Git because that is
335 # the expected behavior.
336 expr += '^'
337 i += 1
338
339 # Build regex bracket expression. Escape slashes so they are treated
340 # as literal slashes by regex as defined by POSIX.
341 expr += pattern[i:j].replace('\\', '\\\\')
342
343 # Add regex bracket expression to regex result.
344 regex += expr
345
346 # Set i to one past the closing bracket.
347 i = j
348
349 else:
350 # Failed to find closing bracket, treat opening bracket as a bracket
351 # literal instead of as an expression.
352 regex += '\\['
353
354 else:
355 # Regular character, escape it for regex.
356 regex += re.escape(char)
357
358 if escape:
359 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
360
361 return regex
362
363 @staticmethod
364 def escape(s: AnyStr) -> AnyStr:
365 """
366 Escape special characters in the given string.
367
368 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
369 escape, usually before adding it to a ".gitignore".
370
371 Returns the escaped string (:class:`str` or :class:`bytes`).
372 """
373 if isinstance(s, str):
374 return_type = str
375 string = s
376 elif isinstance(s, bytes):
377 return_type = bytes
378 string = s.decode(_BYTES_ENCODING)
379 else:
380 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
381
382 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
383 meta_characters = r"[]!*#?"
384
385 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
386
387 if return_type is bytes:
388 return out_string.encode(_BYTES_ENCODING)
389 else:
390 return out_string
391
392util.register_pattern('gitwildmatch', GitWildMatchPattern)
393
394
395class GitIgnorePattern(GitWildMatchPattern):
396 """
397 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
398 This class only exists to maintain compatibility with v0.4.
399 """
400
401 def __init__(self, *args, **kw) -> None:
402 """
403 Warn about deprecation.
404 """
405 self._deprecated()
406 super(GitIgnorePattern, self).__init__(*args, **kw)
407
408 @staticmethod
409 def _deprecated() -> None:
410 """
411 Warn about deprecation.
412 """
413 warnings.warn((
414 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
415 "('gitwildmatch') instead."
416 ), DeprecationWarning, stacklevel=3)
417
418 @override
419 @classmethod
420 def pattern_to_regex(cls, *args, **kw):
421 """
422 Warn about deprecation.
423 """
424 cls._deprecated()
425 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
426
427# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
428# v0.4.
429util.register_pattern('gitignore', GitIgnorePattern)