1"""
2This module implements Git's wildmatch pattern matching which itself is derived
3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
4"""
5
6import re
7import warnings
8from typing import (
9 AnyStr,
10 Optional, # Replaced by `X | None` in 3.10.
11 Tuple) # Replaced by `tuple` in 3.9.
12
13from .. import util
14from ..pattern import RegexPattern
15
16_BYTES_ENCODING = 'latin1'
17"""
18The encoding to use when parsing a byte string pattern.
19"""
20
21_DIR_MARK = 'ps_d'
22"""
23The regex group name for the directory marker. This is only used by
24:class:`GitIgnoreSpec`.
25"""
26
27
28class GitWildMatchPatternError(ValueError):
29 """
30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
31 pattern.
32 """
33 pass
34
35
36class GitWildMatchPattern(RegexPattern):
37 """
38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
39 pattern.
40 """
41
42 # Keep the dict-less class hierarchy.
43 __slots__ = ()
44
45 @classmethod
46 def pattern_to_regex(
47 cls,
48 pattern: AnyStr,
49 ) -> Tuple[Optional[AnyStr], Optional[bool]]:
50 """
51 Convert the pattern into a regular expression.
52
53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
54 regular expression.
55
56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
57 :data:`None`); and whether matched files should be included (:data:`True`),
58 excluded (:data:`False`), or if it is a null-operation (:data:`None`).
59 """
60 if isinstance(pattern, str):
61 return_type = str
62 elif isinstance(pattern, bytes):
63 return_type = bytes
64 pattern = pattern.decode(_BYTES_ENCODING)
65 else:
66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
67
68 original_pattern = pattern
69
70 if pattern.endswith('\\ '):
71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
72 # with backslash followed by a space, only strip from left.
73 pattern = pattern.lstrip()
74 else:
75 pattern = pattern.strip()
76
77 regex: Optional[str]
78 include: Optional[bool]
79
80 if pattern.startswith('#'):
81 # A pattern starting with a hash ('#') serves as a comment (neither
82 # includes nor excludes files). Escape the hash with a back-slash to match
83 # a literal hash (i.e., '\#').
84 regex = None
85 include = None
86
87 elif pattern == '/':
88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
89 # not match any file.
90 regex = None
91 include = None
92
93 elif pattern:
94 if pattern.startswith('!'):
95 # A pattern starting with an exclamation mark ('!') negates the pattern
96 # (exclude instead of include). Escape the exclamation mark with a
97 # back-slash to match a literal exclamation mark (i.e., '\!').
98 include = False
99 # Remove leading exclamation mark.
100 pattern = pattern[1:]
101 else:
102 include = True
103
104 # Allow a regex override for edge cases that cannot be handled through
105 # normalization.
106 override_regex: Optional[str] = None
107
108 # Split pattern into segments.
109 pattern_segs = pattern.split('/')
110
111 # Check whether the pattern is specifically a directory pattern before
112 # normalization.
113 is_dir_pattern = not pattern_segs[-1]
114
115 # Normalize pattern to make processing easier.
116
117 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
118 # sequence down to one double-asterisk. Iterate over the segments in
119 # reverse and remove the duplicate double asterisks as we go.
120 for i in range(len(pattern_segs) - 1, 0, -1):
121 prev = pattern_segs[i-1]
122 seg = pattern_segs[i]
123 if prev == '**' and seg == '**':
124 del pattern_segs[i]
125
126 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
127 # EDGE CASE: The '**/' pattern should match everything except individual
128 # files in the root directory. This case cannot be adequately handled
129 # through normalization. Use the override.
130 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
131
132 if not pattern_segs[0]:
133 # A pattern beginning with a slash ('/') will only match paths directly
134 # on the root directory instead of any descendant paths. So, remove
135 # empty first segment to make pattern relative to root.
136 del pattern_segs[0]
137
138 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
139 # A single pattern without a beginning slash ('/') will match any
140 # descendant path. This is equivalent to "**/{pattern}". So, prepend
141 # with double-asterisks to make pattern relative to root.
142 # - EDGE CASE: This also holds for a single pattern with a trailing
143 # slash (e.g. dir/).
144 if pattern_segs[0] != '**':
145 pattern_segs.insert(0, '**')
146
147 else:
148 # EDGE CASE: A pattern without a beginning slash ('/') but contains at
149 # least one prepended directory (e.g. "dir/{pattern}") should not match
150 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
151 pass
152
153 if not pattern_segs:
154 # After resolving the edge cases, we end up with no pattern at all. This
155 # must be because the pattern is invalid.
156 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
157
158 if not pattern_segs[-1] and len(pattern_segs) > 1:
159 # A pattern ending with a slash ('/') will match all descendant paths if
160 # it is a directory but not if it is a regular file. This is equivalent
161 # to "{pattern}/**". So, set last segment to a double-asterisk to
162 # include all descendants.
163 pattern_segs[-1] = '**'
164
165 if override_regex is None:
166 # Build regular expression from pattern.
167 output = ['^']
168 need_slash = False
169 end = len(pattern_segs) - 1
170 for i, seg in enumerate(pattern_segs):
171 if seg == '**':
172 if i == 0 and i == end:
173 # A pattern consisting solely of double-asterisks ('**') will
174 # match every path.
175 output.append(f'[^/]+(?:/.*)?')
176
177 elif i == 0:
178 # A normalized pattern beginning with double-asterisks
179 # ('**') will match any leading path segments.
180 output.append('(?:.+/)?')
181 need_slash = False
182
183 elif i == end:
184 # A normalized pattern ending with double-asterisks ('**') will
185 # match any trailing path segments.
186 if is_dir_pattern:
187 output.append(f'(?P<{_DIR_MARK}>/).*')
188 else:
189 output.append(f'/.*')
190
191 else:
192 # A pattern with inner double-asterisks ('**') will match multiple
193 # (or zero) inner path segments.
194 output.append('(?:/.+)?')
195 need_slash = True
196
197 elif seg == '*':
198 # Match single path segment.
199 if need_slash:
200 output.append('/')
201
202 output.append('[^/]+')
203
204 if i == end:
205 # A pattern ending without a slash ('/') will match a file or a
206 # directory (with paths underneath it). E.g., "foo" matches "foo",
207 # "foo/bar", "foo/bar/baz", etc.
208 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
209
210 need_slash = True
211
212 else:
213 # Match segment glob pattern.
214 if need_slash:
215 output.append('/')
216
217 try:
218 output.append(cls._translate_segment_glob(seg))
219 except ValueError as e:
220 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
221
222 if i == end:
223 # A pattern ending without a slash ('/') will match a file or a
224 # directory (with paths underneath it). E.g., "foo" matches "foo",
225 # "foo/bar", "foo/bar/baz", etc.
226 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
227
228 need_slash = True
229
230 output.append('$')
231 regex = ''.join(output)
232
233 else:
234 # Use regex override.
235 regex = override_regex
236
237 else:
238 # A blank pattern is a null-operation (neither includes nor excludes
239 # files).
240 regex = None
241 include = None
242
243 if regex is not None and return_type is bytes:
244 regex = regex.encode(_BYTES_ENCODING)
245
246 return regex, include
247
248 @staticmethod
249 def _translate_segment_glob(pattern: str) -> str:
250 """
251 Translates the glob pattern to a regular expression. This is used in the
252 constructor to translate a path segment glob pattern to its corresponding
253 regular expression.
254
255 *pattern* (:class:`str`) is the glob pattern.
256
257 Returns the regular expression (:class:`str`).
258 """
259 # NOTE: This is derived from `fnmatch.translate()` and is similar to the
260 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
261
262 escape = False
263 regex = ''
264 i, end = 0, len(pattern)
265 while i < end:
266 # Get next character.
267 char = pattern[i]
268 i += 1
269
270 if escape:
271 # Escape the character.
272 escape = False
273 regex += re.escape(char)
274
275 elif char == '\\':
276 # Escape character, escape next character.
277 escape = True
278
279 elif char == '*':
280 # Multi-character wildcard. Match any string (except slashes), including
281 # an empty string.
282 regex += '[^/]*'
283
284 elif char == '?':
285 # Single-character wildcard. Match any single character (except a
286 # slash).
287 regex += '[^/]'
288
289 elif char == '[':
290 # Bracket expression wildcard. Except for the beginning exclamation
291 # mark, the whole bracket expression can be used directly as regex, but
292 # we have to find where the expression ends.
293 # - "[][!]" matches ']', '[' and '!'.
294 # - "[]-]" matches ']' and '-'.
295 # - "[!]a-]" matches any character except ']', 'a' and '-'.
296 j = i
297
298 # Pass bracket expression negation.
299 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
300 j += 1
301
302 # Pass first closing bracket if it is at the beginning of the
303 # expression.
304 if j < end and pattern[j] == ']':
305 j += 1
306
307 # Find closing bracket. Stop once we reach the end or find it.
308 while j < end and pattern[j] != ']':
309 j += 1
310
311 if j < end:
312 # Found end of bracket expression. Increment j to be one past the
313 # closing bracket:
314 #
315 # [...]
316 # ^ ^
317 # i j
318 #
319 j += 1
320 expr = '['
321
322 if pattern[i] == '!':
323 # Bracket expression needs to be negated.
324 expr += '^'
325 i += 1
326 elif pattern[i] == '^':
327 # POSIX declares that the regex bracket expression negation "[^...]"
328 # is undefined in a glob pattern. Python's `fnmatch.translate()`
329 # escapes the caret ('^') as a literal. Git supports the using a
330 # caret for negation. Maintain consistency with Git because that is
331 # the expected behavior.
332 expr += '^'
333 i += 1
334
335 # Build regex bracket expression. Escape slashes so they are treated
336 # as literal slashes by regex as defined by POSIX.
337 expr += pattern[i:j].replace('\\', '\\\\')
338
339 # Add regex bracket expression to regex result.
340 regex += expr
341
342 # Set i to one past the closing bracket.
343 i = j
344
345 else:
346 # Failed to find closing bracket, treat opening bracket as a bracket
347 # literal instead of as an expression.
348 regex += '\\['
349
350 else:
351 # Regular character, escape it for regex.
352 regex += re.escape(char)
353
354 if escape:
355 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
356
357 return regex
358
359 @staticmethod
360 def escape(s: AnyStr) -> AnyStr:
361 """
362 Escape special characters in the given string.
363
364 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
365 escape, usually before adding it to a ".gitignore".
366
367 Returns the escaped string (:class:`str` or :class:`bytes`).
368 """
369 if isinstance(s, str):
370 return_type = str
371 string = s
372 elif isinstance(s, bytes):
373 return_type = bytes
374 string = s.decode(_BYTES_ENCODING)
375 else:
376 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
377
378 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
379 meta_characters = r"[]!*#?"
380
381 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
382
383 if return_type is bytes:
384 return out_string.encode(_BYTES_ENCODING)
385 else:
386 return out_string
387
388util.register_pattern('gitwildmatch', GitWildMatchPattern)
389
390
391class GitIgnorePattern(GitWildMatchPattern):
392 """
393 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
394 This class only exists to maintain compatibility with v0.4.
395 """
396
397 def __init__(self, *args, **kw) -> None:
398 """
399 Warn about deprecation.
400 """
401 self._deprecated()
402 super(GitIgnorePattern, self).__init__(*args, **kw)
403
404 @staticmethod
405 def _deprecated() -> None:
406 """
407 Warn about deprecation.
408 """
409 warnings.warn((
410 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
411 "('gitwildmatch') instead."
412 ), DeprecationWarning, stacklevel=3)
413
414 @classmethod
415 def pattern_to_regex(cls, *args, **kw):
416 """
417 Warn about deprecation.
418 """
419 cls._deprecated()
420 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
421
422# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
423# v0.4.
424util.register_pattern('gitignore', GitIgnorePattern)