Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 87%
163 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-14 06:25 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-14 06:25 +0000
1"""
2This module implements Git's wildmatch pattern matching which itself is derived
3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
4"""
6import re
7import warnings
8from typing import (
9 AnyStr,
10 Optional, # Replaced by `X | None` in 3.10.
11 Tuple) # Replaced by `tuple` in 3.9.
13from .. import util
14from ..pattern import RegexPattern
16_BYTES_ENCODING = 'latin1'
17"""
18The encoding to use when parsing a byte string pattern.
19"""
21_DIR_MARK = 'ps_d'
22"""
23The regex group name for the directory marker. This is only used by
24:class:`GitIgnoreSpec`.
25"""
28class GitWildMatchPatternError(ValueError):
29 """
30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
31 pattern.
32 """
33 pass
36class GitWildMatchPattern(RegexPattern):
37 """
38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
39 pattern.
40 """
42 # Keep the dict-less class hierarchy.
43 __slots__ = ()
45 @classmethod
46 def pattern_to_regex(
47 cls,
48 pattern: AnyStr,
49 ) -> Tuple[Optional[AnyStr], Optional[bool]]:
50 """
51 Convert the pattern into a regular expression.
53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
54 regular expression.
56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
57 :data:`None`); and whether matched files should be included (:data:`True`),
58 excluded (:data:`False`), or if it is a null-operation (:data:`None`).
59 """
60 if isinstance(pattern, str):
61 return_type = str
62 elif isinstance(pattern, bytes):
63 return_type = bytes
64 pattern = pattern.decode(_BYTES_ENCODING)
65 else:
66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
68 original_pattern = pattern
70 if pattern.endswith('\\ '):
71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
72 # with backslash followed by a space, only strip from left.
73 pattern = pattern.lstrip()
74 else:
75 pattern = pattern.strip()
77 if pattern.startswith('#'):
78 # A pattern starting with a hash ('#') serves as a comment (neither
79 # includes nor excludes files). Escape the hash with a back-slash to match
80 # a literal hash (i.e., '\#').
81 regex = None
82 include = None
84 elif pattern == '/':
85 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
86 # not match any file.
87 regex = None
88 include = None
90 elif pattern:
91 if pattern.startswith('!'):
92 # A pattern starting with an exclamation mark ('!') negates the pattern
93 # (exclude instead of include). Escape the exclamation mark with a
94 # back-slash to match a literal exclamation mark (i.e., '\!').
95 include = False
96 # Remove leading exclamation mark.
97 pattern = pattern[1:]
98 else:
99 include = True
101 # Allow a regex override for edge cases that cannot be handled through
102 # normalization.
103 override_regex = None
105 # Split pattern into segments.
106 pattern_segs = pattern.split('/')
108 # Check whether the pattern is specifically a directory pattern before
109 # normalization.
110 is_dir_pattern = not pattern_segs[-1]
112 # Normalize pattern to make processing easier.
114 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
115 # sequence down to one double-asterisk. Iterate over the segments in
116 # reverse and remove the duplicate double asterisks as we go.
117 for i in range(len(pattern_segs) - 1, 0, -1):
118 prev = pattern_segs[i-1]
119 seg = pattern_segs[i]
120 if prev == '**' and seg == '**':
121 del pattern_segs[i]
123 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
124 # EDGE CASE: The '**/' pattern should match everything except individual
125 # files in the root directory. This case cannot be adequately handled
126 # through normalization. Use the override.
127 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
129 if not pattern_segs[0]:
130 # A pattern beginning with a slash ('/') will only match paths directly
131 # on the root directory instead of any descendant paths. So, remove
132 # empty first segment to make pattern relative to root.
133 del pattern_segs[0]
135 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
136 # A single pattern without a beginning slash ('/') will match any
137 # descendant path. This is equivalent to "**/{pattern}". So, prepend
138 # with double-asterisks to make pattern relative to root.
139 # - EDGE CASE: This also holds for a single pattern with a trailing
140 # slash (e.g. dir/).
141 if pattern_segs[0] != '**':
142 pattern_segs.insert(0, '**')
144 else:
145 # EDGE CASE: A pattern without a beginning slash ('/') but contains at
146 # least one prepended directory (e.g. "dir/{pattern}") should not match
147 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
148 pass
150 if not pattern_segs:
151 # After resolving the edge cases, we end up with no pattern at all. This
152 # must be because the pattern is invalid.
153 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
155 if not pattern_segs[-1] and len(pattern_segs) > 1:
156 # A pattern ending with a slash ('/') will match all descendant paths if
157 # it is a directory but not if it is a regular file. This is equivalent
158 # to "{pattern}/**". So, set last segment to a double-asterisk to
159 # include all descendants.
160 pattern_segs[-1] = '**'
162 if override_regex is None:
163 # Build regular expression from pattern.
164 output = ['^']
165 need_slash = False
166 end = len(pattern_segs) - 1
167 for i, seg in enumerate(pattern_segs):
168 if seg == '**':
169 if i == 0 and i == end:
170 # A pattern consisting solely of double-asterisks ('**') will
171 # match every path.
172 output.append(f'[^/]+(?:/.*)?')
174 elif i == 0:
175 # A normalized pattern beginning with double-asterisks
176 # ('**') will match any leading path segments.
177 output.append('(?:.+/)?')
178 need_slash = False
180 elif i == end:
181 # A normalized pattern ending with double-asterisks ('**') will
182 # match any trailing path segments.
183 if is_dir_pattern:
184 output.append(f'(?P<{_DIR_MARK}>/).*')
185 else:
186 output.append(f'/.*')
188 else:
189 # A pattern with inner double-asterisks ('**') will match multiple
190 # (or zero) inner path segments.
191 output.append('(?:/.+)?')
192 need_slash = True
194 elif seg == '*':
195 # Match single path segment.
196 if need_slash:
197 output.append('/')
199 output.append('[^/]+')
201 if i == end:
202 # A pattern ending without a slash ('/') will match a file or a
203 # directory (with paths underneath it). E.g., "foo" matches "foo",
204 # "foo/bar", "foo/bar/baz", etc.
205 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
207 need_slash = True
209 else:
210 # Match segment glob pattern.
211 if need_slash:
212 output.append('/')
214 try:
215 output.append(cls._translate_segment_glob(seg))
216 except ValueError as e:
217 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
219 if i == end:
220 # A pattern ending without a slash ('/') will match a file or a
221 # directory (with paths underneath it). E.g., "foo" matches "foo",
222 # "foo/bar", "foo/bar/baz", etc.
223 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
225 need_slash = True
227 output.append('$')
228 regex = ''.join(output)
230 else:
231 # Use regex override.
232 regex = override_regex
234 else:
235 # A blank pattern is a null-operation (neither includes nor excludes
236 # files).
237 regex = None
238 include = None
240 if regex is not None and return_type is bytes:
241 regex = regex.encode(_BYTES_ENCODING)
243 return regex, include
245 @staticmethod
246 def _translate_segment_glob(pattern: str) -> str:
247 """
248 Translates the glob pattern to a regular expression. This is used in the
249 constructor to translate a path segment glob pattern to its corresponding
250 regular expression.
252 *pattern* (:class:`str`) is the glob pattern.
254 Returns the regular expression (:class:`str`).
255 """
256 # NOTE: This is derived from `fnmatch.translate()` and is similar to the
257 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
259 escape = False
260 regex = ''
261 i, end = 0, len(pattern)
262 while i < end:
263 # Get next character.
264 char = pattern[i]
265 i += 1
267 if escape:
268 # Escape the character.
269 escape = False
270 regex += re.escape(char)
272 elif char == '\\':
273 # Escape character, escape next character.
274 escape = True
276 elif char == '*':
277 # Multi-character wildcard. Match any string (except slashes), including
278 # an empty string.
279 regex += '[^/]*'
281 elif char == '?':
282 # Single-character wildcard. Match any single character (except a
283 # slash).
284 regex += '[^/]'
286 elif char == '[':
287 # Bracket expression wildcard. Except for the beginning exclamation
288 # mark, the whole bracket expression can be used directly as regex, but
289 # we have to find where the expression ends.
290 # - "[][!]" matches ']', '[' and '!'.
291 # - "[]-]" matches ']' and '-'.
292 # - "[!]a-]" matches any character except ']', 'a' and '-'.
293 j = i
295 # Pass bracket expression negation.
296 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
297 j += 1
299 # Pass first closing bracket if it is at the beginning of the
300 # expression.
301 if j < end and pattern[j] == ']':
302 j += 1
304 # Find closing bracket. Stop once we reach the end or find it.
305 while j < end and pattern[j] != ']':
306 j += 1
308 if j < end:
309 # Found end of bracket expression. Increment j to be one past the
310 # closing bracket:
311 #
312 # [...]
313 # ^ ^
314 # i j
315 #
316 j += 1
317 expr = '['
319 if pattern[i] == '!':
320 # Bracket expression needs to be negated.
321 expr += '^'
322 i += 1
323 elif pattern[i] == '^':
324 # POSIX declares that the regex bracket expression negation "[^...]"
325 # is undefined in a glob pattern. Python's `fnmatch.translate()`
326 # escapes the caret ('^') as a literal. Git supports the using a
327 # caret for negation. Maintain consistency with Git because that is
328 # the expected behavior.
329 expr += '^'
330 i += 1
332 # Build regex bracket expression. Escape slashes so they are treated
333 # as literal slashes by regex as defined by POSIX.
334 expr += pattern[i:j].replace('\\', '\\\\')
336 # Add regex bracket expression to regex result.
337 regex += expr
339 # Set i to one past the closing bracket.
340 i = j
342 else:
343 # Failed to find closing bracket, treat opening bracket as a bracket
344 # literal instead of as an expression.
345 regex += '\\['
347 else:
348 # Regular character, escape it for regex.
349 regex += re.escape(char)
351 if escape:
352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
354 return regex
356 @staticmethod
357 def escape(s: AnyStr) -> AnyStr:
358 """
359 Escape special characters in the given string.
361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
362 escape, usually before adding it to a ".gitignore".
364 Returns the escaped string (:class:`str` or :class:`bytes`).
365 """
366 if isinstance(s, str):
367 return_type = str
368 string = s
369 elif isinstance(s, bytes):
370 return_type = bytes
371 string = s.decode(_BYTES_ENCODING)
372 else:
373 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
376 meta_characters = r"[]!*#?"
378 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
380 if return_type is bytes:
381 return out_string.encode(_BYTES_ENCODING)
382 else:
383 return out_string
385util.register_pattern('gitwildmatch', GitWildMatchPattern)
388class GitIgnorePattern(GitWildMatchPattern):
389 """
390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
391 This class only exists to maintain compatibility with v0.4.
392 """
394 def __init__(self, *args, **kw) -> None:
395 """
396 Warn about deprecation.
397 """
398 self._deprecated()
399 super(GitIgnorePattern, self).__init__(*args, **kw)
401 @staticmethod
402 def _deprecated() -> None:
403 """
404 Warn about deprecation.
405 """
406 warnings.warn((
407 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
408 "('gitwildmatch') instead."
409 ), DeprecationWarning, stacklevel=3)
411 @classmethod
412 def pattern_to_regex(cls, *args, **kw):
413 """
414 Warn about deprecation.
415 """
416 cls._deprecated()
417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
419# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
420# v0.4.
421util.register_pattern('gitignore', GitIgnorePattern)