Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 86%
160 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 07:17 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 07:17 +0000
1"""
2This module implements Git's wildmatch pattern matching which itself is
3derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore"
4files.
5"""
7import re
8import warnings
9from typing import (
10 AnyStr,
11 Optional,
12 Tuple)
14from .. import util
15from ..pattern import RegexPattern
17_BYTES_ENCODING = 'latin1'
18"""
19The encoding to use when parsing a byte string pattern.
20"""
22_DIR_MARK = 'ps_d'
23"""
24The regex group name for the directory marker. This is only used by
25:class:`GitIgnoreSpec`.
26"""
29class GitWildMatchPatternError(ValueError):
30 """
31 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
32 pattern.
33 """
34 pass
37class GitWildMatchPattern(RegexPattern):
38 """
39 The :class:`GitWildMatchPattern` class represents a compiled Git
40 wildmatch pattern.
41 """
43 # Keep the dict-less class hierarchy.
44 __slots__ = ()
46 @classmethod
47 def pattern_to_regex(
48 cls,
49 pattern: AnyStr,
50 ) -> Tuple[Optional[AnyStr], Optional[bool]]:
51 """
52 Convert the pattern into a regular expression.
54 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert
55 into a regular expression.
57 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`,
58 or :data:`None`); and whether matched files should be included
59 (:data:`True`), excluded (:data:`False`), or if it is a
60 null-operation (:data:`None`).
61 """
62 if isinstance(pattern, str):
63 return_type = str
64 elif isinstance(pattern, bytes):
65 return_type = bytes
66 pattern = pattern.decode(_BYTES_ENCODING)
67 else:
68 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
70 original_pattern = pattern
72 if pattern.endswith('\\ '):
73 # EDGE CASE: Spaces can be escaped with backslash.
74 # If a pattern that ends with backslash followed by a space,
75 # only strip from left.
76 pattern = pattern.lstrip()
77 else:
78 pattern = pattern.strip()
80 if pattern.startswith('#'):
81 # A pattern starting with a hash ('#') serves as a comment
82 # (neither includes nor excludes files). Escape the hash with a
83 # back-slash to match a literal hash (i.e., '\#').
84 regex = None
85 include = None
87 elif pattern == '/':
88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single
89 # '/' does not match any file.
90 regex = None
91 include = None
93 elif pattern:
94 if pattern.startswith('!'):
95 # A pattern starting with an exclamation mark ('!') negates the
96 # pattern (exclude instead of include). Escape the exclamation
97 # mark with a back-slash to match a literal exclamation mark
98 # (i.e., '\!').
99 include = False
100 # Remove leading exclamation mark.
101 pattern = pattern[1:]
102 else:
103 include = True
105 # Allow a regex override for edge cases that cannot be handled
106 # through normalization.
107 override_regex = None
109 # Split pattern into segments.
110 pattern_segs = pattern.split('/')
112 # Normalize pattern to make processing easier.
114 # EDGE CASE: Deal with duplicate double-asterisk sequences.
115 # Collapse each sequence down to one double-asterisk. Iterate over
116 # the segments in reverse and remove the duplicate double
117 # asterisks as we go.
118 for i in range(len(pattern_segs) - 1, 0, -1):
119 prev = pattern_segs[i-1]
120 seg = pattern_segs[i]
121 if prev == '**' and seg == '**':
122 del pattern_segs[i]
124 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
125 # EDGE CASE: The '**/' pattern should match everything except
126 # individual files in the root directory. This case cannot be
127 # adequately handled through normalization. Use the override.
128 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
130 if not pattern_segs[0]:
131 # A pattern beginning with a slash ('/') will only match paths
132 # directly on the root directory instead of any descendant
133 # paths. So, remove empty first segment to make pattern relative
134 # to root.
135 del pattern_segs[0]
137 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
138 # A single pattern without a beginning slash ('/') will match
139 # any descendant path. This is equivalent to "**/{pattern}". So,
140 # prepend with double-asterisks to make pattern relative to
141 # root.
142 # EDGE CASE: This also holds for a single pattern with a
143 # trailing slash (e.g. dir/).
144 if pattern_segs[0] != '**':
145 pattern_segs.insert(0, '**')
147 else:
148 # EDGE CASE: A pattern without a beginning slash ('/') but
149 # contains at least one prepended directory (e.g.
150 # "dir/{pattern}") should not match "**/dir/{pattern}",
151 # according to `git check-ignore` (v2.4.1).
152 pass
154 if not pattern_segs:
155 # After resolving the edge cases, we end up with no pattern at
156 # all. This must be because the pattern is invalid.
157 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
159 if not pattern_segs[-1] and len(pattern_segs) > 1:
160 # A pattern ending with a slash ('/') will match all descendant
161 # paths if it is a directory but not if it is a regular file.
162 # This is equivalent to "{pattern}/**". So, set last segment to
163 # a double-asterisk to include all descendants.
164 pattern_segs[-1] = '**'
166 if override_regex is None:
167 # Build regular expression from pattern.
168 output = ['^']
169 need_slash = False
170 end = len(pattern_segs) - 1
171 for i, seg in enumerate(pattern_segs):
172 if seg == '**':
173 if i == 0 and i == end:
174 # A pattern consisting solely of double-asterisks ('**')
175 # will match every path.
176 output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?')
177 elif i == 0:
178 # A normalized pattern beginning with double-asterisks
179 # ('**') will match any leading path segments.
180 output.append('(?:.+/)?')
181 need_slash = False
182 elif i == end:
183 # A normalized pattern ending with double-asterisks ('**')
184 # will match any trailing path segments.
185 output.append(f'(?P<{_DIR_MARK}>/).*')
186 else:
187 # A pattern with inner double-asterisks ('**') will match
188 # multiple (or zero) inner path segments.
189 output.append('(?:/.+)?')
190 need_slash = True
192 elif seg == '*':
193 # Match single path segment.
194 if need_slash:
195 output.append('/')
197 output.append('[^/]+')
199 if i == end:
200 # A pattern ending without a slash ('/') will match a file
201 # or a directory (with paths underneath it). E.g., "foo"
202 # matches "foo", "foo/bar", "foo/bar/baz", etc.
203 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
205 need_slash = True
207 else:
208 # Match segment glob pattern.
209 if need_slash:
210 output.append('/')
212 try:
213 output.append(cls._translate_segment_glob(seg))
214 except ValueError as e:
215 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
217 if i == end:
218 # A pattern ending without a slash ('/') will match a file
219 # or a directory (with paths underneath it). E.g., "foo"
220 # matches "foo", "foo/bar", "foo/bar/baz", etc.
221 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
223 need_slash = True
225 output.append('$')
226 regex = ''.join(output)
228 else:
229 # Use regex override.
230 regex = override_regex
232 else:
233 # A blank pattern is a null-operation (neither includes nor
234 # excludes files).
235 regex = None
236 include = None
238 if regex is not None and return_type is bytes:
239 regex = regex.encode(_BYTES_ENCODING)
241 return regex, include
243 @staticmethod
244 def _translate_segment_glob(pattern: str) -> str:
245 """
246 Translates the glob pattern to a regular expression. This is used in
247 the constructor to translate a path segment glob pattern to its
248 corresponding regular expression.
250 *pattern* (:class:`str`) is the glob pattern.
252 Returns the regular expression (:class:`str`).
253 """
254 # NOTE: This is derived from `fnmatch.translate()` and is similar to
255 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
257 escape = False
258 regex = ''
259 i, end = 0, len(pattern)
260 while i < end:
261 # Get next character.
262 char = pattern[i]
263 i += 1
265 if escape:
266 # Escape the character.
267 escape = False
268 regex += re.escape(char)
270 elif char == '\\':
271 # Escape character, escape next character.
272 escape = True
274 elif char == '*':
275 # Multi-character wildcard. Match any string (except slashes),
276 # including an empty string.
277 regex += '[^/]*'
279 elif char == '?':
280 # Single-character wildcard. Match any single character (except
281 # a slash).
282 regex += '[^/]'
284 elif char == '[':
285 # Bracket expression wildcard. Except for the beginning
286 # exclamation mark, the whole bracket expression can be used
287 # directly as regex but we have to find where the expression
288 # ends.
289 # - "[][!]" matches ']', '[' and '!'.
290 # - "[]-]" matches ']' and '-'.
291 # - "[!]a-]" matches any character except ']', 'a' and '-'.
292 j = i
294 # Pass bracket expression negation.
295 if j < end and (pattern[j] == '!' or pattern[j] == '^'):
296 j += 1
298 # Pass first closing bracket if it is at the beginning of the
299 # expression.
300 if j < end and pattern[j] == ']':
301 j += 1
303 # Find closing bracket. Stop once we reach the end or find it.
304 while j < end and pattern[j] != ']':
305 j += 1
307 if j < end:
308 # Found end of bracket expression. Increment j to be one past
309 # the closing bracket:
310 #
311 # [...]
312 # ^ ^
313 # i j
314 #
315 j += 1
316 expr = '['
318 if pattern[i] == '!':
319 # Bracket expression needs to be negated.
320 expr += '^'
321 i += 1
322 elif pattern[i] == '^':
323 # POSIX declares that the regex bracket expression negation
324 # "[^...]" is undefined in a glob pattern. Python's
325 # `fnmatch.translate()` escapes the caret ('^') as a
326 # literal. Git supports the using a caret for negation.
327 # Maintain consistency with Git because that is the expected
328 # behavior.
329 expr += '^'
330 i += 1
332 # Build regex bracket expression. Escape slashes so they are
333 # treated as literal slashes by regex as defined by POSIX.
334 expr += pattern[i:j].replace('\\', '\\\\')
336 # Add regex bracket expression to regex result.
337 regex += expr
339 # Set i to one past the closing bracket.
340 i = j
342 else:
343 # Failed to find closing bracket, treat opening bracket as a
344 # bracket literal instead of as an expression.
345 regex += '\\['
347 else:
348 # Regular character, escape it for regex.
349 regex += re.escape(char)
351 if escape:
352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
354 return regex
356 @staticmethod
357 def escape(s: AnyStr) -> AnyStr:
358 """
359 Escape special characters in the given string.
361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you
362 want to escape, usually before adding it to a ".gitignore".
364 Returns the escaped string (:class:`str` or :class:`bytes`).
365 """
366 if isinstance(s, str):
367 return_type = str
368 string = s
369 elif isinstance(s, bytes):
370 return_type = bytes
371 string = s.decode(_BYTES_ENCODING)
372 else:
373 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
376 meta_characters = r"[]!*#?"
378 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
380 if return_type is bytes:
381 return out_string.encode(_BYTES_ENCODING)
382 else:
383 return out_string
385util.register_pattern('gitwildmatch', GitWildMatchPattern)
388class GitIgnorePattern(GitWildMatchPattern):
389 """
390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
391 This class only exists to maintain compatibility with v0.4.
392 """
394 def __init__(self, *args, **kw) -> None:
395 """
396 Warn about deprecation.
397 """
398 self._deprecated()
399 super(GitIgnorePattern, self).__init__(*args, **kw)
401 @staticmethod
402 def _deprecated() -> None:
403 """
404 Warn about deprecation.
405 """
406 warnings.warn((
407 "GitIgnorePattern ('gitignore') is deprecated. Use "
408 "GitWildMatchPattern ('gitwildmatch') instead."
409 ), DeprecationWarning, stacklevel=3)
411 @classmethod
412 def pattern_to_regex(cls, *args, **kw):
413 """
414 Warn about deprecation.
415 """
416 cls._deprecated()
417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
419# Register `GitIgnorePattern` as "gitignore" for backward compatibility
420# with v0.4.
421util.register_pattern('gitignore', GitIgnorePattern)