Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 16%
158 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
1"""
2This module implements Git's wildmatch pattern matching which itself is
3derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore"
4files.
5"""
7import re
8import warnings
9from typing import (
10 AnyStr,
11 Optional,
12 Tuple)
14from .. import util
15from ..pattern import RegexPattern
17_BYTES_ENCODING = 'latin1'
18"""
19The encoding to use when parsing a byte string pattern.
20"""
22_DIR_MARK = 'ps_d'
23"""
24The regex group name for the directory marker. This is only used by
25:class:`GitIgnoreSpec`.
26"""
29class GitWildMatchPatternError(ValueError):
30 """
31 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
32 pattern.
33 """
34 pass
37class GitWildMatchPattern(RegexPattern):
38 """
39 The :class:`GitWildMatchPattern` class represents a compiled Git
40 wildmatch pattern.
41 """
43 # Keep the dict-less class hierarchy.
44 __slots__ = ()
46 @classmethod
47 def pattern_to_regex(
48 cls,
49 pattern: AnyStr,
50 ) -> Tuple[Optional[AnyStr], Optional[bool]]:
51 """
52 Convert the pattern into a regular expression.
54 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert
55 into a regular expression.
57 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`,
58 or :data:`None`); and whether matched files should be included
59 (:data:`True`), excluded (:data:`False`), or if it is a
60 null-operation (:data:`None`).
61 """
62 if isinstance(pattern, str):
63 return_type = str
64 elif isinstance(pattern, bytes):
65 return_type = bytes
66 pattern = pattern.decode(_BYTES_ENCODING)
67 else:
68 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
70 original_pattern = pattern
71 pattern = pattern.strip()
73 if pattern.startswith('#'):
74 # A pattern starting with a hash ('#') serves as a comment
75 # (neither includes nor excludes files). Escape the hash with a
76 # back-slash to match a literal hash (i.e., '\#').
77 regex = None
78 include = None
80 elif pattern == '/':
81 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single
82 # '/' does not match any file.
83 regex = None
84 include = None
86 elif pattern:
87 if pattern.startswith('!'):
88 # A pattern starting with an exclamation mark ('!') negates the
89 # pattern (exclude instead of include). Escape the exclamation
90 # mark with a back-slash to match a literal exclamation mark
91 # (i.e., '\!').
92 include = False
93 # Remove leading exclamation mark.
94 pattern = pattern[1:]
95 else:
96 include = True
98 # Allow a regex override for edge cases that cannot be handled
99 # through normalization.
100 override_regex = None
102 # Split pattern into segments.
103 pattern_segs = pattern.split('/')
105 # Normalize pattern to make processing easier.
107 # EDGE CASE: Deal with duplicate double-asterisk sequences.
108 # Collapse each sequence down to one double-asterisk. Iterate over
109 # the segments in reverse and remove the duplicate double
110 # asterisks as we go.
111 for i in range(len(pattern_segs) - 1, 0, -1):
112 prev = pattern_segs[i-1]
113 seg = pattern_segs[i]
114 if prev == '**' and seg == '**':
115 del pattern_segs[i]
117 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
118 # EDGE CASE: The '**/' pattern should match everything except
119 # individual files in the root directory. This case cannot be
120 # adequately handled through normalization. Use the override.
121 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
123 if not pattern_segs[0]:
124 # A pattern beginning with a slash ('/') will only match paths
125 # directly on the root directory instead of any descendant
126 # paths. So, remove empty first segment to make pattern relative
127 # to root.
128 del pattern_segs[0]
130 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
131 # A single pattern without a beginning slash ('/') will match
132 # any descendant path. This is equivalent to "**/{pattern}". So,
133 # prepend with double-asterisks to make pattern relative to
134 # root.
135 # EDGE CASE: This also holds for a single pattern with a
136 # trailing slash (e.g. dir/).
137 if pattern_segs[0] != '**':
138 pattern_segs.insert(0, '**')
140 else:
141 # EDGE CASE: A pattern without a beginning slash ('/') but
142 # contains at least one prepended directory (e.g.
143 # "dir/{pattern}") should not match "**/dir/{pattern}",
144 # according to `git check-ignore` (v2.4.1).
145 pass
147 if not pattern_segs:
148 # After resolving the edge cases, we end up with no pattern at
149 # all. This must be because the pattern is invalid.
150 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
152 if not pattern_segs[-1] and len(pattern_segs) > 1:
153 # A pattern ending with a slash ('/') will match all descendant
154 # paths if it is a directory but not if it is a regular file.
155 # This is equivalent to "{pattern}/**". So, set last segment to
156 # a double-asterisk to include all descendants.
157 pattern_segs[-1] = '**'
159 if override_regex is None:
160 # Build regular expression from pattern.
161 output = ['^']
162 need_slash = False
163 end = len(pattern_segs) - 1
164 for i, seg in enumerate(pattern_segs):
165 if seg == '**':
166 if i == 0 and i == end:
167 # A pattern consisting solely of double-asterisks ('**')
168 # will match every path.
169 output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?')
170 elif i == 0:
171 # A normalized pattern beginning with double-asterisks
172 # ('**') will match any leading path segments.
173 output.append('(?:.+/)?')
174 need_slash = False
175 elif i == end:
176 # A normalized pattern ending with double-asterisks ('**')
177 # will match any trailing path segments.
178 output.append(f'(?P<{_DIR_MARK}>/).*')
179 else:
180 # A pattern with inner double-asterisks ('**') will match
181 # multiple (or zero) inner path segments.
182 output.append('(?:/.+)?')
183 need_slash = True
185 elif seg == '*':
186 # Match single path segment.
187 if need_slash:
188 output.append('/')
190 output.append('[^/]+')
192 if i == end:
193 # A pattern ending without a slash ('/') will match a file
194 # or a directory (with paths underneath it). E.g., "foo"
195 # matches "foo", "foo/bar", "foo/bar/baz", etc.
196 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
198 need_slash = True
200 else:
201 # Match segment glob pattern.
202 if need_slash:
203 output.append('/')
205 try:
206 output.append(cls._translate_segment_glob(seg))
207 except ValueError as e:
208 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
210 if i == end:
211 # A pattern ending without a slash ('/') will match a file
212 # or a directory (with paths underneath it). E.g., "foo"
213 # matches "foo", "foo/bar", "foo/bar/baz", etc.
214 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
216 need_slash = True
218 output.append('$')
219 regex = ''.join(output)
221 else:
222 # Use regex override.
223 regex = override_regex
225 else:
226 # A blank pattern is a null-operation (neither includes nor
227 # excludes files).
228 regex = None
229 include = None
231 if regex is not None and return_type is bytes:
232 regex = regex.encode(_BYTES_ENCODING)
234 return regex, include
236 @staticmethod
237 def _translate_segment_glob(pattern: str) -> str:
238 """
239 Translates the glob pattern to a regular expression. This is used in
240 the constructor to translate a path segment glob pattern to its
241 corresponding regular expression.
243 *pattern* (:class:`str`) is the glob pattern.
245 Returns the regular expression (:class:`str`).
246 """
247 # NOTE: This is derived from `fnmatch.translate()` and is similar to
248 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
250 escape = False
251 regex = ''
252 i, end = 0, len(pattern)
253 while i < end:
254 # Get next character.
255 char = pattern[i]
256 i += 1
258 if escape:
259 # Escape the character.
260 escape = False
261 regex += re.escape(char)
263 elif char == '\\':
264 # Escape character, escape next character.
265 escape = True
267 elif char == '*':
268 # Multi-character wildcard. Match any string (except slashes),
269 # including an empty string.
270 regex += '[^/]*'
272 elif char == '?':
273 # Single-character wildcard. Match any single character (except
274 # a slash).
275 regex += '[^/]'
277 elif char == '[':
278 # Bracket expression wildcard. Except for the beginning
279 # exclamation mark, the whole bracket expression can be used
280 # directly as regex but we have to find where the expression
281 # ends.
282 # - "[][!]" matches ']', '[' and '!'.
283 # - "[]-]" matches ']' and '-'.
284 # - "[!]a-]" matches any character except ']', 'a' and '-'.
285 j = i
286 # Pass brack expression negation.
287 if j < end and pattern[j] == '!':
288 j += 1
289 # Pass first closing bracket if it is at the beginning of the
290 # expression.
291 if j < end and pattern[j] == ']':
292 j += 1
293 # Find closing bracket. Stop once we reach the end or find it.
294 while j < end and pattern[j] != ']':
295 j += 1
297 if j < end:
298 # Found end of bracket expression. Increment j to be one past
299 # the closing bracket:
300 #
301 # [...]
302 # ^ ^
303 # i j
304 #
305 j += 1
306 expr = '['
308 if pattern[i] == '!':
309 # Braket expression needs to be negated.
310 expr += '^'
311 i += 1
312 elif pattern[i] == '^':
313 # POSIX declares that the regex bracket expression negation
314 # "[^...]" is undefined in a glob pattern. Python's
315 # `fnmatch.translate()` escapes the caret ('^') as a
316 # literal. To maintain consistency with undefined behavior,
317 # I am escaping the '^' as well.
318 expr += '\\^'
319 i += 1
321 # Build regex bracket expression. Escape slashes so they are
322 # treated as literal slashes by regex as defined by POSIX.
323 expr += pattern[i:j].replace('\\', '\\\\')
325 # Add regex bracket expression to regex result.
326 regex += expr
328 # Set i to one past the closing bracket.
329 i = j
331 else:
332 # Failed to find closing bracket, treat opening bracket as a
333 # bracket literal instead of as an expression.
334 regex += '\\['
336 else:
337 # Regular character, escape it for regex.
338 regex += re.escape(char)
340 if escape:
341 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
343 return regex
345 @staticmethod
346 def escape(s: AnyStr) -> AnyStr:
347 """
348 Escape special characters in the given string.
350 *s* (:class:`str` or :class:`bytes`) a filename or a string that you
351 want to escape, usually before adding it to a ".gitignore".
353 Returns the escaped string (:class:`str` or :class:`bytes`).
354 """
355 if isinstance(s, str):
356 return_type = str
357 string = s
358 elif isinstance(s, bytes):
359 return_type = bytes
360 string = s.decode(_BYTES_ENCODING)
361 else:
362 raise TypeError(f"s:{s!r} is not a unicode or byte string.")
364 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
365 meta_characters = r"[]!*#?"
367 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
369 if return_type is bytes:
370 return out_string.encode(_BYTES_ENCODING)
371 else:
372 return out_string
374util.register_pattern('gitwildmatch', GitWildMatchPattern)
377class GitIgnorePattern(GitWildMatchPattern):
378 """
379 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
380 This class only exists to maintain compatibility with v0.4.
381 """
383 def __init__(self, *args, **kw) -> None:
384 """
385 Warn about deprecation.
386 """
387 self._deprecated()
388 super(GitIgnorePattern, self).__init__(*args, **kw)
390 @staticmethod
391 def _deprecated() -> None:
392 """
393 Warn about deprecation.
394 """
395 warnings.warn((
396 "GitIgnorePattern ('gitignore') is deprecated. Use "
397 "GitWildMatchPattern ('gitwildmatch') instead."
398 ), DeprecationWarning, stacklevel=3)
400 @classmethod
401 def pattern_to_regex(cls, *args, **kw):
402 """
403 Warn about deprecation.
404 """
405 cls._deprecated()
406 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
408# Register `GitIgnorePattern` as "gitignore" for backward compatibility
409# with v0.4.
410util.register_pattern('gitignore', GitIgnorePattern)