Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 18%
158 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:35 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:35 +0000
1# encoding: utf-8
2"""
3This module implements Git's wildmatch pattern matching which itself is
4derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore"
5files.
6"""
7from __future__ import unicode_literals
9import re
10import warnings
11try:
12 from typing import (
13 AnyStr,
14 Optional,
15 Text,
16 Tuple)
17except ImportError:
18 pass
20from .. import util
21from ..compat import unicode
22from ..pattern import RegexPattern
24#: The encoding to use when parsing a byte string pattern.
25_BYTES_ENCODING = 'latin1'
28class GitWildMatchPatternError(ValueError):
29 """
30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match
31 pattern.
32 """
33 pass
36class GitWildMatchPattern(RegexPattern):
37 """
38 The :class:`GitWildMatchPattern` class represents a compiled Git
39 wildmatch pattern.
40 """
42 # Keep the dict-less class hierarchy.
43 __slots__ = ()
45 @classmethod
46 def pattern_to_regex(cls, pattern):
47 # type: (AnyStr) -> Tuple[Optional[AnyStr], Optional[bool]]
48 """
49 Convert the pattern into a regular expression.
51 *pattern* (:class:`unicode` or :class:`bytes`) is the pattern to
52 convert into a regular expression.
54 Returns the uncompiled regular expression (:class:`unicode`, :class:`bytes`,
55 or :data:`None`), and whether matched files should be included
56 (:data:`True`), excluded (:data:`False`), or if it is a
57 null-operation (:data:`None`).
58 """
59 if isinstance(pattern, unicode):
60 return_type = unicode
61 elif isinstance(pattern, bytes):
62 return_type = bytes
63 pattern = pattern.decode(_BYTES_ENCODING)
64 else:
65 raise TypeError("pattern:{!r} is not a unicode or byte string.".format(pattern))
67 original_pattern = pattern
68 pattern = pattern.strip()
70 if pattern.startswith('#'):
71 # A pattern starting with a hash ('#') serves as a comment
72 # (neither includes nor excludes files). Escape the hash with a
73 # back-slash to match a literal hash (i.e., '\#').
74 regex = None
75 include = None
77 elif pattern == '/':
78 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single
79 # '/' does not match any file.
80 regex = None
81 include = None
83 elif pattern:
84 if pattern.startswith('!'):
85 # A pattern starting with an exclamation mark ('!') negates the
86 # pattern (exclude instead of include). Escape the exclamation
87 # mark with a back-slash to match a literal exclamation mark
88 # (i.e., '\!').
89 include = False
90 # Remove leading exclamation mark.
91 pattern = pattern[1:]
92 else:
93 include = True
95 if pattern.startswith('\\'):
96 # Remove leading back-slash escape for escaped hash ('#') or
97 # exclamation mark ('!').
98 pattern = pattern[1:]
100 # Allow a regex override for edge cases that cannot be handled
101 # through normalization.
102 override_regex = None
104 # Split pattern into segments.
105 pattern_segs = pattern.split('/')
107 # Normalize pattern to make processing easier.
109 # EDGE CASE: Deal with duplicate double-asterisk sequences.
110 # Collapse each sequence down to one double-asterisk. Iterate over
111 # the segments in reverse and remove the duplicate double
112 # asterisks as we go.
113 for i in range(len(pattern_segs) - 1, 0, -1):
114 prev = pattern_segs[i-1]
115 seg = pattern_segs[i]
116 if prev == '**' and seg == '**':
117 del pattern_segs[i]
119 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
120 # EDGE CASE: The '**/' pattern should match everything except
121 # individual files in the root directory. This case cannot be
122 # adequately handled through normalization. Use the override.
123 override_regex = '^.+/.*$'
125 if not pattern_segs[0]:
126 # A pattern beginning with a slash ('/') will only match paths
127 # directly on the root directory instead of any descendant
128 # paths. So, remove empty first segment to make pattern relative
129 # to root.
130 del pattern_segs[0]
132 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
133 # A single pattern without a beginning slash ('/') will match
134 # any descendant path. This is equivalent to "**/{pattern}". So,
135 # prepend with double-asterisks to make pattern relative to
136 # root.
137 # EDGE CASE: This also holds for a single pattern with a
138 # trailing slash (e.g. dir/).
139 if pattern_segs[0] != '**':
140 pattern_segs.insert(0, '**')
142 else:
143 # EDGE CASE: A pattern without a beginning slash ('/') but
144 # contains at least one prepended directory (e.g.
145 # "dir/{pattern}") should not match "**/dir/{pattern}",
146 # according to `git check-ignore` (v2.4.1).
147 pass
149 if not pattern_segs:
150 # After resolving the edge cases, we end up with no
151 # pattern at all. This must be because the pattern is
152 # invalid.
153 raise GitWildMatchPatternError("Invalid git pattern: %r" % (original_pattern,))
155 if not pattern_segs[-1] and len(pattern_segs) > 1:
156 # A pattern ending with a slash ('/') will match all
157 # descendant paths if it is a directory but not if it is a
158 # regular file. This is equivalent to "{pattern}/**". So, set
159 # last segment to a double-asterisk to include all
160 # descendants.
161 pattern_segs[-1] = '**'
163 if override_regex is None:
164 # Build regular expression from pattern.
165 output = ['^']
166 need_slash = False
167 end = len(pattern_segs) - 1
168 for i, seg in enumerate(pattern_segs):
169 if seg == '**':
170 if i == 0 and i == end:
171 # A pattern consisting solely of double-asterisks ('**')
172 # will match every path.
173 output.append('.+')
174 elif i == 0:
175 # A normalized pattern beginning with double-asterisks
176 # ('**') will match any leading path segments.
177 output.append('(?:.+/)?')
178 need_slash = False
179 elif i == end:
180 # A normalized pattern ending with double-asterisks ('**')
181 # will match any trailing path segments.
182 output.append('/.*')
183 else:
184 # A pattern with inner double-asterisks ('**') will match
185 # multiple (or zero) inner path segments.
186 output.append('(?:/.+)?')
187 need_slash = True
189 elif seg == '*':
190 # Match single path segment.
191 if need_slash:
192 output.append('/')
193 output.append('[^/]+')
194 need_slash = True
196 else:
197 # Match segment glob pattern.
198 if need_slash:
199 output.append('/')
201 output.append(cls._translate_segment_glob(seg))
202 if i == end and include is True:
203 # A pattern ending without a slash ('/') will match a file
204 # or a directory (with paths underneath it). E.g., "foo"
205 # matches "foo", "foo/bar", "foo/bar/baz", etc.
206 # EDGE CASE: However, this does not hold for exclusion cases
207 # according to `git check-ignore` (v2.4.1).
208 output.append('(?:/.*)?')
210 need_slash = True
212 output.append('$')
213 regex = ''.join(output)
215 else:
216 # Use regex override.
217 regex = override_regex
219 else:
220 # A blank pattern is a null-operation (neither includes nor
221 # excludes files).
222 regex = None
223 include = None
225 if regex is not None and return_type is bytes:
226 regex = regex.encode(_BYTES_ENCODING)
228 return regex, include
230 @staticmethod
231 def _translate_segment_glob(pattern):
232 # type: (Text) -> Text
233 """
234 Translates the glob pattern to a regular expression. This is used in
235 the constructor to translate a path segment glob pattern to its
236 corresponding regular expression.
238 *pattern* (:class:`str`) is the glob pattern.
240 Returns the regular expression (:class:`str`).
241 """
242 # NOTE: This is derived from `fnmatch.translate()` and is similar to
243 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
245 escape = False
246 regex = ''
247 i, end = 0, len(pattern)
248 while i < end:
249 # Get next character.
250 char = pattern[i]
251 i += 1
253 if escape:
254 # Escape the character.
255 escape = False
256 regex += re.escape(char)
258 elif char == '\\':
259 # Escape character, escape next character.
260 escape = True
262 elif char == '*':
263 # Multi-character wildcard. Match any string (except slashes),
264 # including an empty string.
265 regex += '[^/]*'
267 elif char == '?':
268 # Single-character wildcard. Match any single character (except
269 # a slash).
270 regex += '[^/]'
272 elif char == '[':
273 # Bracket expression wildcard. Except for the beginning
274 # exclamation mark, the whole bracket expression can be used
275 # directly as regex but we have to find where the expression
276 # ends.
277 # - "[][!]" matches ']', '[' and '!'.
278 # - "[]-]" matches ']' and '-'.
279 # - "[!]a-]" matches any character except ']', 'a' and '-'.
280 j = i
281 # Pass brack expression negation.
282 if j < end and pattern[j] == '!':
283 j += 1
284 # Pass first closing bracket if it is at the beginning of the
285 # expression.
286 if j < end and pattern[j] == ']':
287 j += 1
288 # Find closing bracket. Stop once we reach the end or find it.
289 while j < end and pattern[j] != ']':
290 j += 1
292 if j < end:
293 # Found end of bracket expression. Increment j to be one past
294 # the closing bracket:
295 #
296 # [...]
297 # ^ ^
298 # i j
299 #
300 j += 1
301 expr = '['
303 if pattern[i] == '!':
304 # Braket expression needs to be negated.
305 expr += '^'
306 i += 1
307 elif pattern[i] == '^':
308 # POSIX declares that the regex bracket expression negation
309 # "[^...]" is undefined in a glob pattern. Python's
310 # `fnmatch.translate()` escapes the caret ('^') as a
311 # literal. To maintain consistency with undefined behavior,
312 # I am escaping the '^' as well.
313 expr += '\\^'
314 i += 1
316 # Build regex bracket expression. Escape slashes so they are
317 # treated as literal slashes by regex as defined by POSIX.
318 expr += pattern[i:j].replace('\\', '\\\\')
320 # Add regex bracket expression to regex result.
321 regex += expr
323 # Set i to one past the closing bracket.
324 i = j
326 else:
327 # Failed to find closing bracket, treat opening bracket as a
328 # bracket literal instead of as an expression.
329 regex += '\\['
331 else:
332 # Regular character, escape it for regex.
333 regex += re.escape(char)
335 return regex
337 @staticmethod
338 def escape(s):
339 # type: (AnyStr) -> AnyStr
340 """
341 Escape special characters in the given string.
343 *s* (:class:`unicode` or :class:`bytes`) a filename or a string
344 that you want to escape, usually before adding it to a `.gitignore`
346 Returns the escaped string (:class:`unicode` or :class:`bytes`)
347 """
348 if isinstance(s, unicode):
349 return_type = unicode
350 string = s
351 elif isinstance(s, bytes):
352 return_type = bytes
353 string = s.decode(_BYTES_ENCODING)
354 else:
355 raise TypeError("s:{!r} is not a unicode or byte string.".format(s))
357 # Reference: https://git-scm.com/docs/gitignore#_pattern_format
358 meta_characters = r"[]!*#?"
360 out_string = "".join("\\" + x if x in meta_characters else x for x in string)
362 if return_type is bytes:
363 return out_string.encode(_BYTES_ENCODING)
364 else:
365 return out_string
367util.register_pattern('gitwildmatch', GitWildMatchPattern)
370class GitIgnorePattern(GitWildMatchPattern):
371 """
372 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
373 This class only exists to maintain compatibility with v0.4.
374 """
376 def __init__(self, *args, **kw):
377 """
378 Warn about deprecation.
379 """
380 self._deprecated()
381 super(GitIgnorePattern, self).__init__(*args, **kw)
383 @staticmethod
384 def _deprecated():
385 """
386 Warn about deprecation.
387 """
388 warnings.warn("GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern ('gitwildmatch') instead.", DeprecationWarning, stacklevel=3)
390 @classmethod
391 def pattern_to_regex(cls, *args, **kw):
392 """
393 Warn about deprecation.
394 """
395 cls._deprecated()
396 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
398# Register `GitIgnorePattern` as "gitignore" for backward compatibility
399# with v0.4.
400util.register_pattern('gitignore', GitIgnorePattern)