1"""
2This module provides :class:`GitIgnoreSpecPattern` which implements Git's
3`gitignore`_ patterns, and handles edge-cases where Git's behavior differs from
4what's documented. Git allows including files from excluded directories which
5appears to contradict the documentation. Git discards patterns with invalid
6range notation. This is used by :class:`~pathspec.gitignore.GitIgnoreSpec` to
7fully replicate Git's handling.
8
9.. _`gitignore`: https://git-scm.com/docs/gitignore
10"""
11
12from typing import (
13 Optional) # Replaced by `X | None` in 3.10.
14
15from pathspec._typing import (
16 AnyStr, # Removed in 3.18.
17 assert_unreachable,
18 override) # Added in 3.12.
19
20from .base import (
21 GitIgnorePatternError,
22 _BYTES_ENCODING,
23 _GitIgnoreBasePattern,
24 _RangeError)
25
26_DIR_MARK = 'ps_d'
27"""
28The regex group name for the directory marker. This is only used by
29:class:`GitIgnoreSpec`.
30"""
31
32_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'
33"""
34This regular expression matches the directory marker.
35"""
36
37_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'
38"""
39This regular expression matches the optional directory marker and sub-path.
40"""
41
42
43class GitIgnoreSpecPattern(_GitIgnoreBasePattern):
44 """
45 The :class:`GitIgnoreSpecPattern` class represents a compiled gitignore
46 pattern with special handling for edge-cases to replicate Git's behavior.
47
48 This is registered under the deprecated name "gitwildmatch" for backward
49 compatibility with v0.12. The registered name will be removed in a future
50 version.
51 """
52
53 # Keep the dict-less class hierarchy.
54 __slots__ = ()
55
56 @staticmethod
57 def __normalize_segments(
58 is_dir_pattern: bool,
59 pattern_segs: list[str],
60 ) -> tuple[Optional[list[str]], Optional[str]]:
61 """
62 Normalize the pattern segments to make processing easier.
63
64 *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
65 pattern (i.e., ends with a slash '/').
66
67 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
68 segments. This may be modified in place.
69
70 Returns a :class:`tuple` containing either:
71
72 - The normalized segments (:class:`list` of :class:`str`; or :data:`None`).
73
74 - The regular expression override (:class:`str` or :data:`None`).
75 """
76 if not pattern_segs[0]:
77 # A pattern beginning with a slash ('/') should match relative to the root
78 # directory. Remove the empty first segment to make the pattern relative
79 # to root.
80 del pattern_segs[0]
81
82 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
83 # A single segment pattern with or without a trailing slash ('/') will
84 # match any descendant path. This is equivalent to "**/{pattern}". Prepend
85 # a double-asterisk segment to make the pattern relative to root.
86 if pattern_segs[0] != '**':
87 pattern_segs.insert(0, '**')
88
89 else:
90 # A pattern without a beginning slash ('/') but contains at least one
91 # prepended directory (e.g., "dir/{pattern}") should match relative to the
92 # root directory. No segment modification is needed.
93 pass
94
95 if not pattern_segs:
96 # After normalization, we end up with no pattern at all. This must be
97 # because the pattern is invalid.
98 raise ValueError("Pattern normalized to nothing.")
99
100 if not pattern_segs[-1]:
101 # A pattern ending with a slash ('/') will match all descendant paths if
102 # it is a directory but not if it is a regular file. This is equivalent to
103 # "{pattern}/**". Set the empty last segment to a double-asterisk to
104 # include all descendants.
105 pattern_segs[-1] = '**'
106
107 # EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**').
108 # Iterate over the segments in reverse order and remove the duplicate double
109 # asterisks as we go.
110 for i in range(len(pattern_segs) - 1, 0, -1):
111 prev = pattern_segs[i-1]
112 seg = pattern_segs[i]
113 if prev == '**' and seg == '**':
114 del pattern_segs[i]
115
116 seg_count = len(pattern_segs)
117 if seg_count == 1 and pattern_segs[0] == '**':
118 if is_dir_pattern:
119 # The pattern "**/" will be normalized to "**", but it should match
120 # everything except for files in the root. Special case this pattern.
121 return (None, _DIR_MARK_CG)
122 else:
123 # The pattern "**" will match every path. Special case this pattern.
124 return (None, '.')
125
126 elif (
127 seg_count == 2
128 and pattern_segs[0] == '**'
129 and pattern_segs[1] == '*'
130 ):
131 # The pattern "*" will be normalized to "**/*" and will match every
132 # path. Special case this pattern for efficiency.
133 return (None, '.')
134
135 elif (
136 seg_count == 3
137 and pattern_segs[0] == '**'
138 and pattern_segs[1] == '*'
139 and pattern_segs[2] == '**'
140 ):
141 # The pattern "*/" will be normalized to "**/*/**" which will match every
142 # file not in the root directory. Special case this pattern for
143 # efficiency.
144 if is_dir_pattern:
145 return (None, _DIR_MARK_CG)
146 else:
147 return (None, '/')
148
149 # No regular expression override, return modified pattern segments.
150 return (pattern_segs, None)
151
152 @override
153 @classmethod
154 def pattern_to_regex(
155 cls,
156 pattern: AnyStr,
157 ) -> tuple[Optional[AnyStr], Optional[bool]]:
158 """
159 Convert the pattern into a regular expression.
160
161 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
162 regular expression.
163
164 Returns a :class:`tuple` containing:
165
166 - *pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the
167 uncompiled regular expression.
168
169 - *include* (:class:`bool` or :data:`None`) is whether matched files
170 should be included (:data:`True`), excluded (:data:`False`), or is a
171 null-operation (:data:`None`).
172 """
173 if isinstance(pattern, str):
174 pattern_str = pattern
175 return_type = str
176 elif isinstance(pattern, bytes):
177 pattern_str = pattern.decode(_BYTES_ENCODING)
178 return_type = bytes
179 else:
180 raise TypeError(f"{pattern=!r} is not a unicode or byte string.")
181
182 original_pattern = pattern_str
183 del pattern
184
185 if pattern_str.endswith('\\ '):
186 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
187 # with a backslash is followed by a space, do not strip from the left.
188 pass
189 else:
190 # EDGE CASE: Leading spaces should be kept (only trailing spaces should be
191 # removed). Git does not remove leading spaces.
192 pattern_str = pattern_str.rstrip()
193
194 regex: Optional[str]
195 include: Optional[bool]
196
197 if not pattern_str:
198 # A blank pattern is a null-operation (neither includes nor excludes
199 # files).
200 return (None, None)
201
202 elif pattern_str.startswith('#'):
203 # A pattern starting with a hash ('#') serves as a comment (neither
204 # includes nor excludes files). Escape the hash with a backslash to match
205 # a literal hash (i.e., '\#').
206 return (None, None)
207
208 elif pattern_str == '/':
209 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
210 # not match any file.
211 return (None, None)
212
213 if pattern_str.startswith('!'):
214 # A pattern starting with an exclamation mark ('!') negates the pattern
215 # (exclude instead of include). Escape the exclamation mark with a
216 # backslash to match a literal exclamation mark (i.e., '\!').
217 include = False
218 # Remove leading exclamation mark.
219 pattern_str = pattern_str[1:]
220 else:
221 include = True
222
223 # Split pattern into segments.
224 pattern_segs = pattern_str.split('/')
225
226 # Check whether the pattern is specifically a directory pattern before
227 # normalization.
228 is_dir_pattern = not pattern_segs[-1]
229
230 # Normalize pattern to make processing easier.
231 try:
232 pattern_segs, override_regex = cls.__normalize_segments(
233 is_dir_pattern, pattern_segs,
234 )
235 except ValueError as e:
236 raise GitIgnorePatternError((
237 f"Invalid git pattern: {original_pattern!r}"
238 )) from e # GitIgnorePatternError
239
240 if override_regex is not None:
241 # Use regex override.
242 regex = override_regex
243
244 elif pattern_segs is not None:
245 # Build regular expression from pattern.
246 try:
247 regex_parts = cls.__translate_segments(is_dir_pattern, pattern_segs)
248 except _RangeError:
249 # EDGE CASE: Git discards patterns with invalid range notation.
250 return (None, None)
251 except ValueError as e:
252 raise GitIgnorePatternError((
253 f"Invalid git pattern: {original_pattern!r}"
254 )) from e # GitIgnorePatternError
255
256 regex = ''.join(regex_parts)
257
258 else:
259 assert_unreachable((
260 f"{override_regex=} and {pattern_segs=} cannot both be null."
261 )) # assert_unreachable
262
263 # Encode regex if needed.
264 out_regex: AnyStr
265 if regex is not None and return_type is bytes:
266 out_regex = regex.encode(_BYTES_ENCODING)
267 else:
268 out_regex = regex
269
270 return (out_regex, include)
271
272 @classmethod
273 def __translate_segments(
274 cls,
275 is_dir_pattern: bool,
276 pattern_segs: list[str],
277 ) -> list[str]:
278 """
279 Translate the pattern segments to regular expressions.
280
281 *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
282 pattern (i.e., ends with a slash '/').
283
284 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
285 segments.
286
287 Raises :class:`_RangeError` if invalid range notation is found.
288
289 Returns the regular expression parts (:class:`list` of :class:`str`).
290 """
291 # Build regular expression from pattern.
292 out_parts = []
293 need_slash = False
294 end = len(pattern_segs) - 1
295 for i, seg in enumerate(pattern_segs):
296 if seg == '**':
297 if i == 0:
298 # A normalized pattern beginning with double-asterisks ('**') will
299 # match any leading path segments.
300 out_parts.append('^(?:.+/)?')
301
302 elif i < end:
303 # A pattern with inner double-asterisks ('**') will match multiple (or
304 # zero) inner path segments.
305 out_parts.append('(?:/.+)?')
306 need_slash = True
307
308 else:
309 assert i == end, (i, end)
310 # A normalized pattern ending with double-asterisks ('**') will match
311 # any trailing path segments.
312 if is_dir_pattern:
313 out_parts.append(_DIR_MARK_CG)
314 else:
315 out_parts.append('/')
316
317 else:
318 # Match path segment.
319 if i == 0:
320 # Anchor to root directory.
321 out_parts.append('^')
322
323 if need_slash:
324 out_parts.append('/')
325
326 if seg == '*':
327 # Match whole path segment.
328 out_parts.append('[^/]+')
329
330 else:
331 # Match segment glob pattern.
332 # - EDGE CASE: Git discards patterns with invalid range notation.
333 out_parts.append(cls._translate_segment_glob(seg, 'raise'))
334
335 if i == end:
336 # A pattern ending without a slash ('/') will match a file or a
337 # directory (with paths underneath it). E.g., "foo" matches "foo",
338 # "foo/bar", "foo/bar/baz", etc.
339 out_parts.append(_DIR_MARK_OPT)
340
341 need_slash = True
342
343 return out_parts