1"""
2This module provides :class:`GitIgnoreSpecPattern` which implements Git's
3`gitignore`_ patterns, and handles edge-cases where Git's behavior differs from
4what's documented. Git allows including files from excluded directories which
5appears to contradict the documentation. This is used by
6:class:`~pathspec.gitignore.GitIgnoreSpec` to fully replicate Git's handling.
7
8.. _`gitignore`: https://git-scm.com/docs/gitignore
9"""
10
11from typing import (
12 Optional) # Replaced by `X | None` in 3.10.
13
14from pathspec._typing import (
15 AnyStr, # Removed in 3.18.
16 assert_unreachable,
17 override) # Added in 3.12.
18
19from .base import (
20 GitIgnorePatternError,
21 _BYTES_ENCODING,
22 _GitIgnoreBasePattern)
23
24_DIR_MARK = 'ps_d'
25"""
26The regex group name for the directory marker. This is only used by
27:class:`GitIgnoreSpec`.
28"""
29
30_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'
31"""
32This regular expression matches the directory marker.
33"""
34
35_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'
36"""
37This regular expression matches the optional directory marker and sub-path.
38"""
39
40
41class GitIgnoreSpecPattern(_GitIgnoreBasePattern):
42 """
43 The :class:`GitIgnoreSpecPattern` class represents a compiled gitignore
44 pattern with special handling for edge-cases to replicate Git's behavior.
45
46 This is registered under the deprecated name "gitwildmatch" for backward
47 compatibility with v0.12. The registered name will be removed in a future
48 version.
49 """
50
51 # Keep the dict-less class hierarchy.
52 __slots__ = ()
53
54 @staticmethod
55 def __normalize_segments(
56 is_dir_pattern: bool,
57 pattern_segs: list[str],
58 ) -> tuple[Optional[list[str]], Optional[str]]:
59 """
60 Normalize the pattern segments to make processing easier.
61
62 *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
63 pattern (i.e., ends with a slash '/').
64
65 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
66 segments. This may be modified in place.
67
68 Returns a :class:`tuple` containing either:
69
70 - The normalized segments (:class:`list` of :class:`str`; or :data:`None`).
71
72 - The regular expression override (:class:`str` or :data:`None`).
73 """
74 if not pattern_segs[0]:
75 # A pattern beginning with a slash ('/') should match relative to the root
76 # directory. Remove the empty first segment to make the pattern relative
77 # to root.
78 del pattern_segs[0]
79
80 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
81 # A single segment pattern with or without a trailing slash ('/') will
82 # match any descendant path. This is equivalent to "**/{pattern}". Prepend
83 # double-asterisk segment to make pattern relative to root.
84 if pattern_segs[0] != '**':
85 pattern_segs.insert(0, '**')
86
87 else:
88 # A pattern without a beginning slash ('/') but contains at least one
89 # prepended directory (e.g., "dir/{pattern}") should match relative to the
90 # root directory. No segment modification is needed.
91 pass
92
93 if not pattern_segs:
94 # After normalization, we end up with no pattern at all. This must be
95 # because the pattern is invalid.
96 raise ValueError("Pattern normalized to nothing.")
97
98 if not pattern_segs[-1]:
99 # A pattern ending with a slash ('/') will match all descendant paths if
100 # it is a directory but not if it is a regular file. This is equivalent to
101 # "{pattern}/**". Set empty last segment to a double-asterisk to include
102 # all descendants.
103 pattern_segs[-1] = '**'
104
105 # EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**').
106 # Iterate over the segments in reverse order and remove the duplicate double
107 # asterisks as we go.
108 for i in range(len(pattern_segs) - 1, 0, -1):
109 prev = pattern_segs[i-1]
110 seg = pattern_segs[i]
111 if prev == '**' and seg == '**':
112 del pattern_segs[i]
113
114 seg_count = len(pattern_segs)
115 if seg_count == 1 and pattern_segs[0] == '**':
116 if is_dir_pattern:
117 # The pattern "**/" will be normalized to "**", but it should match
118 # everything except for files in the root. Special case this pattern.
119 return (None, _DIR_MARK_CG)
120 else:
121 # The pattern "**" will match every path. Special case this pattern.
122 return (None, '.')
123
124 elif (
125 seg_count == 2
126 and pattern_segs[0] == '**'
127 and pattern_segs[1] == '*'
128 ):
129 # The pattern "*" will be normalized to "**/*" and will match every
130 # path. Special case this pattern for efficiency.
131 return (None, '.')
132
133 elif (
134 seg_count == 3
135 and pattern_segs[0] == '**'
136 and pattern_segs[1] == '*'
137 and pattern_segs[2] == '**'
138 ):
139 # The pattern "*/" will be normalized to "**/*/**" which will match every
140 # file not in the root directory. Special case this pattern for
141 # efficiency.
142 if is_dir_pattern:
143 return (None, _DIR_MARK_CG)
144 else:
145 return (None, '/')
146
147 # No regular expression override, return modified pattern segments.
148 return (pattern_segs, None)
149
150 @override
151 @classmethod
152 def pattern_to_regex(
153 cls,
154 pattern: AnyStr,
155 ) -> tuple[Optional[AnyStr], Optional[bool]]:
156 """
157 Convert the pattern into a regular expression.
158
159 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
160 regular expression.
161
162 Returns a :class:`tuple` containing:
163
164 - *pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the
165 uncompiled regular expression.
166
167 - *include* (:class:`bool` or :data:`None`) is whether matched files
168 should be included (:data:`True`), excluded (:data:`False`), or is a
169 null-operation (:data:`None`).
170 """
171 if isinstance(pattern, str):
172 pattern_str = pattern
173 return_type = str
174 elif isinstance(pattern, bytes):
175 pattern_str = pattern.decode(_BYTES_ENCODING)
176 return_type = bytes
177 else:
178 raise TypeError(f"{pattern=!r} is not a unicode or byte string.")
179
180 original_pattern = pattern_str
181 del pattern
182
183 if pattern_str.endswith('\\ '):
184 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
185 # with a backslash is followed by a space, do not strip from the left.
186 pass
187 else:
188 # EDGE CASE: Leading spaces should be kept (only trailing spaces should be
189 # removed). Git does not remove leading spaces.
190 pattern_str = pattern_str.rstrip()
191
192 regex: Optional[str]
193 include: Optional[bool]
194
195 if not pattern_str:
196 # A blank pattern is a null-operation (neither includes nor excludes
197 # files).
198 return (None, None)
199
200 elif pattern_str.startswith('#'):
201 # A pattern starting with a hash ('#') serves as a comment (neither
202 # includes nor excludes files). Escape the hash with a backslash to match
203 # a literal hash (i.e., '\#').
204 return (None, None)
205
206 elif pattern_str == '/':
207 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
208 # not match any file.
209 return (None, None)
210
211 if pattern_str.startswith('!'):
212 # A pattern starting with an exclamation mark ('!') negates the pattern
213 # (exclude instead of include). Escape the exclamation mark with a back
214 # slash to match a literal exclamation mark (i.e., '\!').
215 include = False
216 # Remove leading exclamation mark.
217 pattern_str = pattern_str[1:]
218 else:
219 include = True
220
221 # Split pattern into segments.
222 pattern_segs = pattern_str.split('/')
223
224 # Check whether the pattern is specifically a directory pattern before
225 # normalization.
226 is_dir_pattern = not pattern_segs[-1]
227
228 # Normalize pattern to make processing easier.
229 try:
230 pattern_segs, override_regex = cls.__normalize_segments(
231 is_dir_pattern, pattern_segs,
232 )
233 except ValueError as e:
234 raise GitIgnorePatternError((
235 f"Invalid git pattern: {original_pattern!r}"
236 )) from e # GitIgnorePatternError
237
238 if override_regex is not None:
239 # Use regex override.
240 regex = override_regex
241
242 elif pattern_segs is not None:
243 # Build regular expression from pattern.
244 try:
245 regex_parts = cls.__translate_segments(is_dir_pattern, pattern_segs)
246 except ValueError as e:
247 raise GitIgnorePatternError((
248 f"Invalid git pattern: {original_pattern!r}"
249 )) from e # GitIgnorePatternError
250
251 regex = ''.join(regex_parts)
252
253 else:
254 assert_unreachable((
255 f"{override_regex=} and {pattern_segs=} cannot both be null."
256 )) # assert_unreachable
257
258 # Encode regex if needed.
259 out_regex: AnyStr
260 if regex is not None and return_type is bytes:
261 out_regex = regex.encode(_BYTES_ENCODING)
262 else:
263 out_regex = regex
264
265 return (out_regex, include)
266
267 @classmethod
268 def __translate_segments(
269 cls,
270 is_dir_pattern: bool,
271 pattern_segs: list[str],
272 ) -> list[str]:
273 """
274 Translate the pattern segments to regular expressions.
275
276 *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
277 pattern (i.e., ends with a slash '/').
278
279 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
280 segments.
281
282 Returns the regular expression parts (:class:`list` of :class:`str`).
283 """
284 # Build regular expression from pattern.
285 out_parts = []
286 need_slash = False
287 end = len(pattern_segs) - 1
288 for i, seg in enumerate(pattern_segs):
289 if seg == '**':
290 if i == 0:
291 # A normalized pattern beginning with double-asterisks ('**') will
292 # match any leading path segments.
293 out_parts.append('^(?:.+/)?')
294
295 elif i < end:
296 # A pattern with inner double-asterisks ('**') will match multiple (or
297 # zero) inner path segments.
298 out_parts.append('(?:/.+)?')
299 need_slash = True
300
301 else:
302 assert i == end, (i, end)
303 # A normalized pattern ending with double-asterisks ('**') will match
304 # any trailing path segments.
305 if is_dir_pattern:
306 out_parts.append(_DIR_MARK_CG)
307 else:
308 out_parts.append('/')
309
310 else:
311 # Match path segment.
312 if i == 0:
313 # Anchor to root directory.
314 out_parts.append('^')
315
316 if need_slash:
317 out_parts.append('/')
318
319 if seg == '*':
320 # Match whole path segment.
321 out_parts.append('[^/]+')
322
323 else:
324 # Match segment glob pattern.
325 out_parts.append(cls._translate_segment_glob(seg))
326
327 if i == end:
328 # A pattern ending without a slash ('/') will match a file or a
329 # directory (with paths underneath it). E.g., "foo" matches "foo",
330 # "foo/bar", "foo/bar/baz", etc.
331 out_parts.append(_DIR_MARK_OPT)
332
333 need_slash = True
334
335 return out_parts