1"""
2This module provides :class:`GitIgnoreBasicPattern` which implements Git's
3`gitignore`_ patterns as documented. This differs from how Git actually behaves
4when including files in excluded directories.
5
6.. _`gitignore`: https://git-scm.com/docs/gitignore
7"""
8
9from typing import (
10 Optional) # Replaced by `X | None` in 3.10.
11
12from pathspec import util
13from pathspec._typing import (
14 AnyStr, # Removed in 3.18.
15 assert_unreachable,
16 override) # Added in 3.12.
17
18from .base import (
19 GitIgnorePatternError,
20 _BYTES_ENCODING,
21 _GitIgnoreBasePattern)
22
23
24class GitIgnoreBasicPattern(_GitIgnoreBasePattern):
25 """
26 The :class:`GitIgnoreBasicPattern` class represents a compiled gitignore
27 pattern as documented. This is registered as "gitignore".
28 """
29
30 # Keep the dict-less class hierarchy.
31 __slots__ = ()
32
33 @staticmethod
34 def __normalize_segments(
35 is_dir_pattern: bool,
36 pattern_segs: list[str],
37 ) -> tuple[Optional[list[str]], Optional[str]]:
38 """
39 Normalize the pattern segments to make processing easier.
40
41 *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
42 pattern (i.e., ends with a slash '/').
43
44 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
45 segments. This may be modified in place.
46
47 Returns a :class:`tuple` containing either:
48
49 - The normalized segments (:class:`list` of :class:`str`; or :data:`None`).
50
51 - The regular expression override (:class:`str` or :data:`None`).
52 """
53 if not pattern_segs[0]:
54 # A pattern beginning with a slash ('/') should match relative to the root
55 # directory. Remove the empty first segment to make the pattern relative
56 # to root.
57 del pattern_segs[0]
58
59 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
60 # A single segment pattern with or without a trailing slash ('/') will
61 # match any descendant path. This is equivalent to "**/{pattern}". Prepend
62 # double-asterisk segment to make pattern relative to root.
63 if pattern_segs[0] != '**':
64 pattern_segs.insert(0, '**')
65
66 else:
67 # A pattern without a beginning slash ('/') but contains at least one
68 # prepended directory (e.g., "dir/{pattern}") should match relative to the
69 # root directory. No segment modification is needed.
70 pass
71
72 if not pattern_segs:
73 # After normalization, we end up with no pattern at all. This must be
74 # because the pattern is invalid.
75 raise ValueError("Pattern normalized to nothing.")
76
77 if not pattern_segs[-1]:
78 # A pattern ending with a slash ('/') will match all descendant paths if
79 # it is a directory but not if it is a regular file. This is equivalent to
80 # "{pattern}/**". Set empty last segment to a double-asterisk to include
81 # all descendants.
82 pattern_segs[-1] = '**'
83
84 # EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**').
85 # Iterate over the segments in reverse order and remove the duplicate double
86 # asterisks as we go.
87 for i in range(len(pattern_segs) - 1, 0, -1):
88 prev = pattern_segs[i-1]
89 seg = pattern_segs[i]
90 if prev == '**' and seg == '**':
91 del pattern_segs[i]
92
93 seg_count = len(pattern_segs)
94 if seg_count == 1 and pattern_segs[0] == '**':
95 if is_dir_pattern:
96 # The pattern "**/" will be normalized to "**", but it should match
97 # everything except for files in the root. Special case this pattern.
98 return (None, '/')
99 else:
100 # The pattern "**" will match every path. Special case this pattern.
101 return (None, '.')
102
103 elif (
104 seg_count == 2
105 and pattern_segs[0] == '**'
106 and pattern_segs[1] == '*'
107 ):
108 # The pattern "*" will be normalized to "**/*" and will match every
109 # path. Special case this pattern for efficiency.
110 return (None, '.')
111
112 elif (
113 seg_count == 3
114 and pattern_segs[0] == '**'
115 and pattern_segs[1] == '*'
116 and pattern_segs[2] == '**'
117 ):
118 # The pattern "*/" will be normalized to "**/*/**" which will match every
119 # file not in the root directory. Special case this pattern for
120 # efficiency.
121 return (None, '/')
122
123 # No regular expression override, return modified pattern segments.
124 return (pattern_segs, None)
125
126 @override
127 @classmethod
128 def pattern_to_regex(
129 cls,
130 pattern: AnyStr,
131 ) -> tuple[Optional[AnyStr], Optional[bool]]:
132 """
133 Convert the pattern into a regular expression.
134
135 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
136 regular expression.
137
138 Returns a :class:`tuple` containing:
139
140 - *pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the
141 uncompiled regular expression.
142
143 - *include* (:class:`bool` or :data:`None`) is whether matched files
144 should be included (:data:`True`), excluded (:data:`False`), or is a
145 null-operation (:data:`None`).
146 """
147 if isinstance(pattern, str):
148 pattern_str = pattern
149 return_type = str
150 elif isinstance(pattern, bytes):
151 pattern_str = pattern.decode(_BYTES_ENCODING)
152 return_type = bytes
153 else:
154 raise TypeError(f"{pattern=!r} is not a unicode or byte string.")
155
156 original_pattern = pattern_str
157 del pattern
158
159 if pattern_str.endswith('\\ '):
160 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
161 # with a backslash is followed by a space, do not strip from the left.
162 pass
163 else:
164 # EDGE CASE: Leading spaces should be kept (only trailing spaces should be
165 # removed).
166 pattern_str = pattern_str.rstrip()
167
168 regex: Optional[str]
169 include: Optional[bool]
170
171 if not pattern_str:
172 # A blank pattern is a null-operation (neither includes nor excludes
173 # files).
174 return (None, None)
175
176 elif pattern_str.startswith('#'):
177 # A pattern starting with a hash ('#') serves as a comment (neither
178 # includes nor excludes files). Escape the hash with a backslash to match
179 # a literal hash (i.e., '\#').
180 return (None, None)
181
182 if pattern_str.startswith('!'):
183 # A pattern starting with an exclamation mark ('!') negates the pattern
184 # (exclude instead of include). Escape the exclamation mark with a back
185 # slash to match a literal exclamation mark (i.e., '\!').
186 include = False
187 # Remove leading exclamation mark.
188 pattern_str = pattern_str[1:]
189 else:
190 include = True
191
192 # Split pattern into segments.
193 pattern_segs = pattern_str.split('/')
194
195 # Check whether the pattern is specifically a directory pattern before
196 # normalization.
197 is_dir_pattern = not pattern_segs[-1]
198
199 if pattern_str == '/':
200 # EDGE CASE: A single slash ('/') is not addressed by the gitignore
201 # documentation. Git treats it as a no-op (does not match any files). The
202 # straight forward interpretation is to treat it as a directory and match
203 # every descendant path (equivalent to '**'). Remove the directory pattern
204 # flag so that it is treated as '**' instead of '**/'.
205 is_dir_pattern = False
206
207 # Normalize pattern to make processing easier.
208 try:
209 pattern_segs, override_regex = cls.__normalize_segments(
210 is_dir_pattern, pattern_segs,
211 )
212 except ValueError as e:
213 raise GitIgnorePatternError((
214 f"Invalid git pattern: {original_pattern!r}"
215 )) from e # GitIgnorePatternError
216
217 if override_regex is not None:
218 # Use regex override.
219 regex = override_regex
220
221 elif pattern_segs is not None:
222 # Build regular expression from pattern.
223 try:
224 regex_parts = cls.__translate_segments(pattern_segs)
225 except ValueError as e:
226 raise GitIgnorePatternError((
227 f"Invalid git pattern: {original_pattern!r}"
228 )) from e # GitIgnorePatternError
229
230 regex = ''.join(regex_parts)
231
232 else:
233 assert_unreachable((
234 f"{override_regex=} and {pattern_segs=} cannot both be null."
235 )) # assert_unreachable
236
237 # Encode regex if needed.
238 out_regex: AnyStr
239 if regex is not None and return_type is bytes:
240 out_regex = regex.encode(_BYTES_ENCODING)
241 else:
242 out_regex = regex
243
244 return (out_regex, include)
245
246 @classmethod
247 def __translate_segments(cls, pattern_segs: list[str]) -> list[str]:
248 """
249 Translate the pattern segments to regular expressions.
250
251 *pattern_segs* (:class:`list` of :class:`str`) contains the pattern
252 segments.
253
254 Returns the regular expression parts (:class:`list` of :class:`str`).
255 """
256 # Build regular expression from pattern.
257 out_parts = []
258 need_slash = False
259 end = len(pattern_segs) - 1
260 for i, seg in enumerate(pattern_segs):
261 if seg == '**':
262 if i == 0:
263 # A normalized pattern beginning with double-asterisks ('**') will
264 # match any leading path segments.
265 # - NOTICE: '(?:^|/)' benchmarks slower using p15 (sm=0.9382,
266 # hs=0.9966, re2=0.9337).
267 out_parts.append('^(?:.+/)?')
268
269 elif i < end:
270 # A pattern with inner double-asterisks ('**') will match multiple (or
271 # zero) inner path segments.
272 out_parts.append('(?:/.+)?')
273 need_slash = True
274
275 else:
276 assert i == end, (i, end)
277 # A normalized pattern ending with double-asterisks ('**') will match
278 # any trailing path segments.
279 out_parts.append('/')
280
281 else:
282 # Match path segment.
283 if i == 0:
284 # Anchor to root directory.
285 out_parts.append('^')
286
287 if need_slash:
288 out_parts.append('/')
289
290 if seg == '*':
291 # Match whole path segment.
292 out_parts.append('[^/]+')
293
294 else:
295 # Match segment glob pattern.
296 out_parts.append(cls._translate_segment_glob(seg))
297
298 if i == end:
299 if seg == '*':
300 # A pattern ending with an asterisk ('*') will match a file or
301 # directory (without matching descendant paths). E.g., "foo/*"
302 # matches "foo/test.json", "foo/bar/", but not "foo/bar/hello.c".
303 out_parts.append('/?$')
304
305 else:
306 # A pattern ending without a slash ('/') will match a file or a
307 # directory (with paths underneath it). E.g., "foo" matches "foo",
308 # "foo/bar", "foo/bar/baz", etc.
309 out_parts.append('(?:/|$)')
310
311 need_slash = True
312
313 return out_parts
314
315
316# Register GitIgnoreBasicPattern as "gitignore".
317util.register_pattern('gitignore', GitIgnoreBasicPattern)