Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 87%

1"""

2This module implements Git's wildmatch pattern matching which itself is derived

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.

4"""

6import re

7import warnings

8from typing import (

9 AnyStr,

10 Optional) # Replaced by `X | None` in 3.10.

12from .. import (

13 util)

14from ..pattern import (

15 RegexPattern)

16from .._typing import (

17 override) # Added in 3.12.

19_BYTES_ENCODING = 'latin1'

20"""

21The encoding to use when parsing a byte string pattern.

22"""

24_DIR_MARK = 'ps_d'

25"""

26The regex group name for the directory marker. This is only used by

27:class:`GitIgnoreSpec`.

28"""

30_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'

31"""

32This regular expression matches the directory marker.

33"""

35_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'

36"""

37This regular expression matches the optional directory marker and sub-path.

38"""

41class GitWildMatchPatternError(ValueError):

42 """

43 The :class:`GitWildMatchPatternError` indicates an invalid git wild match

44 pattern.

45 """

46 pass

49class GitWildMatchPattern(RegexPattern):

50 """

51 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch

52 pattern.

53 """

55 # Keep the dict-less class hierarchy.

56 __slots__ = ()

58 @override

59 @classmethod

60 def pattern_to_regex(

61 cls,

62 pattern: AnyStr,

63 ) -> tuple[Optional[AnyStr], Optional[bool]]:

64 """

65 Convert the pattern into a regular expression.

67 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a

68 regular expression.

70 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or

71 :data:`None`); and whether matched files should be included (:data:`True`),

72 excluded (:data:`False`), or if it is a null-operation (:data:`None`).

73 """

74 if isinstance(pattern, str):

75 return_type = str

76 elif isinstance(pattern, bytes):

77 return_type = bytes

78 pattern = pattern.decode(_BYTES_ENCODING)

79 else:

80 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")

82 original_pattern = pattern

84 if pattern.endswith('\\ '):

85 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends

86 # with backslash followed by a space, only strip from left.

87 pattern = pattern.lstrip()

88 else:

89 pattern = pattern.strip()

91 regex: Optional[str]

92 include: Optional[bool]

94 if pattern.startswith('#'):

95 # A pattern starting with a hash ('#') serves as a comment (neither

96 # includes nor excludes files). Escape the hash with a back-slash to match

97 # a literal hash (i.e., '\#').

98 regex = None

99 include = None

100

101 elif pattern == '/':

102 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does

103 # not match any file.

104 regex = None

105 include = None

106

107 elif pattern:

108 if pattern.startswith('!'):

109 # A pattern starting with an exclamation mark ('!') negates the pattern

110 # (exclude instead of include). Escape the exclamation mark with a

111 # back-slash to match a literal exclamation mark (i.e., '\!').

112 include = False

113 # Remove leading exclamation mark.

114 pattern = pattern[1:]

115 else:

116 include = True

117

118 # Allow a regex override for edge cases that cannot be handled through

119 # normalization.

120 override_regex: Optional[str] = None

121

122 # Split pattern into segments.

123 pattern_segs = pattern.split('/')

124

125 # Check whether the pattern is specifically a directory pattern before

126 # normalization.

127 is_dir_pattern = not pattern_segs[-1]

128

129 # Normalize pattern to make processing easier.

130

131 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each

132 # sequence down to one double-asterisk. Iterate over the segments in

133 # reverse and remove the duplicate double asterisks as we go.

134 for i in range(len(pattern_segs) - 1, 0, -1):

135 prev = pattern_segs[i-1]

136 seg = pattern_segs[i]

137 if prev == '**' and seg == '**':

138 del pattern_segs[i]

139

140 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:

141 # EDGE CASE: The '**/' pattern should match everything except individual

142 # files in the root directory. This case cannot be adequately handled

143 # through normalization. Use the override.

144 override_regex = _DIR_MARK_CG

145

146 if not pattern_segs[0]:

147 # A pattern beginning with a slash ('/') will only match paths directly

148 # on the root directory instead of any descendant paths. So, remove

149 # empty first segment to make pattern relative to root.

150 del pattern_segs[0]

151

152 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):

153 # A single segment pattern without a beginning slash ('/') will match

154 # any descendant path. This is equivalent to "**/{pattern}". So, prepend

155 # with double-asterisks to make pattern relative to root.

156 # - EDGE CASE: This also holds for a single segment pattern with a

157 # trailing slash (e.g. 'dir/').

158 if pattern_segs[0] != '**':

159 pattern_segs.insert(0, '**')

160

161 else:

162 # EDGE CASE: A pattern without a beginning slash ('/') but contains at

163 # least one prepended directory (e.g. "dir/{pattern}") should not match

164 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).

165 pass

166

167 if not pattern_segs:

168 # After resolving the edge cases, we end up with no pattern at all. This

169 # must be because the pattern is invalid.

170 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")

171

172 if not pattern_segs[-1] and len(pattern_segs) > 1:

173 # A pattern ending with a slash ('/') will match all descendant paths if

174 # it is a directory but not if it is a regular file. This is equivalent

175 # to "{pattern}/**". So, set last segment to a double-asterisk to

176 # include all descendants.

177 pattern_segs[-1] = '**'

178

179 if override_regex is None:

180 seg_count = len(pattern_segs)

181 if seg_count == 1 and pattern_segs[0] == '**':

182 # The pattern "**" will match every path. Special case this pattern.

183 override_regex = '.'

184

185 elif (

186 seg_count == 2

187 and pattern_segs[0] == '**'

188 and pattern_segs[1] == '*'

189 ):

190 # The pattern "*" will be normalized to "**/*" and will match every

191 # path. Special case this pattern for efficiency.

192 override_regex = '.'

193

194 elif (

195 seg_count == 3

196 and pattern_segs[0] == '**'

197 and pattern_segs[1] == '*'

198 and pattern_segs[2] == '**'

199 ):

200 # The pattern "*/" will be normalized to "**/*/**" which will match

201 # every file not in the root directory. Special case this pattern for

202 # efficiency.

203 if is_dir_pattern:

204 override_regex = _DIR_MARK_CG

205 else:

206 override_regex = '/'

207

208 if override_regex is None:

209 # Build regular expression from pattern.

210 output = []

211 need_slash = False

212 end = len(pattern_segs) - 1

213 for i, seg in enumerate(pattern_segs):

214 if seg == '**':

215 if i == 0:

216 # A normalized pattern beginning with double-asterisks ('**') will

217 # match any leading path segments.

218 output.append('^(?:.+/)?')

219

220 elif i < end:

221 # A pattern with inner double-asterisks ('**') will match multiple

222 # (or zero) inner path segments.

223 output.append('(?:/.+)?')

224 need_slash = True

225

226 else:

227 assert i == end, (i, end)

228 # A normalized pattern ending with double-asterisks ('**') will

229 # match any trailing path segments.

230 if is_dir_pattern:

231 output.append(_DIR_MARK_CG)

232 else:

233 output.append(f'/')

234

235 else:

236 # Match path segment.

237 if i == 0:

238 # Anchor to root directory.

239 output.append('^')

240

241 if need_slash:

242 output.append('/')

243

244 if seg == '*':

245 # Match whole path segment.

246 output.append('[^/]+')

247

248 else:

249 # Match segment glob pattern.

250 try:

251 output.append(cls._translate_segment_glob(seg))

252 except ValueError as e:

253 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e

254

255 if i == end:

256 # A pattern ending without a slash ('/') will match a file or a

257 # directory (with paths underneath it). E.g., "foo" matches "foo",

258 # "foo/bar", "foo/bar/baz", etc.

259 output.append(_DIR_MARK_OPT)

260

261 need_slash = True

262

263 regex = ''.join(output)

264

265 else:

266 # Use regex override.

267 regex = override_regex

268

269 else:

270 # A blank pattern is a null-operation (neither includes nor excludes

271 # files).

272 regex = None

273 include = None

274

275 if regex is not None and return_type is bytes:

276 regex = regex.encode(_BYTES_ENCODING)

277

278 return regex, include

279

280 @staticmethod

281 def _translate_segment_glob(pattern: str) -> str:

282 """

283 Translates the glob pattern to a regular expression. This is used in the

284 constructor to translate a path segment glob pattern to its corresponding

285 regular expression.

286

287 *pattern* (:class:`str`) is the glob pattern.

288

289 Returns the regular expression (:class:`str`).

290 """

291 # NOTE: This is derived from `fnmatch.translate()` and is similar to the

292 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.

293

294 escape = False

295 regex = ''

296 i, end = 0, len(pattern)

297 while i < end:

298 # Get next character.

299 char = pattern[i]

300 i += 1

301

302 if escape:

303 # Escape the character.

304 escape = False

305 regex += re.escape(char)

306

307 elif char == '\\':

308 # Escape character, escape next character.

309 escape = True

310

311 elif char == '*':

312 # Multi-character wildcard. Match any string (except slashes), including

313 # an empty string.

314 regex += '[^/]*'

315

316 elif char == '?':

317 # Single-character wildcard. Match any single character (except a

318 # slash).

319 regex += '[^/]'

320

321 elif char == '[':

322 # Bracket expression wildcard. Except for the beginning exclamation

323 # mark, the whole bracket expression can be used directly as regex, but

324 # we have to find where the expression ends.

325 # - "[][!]" matches ']', '[' and '!'.

326 # - "[]-]" matches ']' and '-'.

327 # - "[!]a-]" matches any character except ']', 'a' and '-'.

328 j = i

329

330 # Pass bracket expression negation.

331 if j < end and (pattern[j] == '!' or pattern[j] == '^'):

332 j += 1

333

334 # Pass first closing bracket if it is at the beginning of the

335 # expression.

336 if j < end and pattern[j] == ']':

337 j += 1

338

339 # Find closing bracket. Stop once we reach the end or find it.

340 while j < end and pattern[j] != ']':

341 j += 1

342

343 if j < end:

344 # Found end of bracket expression. Increment j to be one past the

345 # closing bracket:

346 #

347 # [...]

348 # ^ ^

349 # i j

350 #

351 j += 1

352 expr = '['

353

354 if pattern[i] == '!':

355 # Bracket expression needs to be negated.

356 expr += '^'

357 i += 1

358 elif pattern[i] == '^':

359 # POSIX declares that the regex bracket expression negation "[^...]"

360 # is undefined in a glob pattern. Python's `fnmatch.translate()`

361 # escapes the caret ('^') as a literal. Git supports the using a

362 # caret for negation. Maintain consistency with Git because that is

363 # the expected behavior.

364 expr += '^'

365 i += 1

366

367 # Build regex bracket expression. Escape slashes so they are treated

368 # as literal slashes by regex as defined by POSIX.

369 expr += pattern[i:j].replace('\\', '\\\\')

370

371 # Add regex bracket expression to regex result.

372 regex += expr

373

374 # Set i to one past the closing bracket.

375 i = j

376

377 else:

378 # Failed to find closing bracket, treat opening bracket as a bracket

379 # literal instead of as an expression.

380 regex += '\\['

381

382 else:

383 # Regular character, escape it for regex.

384 regex += re.escape(char)

385

386 if escape:

387 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")

388

389 return regex

390

391 @staticmethod

392 def escape(s: AnyStr) -> AnyStr:

393 """

394 Escape special characters in the given string.

395

396 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to

397 escape, usually before adding it to a ".gitignore".

398

399 Returns the escaped string (:class:`str` or :class:`bytes`).

400 """

401 if isinstance(s, str):

402 return_type = str

403 string = s

404 elif isinstance(s, bytes):

405 return_type = bytes

406 string = s.decode(_BYTES_ENCODING)

407 else:

408 raise TypeError(f"s:{s!r} is not a unicode or byte string.")

409

410 # Reference: https://git-scm.com/docs/gitignore#_pattern_format

411 meta_characters = r"[]!*#?"

412

413 out_string = "".join("\\" + x if x in meta_characters else x for x in string)

414

415 if return_type is bytes:

416 return out_string.encode(_BYTES_ENCODING)

417 else:

418 return out_string

419

420util.register_pattern('gitwildmatch', GitWildMatchPattern)

421

422

423class GitIgnorePattern(GitWildMatchPattern):

424 """

425 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.

426 This class only exists to maintain compatibility with v0.4.

427 """

428

429 def __init__(self, *args, **kw) -> None:

430 """

431 Warn about deprecation.

432 """

433 self._deprecated()

434 super(GitIgnorePattern, self).__init__(*args, **kw)

435

436 @staticmethod

437 def _deprecated() -> None:

438 """

439 Warn about deprecation.

440 """

441 warnings.warn((

442 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "

443 "('gitwildmatch') instead."

444 ), DeprecationWarning, stacklevel=3)

445

446 @override

447 @classmethod

448 def pattern_to_regex(cls, *args, **kw):

449 """

450 Warn about deprecation.

451 """

452 cls._deprecated()

453 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)

454

455# Register `GitIgnorePattern` as "gitignore" for backward compatibility with

456# v0.4.

457util.register_pattern('gitignore', GitIgnorePattern)