Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 76%

1"""

2This module implements Git's wildmatch pattern matching which itself is derived

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.

4"""

6import re

7import warnings

8from typing import (

9 AnyStr,

10 Optional) # Replaced by `X | None` in 3.10.

12from .. import (

13 util)

14from ..pattern import (

15 RegexPattern)

16from .._typing import (

17 override) # Added in 3.12.

19_BYTES_ENCODING = 'latin1'

20"""

21The encoding to use when parsing a byte string pattern.

22"""

24_DIR_MARK = 'ps_d'

25"""

26The regex group name for the directory marker. This is only used by

27:class:`GitIgnoreSpec`.

28"""

31class GitWildMatchPatternError(ValueError):

32 """

33 The :class:`GitWildMatchPatternError` indicates an invalid git wild match

34 pattern.

35 """

36 pass

39class GitWildMatchPattern(RegexPattern):

40 """

41 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch

42 pattern.

43 """

45 # Keep the dict-less class hierarchy.

46 __slots__ = ()

48 @override

49 @classmethod

50 def pattern_to_regex(

51 cls,

52 pattern: AnyStr,

53 ) -> tuple[Optional[AnyStr], Optional[bool]]:

54 """

55 Convert the pattern into a regular expression.

57 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a

58 regular expression.

60 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or

61 :data:`None`); and whether matched files should be included (:data:`True`),

62 excluded (:data:`False`), or if it is a null-operation (:data:`None`).

63 """

64 if isinstance(pattern, str):

65 return_type = str

66 elif isinstance(pattern, bytes):

67 return_type = bytes

68 pattern = pattern.decode(_BYTES_ENCODING)

69 else:

70 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")

72 original_pattern = pattern

74 if pattern.endswith('\\ '):

75 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends

76 # with backslash followed by a space, only strip from left.

77 pattern = pattern.lstrip()

78 else:

79 pattern = pattern.strip()

81 regex: Optional[str]

82 include: Optional[bool]

84 if pattern.startswith('#'):

85 # A pattern starting with a hash ('#') serves as a comment (neither

86 # includes nor excludes files). Escape the hash with a back-slash to match

87 # a literal hash (i.e., '\#').

88 regex = None

89 include = None

91 elif pattern == '/':

92 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does

93 # not match any file.

94 regex = None

95 include = None

97 elif pattern:

98 if pattern.startswith('!'):

99 # A pattern starting with an exclamation mark ('!') negates the pattern

100 # (exclude instead of include). Escape the exclamation mark with a

101 # back-slash to match a literal exclamation mark (i.e., '\!').

102 include = False

103 # Remove leading exclamation mark.

104 pattern = pattern[1:]

105 else:

106 include = True

107

108 # Allow a regex override for edge cases that cannot be handled through

109 # normalization.

110 override_regex: Optional[str] = None

111

112 # Split pattern into segments.

113 pattern_segs = pattern.split('/')

114

115 # Check whether the pattern is specifically a directory pattern before

116 # normalization.

117 is_dir_pattern = not pattern_segs[-1]

118

119 # Normalize pattern to make processing easier.

120

121 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each

122 # sequence down to one double-asterisk. Iterate over the segments in

123 # reverse and remove the duplicate double asterisks as we go.

124 for i in range(len(pattern_segs) - 1, 0, -1):

125 prev = pattern_segs[i-1]

126 seg = pattern_segs[i]

127 if prev == '**' and seg == '**':

128 del pattern_segs[i]

129

130 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:

131 # EDGE CASE: The '**/' pattern should match everything except individual

132 # files in the root directory. This case cannot be adequately handled

133 # through normalization. Use the override.

134 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'

135

136 if not pattern_segs[0]:

137 # A pattern beginning with a slash ('/') will only match paths directly

138 # on the root directory instead of any descendant paths. So, remove

139 # empty first segment to make pattern relative to root.

140 del pattern_segs[0]

141

142 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):

143 # A single pattern without a beginning slash ('/') will match any

144 # descendant path. This is equivalent to "**/{pattern}". So, prepend

145 # with double-asterisks to make pattern relative to root.

146 # - EDGE CASE: This also holds for a single pattern with a trailing

147 # slash (e.g. dir/).

148 if pattern_segs[0] != '**':

149 pattern_segs.insert(0, '**')

150

151 else:

152 # EDGE CASE: A pattern without a beginning slash ('/') but contains at

153 # least one prepended directory (e.g. "dir/{pattern}") should not match

154 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).

155 pass

156

157 if not pattern_segs:

158 # After resolving the edge cases, we end up with no pattern at all. This

159 # must be because the pattern is invalid.

160 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")

161

162 if not pattern_segs[-1] and len(pattern_segs) > 1:

163 # A pattern ending with a slash ('/') will match all descendant paths if

164 # it is a directory but not if it is a regular file. This is equivalent

165 # to "{pattern}/**". So, set last segment to a double-asterisk to

166 # include all descendants.

167 pattern_segs[-1] = '**'

168

169 if override_regex is None:

170 # Build regular expression from pattern.

171 output = ['^']

172 need_slash = False

173 end = len(pattern_segs) - 1

174 for i, seg in enumerate(pattern_segs):

175 if seg == '**':

176 if i == 0 and i == end:

177 # A pattern consisting solely of double-asterisks ('**') will

178 # match every path.

179 output.append(f'[^/]+(?:/.*)?')

180

181 elif i == 0:

182 # A normalized pattern beginning with double-asterisks

183 # ('**') will match any leading path segments.

184 output.append('(?:.+/)?')

185 need_slash = False

186

187 elif i == end:

188 # A normalized pattern ending with double-asterisks ('**') will

189 # match any trailing path segments.

190 if is_dir_pattern:

191 output.append(f'(?P<{_DIR_MARK}>/).*')

192 else:

193 output.append(f'/.*')

194

195 else:

196 # A pattern with inner double-asterisks ('**') will match multiple

197 # (or zero) inner path segments.

198 output.append('(?:/.+)?')

199 need_slash = True

200

201 elif seg == '*':

202 # Match single path segment.

203 if need_slash:

204 output.append('/')

205

206 output.append('[^/]+')

207

208 if i == end:

209 # A pattern ending without a slash ('/') will match a file or a

210 # directory (with paths underneath it). E.g., "foo" matches "foo",

211 # "foo/bar", "foo/bar/baz", etc.

212 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

213

214 need_slash = True

215

216 else:

217 # Match segment glob pattern.

218 if need_slash:

219 output.append('/')

220

221 try:

222 output.append(cls._translate_segment_glob(seg))

223 except ValueError as e:

224 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e

225

226 if i == end:

227 # A pattern ending without a slash ('/') will match a file or a

228 # directory (with paths underneath it). E.g., "foo" matches "foo",

229 # "foo/bar", "foo/bar/baz", etc.

230 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

231

232 need_slash = True

233

234 output.append('$')

235 regex = ''.join(output)

236

237 else:

238 # Use regex override.

239 regex = override_regex

240

241 else:

242 # A blank pattern is a null-operation (neither includes nor excludes

243 # files).

244 regex = None

245 include = None

246

247 if regex is not None and return_type is bytes:

248 regex = regex.encode(_BYTES_ENCODING)

249

250 return regex, include

251

252 @staticmethod

253 def _translate_segment_glob(pattern: str) -> str:

254 """

255 Translates the glob pattern to a regular expression. This is used in the

256 constructor to translate a path segment glob pattern to its corresponding

257 regular expression.

258

259 *pattern* (:class:`str`) is the glob pattern.

260

261 Returns the regular expression (:class:`str`).

262 """

263 # NOTE: This is derived from `fnmatch.translate()` and is similar to the

264 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.

265

266 escape = False

267 regex = ''

268 i, end = 0, len(pattern)

269 while i < end:

270 # Get next character.

271 char = pattern[i]

272 i += 1

273

274 if escape:

275 # Escape the character.

276 escape = False

277 regex += re.escape(char)

278

279 elif char == '\\':

280 # Escape character, escape next character.

281 escape = True

282

283 elif char == '*':

284 # Multi-character wildcard. Match any string (except slashes), including

285 # an empty string.

286 regex += '[^/]*'

287

288 elif char == '?':

289 # Single-character wildcard. Match any single character (except a

290 # slash).

291 regex += '[^/]'

292

293 elif char == '[':

294 # Bracket expression wildcard. Except for the beginning exclamation

295 # mark, the whole bracket expression can be used directly as regex, but

296 # we have to find where the expression ends.

297 # - "[][!]" matches ']', '[' and '!'.

298 # - "[]-]" matches ']' and '-'.

299 # - "[!]a-]" matches any character except ']', 'a' and '-'.

300 j = i

301

302 # Pass bracket expression negation.

303 if j < end and (pattern[j] == '!' or pattern[j] == '^'):

304 j += 1

305

306 # Pass first closing bracket if it is at the beginning of the

307 # expression.

308 if j < end and pattern[j] == ']':

309 j += 1

310

311 # Find closing bracket. Stop once we reach the end or find it.

312 while j < end and pattern[j] != ']':

313 j += 1

314

315 if j < end:

316 # Found end of bracket expression. Increment j to be one past the

317 # closing bracket:

318 #

319 # [...]

320 # ^ ^

321 # i j

322 #

323 j += 1

324 expr = '['

325

326 if pattern[i] == '!':

327 # Bracket expression needs to be negated.

328 expr += '^'

329 i += 1

330 elif pattern[i] == '^':

331 # POSIX declares that the regex bracket expression negation "[^...]"

332 # is undefined in a glob pattern. Python's `fnmatch.translate()`

333 # escapes the caret ('^') as a literal. Git supports the using a

334 # caret for negation. Maintain consistency with Git because that is

335 # the expected behavior.

336 expr += '^'

337 i += 1

338

339 # Build regex bracket expression. Escape slashes so they are treated

340 # as literal slashes by regex as defined by POSIX.

341 expr += pattern[i:j].replace('\\', '\\\\')

342

343 # Add regex bracket expression to regex result.

344 regex += expr

345

346 # Set i to one past the closing bracket.

347 i = j

348

349 else:

350 # Failed to find closing bracket, treat opening bracket as a bracket

351 # literal instead of as an expression.

352 regex += '\\['

353

354 else:

355 # Regular character, escape it for regex.

356 regex += re.escape(char)

357

358 if escape:

359 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")

360

361 return regex

362

363 @staticmethod

364 def escape(s: AnyStr) -> AnyStr:

365 """

366 Escape special characters in the given string.

367

368 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to

369 escape, usually before adding it to a ".gitignore".

370

371 Returns the escaped string (:class:`str` or :class:`bytes`).

372 """

373 if isinstance(s, str):

374 return_type = str

375 string = s

376 elif isinstance(s, bytes):

377 return_type = bytes

378 string = s.decode(_BYTES_ENCODING)

379 else:

380 raise TypeError(f"s:{s!r} is not a unicode or byte string.")

381

382 # Reference: https://git-scm.com/docs/gitignore#_pattern_format

383 meta_characters = r"[]!*#?"

384

385 out_string = "".join("\\" + x if x in meta_characters else x for x in string)

386

387 if return_type is bytes:

388 return out_string.encode(_BYTES_ENCODING)

389 else:

390 return out_string

391

392util.register_pattern('gitwildmatch', GitWildMatchPattern)

393

394

395class GitIgnorePattern(GitWildMatchPattern):

396 """

397 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.

398 This class only exists to maintain compatibility with v0.4.

399 """

400

401 def __init__(self, *args, **kw) -> None:

402 """

403 Warn about deprecation.

404 """

405 self._deprecated()

406 super(GitIgnorePattern, self).__init__(*args, **kw)

407

408 @staticmethod

409 def _deprecated() -> None:

410 """

411 Warn about deprecation.

412 """

413 warnings.warn((

414 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "

415 "('gitwildmatch') instead."

416 ), DeprecationWarning, stacklevel=3)

417

418 @override

419 @classmethod

420 def pattern_to_regex(cls, *args, **kw):

421 """

422 Warn about deprecation.

423 """

424 cls._deprecated()

425 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)

426

427# Register `GitIgnorePattern` as "gitignore" for backward compatibility with

428# v0.4.

429util.register_pattern('gitignore', GitIgnorePattern)