Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 87%

1"""

2This module implements Git's wildmatch pattern matching which itself is derived

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.

4"""

6import re

7import warnings

8from typing import (

9 AnyStr,

10 Optional, # Replaced by `X | None` in 3.10.

11 Tuple) # Replaced by `tuple` in 3.9.

13from .. import util

14from ..pattern import RegexPattern

16_BYTES_ENCODING = 'latin1'

17"""

18The encoding to use when parsing a byte string pattern.

19"""

21_DIR_MARK = 'ps_d'

22"""

23The regex group name for the directory marker. This is only used by

24:class:`GitIgnoreSpec`.

25"""

28class GitWildMatchPatternError(ValueError):

29 """

30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match

31 pattern.

32 """

33 pass

36class GitWildMatchPattern(RegexPattern):

37 """

38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch

39 pattern.

40 """

42 # Keep the dict-less class hierarchy.

43 __slots__ = ()

45 @classmethod

46 def pattern_to_regex(

47 cls,

48 pattern: AnyStr,

49 ) -> Tuple[Optional[AnyStr], Optional[bool]]:

50 """

51 Convert the pattern into a regular expression.

53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a

54 regular expression.

56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or

57 :data:`None`); and whether matched files should be included (:data:`True`),

58 excluded (:data:`False`), or if it is a null-operation (:data:`None`).

59 """

60 if isinstance(pattern, str):

61 return_type = str

62 elif isinstance(pattern, bytes):

63 return_type = bytes

64 pattern = pattern.decode(_BYTES_ENCODING)

65 else:

66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")

68 original_pattern = pattern

70 if pattern.endswith('\\ '):

71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends

72 # with backslash followed by a space, only strip from left.

73 pattern = pattern.lstrip()

74 else:

75 pattern = pattern.strip()

77 regex: Optional[str]

78 include: Optional[bool]

80 if pattern.startswith('#'):

81 # A pattern starting with a hash ('#') serves as a comment (neither

82 # includes nor excludes files). Escape the hash with a back-slash to match

83 # a literal hash (i.e., '\#').

84 regex = None

85 include = None

87 elif pattern == '/':

88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does

89 # not match any file.

90 regex = None

91 include = None

93 elif pattern:

94 if pattern.startswith('!'):

95 # A pattern starting with an exclamation mark ('!') negates the pattern

96 # (exclude instead of include). Escape the exclamation mark with a

97 # back-slash to match a literal exclamation mark (i.e., '\!').

98 include = False

99 # Remove leading exclamation mark.

100 pattern = pattern[1:]

101 else:

102 include = True

103

104 # Allow a regex override for edge cases that cannot be handled through

105 # normalization.

106 override_regex: Optional[str] = None

107

108 # Split pattern into segments.

109 pattern_segs = pattern.split('/')

110

111 # Check whether the pattern is specifically a directory pattern before

112 # normalization.

113 is_dir_pattern = not pattern_segs[-1]

114

115 # Normalize pattern to make processing easier.

116

117 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each

118 # sequence down to one double-asterisk. Iterate over the segments in

119 # reverse and remove the duplicate double asterisks as we go.

120 for i in range(len(pattern_segs) - 1, 0, -1):

121 prev = pattern_segs[i-1]

122 seg = pattern_segs[i]

123 if prev == '**' and seg == '**':

124 del pattern_segs[i]

125

126 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:

127 # EDGE CASE: The '**/' pattern should match everything except individual

128 # files in the root directory. This case cannot be adequately handled

129 # through normalization. Use the override.

130 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'

131

132 if not pattern_segs[0]:

133 # A pattern beginning with a slash ('/') will only match paths directly

134 # on the root directory instead of any descendant paths. So, remove

135 # empty first segment to make pattern relative to root.

136 del pattern_segs[0]

137

138 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):

139 # A single pattern without a beginning slash ('/') will match any

140 # descendant path. This is equivalent to "**/{pattern}". So, prepend

141 # with double-asterisks to make pattern relative to root.

142 # - EDGE CASE: This also holds for a single pattern with a trailing

143 # slash (e.g. dir/).

144 if pattern_segs[0] != '**':

145 pattern_segs.insert(0, '**')

146

147 else:

148 # EDGE CASE: A pattern without a beginning slash ('/') but contains at

149 # least one prepended directory (e.g. "dir/{pattern}") should not match

150 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).

151 pass

152

153 if not pattern_segs:

154 # After resolving the edge cases, we end up with no pattern at all. This

155 # must be because the pattern is invalid.

156 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")

157

158 if not pattern_segs[-1] and len(pattern_segs) > 1:

159 # A pattern ending with a slash ('/') will match all descendant paths if

160 # it is a directory but not if it is a regular file. This is equivalent

161 # to "{pattern}/**". So, set last segment to a double-asterisk to

162 # include all descendants.

163 pattern_segs[-1] = '**'

164

165 if override_regex is None:

166 # Build regular expression from pattern.

167 output = ['^']

168 need_slash = False

169 end = len(pattern_segs) - 1

170 for i, seg in enumerate(pattern_segs):

171 if seg == '**':

172 if i == 0 and i == end:

173 # A pattern consisting solely of double-asterisks ('**') will

174 # match every path.

175 output.append(f'[^/]+(?:/.*)?')

176

177 elif i == 0:

178 # A normalized pattern beginning with double-asterisks

179 # ('**') will match any leading path segments.

180 output.append('(?:.+/)?')

181 need_slash = False

182

183 elif i == end:

184 # A normalized pattern ending with double-asterisks ('**') will

185 # match any trailing path segments.

186 if is_dir_pattern:

187 output.append(f'(?P<{_DIR_MARK}>/).*')

188 else:

189 output.append(f'/.*')

190

191 else:

192 # A pattern with inner double-asterisks ('**') will match multiple

193 # (or zero) inner path segments.

194 output.append('(?:/.+)?')

195 need_slash = True

196

197 elif seg == '*':

198 # Match single path segment.

199 if need_slash:

200 output.append('/')

201

202 output.append('[^/]+')

203

204 if i == end:

205 # A pattern ending without a slash ('/') will match a file or a

206 # directory (with paths underneath it). E.g., "foo" matches "foo",

207 # "foo/bar", "foo/bar/baz", etc.

208 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

209

210 need_slash = True

211

212 else:

213 # Match segment glob pattern.

214 if need_slash:

215 output.append('/')

216

217 try:

218 output.append(cls._translate_segment_glob(seg))

219 except ValueError as e:

220 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e

221

222 if i == end:

223 # A pattern ending without a slash ('/') will match a file or a

224 # directory (with paths underneath it). E.g., "foo" matches "foo",

225 # "foo/bar", "foo/bar/baz", etc.

226 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

227

228 need_slash = True

229

230 output.append('$')

231 regex = ''.join(output)

232

233 else:

234 # Use regex override.

235 regex = override_regex

236

237 else:

238 # A blank pattern is a null-operation (neither includes nor excludes

239 # files).

240 regex = None

241 include = None

242

243 if regex is not None and return_type is bytes:

244 regex = regex.encode(_BYTES_ENCODING)

245

246 return regex, include

247

248 @staticmethod

249 def _translate_segment_glob(pattern: str) -> str:

250 """

251 Translates the glob pattern to a regular expression. This is used in the

252 constructor to translate a path segment glob pattern to its corresponding

253 regular expression.

254

255 *pattern* (:class:`str`) is the glob pattern.

256

257 Returns the regular expression (:class:`str`).

258 """

259 # NOTE: This is derived from `fnmatch.translate()` and is similar to the

260 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.

261

262 escape = False

263 regex = ''

264 i, end = 0, len(pattern)

265 while i < end:

266 # Get next character.

267 char = pattern[i]

268 i += 1

269

270 if escape:

271 # Escape the character.

272 escape = False

273 regex += re.escape(char)

274

275 elif char == '\\':

276 # Escape character, escape next character.

277 escape = True

278

279 elif char == '*':

280 # Multi-character wildcard. Match any string (except slashes), including

281 # an empty string.

282 regex += '[^/]*'

283

284 elif char == '?':

285 # Single-character wildcard. Match any single character (except a

286 # slash).

287 regex += '[^/]'

288

289 elif char == '[':

290 # Bracket expression wildcard. Except for the beginning exclamation

291 # mark, the whole bracket expression can be used directly as regex, but

292 # we have to find where the expression ends.

293 # - "[][!]" matches ']', '[' and '!'.

294 # - "[]-]" matches ']' and '-'.

295 # - "[!]a-]" matches any character except ']', 'a' and '-'.

296 j = i

297

298 # Pass bracket expression negation.

299 if j < end and (pattern[j] == '!' or pattern[j] == '^'):

300 j += 1

301

302 # Pass first closing bracket if it is at the beginning of the

303 # expression.

304 if j < end and pattern[j] == ']':

305 j += 1

306

307 # Find closing bracket. Stop once we reach the end or find it.

308 while j < end and pattern[j] != ']':

309 j += 1

310

311 if j < end:

312 # Found end of bracket expression. Increment j to be one past the

313 # closing bracket:

314 #

315 # [...]

316 # ^ ^

317 # i j

318 #

319 j += 1

320 expr = '['

321

322 if pattern[i] == '!':

323 # Bracket expression needs to be negated.

324 expr += '^'

325 i += 1

326 elif pattern[i] == '^':

327 # POSIX declares that the regex bracket expression negation "[^...]"

328 # is undefined in a glob pattern. Python's `fnmatch.translate()`

329 # escapes the caret ('^') as a literal. Git supports the using a

330 # caret for negation. Maintain consistency with Git because that is

331 # the expected behavior.

332 expr += '^'

333 i += 1

334

335 # Build regex bracket expression. Escape slashes so they are treated

336 # as literal slashes by regex as defined by POSIX.

337 expr += pattern[i:j].replace('\\', '\\\\')

338

339 # Add regex bracket expression to regex result.

340 regex += expr

341

342 # Set i to one past the closing bracket.

343 i = j

344

345 else:

346 # Failed to find closing bracket, treat opening bracket as a bracket

347 # literal instead of as an expression.

348 regex += '\\['

349

350 else:

351 # Regular character, escape it for regex.

352 regex += re.escape(char)

353

354 if escape:

355 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")

356

357 return regex

358

359 @staticmethod

360 def escape(s: AnyStr) -> AnyStr:

361 """

362 Escape special characters in the given string.

363

364 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to

365 escape, usually before adding it to a ".gitignore".

366

367 Returns the escaped string (:class:`str` or :class:`bytes`).

368 """

369 if isinstance(s, str):

370 return_type = str

371 string = s

372 elif isinstance(s, bytes):

373 return_type = bytes

374 string = s.decode(_BYTES_ENCODING)

375 else:

376 raise TypeError(f"s:{s!r} is not a unicode or byte string.")

377

378 # Reference: https://git-scm.com/docs/gitignore#_pattern_format

379 meta_characters = r"[]!*#?"

380

381 out_string = "".join("\\" + x if x in meta_characters else x for x in string)

382

383 if return_type is bytes:

384 return out_string.encode(_BYTES_ENCODING)

385 else:

386 return out_string

387

388util.register_pattern('gitwildmatch', GitWildMatchPattern)

389

390

391class GitIgnorePattern(GitWildMatchPattern):

392 """

393 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.

394 This class only exists to maintain compatibility with v0.4.

395 """

396

397 def __init__(self, *args, **kw) -> None:

398 """

399 Warn about deprecation.

400 """

401 self._deprecated()

402 super(GitIgnorePattern, self).__init__(*args, **kw)

403

404 @staticmethod

405 def _deprecated() -> None:

406 """

407 Warn about deprecation.

408 """

409 warnings.warn((

410 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "

411 "('gitwildmatch') instead."

412 ), DeprecationWarning, stacklevel=3)

413

414 @classmethod

415 def pattern_to_regex(cls, *args, **kw):

416 """

417 Warn about deprecation.

418 """

419 cls._deprecated()

420 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)

421

422# Register `GitIgnorePattern` as "gitignore" for backward compatibility with

423# v0.4.

424util.register_pattern('gitignore', GitIgnorePattern)