Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 87%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

178 statements  

1""" 

2This module implements Git's wildmatch pattern matching which itself is derived 

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files. 

4""" 

5 

6import re 

7import warnings 

8from typing import ( 

9 AnyStr, 

10 Optional) # Replaced by `X | None` in 3.10. 

11 

12from .. import ( 

13 util) 

14from ..pattern import ( 

15 RegexPattern) 

16from .._typing import ( 

17 override) # Added in 3.12. 

18 

19_BYTES_ENCODING = 'latin1' 

20""" 

21The encoding to use when parsing a byte string pattern. 

22""" 

23 

24_DIR_MARK = 'ps_d' 

25""" 

26The regex group name for the directory marker. This is only used by 

27:class:`GitIgnoreSpec`. 

28""" 

29 

30_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)' 

31""" 

32This regular expression matches the directory marker. 

33""" 

34 

35_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)' 

36""" 

37This regular expression matches the optional directory marker and sub-path. 

38""" 

39 

40 

41class GitWildMatchPatternError(ValueError): 

42 """ 

43 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

44 pattern. 

45 """ 

46 pass 

47 

48 

49class GitWildMatchPattern(RegexPattern): 

50 """ 

51 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch 

52 pattern. 

53 """ 

54 

55 # Keep the dict-less class hierarchy. 

56 __slots__ = () 

57 

58 @override 

59 @classmethod 

60 def pattern_to_regex( 

61 cls, 

62 pattern: AnyStr, 

63 ) -> tuple[Optional[AnyStr], Optional[bool]]: 

64 """ 

65 Convert the pattern into a regular expression. 

66 

67 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a 

68 regular expression. 

69 

70 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or 

71 :data:`None`); and whether matched files should be included (:data:`True`), 

72 excluded (:data:`False`), or if it is a null-operation (:data:`None`). 

73 """ 

74 if isinstance(pattern, str): 

75 return_type = str 

76 elif isinstance(pattern, bytes): 

77 return_type = bytes 

78 pattern = pattern.decode(_BYTES_ENCODING) 

79 else: 

80 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

81 

82 original_pattern = pattern 

83 

84 if pattern.endswith('\\ '): 

85 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends 

86 # with backslash followed by a space, only strip from left. 

87 pattern = pattern.lstrip() 

88 else: 

89 pattern = pattern.strip() 

90 

91 regex: Optional[str] 

92 include: Optional[bool] 

93 

94 if pattern.startswith('#'): 

95 # A pattern starting with a hash ('#') serves as a comment (neither 

96 # includes nor excludes files). Escape the hash with a back-slash to match 

97 # a literal hash (i.e., '\#'). 

98 regex = None 

99 include = None 

100 

101 elif pattern == '/': 

102 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does 

103 # not match any file. 

104 regex = None 

105 include = None 

106 

107 elif pattern: 

108 if pattern.startswith('!'): 

109 # A pattern starting with an exclamation mark ('!') negates the pattern 

110 # (exclude instead of include). Escape the exclamation mark with a 

111 # back-slash to match a literal exclamation mark (i.e., '\!'). 

112 include = False 

113 # Remove leading exclamation mark. 

114 pattern = pattern[1:] 

115 else: 

116 include = True 

117 

118 # Allow a regex override for edge cases that cannot be handled through 

119 # normalization. 

120 override_regex: Optional[str] = None 

121 

122 # Split pattern into segments. 

123 pattern_segs = pattern.split('/') 

124 

125 # Check whether the pattern is specifically a directory pattern before 

126 # normalization. 

127 is_dir_pattern = not pattern_segs[-1] 

128 

129 # Normalize pattern to make processing easier. 

130 

131 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each 

132 # sequence down to one double-asterisk. Iterate over the segments in 

133 # reverse and remove the duplicate double asterisks as we go. 

134 for i in range(len(pattern_segs) - 1, 0, -1): 

135 prev = pattern_segs[i-1] 

136 seg = pattern_segs[i] 

137 if prev == '**' and seg == '**': 

138 del pattern_segs[i] 

139 

140 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

141 # EDGE CASE: The '**/' pattern should match everything except individual 

142 # files in the root directory. This case cannot be adequately handled 

143 # through normalization. Use the override. 

144 override_regex = _DIR_MARK_CG 

145 

146 if not pattern_segs[0]: 

147 # A pattern beginning with a slash ('/') will only match paths directly 

148 # on the root directory instead of any descendant paths. So, remove 

149 # empty first segment to make pattern relative to root. 

150 del pattern_segs[0] 

151 

152 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

153 # A single segment pattern without a beginning slash ('/') will match 

154 # any descendant path. This is equivalent to "**/{pattern}". So, prepend 

155 # with double-asterisks to make pattern relative to root. 

156 # - EDGE CASE: This also holds for a single segment pattern with a 

157 # trailing slash (e.g. 'dir/'). 

158 if pattern_segs[0] != '**': 

159 pattern_segs.insert(0, '**') 

160 

161 else: 

162 # EDGE CASE: A pattern without a beginning slash ('/') but contains at 

163 # least one prepended directory (e.g. "dir/{pattern}") should not match 

164 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1). 

165 pass 

166 

167 if not pattern_segs: 

168 # After resolving the edge cases, we end up with no pattern at all. This 

169 # must be because the pattern is invalid. 

170 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

171 

172 if not pattern_segs[-1] and len(pattern_segs) > 1: 

173 # A pattern ending with a slash ('/') will match all descendant paths if 

174 # it is a directory but not if it is a regular file. This is equivalent 

175 # to "{pattern}/**". So, set last segment to a double-asterisk to 

176 # include all descendants. 

177 pattern_segs[-1] = '**' 

178 

179 if override_regex is None: 

180 seg_count = len(pattern_segs) 

181 if seg_count == 1 and pattern_segs[0] == '**': 

182 # The pattern "**" will match every path. Special case this pattern. 

183 override_regex = '.' 

184 

185 elif ( 

186 seg_count == 2 

187 and pattern_segs[0] == '**' 

188 and pattern_segs[1] == '*' 

189 ): 

190 # The pattern "*" will be normalized to "**/*" and will match every 

191 # path. Special case this pattern for efficiency. 

192 override_regex = '.' 

193 

194 elif ( 

195 seg_count == 3 

196 and pattern_segs[0] == '**' 

197 and pattern_segs[1] == '*' 

198 and pattern_segs[2] == '**' 

199 ): 

200 # The pattern "*/" will be normalized to "**/*/**" which will match 

201 # every file not in the root directory. Special case this pattern for 

202 # efficiency. 

203 if is_dir_pattern: 

204 override_regex = _DIR_MARK_CG 

205 else: 

206 override_regex = '/' 

207 

208 if override_regex is None: 

209 # Build regular expression from pattern. 

210 output = [] 

211 need_slash = False 

212 end = len(pattern_segs) - 1 

213 for i, seg in enumerate(pattern_segs): 

214 if seg == '**': 

215 if i == 0: 

216 # A normalized pattern beginning with double-asterisks ('**') will 

217 # match any leading path segments. 

218 output.append('^(?:.+/)?') 

219 

220 elif i < end: 

221 # A pattern with inner double-asterisks ('**') will match multiple 

222 # (or zero) inner path segments. 

223 output.append('(?:/.+)?') 

224 need_slash = True 

225 

226 else: 

227 assert i == end, (i, end) 

228 # A normalized pattern ending with double-asterisks ('**') will 

229 # match any trailing path segments. 

230 if is_dir_pattern: 

231 output.append(_DIR_MARK_CG) 

232 else: 

233 output.append(f'/') 

234 

235 else: 

236 # Match path segment. 

237 if i == 0: 

238 # Anchor to root directory. 

239 output.append('^') 

240 

241 if need_slash: 

242 output.append('/') 

243 

244 if seg == '*': 

245 # Match whole path segment. 

246 output.append('[^/]+') 

247 

248 else: 

249 # Match segment glob pattern. 

250 try: 

251 output.append(cls._translate_segment_glob(seg)) 

252 except ValueError as e: 

253 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

254 

255 if i == end: 

256 # A pattern ending without a slash ('/') will match a file or a 

257 # directory (with paths underneath it). E.g., "foo" matches "foo", 

258 # "foo/bar", "foo/bar/baz", etc. 

259 output.append(_DIR_MARK_OPT) 

260 

261 need_slash = True 

262 

263 regex = ''.join(output) 

264 

265 else: 

266 # Use regex override. 

267 regex = override_regex 

268 

269 else: 

270 # A blank pattern is a null-operation (neither includes nor excludes 

271 # files). 

272 regex = None 

273 include = None 

274 

275 if regex is not None and return_type is bytes: 

276 regex = regex.encode(_BYTES_ENCODING) 

277 

278 return regex, include 

279 

280 @staticmethod 

281 def _translate_segment_glob(pattern: str) -> str: 

282 """ 

283 Translates the glob pattern to a regular expression. This is used in the 

284 constructor to translate a path segment glob pattern to its corresponding 

285 regular expression. 

286 

287 *pattern* (:class:`str`) is the glob pattern. 

288 

289 Returns the regular expression (:class:`str`). 

290 """ 

291 # NOTE: This is derived from `fnmatch.translate()` and is similar to the 

292 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

293 

294 escape = False 

295 regex = '' 

296 i, end = 0, len(pattern) 

297 while i < end: 

298 # Get next character. 

299 char = pattern[i] 

300 i += 1 

301 

302 if escape: 

303 # Escape the character. 

304 escape = False 

305 regex += re.escape(char) 

306 

307 elif char == '\\': 

308 # Escape character, escape next character. 

309 escape = True 

310 

311 elif char == '*': 

312 # Multi-character wildcard. Match any string (except slashes), including 

313 # an empty string. 

314 regex += '[^/]*' 

315 

316 elif char == '?': 

317 # Single-character wildcard. Match any single character (except a 

318 # slash). 

319 regex += '[^/]' 

320 

321 elif char == '[': 

322 # Bracket expression wildcard. Except for the beginning exclamation 

323 # mark, the whole bracket expression can be used directly as regex, but 

324 # we have to find where the expression ends. 

325 # - "[][!]" matches ']', '[' and '!'. 

326 # - "[]-]" matches ']' and '-'. 

327 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

328 j = i 

329 

330 # Pass bracket expression negation. 

331 if j < end and (pattern[j] == '!' or pattern[j] == '^'): 

332 j += 1 

333 

334 # Pass first closing bracket if it is at the beginning of the 

335 # expression. 

336 if j < end and pattern[j] == ']': 

337 j += 1 

338 

339 # Find closing bracket. Stop once we reach the end or find it. 

340 while j < end and pattern[j] != ']': 

341 j += 1 

342 

343 if j < end: 

344 # Found end of bracket expression. Increment j to be one past the 

345 # closing bracket: 

346 # 

347 # [...] 

348 # ^ ^ 

349 # i j 

350 # 

351 j += 1 

352 expr = '[' 

353 

354 if pattern[i] == '!': 

355 # Bracket expression needs to be negated. 

356 expr += '^' 

357 i += 1 

358 elif pattern[i] == '^': 

359 # POSIX declares that the regex bracket expression negation "[^...]" 

360 # is undefined in a glob pattern. Python's `fnmatch.translate()` 

361 # escapes the caret ('^') as a literal. Git supports the using a 

362 # caret for negation. Maintain consistency with Git because that is 

363 # the expected behavior. 

364 expr += '^' 

365 i += 1 

366 

367 # Build regex bracket expression. Escape slashes so they are treated 

368 # as literal slashes by regex as defined by POSIX. 

369 expr += pattern[i:j].replace('\\', '\\\\') 

370 

371 # Add regex bracket expression to regex result. 

372 regex += expr 

373 

374 # Set i to one past the closing bracket. 

375 i = j 

376 

377 else: 

378 # Failed to find closing bracket, treat opening bracket as a bracket 

379 # literal instead of as an expression. 

380 regex += '\\[' 

381 

382 else: 

383 # Regular character, escape it for regex. 

384 regex += re.escape(char) 

385 

386 if escape: 

387 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

388 

389 return regex 

390 

391 @staticmethod 

392 def escape(s: AnyStr) -> AnyStr: 

393 """ 

394 Escape special characters in the given string. 

395 

396 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to 

397 escape, usually before adding it to a ".gitignore". 

398 

399 Returns the escaped string (:class:`str` or :class:`bytes`). 

400 """ 

401 if isinstance(s, str): 

402 return_type = str 

403 string = s 

404 elif isinstance(s, bytes): 

405 return_type = bytes 

406 string = s.decode(_BYTES_ENCODING) 

407 else: 

408 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

409 

410 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

411 meta_characters = r"[]!*#?" 

412 

413 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

414 

415 if return_type is bytes: 

416 return out_string.encode(_BYTES_ENCODING) 

417 else: 

418 return out_string 

419 

420util.register_pattern('gitwildmatch', GitWildMatchPattern) 

421 

422 

423class GitIgnorePattern(GitWildMatchPattern): 

424 """ 

425 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

426 This class only exists to maintain compatibility with v0.4. 

427 """ 

428 

429 def __init__(self, *args, **kw) -> None: 

430 """ 

431 Warn about deprecation. 

432 """ 

433 self._deprecated() 

434 super(GitIgnorePattern, self).__init__(*args, **kw) 

435 

436 @staticmethod 

437 def _deprecated() -> None: 

438 """ 

439 Warn about deprecation. 

440 """ 

441 warnings.warn(( 

442 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern " 

443 "('gitwildmatch') instead." 

444 ), DeprecationWarning, stacklevel=3) 

445 

446 @override 

447 @classmethod 

448 def pattern_to_regex(cls, *args, **kw): 

449 """ 

450 Warn about deprecation. 

451 """ 

452 cls._deprecated() 

453 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

454 

455# Register `GitIgnorePattern` as "gitignore" for backward compatibility with 

456# v0.4. 

457util.register_pattern('gitignore', GitIgnorePattern)