Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 76%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

170 statements  

1""" 

2This module implements Git's wildmatch pattern matching which itself is derived 

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files. 

4""" 

5 

6import re 

7import warnings 

8from typing import ( 

9 AnyStr, 

10 Optional) # Replaced by `X | None` in 3.10. 

11 

12from .. import ( 

13 util) 

14from ..pattern import ( 

15 RegexPattern) 

16from .._typing import ( 

17 override) # Added in 3.12. 

18 

19_BYTES_ENCODING = 'latin1' 

20""" 

21The encoding to use when parsing a byte string pattern. 

22""" 

23 

24_DIR_MARK = 'ps_d' 

25""" 

26The regex group name for the directory marker. This is only used by 

27:class:`GitIgnoreSpec`. 

28""" 

29 

30 

31class GitWildMatchPatternError(ValueError): 

32 """ 

33 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

34 pattern. 

35 """ 

36 pass 

37 

38 

39class GitWildMatchPattern(RegexPattern): 

40 """ 

41 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch 

42 pattern. 

43 """ 

44 

45 # Keep the dict-less class hierarchy. 

46 __slots__ = () 

47 

48 @override 

49 @classmethod 

50 def pattern_to_regex( 

51 cls, 

52 pattern: AnyStr, 

53 ) -> tuple[Optional[AnyStr], Optional[bool]]: 

54 """ 

55 Convert the pattern into a regular expression. 

56 

57 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a 

58 regular expression. 

59 

60 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or 

61 :data:`None`); and whether matched files should be included (:data:`True`), 

62 excluded (:data:`False`), or if it is a null-operation (:data:`None`). 

63 """ 

64 if isinstance(pattern, str): 

65 return_type = str 

66 elif isinstance(pattern, bytes): 

67 return_type = bytes 

68 pattern = pattern.decode(_BYTES_ENCODING) 

69 else: 

70 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

71 

72 original_pattern = pattern 

73 

74 if pattern.endswith('\\ '): 

75 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends 

76 # with backslash followed by a space, only strip from left. 

77 pattern = pattern.lstrip() 

78 else: 

79 pattern = pattern.strip() 

80 

81 regex: Optional[str] 

82 include: Optional[bool] 

83 

84 if pattern.startswith('#'): 

85 # A pattern starting with a hash ('#') serves as a comment (neither 

86 # includes nor excludes files). Escape the hash with a back-slash to match 

87 # a literal hash (i.e., '\#'). 

88 regex = None 

89 include = None 

90 

91 elif pattern == '/': 

92 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does 

93 # not match any file. 

94 regex = None 

95 include = None 

96 

97 elif pattern: 

98 if pattern.startswith('!'): 

99 # A pattern starting with an exclamation mark ('!') negates the pattern 

100 # (exclude instead of include). Escape the exclamation mark with a 

101 # back-slash to match a literal exclamation mark (i.e., '\!'). 

102 include = False 

103 # Remove leading exclamation mark. 

104 pattern = pattern[1:] 

105 else: 

106 include = True 

107 

108 # Allow a regex override for edge cases that cannot be handled through 

109 # normalization. 

110 override_regex: Optional[str] = None 

111 

112 # Split pattern into segments. 

113 pattern_segs = pattern.split('/') 

114 

115 # Check whether the pattern is specifically a directory pattern before 

116 # normalization. 

117 is_dir_pattern = not pattern_segs[-1] 

118 

119 # Normalize pattern to make processing easier. 

120 

121 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each 

122 # sequence down to one double-asterisk. Iterate over the segments in 

123 # reverse and remove the duplicate double asterisks as we go. 

124 for i in range(len(pattern_segs) - 1, 0, -1): 

125 prev = pattern_segs[i-1] 

126 seg = pattern_segs[i] 

127 if prev == '**' and seg == '**': 

128 del pattern_segs[i] 

129 

130 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

131 # EDGE CASE: The '**/' pattern should match everything except individual 

132 # files in the root directory. This case cannot be adequately handled 

133 # through normalization. Use the override. 

134 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' 

135 

136 if not pattern_segs[0]: 

137 # A pattern beginning with a slash ('/') will only match paths directly 

138 # on the root directory instead of any descendant paths. So, remove 

139 # empty first segment to make pattern relative to root. 

140 del pattern_segs[0] 

141 

142 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

143 # A single pattern without a beginning slash ('/') will match any 

144 # descendant path. This is equivalent to "**/{pattern}". So, prepend 

145 # with double-asterisks to make pattern relative to root. 

146 # - EDGE CASE: This also holds for a single pattern with a trailing 

147 # slash (e.g. dir/). 

148 if pattern_segs[0] != '**': 

149 pattern_segs.insert(0, '**') 

150 

151 else: 

152 # EDGE CASE: A pattern without a beginning slash ('/') but contains at 

153 # least one prepended directory (e.g. "dir/{pattern}") should not match 

154 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1). 

155 pass 

156 

157 if not pattern_segs: 

158 # After resolving the edge cases, we end up with no pattern at all. This 

159 # must be because the pattern is invalid. 

160 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

161 

162 if not pattern_segs[-1] and len(pattern_segs) > 1: 

163 # A pattern ending with a slash ('/') will match all descendant paths if 

164 # it is a directory but not if it is a regular file. This is equivalent 

165 # to "{pattern}/**". So, set last segment to a double-asterisk to 

166 # include all descendants. 

167 pattern_segs[-1] = '**' 

168 

169 if override_regex is None: 

170 # Build regular expression from pattern. 

171 output = ['^'] 

172 need_slash = False 

173 end = len(pattern_segs) - 1 

174 for i, seg in enumerate(pattern_segs): 

175 if seg == '**': 

176 if i == 0 and i == end: 

177 # A pattern consisting solely of double-asterisks ('**') will 

178 # match every path. 

179 output.append(f'[^/]+(?:/.*)?') 

180 

181 elif i == 0: 

182 # A normalized pattern beginning with double-asterisks 

183 # ('**') will match any leading path segments. 

184 output.append('(?:.+/)?') 

185 need_slash = False 

186 

187 elif i == end: 

188 # A normalized pattern ending with double-asterisks ('**') will 

189 # match any trailing path segments. 

190 if is_dir_pattern: 

191 output.append(f'(?P<{_DIR_MARK}>/).*') 

192 else: 

193 output.append(f'/.*') 

194 

195 else: 

196 # A pattern with inner double-asterisks ('**') will match multiple 

197 # (or zero) inner path segments. 

198 output.append('(?:/.+)?') 

199 need_slash = True 

200 

201 elif seg == '*': 

202 # Match single path segment. 

203 if need_slash: 

204 output.append('/') 

205 

206 output.append('[^/]+') 

207 

208 if i == end: 

209 # A pattern ending without a slash ('/') will match a file or a 

210 # directory (with paths underneath it). E.g., "foo" matches "foo", 

211 # "foo/bar", "foo/bar/baz", etc. 

212 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

213 

214 need_slash = True 

215 

216 else: 

217 # Match segment glob pattern. 

218 if need_slash: 

219 output.append('/') 

220 

221 try: 

222 output.append(cls._translate_segment_glob(seg)) 

223 except ValueError as e: 

224 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

225 

226 if i == end: 

227 # A pattern ending without a slash ('/') will match a file or a 

228 # directory (with paths underneath it). E.g., "foo" matches "foo", 

229 # "foo/bar", "foo/bar/baz", etc. 

230 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

231 

232 need_slash = True 

233 

234 output.append('$') 

235 regex = ''.join(output) 

236 

237 else: 

238 # Use regex override. 

239 regex = override_regex 

240 

241 else: 

242 # A blank pattern is a null-operation (neither includes nor excludes 

243 # files). 

244 regex = None 

245 include = None 

246 

247 if regex is not None and return_type is bytes: 

248 regex = regex.encode(_BYTES_ENCODING) 

249 

250 return regex, include 

251 

252 @staticmethod 

253 def _translate_segment_glob(pattern: str) -> str: 

254 """ 

255 Translates the glob pattern to a regular expression. This is used in the 

256 constructor to translate a path segment glob pattern to its corresponding 

257 regular expression. 

258 

259 *pattern* (:class:`str`) is the glob pattern. 

260 

261 Returns the regular expression (:class:`str`). 

262 """ 

263 # NOTE: This is derived from `fnmatch.translate()` and is similar to the 

264 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

265 

266 escape = False 

267 regex = '' 

268 i, end = 0, len(pattern) 

269 while i < end: 

270 # Get next character. 

271 char = pattern[i] 

272 i += 1 

273 

274 if escape: 

275 # Escape the character. 

276 escape = False 

277 regex += re.escape(char) 

278 

279 elif char == '\\': 

280 # Escape character, escape next character. 

281 escape = True 

282 

283 elif char == '*': 

284 # Multi-character wildcard. Match any string (except slashes), including 

285 # an empty string. 

286 regex += '[^/]*' 

287 

288 elif char == '?': 

289 # Single-character wildcard. Match any single character (except a 

290 # slash). 

291 regex += '[^/]' 

292 

293 elif char == '[': 

294 # Bracket expression wildcard. Except for the beginning exclamation 

295 # mark, the whole bracket expression can be used directly as regex, but 

296 # we have to find where the expression ends. 

297 # - "[][!]" matches ']', '[' and '!'. 

298 # - "[]-]" matches ']' and '-'. 

299 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

300 j = i 

301 

302 # Pass bracket expression negation. 

303 if j < end and (pattern[j] == '!' or pattern[j] == '^'): 

304 j += 1 

305 

306 # Pass first closing bracket if it is at the beginning of the 

307 # expression. 

308 if j < end and pattern[j] == ']': 

309 j += 1 

310 

311 # Find closing bracket. Stop once we reach the end or find it. 

312 while j < end and pattern[j] != ']': 

313 j += 1 

314 

315 if j < end: 

316 # Found end of bracket expression. Increment j to be one past the 

317 # closing bracket: 

318 # 

319 # [...] 

320 # ^ ^ 

321 # i j 

322 # 

323 j += 1 

324 expr = '[' 

325 

326 if pattern[i] == '!': 

327 # Bracket expression needs to be negated. 

328 expr += '^' 

329 i += 1 

330 elif pattern[i] == '^': 

331 # POSIX declares that the regex bracket expression negation "[^...]" 

332 # is undefined in a glob pattern. Python's `fnmatch.translate()` 

333 # escapes the caret ('^') as a literal. Git supports the using a 

334 # caret for negation. Maintain consistency with Git because that is 

335 # the expected behavior. 

336 expr += '^' 

337 i += 1 

338 

339 # Build regex bracket expression. Escape slashes so they are treated 

340 # as literal slashes by regex as defined by POSIX. 

341 expr += pattern[i:j].replace('\\', '\\\\') 

342 

343 # Add regex bracket expression to regex result. 

344 regex += expr 

345 

346 # Set i to one past the closing bracket. 

347 i = j 

348 

349 else: 

350 # Failed to find closing bracket, treat opening bracket as a bracket 

351 # literal instead of as an expression. 

352 regex += '\\[' 

353 

354 else: 

355 # Regular character, escape it for regex. 

356 regex += re.escape(char) 

357 

358 if escape: 

359 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

360 

361 return regex 

362 

363 @staticmethod 

364 def escape(s: AnyStr) -> AnyStr: 

365 """ 

366 Escape special characters in the given string. 

367 

368 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to 

369 escape, usually before adding it to a ".gitignore". 

370 

371 Returns the escaped string (:class:`str` or :class:`bytes`). 

372 """ 

373 if isinstance(s, str): 

374 return_type = str 

375 string = s 

376 elif isinstance(s, bytes): 

377 return_type = bytes 

378 string = s.decode(_BYTES_ENCODING) 

379 else: 

380 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

381 

382 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

383 meta_characters = r"[]!*#?" 

384 

385 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

386 

387 if return_type is bytes: 

388 return out_string.encode(_BYTES_ENCODING) 

389 else: 

390 return out_string 

391 

392util.register_pattern('gitwildmatch', GitWildMatchPattern) 

393 

394 

395class GitIgnorePattern(GitWildMatchPattern): 

396 """ 

397 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

398 This class only exists to maintain compatibility with v0.4. 

399 """ 

400 

401 def __init__(self, *args, **kw) -> None: 

402 """ 

403 Warn about deprecation. 

404 """ 

405 self._deprecated() 

406 super(GitIgnorePattern, self).__init__(*args, **kw) 

407 

408 @staticmethod 

409 def _deprecated() -> None: 

410 """ 

411 Warn about deprecation. 

412 """ 

413 warnings.warn(( 

414 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern " 

415 "('gitwildmatch') instead." 

416 ), DeprecationWarning, stacklevel=3) 

417 

418 @override 

419 @classmethod 

420 def pattern_to_regex(cls, *args, **kw): 

421 """ 

422 Warn about deprecation. 

423 """ 

424 cls._deprecated() 

425 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

426 

427# Register `GitIgnorePattern` as "gitignore" for backward compatibility with 

428# v0.4. 

429util.register_pattern('gitignore', GitIgnorePattern)