Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pathspec/patterns/gitwildmatch.py: 87%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

167 statements  

1""" 

2This module implements Git's wildmatch pattern matching which itself is derived 

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files. 

4""" 

5 

6import re 

7import warnings 

8from typing import ( 

9 AnyStr, 

10 Optional, # Replaced by `X | None` in 3.10. 

11 Tuple) # Replaced by `tuple` in 3.9. 

12 

13from .. import util 

14from ..pattern import RegexPattern 

15 

16_BYTES_ENCODING = 'latin1' 

17""" 

18The encoding to use when parsing a byte string pattern. 

19""" 

20 

21_DIR_MARK = 'ps_d' 

22""" 

23The regex group name for the directory marker. This is only used by 

24:class:`GitIgnoreSpec`. 

25""" 

26 

27 

28class GitWildMatchPatternError(ValueError): 

29 """ 

30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

31 pattern. 

32 """ 

33 pass 

34 

35 

36class GitWildMatchPattern(RegexPattern): 

37 """ 

38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch 

39 pattern. 

40 """ 

41 

42 # Keep the dict-less class hierarchy. 

43 __slots__ = () 

44 

45 @classmethod 

46 def pattern_to_regex( 

47 cls, 

48 pattern: AnyStr, 

49 ) -> Tuple[Optional[AnyStr], Optional[bool]]: 

50 """ 

51 Convert the pattern into a regular expression. 

52 

53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a 

54 regular expression. 

55 

56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or 

57 :data:`None`); and whether matched files should be included (:data:`True`), 

58 excluded (:data:`False`), or if it is a null-operation (:data:`None`). 

59 """ 

60 if isinstance(pattern, str): 

61 return_type = str 

62 elif isinstance(pattern, bytes): 

63 return_type = bytes 

64 pattern = pattern.decode(_BYTES_ENCODING) 

65 else: 

66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

67 

68 original_pattern = pattern 

69 

70 if pattern.endswith('\\ '): 

71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends 

72 # with backslash followed by a space, only strip from left. 

73 pattern = pattern.lstrip() 

74 else: 

75 pattern = pattern.strip() 

76 

77 regex: Optional[str] 

78 include: Optional[bool] 

79 

80 if pattern.startswith('#'): 

81 # A pattern starting with a hash ('#') serves as a comment (neither 

82 # includes nor excludes files). Escape the hash with a back-slash to match 

83 # a literal hash (i.e., '\#'). 

84 regex = None 

85 include = None 

86 

87 elif pattern == '/': 

88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does 

89 # not match any file. 

90 regex = None 

91 include = None 

92 

93 elif pattern: 

94 if pattern.startswith('!'): 

95 # A pattern starting with an exclamation mark ('!') negates the pattern 

96 # (exclude instead of include). Escape the exclamation mark with a 

97 # back-slash to match a literal exclamation mark (i.e., '\!'). 

98 include = False 

99 # Remove leading exclamation mark. 

100 pattern = pattern[1:] 

101 else: 

102 include = True 

103 

104 # Allow a regex override for edge cases that cannot be handled through 

105 # normalization. 

106 override_regex: Optional[str] = None 

107 

108 # Split pattern into segments. 

109 pattern_segs = pattern.split('/') 

110 

111 # Check whether the pattern is specifically a directory pattern before 

112 # normalization. 

113 is_dir_pattern = not pattern_segs[-1] 

114 

115 # Normalize pattern to make processing easier. 

116 

117 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each 

118 # sequence down to one double-asterisk. Iterate over the segments in 

119 # reverse and remove the duplicate double asterisks as we go. 

120 for i in range(len(pattern_segs) - 1, 0, -1): 

121 prev = pattern_segs[i-1] 

122 seg = pattern_segs[i] 

123 if prev == '**' and seg == '**': 

124 del pattern_segs[i] 

125 

126 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

127 # EDGE CASE: The '**/' pattern should match everything except individual 

128 # files in the root directory. This case cannot be adequately handled 

129 # through normalization. Use the override. 

130 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' 

131 

132 if not pattern_segs[0]: 

133 # A pattern beginning with a slash ('/') will only match paths directly 

134 # on the root directory instead of any descendant paths. So, remove 

135 # empty first segment to make pattern relative to root. 

136 del pattern_segs[0] 

137 

138 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

139 # A single pattern without a beginning slash ('/') will match any 

140 # descendant path. This is equivalent to "**/{pattern}". So, prepend 

141 # with double-asterisks to make pattern relative to root. 

142 # - EDGE CASE: This also holds for a single pattern with a trailing 

143 # slash (e.g. dir/). 

144 if pattern_segs[0] != '**': 

145 pattern_segs.insert(0, '**') 

146 

147 else: 

148 # EDGE CASE: A pattern without a beginning slash ('/') but contains at 

149 # least one prepended directory (e.g. "dir/{pattern}") should not match 

150 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1). 

151 pass 

152 

153 if not pattern_segs: 

154 # After resolving the edge cases, we end up with no pattern at all. This 

155 # must be because the pattern is invalid. 

156 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

157 

158 if not pattern_segs[-1] and len(pattern_segs) > 1: 

159 # A pattern ending with a slash ('/') will match all descendant paths if 

160 # it is a directory but not if it is a regular file. This is equivalent 

161 # to "{pattern}/**". So, set last segment to a double-asterisk to 

162 # include all descendants. 

163 pattern_segs[-1] = '**' 

164 

165 if override_regex is None: 

166 # Build regular expression from pattern. 

167 output = ['^'] 

168 need_slash = False 

169 end = len(pattern_segs) - 1 

170 for i, seg in enumerate(pattern_segs): 

171 if seg == '**': 

172 if i == 0 and i == end: 

173 # A pattern consisting solely of double-asterisks ('**') will 

174 # match every path. 

175 output.append(f'[^/]+(?:/.*)?') 

176 

177 elif i == 0: 

178 # A normalized pattern beginning with double-asterisks 

179 # ('**') will match any leading path segments. 

180 output.append('(?:.+/)?') 

181 need_slash = False 

182 

183 elif i == end: 

184 # A normalized pattern ending with double-asterisks ('**') will 

185 # match any trailing path segments. 

186 if is_dir_pattern: 

187 output.append(f'(?P<{_DIR_MARK}>/).*') 

188 else: 

189 output.append(f'/.*') 

190 

191 else: 

192 # A pattern with inner double-asterisks ('**') will match multiple 

193 # (or zero) inner path segments. 

194 output.append('(?:/.+)?') 

195 need_slash = True 

196 

197 elif seg == '*': 

198 # Match single path segment. 

199 if need_slash: 

200 output.append('/') 

201 

202 output.append('[^/]+') 

203 

204 if i == end: 

205 # A pattern ending without a slash ('/') will match a file or a 

206 # directory (with paths underneath it). E.g., "foo" matches "foo", 

207 # "foo/bar", "foo/bar/baz", etc. 

208 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

209 

210 need_slash = True 

211 

212 else: 

213 # Match segment glob pattern. 

214 if need_slash: 

215 output.append('/') 

216 

217 try: 

218 output.append(cls._translate_segment_glob(seg)) 

219 except ValueError as e: 

220 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

221 

222 if i == end: 

223 # A pattern ending without a slash ('/') will match a file or a 

224 # directory (with paths underneath it). E.g., "foo" matches "foo", 

225 # "foo/bar", "foo/bar/baz", etc. 

226 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

227 

228 need_slash = True 

229 

230 output.append('$') 

231 regex = ''.join(output) 

232 

233 else: 

234 # Use regex override. 

235 regex = override_regex 

236 

237 else: 

238 # A blank pattern is a null-operation (neither includes nor excludes 

239 # files). 

240 regex = None 

241 include = None 

242 

243 if regex is not None and return_type is bytes: 

244 regex = regex.encode(_BYTES_ENCODING) 

245 

246 return regex, include 

247 

248 @staticmethod 

249 def _translate_segment_glob(pattern: str) -> str: 

250 """ 

251 Translates the glob pattern to a regular expression. This is used in the 

252 constructor to translate a path segment glob pattern to its corresponding 

253 regular expression. 

254 

255 *pattern* (:class:`str`) is the glob pattern. 

256 

257 Returns the regular expression (:class:`str`). 

258 """ 

259 # NOTE: This is derived from `fnmatch.translate()` and is similar to the 

260 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

261 

262 escape = False 

263 regex = '' 

264 i, end = 0, len(pattern) 

265 while i < end: 

266 # Get next character. 

267 char = pattern[i] 

268 i += 1 

269 

270 if escape: 

271 # Escape the character. 

272 escape = False 

273 regex += re.escape(char) 

274 

275 elif char == '\\': 

276 # Escape character, escape next character. 

277 escape = True 

278 

279 elif char == '*': 

280 # Multi-character wildcard. Match any string (except slashes), including 

281 # an empty string. 

282 regex += '[^/]*' 

283 

284 elif char == '?': 

285 # Single-character wildcard. Match any single character (except a 

286 # slash). 

287 regex += '[^/]' 

288 

289 elif char == '[': 

290 # Bracket expression wildcard. Except for the beginning exclamation 

291 # mark, the whole bracket expression can be used directly as regex, but 

292 # we have to find where the expression ends. 

293 # - "[][!]" matches ']', '[' and '!'. 

294 # - "[]-]" matches ']' and '-'. 

295 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

296 j = i 

297 

298 # Pass bracket expression negation. 

299 if j < end and (pattern[j] == '!' or pattern[j] == '^'): 

300 j += 1 

301 

302 # Pass first closing bracket if it is at the beginning of the 

303 # expression. 

304 if j < end and pattern[j] == ']': 

305 j += 1 

306 

307 # Find closing bracket. Stop once we reach the end or find it. 

308 while j < end and pattern[j] != ']': 

309 j += 1 

310 

311 if j < end: 

312 # Found end of bracket expression. Increment j to be one past the 

313 # closing bracket: 

314 # 

315 # [...] 

316 # ^ ^ 

317 # i j 

318 # 

319 j += 1 

320 expr = '[' 

321 

322 if pattern[i] == '!': 

323 # Bracket expression needs to be negated. 

324 expr += '^' 

325 i += 1 

326 elif pattern[i] == '^': 

327 # POSIX declares that the regex bracket expression negation "[^...]" 

328 # is undefined in a glob pattern. Python's `fnmatch.translate()` 

329 # escapes the caret ('^') as a literal. Git supports the using a 

330 # caret for negation. Maintain consistency with Git because that is 

331 # the expected behavior. 

332 expr += '^' 

333 i += 1 

334 

335 # Build regex bracket expression. Escape slashes so they are treated 

336 # as literal slashes by regex as defined by POSIX. 

337 expr += pattern[i:j].replace('\\', '\\\\') 

338 

339 # Add regex bracket expression to regex result. 

340 regex += expr 

341 

342 # Set i to one past the closing bracket. 

343 i = j 

344 

345 else: 

346 # Failed to find closing bracket, treat opening bracket as a bracket 

347 # literal instead of as an expression. 

348 regex += '\\[' 

349 

350 else: 

351 # Regular character, escape it for regex. 

352 regex += re.escape(char) 

353 

354 if escape: 

355 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

356 

357 return regex 

358 

359 @staticmethod 

360 def escape(s: AnyStr) -> AnyStr: 

361 """ 

362 Escape special characters in the given string. 

363 

364 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to 

365 escape, usually before adding it to a ".gitignore". 

366 

367 Returns the escaped string (:class:`str` or :class:`bytes`). 

368 """ 

369 if isinstance(s, str): 

370 return_type = str 

371 string = s 

372 elif isinstance(s, bytes): 

373 return_type = bytes 

374 string = s.decode(_BYTES_ENCODING) 

375 else: 

376 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

377 

378 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

379 meta_characters = r"[]!*#?" 

380 

381 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

382 

383 if return_type is bytes: 

384 return out_string.encode(_BYTES_ENCODING) 

385 else: 

386 return out_string 

387 

388util.register_pattern('gitwildmatch', GitWildMatchPattern) 

389 

390 

391class GitIgnorePattern(GitWildMatchPattern): 

392 """ 

393 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

394 This class only exists to maintain compatibility with v0.4. 

395 """ 

396 

397 def __init__(self, *args, **kw) -> None: 

398 """ 

399 Warn about deprecation. 

400 """ 

401 self._deprecated() 

402 super(GitIgnorePattern, self).__init__(*args, **kw) 

403 

404 @staticmethod 

405 def _deprecated() -> None: 

406 """ 

407 Warn about deprecation. 

408 """ 

409 warnings.warn(( 

410 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern " 

411 "('gitwildmatch') instead." 

412 ), DeprecationWarning, stacklevel=3) 

413 

414 @classmethod 

415 def pattern_to_regex(cls, *args, **kw): 

416 """ 

417 Warn about deprecation. 

418 """ 

419 cls._deprecated() 

420 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

421 

422# Register `GitIgnorePattern` as "gitignore" for backward compatibility with 

423# v0.4. 

424util.register_pattern('gitignore', GitIgnorePattern)