Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

163 statements  

1""" 

2This module implements Git's wildmatch pattern matching which itself is derived 

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files. 

4""" 

5 

6import re 

7import warnings 

8from typing import ( 

9 AnyStr, 

10 Optional, # Replaced by `X | None` in 3.10. 

11 Tuple) # Replaced by `tuple` in 3.9. 

12 

13from .. import util 

14from ..pattern import RegexPattern 

15 

16_BYTES_ENCODING = 'latin1' 

17""" 

18The encoding to use when parsing a byte string pattern. 

19""" 

20 

21_DIR_MARK = 'ps_d' 

22""" 

23The regex group name for the directory marker. This is only used by 

24:class:`GitIgnoreSpec`. 

25""" 

26 

27 

28class GitWildMatchPatternError(ValueError): 

29 """ 

30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

31 pattern. 

32 """ 

33 pass 

34 

35 

36class GitWildMatchPattern(RegexPattern): 

37 """ 

38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch 

39 pattern. 

40 """ 

41 

42 # Keep the dict-less class hierarchy. 

43 __slots__ = () 

44 

45 @classmethod 

46 def pattern_to_regex( 

47 cls, 

48 pattern: AnyStr, 

49 ) -> Tuple[Optional[AnyStr], Optional[bool]]: 

50 """ 

51 Convert the pattern into a regular expression. 

52 

53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a 

54 regular expression. 

55 

56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or 

57 :data:`None`); and whether matched files should be included (:data:`True`), 

58 excluded (:data:`False`), or if it is a null-operation (:data:`None`). 

59 """ 

60 if isinstance(pattern, str): 

61 return_type = str 

62 elif isinstance(pattern, bytes): 

63 return_type = bytes 

64 pattern = pattern.decode(_BYTES_ENCODING) 

65 else: 

66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

67 

68 original_pattern = pattern 

69 

70 if pattern.endswith('\\ '): 

71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends 

72 # with backslash followed by a space, only strip from left. 

73 pattern = pattern.lstrip() 

74 else: 

75 pattern = pattern.strip() 

76 

77 if pattern.startswith('#'): 

78 # A pattern starting with a hash ('#') serves as a comment (neither 

79 # includes nor excludes files). Escape the hash with a back-slash to match 

80 # a literal hash (i.e., '\#'). 

81 regex = None 

82 include = None 

83 

84 elif pattern == '/': 

85 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does 

86 # not match any file. 

87 regex = None 

88 include = None 

89 

90 elif pattern: 

91 if pattern.startswith('!'): 

92 # A pattern starting with an exclamation mark ('!') negates the pattern 

93 # (exclude instead of include). Escape the exclamation mark with a 

94 # back-slash to match a literal exclamation mark (i.e., '\!'). 

95 include = False 

96 # Remove leading exclamation mark. 

97 pattern = pattern[1:] 

98 else: 

99 include = True 

100 

101 # Allow a regex override for edge cases that cannot be handled through 

102 # normalization. 

103 override_regex = None 

104 

105 # Split pattern into segments. 

106 pattern_segs = pattern.split('/') 

107 

108 # Check whether the pattern is specifically a directory pattern before 

109 # normalization. 

110 is_dir_pattern = not pattern_segs[-1] 

111 

112 # Normalize pattern to make processing easier. 

113 

114 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each 

115 # sequence down to one double-asterisk. Iterate over the segments in 

116 # reverse and remove the duplicate double asterisks as we go. 

117 for i in range(len(pattern_segs) - 1, 0, -1): 

118 prev = pattern_segs[i-1] 

119 seg = pattern_segs[i] 

120 if prev == '**' and seg == '**': 

121 del pattern_segs[i] 

122 

123 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

124 # EDGE CASE: The '**/' pattern should match everything except individual 

125 # files in the root directory. This case cannot be adequately handled 

126 # through normalization. Use the override. 

127 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' 

128 

129 if not pattern_segs[0]: 

130 # A pattern beginning with a slash ('/') will only match paths directly 

131 # on the root directory instead of any descendant paths. So, remove 

132 # empty first segment to make pattern relative to root. 

133 del pattern_segs[0] 

134 

135 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

136 # A single pattern without a beginning slash ('/') will match any 

137 # descendant path. This is equivalent to "**/{pattern}". So, prepend 

138 # with double-asterisks to make pattern relative to root. 

139 # - EDGE CASE: This also holds for a single pattern with a trailing 

140 # slash (e.g. dir/). 

141 if pattern_segs[0] != '**': 

142 pattern_segs.insert(0, '**') 

143 

144 else: 

145 # EDGE CASE: A pattern without a beginning slash ('/') but contains at 

146 # least one prepended directory (e.g. "dir/{pattern}") should not match 

147 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1). 

148 pass 

149 

150 if not pattern_segs: 

151 # After resolving the edge cases, we end up with no pattern at all. This 

152 # must be because the pattern is invalid. 

153 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

154 

155 if not pattern_segs[-1] and len(pattern_segs) > 1: 

156 # A pattern ending with a slash ('/') will match all descendant paths if 

157 # it is a directory but not if it is a regular file. This is equivalent 

158 # to "{pattern}/**". So, set last segment to a double-asterisk to 

159 # include all descendants. 

160 pattern_segs[-1] = '**' 

161 

162 if override_regex is None: 

163 # Build regular expression from pattern. 

164 output = ['^'] 

165 need_slash = False 

166 end = len(pattern_segs) - 1 

167 for i, seg in enumerate(pattern_segs): 

168 if seg == '**': 

169 if i == 0 and i == end: 

170 # A pattern consisting solely of double-asterisks ('**') will 

171 # match every path. 

172 output.append(f'[^/]+(?:/.*)?') 

173 

174 elif i == 0: 

175 # A normalized pattern beginning with double-asterisks 

176 # ('**') will match any leading path segments. 

177 output.append('(?:.+/)?') 

178 need_slash = False 

179 

180 elif i == end: 

181 # A normalized pattern ending with double-asterisks ('**') will 

182 # match any trailing path segments. 

183 if is_dir_pattern: 

184 output.append(f'(?P<{_DIR_MARK}>/).*') 

185 else: 

186 output.append(f'/.*') 

187 

188 else: 

189 # A pattern with inner double-asterisks ('**') will match multiple 

190 # (or zero) inner path segments. 

191 output.append('(?:/.+)?') 

192 need_slash = True 

193 

194 elif seg == '*': 

195 # Match single path segment. 

196 if need_slash: 

197 output.append('/') 

198 

199 output.append('[^/]+') 

200 

201 if i == end: 

202 # A pattern ending without a slash ('/') will match a file or a 

203 # directory (with paths underneath it). E.g., "foo" matches "foo", 

204 # "foo/bar", "foo/bar/baz", etc. 

205 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

206 

207 need_slash = True 

208 

209 else: 

210 # Match segment glob pattern. 

211 if need_slash: 

212 output.append('/') 

213 

214 try: 

215 output.append(cls._translate_segment_glob(seg)) 

216 except ValueError as e: 

217 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

218 

219 if i == end: 

220 # A pattern ending without a slash ('/') will match a file or a 

221 # directory (with paths underneath it). E.g., "foo" matches "foo", 

222 # "foo/bar", "foo/bar/baz", etc. 

223 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

224 

225 need_slash = True 

226 

227 output.append('$') 

228 regex = ''.join(output) 

229 

230 else: 

231 # Use regex override. 

232 regex = override_regex 

233 

234 else: 

235 # A blank pattern is a null-operation (neither includes nor excludes 

236 # files). 

237 regex = None 

238 include = None 

239 

240 if regex is not None and return_type is bytes: 

241 regex = regex.encode(_BYTES_ENCODING) 

242 

243 return regex, include 

244 

245 @staticmethod 

246 def _translate_segment_glob(pattern: str) -> str: 

247 """ 

248 Translates the glob pattern to a regular expression. This is used in the 

249 constructor to translate a path segment glob pattern to its corresponding 

250 regular expression. 

251 

252 *pattern* (:class:`str`) is the glob pattern. 

253 

254 Returns the regular expression (:class:`str`). 

255 """ 

256 # NOTE: This is derived from `fnmatch.translate()` and is similar to the 

257 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

258 

259 escape = False 

260 regex = '' 

261 i, end = 0, len(pattern) 

262 while i < end: 

263 # Get next character. 

264 char = pattern[i] 

265 i += 1 

266 

267 if escape: 

268 # Escape the character. 

269 escape = False 

270 regex += re.escape(char) 

271 

272 elif char == '\\': 

273 # Escape character, escape next character. 

274 escape = True 

275 

276 elif char == '*': 

277 # Multi-character wildcard. Match any string (except slashes), including 

278 # an empty string. 

279 regex += '[^/]*' 

280 

281 elif char == '?': 

282 # Single-character wildcard. Match any single character (except a 

283 # slash). 

284 regex += '[^/]' 

285 

286 elif char == '[': 

287 # Bracket expression wildcard. Except for the beginning exclamation 

288 # mark, the whole bracket expression can be used directly as regex, but 

289 # we have to find where the expression ends. 

290 # - "[][!]" matches ']', '[' and '!'. 

291 # - "[]-]" matches ']' and '-'. 

292 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

293 j = i 

294 

295 # Pass bracket expression negation. 

296 if j < end and (pattern[j] == '!' or pattern[j] == '^'): 

297 j += 1 

298 

299 # Pass first closing bracket if it is at the beginning of the 

300 # expression. 

301 if j < end and pattern[j] == ']': 

302 j += 1 

303 

304 # Find closing bracket. Stop once we reach the end or find it. 

305 while j < end and pattern[j] != ']': 

306 j += 1 

307 

308 if j < end: 

309 # Found end of bracket expression. Increment j to be one past the 

310 # closing bracket: 

311 # 

312 # [...] 

313 # ^ ^ 

314 # i j 

315 # 

316 j += 1 

317 expr = '[' 

318 

319 if pattern[i] == '!': 

320 # Bracket expression needs to be negated. 

321 expr += '^' 

322 i += 1 

323 elif pattern[i] == '^': 

324 # POSIX declares that the regex bracket expression negation "[^...]" 

325 # is undefined in a glob pattern. Python's `fnmatch.translate()` 

326 # escapes the caret ('^') as a literal. Git supports the using a 

327 # caret for negation. Maintain consistency with Git because that is 

328 # the expected behavior. 

329 expr += '^' 

330 i += 1 

331 

332 # Build regex bracket expression. Escape slashes so they are treated 

333 # as literal slashes by regex as defined by POSIX. 

334 expr += pattern[i:j].replace('\\', '\\\\') 

335 

336 # Add regex bracket expression to regex result. 

337 regex += expr 

338 

339 # Set i to one past the closing bracket. 

340 i = j 

341 

342 else: 

343 # Failed to find closing bracket, treat opening bracket as a bracket 

344 # literal instead of as an expression. 

345 regex += '\\[' 

346 

347 else: 

348 # Regular character, escape it for regex. 

349 regex += re.escape(char) 

350 

351 if escape: 

352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

353 

354 return regex 

355 

356 @staticmethod 

357 def escape(s: AnyStr) -> AnyStr: 

358 """ 

359 Escape special characters in the given string. 

360 

361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to 

362 escape, usually before adding it to a ".gitignore". 

363 

364 Returns the escaped string (:class:`str` or :class:`bytes`). 

365 """ 

366 if isinstance(s, str): 

367 return_type = str 

368 string = s 

369 elif isinstance(s, bytes): 

370 return_type = bytes 

371 string = s.decode(_BYTES_ENCODING) 

372 else: 

373 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

374 

375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

376 meta_characters = r"[]!*#?" 

377 

378 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

379 

380 if return_type is bytes: 

381 return out_string.encode(_BYTES_ENCODING) 

382 else: 

383 return out_string 

384 

385util.register_pattern('gitwildmatch', GitWildMatchPattern) 

386 

387 

388class GitIgnorePattern(GitWildMatchPattern): 

389 """ 

390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

391 This class only exists to maintain compatibility with v0.4. 

392 """ 

393 

394 def __init__(self, *args, **kw) -> None: 

395 """ 

396 Warn about deprecation. 

397 """ 

398 self._deprecated() 

399 super(GitIgnorePattern, self).__init__(*args, **kw) 

400 

401 @staticmethod 

402 def _deprecated() -> None: 

403 """ 

404 Warn about deprecation. 

405 """ 

406 warnings.warn(( 

407 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern " 

408 "('gitwildmatch') instead." 

409 ), DeprecationWarning, stacklevel=3) 

410 

411 @classmethod 

412 def pattern_to_regex(cls, *args, **kw): 

413 """ 

414 Warn about deprecation. 

415 """ 

416 cls._deprecated() 

417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

418 

419# Register `GitIgnorePattern` as "gitignore" for backward compatibility with 

420# v0.4. 

421util.register_pattern('gitignore', GitIgnorePattern)