Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 86%

160 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 07:17 +0000

1""" 

2This module implements Git's wildmatch pattern matching which itself is 

3derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" 

4files. 

5""" 

6 

7import re 

8import warnings 

9from typing import ( 

10 AnyStr, 

11 Optional, 

12 Tuple) 

13 

14from .. import util 

15from ..pattern import RegexPattern 

16 

17_BYTES_ENCODING = 'latin1' 

18""" 

19The encoding to use when parsing a byte string pattern. 

20""" 

21 

22_DIR_MARK = 'ps_d' 

23""" 

24The regex group name for the directory marker. This is only used by 

25:class:`GitIgnoreSpec`. 

26""" 

27 

28 

29class GitWildMatchPatternError(ValueError): 

30 """ 

31 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

32 pattern. 

33 """ 

34 pass 

35 

36 

37class GitWildMatchPattern(RegexPattern): 

38 """ 

39 The :class:`GitWildMatchPattern` class represents a compiled Git 

40 wildmatch pattern. 

41 """ 

42 

43 # Keep the dict-less class hierarchy. 

44 __slots__ = () 

45 

46 @classmethod 

47 def pattern_to_regex( 

48 cls, 

49 pattern: AnyStr, 

50 ) -> Tuple[Optional[AnyStr], Optional[bool]]: 

51 """ 

52 Convert the pattern into a regular expression. 

53 

54 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert 

55 into a regular expression. 

56 

57 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, 

58 or :data:`None`); and whether matched files should be included 

59 (:data:`True`), excluded (:data:`False`), or if it is a 

60 null-operation (:data:`None`). 

61 """ 

62 if isinstance(pattern, str): 

63 return_type = str 

64 elif isinstance(pattern, bytes): 

65 return_type = bytes 

66 pattern = pattern.decode(_BYTES_ENCODING) 

67 else: 

68 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

69 

70 original_pattern = pattern 

71 

72 if pattern.endswith('\\ '): 

73 # EDGE CASE: Spaces can be escaped with backslash. 

74 # If a pattern that ends with backslash followed by a space, 

75 # only strip from left. 

76 pattern = pattern.lstrip() 

77 else: 

78 pattern = pattern.strip() 

79 

80 if pattern.startswith('#'): 

81 # A pattern starting with a hash ('#') serves as a comment 

82 # (neither includes nor excludes files). Escape the hash with a 

83 # back-slash to match a literal hash (i.e., '\#'). 

84 regex = None 

85 include = None 

86 

87 elif pattern == '/': 

88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single 

89 # '/' does not match any file. 

90 regex = None 

91 include = None 

92 

93 elif pattern: 

94 if pattern.startswith('!'): 

95 # A pattern starting with an exclamation mark ('!') negates the 

96 # pattern (exclude instead of include). Escape the exclamation 

97 # mark with a back-slash to match a literal exclamation mark 

98 # (i.e., '\!'). 

99 include = False 

100 # Remove leading exclamation mark. 

101 pattern = pattern[1:] 

102 else: 

103 include = True 

104 

105 # Allow a regex override for edge cases that cannot be handled 

106 # through normalization. 

107 override_regex = None 

108 

109 # Split pattern into segments. 

110 pattern_segs = pattern.split('/') 

111 

112 # Normalize pattern to make processing easier. 

113 

114 # EDGE CASE: Deal with duplicate double-asterisk sequences. 

115 # Collapse each sequence down to one double-asterisk. Iterate over 

116 # the segments in reverse and remove the duplicate double 

117 # asterisks as we go. 

118 for i in range(len(pattern_segs) - 1, 0, -1): 

119 prev = pattern_segs[i-1] 

120 seg = pattern_segs[i] 

121 if prev == '**' and seg == '**': 

122 del pattern_segs[i] 

123 

124 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

125 # EDGE CASE: The '**/' pattern should match everything except 

126 # individual files in the root directory. This case cannot be 

127 # adequately handled through normalization. Use the override. 

128 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' 

129 

130 if not pattern_segs[0]: 

131 # A pattern beginning with a slash ('/') will only match paths 

132 # directly on the root directory instead of any descendant 

133 # paths. So, remove empty first segment to make pattern relative 

134 # to root. 

135 del pattern_segs[0] 

136 

137 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

138 # A single pattern without a beginning slash ('/') will match 

139 # any descendant path. This is equivalent to "**/{pattern}". So, 

140 # prepend with double-asterisks to make pattern relative to 

141 # root. 

142 # EDGE CASE: This also holds for a single pattern with a 

143 # trailing slash (e.g. dir/). 

144 if pattern_segs[0] != '**': 

145 pattern_segs.insert(0, '**') 

146 

147 else: 

148 # EDGE CASE: A pattern without a beginning slash ('/') but 

149 # contains at least one prepended directory (e.g. 

150 # "dir/{pattern}") should not match "**/dir/{pattern}", 

151 # according to `git check-ignore` (v2.4.1). 

152 pass 

153 

154 if not pattern_segs: 

155 # After resolving the edge cases, we end up with no pattern at 

156 # all. This must be because the pattern is invalid. 

157 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

158 

159 if not pattern_segs[-1] and len(pattern_segs) > 1: 

160 # A pattern ending with a slash ('/') will match all descendant 

161 # paths if it is a directory but not if it is a regular file. 

162 # This is equivalent to "{pattern}/**". So, set last segment to 

163 # a double-asterisk to include all descendants. 

164 pattern_segs[-1] = '**' 

165 

166 if override_regex is None: 

167 # Build regular expression from pattern. 

168 output = ['^'] 

169 need_slash = False 

170 end = len(pattern_segs) - 1 

171 for i, seg in enumerate(pattern_segs): 

172 if seg == '**': 

173 if i == 0 and i == end: 

174 # A pattern consisting solely of double-asterisks ('**') 

175 # will match every path. 

176 output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?') 

177 elif i == 0: 

178 # A normalized pattern beginning with double-asterisks 

179 # ('**') will match any leading path segments. 

180 output.append('(?:.+/)?') 

181 need_slash = False 

182 elif i == end: 

183 # A normalized pattern ending with double-asterisks ('**') 

184 # will match any trailing path segments. 

185 output.append(f'(?P<{_DIR_MARK}>/).*') 

186 else: 

187 # A pattern with inner double-asterisks ('**') will match 

188 # multiple (or zero) inner path segments. 

189 output.append('(?:/.+)?') 

190 need_slash = True 

191 

192 elif seg == '*': 

193 # Match single path segment. 

194 if need_slash: 

195 output.append('/') 

196 

197 output.append('[^/]+') 

198 

199 if i == end: 

200 # A pattern ending without a slash ('/') will match a file 

201 # or a directory (with paths underneath it). E.g., "foo" 

202 # matches "foo", "foo/bar", "foo/bar/baz", etc. 

203 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

204 

205 need_slash = True 

206 

207 else: 

208 # Match segment glob pattern. 

209 if need_slash: 

210 output.append('/') 

211 

212 try: 

213 output.append(cls._translate_segment_glob(seg)) 

214 except ValueError as e: 

215 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

216 

217 if i == end: 

218 # A pattern ending without a slash ('/') will match a file 

219 # or a directory (with paths underneath it). E.g., "foo" 

220 # matches "foo", "foo/bar", "foo/bar/baz", etc. 

221 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

222 

223 need_slash = True 

224 

225 output.append('$') 

226 regex = ''.join(output) 

227 

228 else: 

229 # Use regex override. 

230 regex = override_regex 

231 

232 else: 

233 # A blank pattern is a null-operation (neither includes nor 

234 # excludes files). 

235 regex = None 

236 include = None 

237 

238 if regex is not None and return_type is bytes: 

239 regex = regex.encode(_BYTES_ENCODING) 

240 

241 return regex, include 

242 

243 @staticmethod 

244 def _translate_segment_glob(pattern: str) -> str: 

245 """ 

246 Translates the glob pattern to a regular expression. This is used in 

247 the constructor to translate a path segment glob pattern to its 

248 corresponding regular expression. 

249 

250 *pattern* (:class:`str`) is the glob pattern. 

251 

252 Returns the regular expression (:class:`str`). 

253 """ 

254 # NOTE: This is derived from `fnmatch.translate()` and is similar to 

255 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

256 

257 escape = False 

258 regex = '' 

259 i, end = 0, len(pattern) 

260 while i < end: 

261 # Get next character. 

262 char = pattern[i] 

263 i += 1 

264 

265 if escape: 

266 # Escape the character. 

267 escape = False 

268 regex += re.escape(char) 

269 

270 elif char == '\\': 

271 # Escape character, escape next character. 

272 escape = True 

273 

274 elif char == '*': 

275 # Multi-character wildcard. Match any string (except slashes), 

276 # including an empty string. 

277 regex += '[^/]*' 

278 

279 elif char == '?': 

280 # Single-character wildcard. Match any single character (except 

281 # a slash). 

282 regex += '[^/]' 

283 

284 elif char == '[': 

285 # Bracket expression wildcard. Except for the beginning 

286 # exclamation mark, the whole bracket expression can be used 

287 # directly as regex but we have to find where the expression 

288 # ends. 

289 # - "[][!]" matches ']', '[' and '!'. 

290 # - "[]-]" matches ']' and '-'. 

291 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

292 j = i 

293 

294 # Pass bracket expression negation. 

295 if j < end and (pattern[j] == '!' or pattern[j] == '^'): 

296 j += 1 

297 

298 # Pass first closing bracket if it is at the beginning of the 

299 # expression. 

300 if j < end and pattern[j] == ']': 

301 j += 1 

302 

303 # Find closing bracket. Stop once we reach the end or find it. 

304 while j < end and pattern[j] != ']': 

305 j += 1 

306 

307 if j < end: 

308 # Found end of bracket expression. Increment j to be one past 

309 # the closing bracket: 

310 # 

311 # [...] 

312 # ^ ^ 

313 # i j 

314 # 

315 j += 1 

316 expr = '[' 

317 

318 if pattern[i] == '!': 

319 # Bracket expression needs to be negated. 

320 expr += '^' 

321 i += 1 

322 elif pattern[i] == '^': 

323 # POSIX declares that the regex bracket expression negation 

324 # "[^...]" is undefined in a glob pattern. Python's 

325 # `fnmatch.translate()` escapes the caret ('^') as a 

326 # literal. Git supports the using a caret for negation. 

327 # Maintain consistency with Git because that is the expected 

328 # behavior. 

329 expr += '^' 

330 i += 1 

331 

332 # Build regex bracket expression. Escape slashes so they are 

333 # treated as literal slashes by regex as defined by POSIX. 

334 expr += pattern[i:j].replace('\\', '\\\\') 

335 

336 # Add regex bracket expression to regex result. 

337 regex += expr 

338 

339 # Set i to one past the closing bracket. 

340 i = j 

341 

342 else: 

343 # Failed to find closing bracket, treat opening bracket as a 

344 # bracket literal instead of as an expression. 

345 regex += '\\[' 

346 

347 else: 

348 # Regular character, escape it for regex. 

349 regex += re.escape(char) 

350 

351 if escape: 

352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

353 

354 return regex 

355 

356 @staticmethod 

357 def escape(s: AnyStr) -> AnyStr: 

358 """ 

359 Escape special characters in the given string. 

360 

361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you 

362 want to escape, usually before adding it to a ".gitignore". 

363 

364 Returns the escaped string (:class:`str` or :class:`bytes`). 

365 """ 

366 if isinstance(s, str): 

367 return_type = str 

368 string = s 

369 elif isinstance(s, bytes): 

370 return_type = bytes 

371 string = s.decode(_BYTES_ENCODING) 

372 else: 

373 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

374 

375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

376 meta_characters = r"[]!*#?" 

377 

378 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

379 

380 if return_type is bytes: 

381 return out_string.encode(_BYTES_ENCODING) 

382 else: 

383 return out_string 

384 

385util.register_pattern('gitwildmatch', GitWildMatchPattern) 

386 

387 

388class GitIgnorePattern(GitWildMatchPattern): 

389 """ 

390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

391 This class only exists to maintain compatibility with v0.4. 

392 """ 

393 

394 def __init__(self, *args, **kw) -> None: 

395 """ 

396 Warn about deprecation. 

397 """ 

398 self._deprecated() 

399 super(GitIgnorePattern, self).__init__(*args, **kw) 

400 

401 @staticmethod 

402 def _deprecated() -> None: 

403 """ 

404 Warn about deprecation. 

405 """ 

406 warnings.warn(( 

407 "GitIgnorePattern ('gitignore') is deprecated. Use " 

408 "GitWildMatchPattern ('gitwildmatch') instead." 

409 ), DeprecationWarning, stacklevel=3) 

410 

411 @classmethod 

412 def pattern_to_regex(cls, *args, **kw): 

413 """ 

414 Warn about deprecation. 

415 """ 

416 cls._deprecated() 

417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

418 

419# Register `GitIgnorePattern` as "gitignore" for backward compatibility 

420# with v0.4. 

421util.register_pattern('gitignore', GitIgnorePattern)