Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 16%

158 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1""" 

2This module implements Git's wildmatch pattern matching which itself is 

3derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" 

4files. 

5""" 

6 

7import re 

8import warnings 

9from typing import ( 

10 AnyStr, 

11 Optional, 

12 Tuple) 

13 

14from .. import util 

15from ..pattern import RegexPattern 

16 

17_BYTES_ENCODING = 'latin1' 

18""" 

19The encoding to use when parsing a byte string pattern. 

20""" 

21 

22_DIR_MARK = 'ps_d' 

23""" 

24The regex group name for the directory marker. This is only used by 

25:class:`GitIgnoreSpec`. 

26""" 

27 

28 

29class GitWildMatchPatternError(ValueError): 

30 """ 

31 The :class:`GitWildMatchPatternError` indicates an invalid git wild match 

32 pattern. 

33 """ 

34 pass 

35 

36 

37class GitWildMatchPattern(RegexPattern): 

38 """ 

39 The :class:`GitWildMatchPattern` class represents a compiled Git 

40 wildmatch pattern. 

41 """ 

42 

43 # Keep the dict-less class hierarchy. 

44 __slots__ = () 

45 

46 @classmethod 

47 def pattern_to_regex( 

48 cls, 

49 pattern: AnyStr, 

50 ) -> Tuple[Optional[AnyStr], Optional[bool]]: 

51 """ 

52 Convert the pattern into a regular expression. 

53 

54 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert 

55 into a regular expression. 

56 

57 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, 

58 or :data:`None`); and whether matched files should be included 

59 (:data:`True`), excluded (:data:`False`), or if it is a 

60 null-operation (:data:`None`). 

61 """ 

62 if isinstance(pattern, str): 

63 return_type = str 

64 elif isinstance(pattern, bytes): 

65 return_type = bytes 

66 pattern = pattern.decode(_BYTES_ENCODING) 

67 else: 

68 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") 

69 

70 original_pattern = pattern 

71 pattern = pattern.strip() 

72 

73 if pattern.startswith('#'): 

74 # A pattern starting with a hash ('#') serves as a comment 

75 # (neither includes nor excludes files). Escape the hash with a 

76 # back-slash to match a literal hash (i.e., '\#'). 

77 regex = None 

78 include = None 

79 

80 elif pattern == '/': 

81 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single 

82 # '/' does not match any file. 

83 regex = None 

84 include = None 

85 

86 elif pattern: 

87 if pattern.startswith('!'): 

88 # A pattern starting with an exclamation mark ('!') negates the 

89 # pattern (exclude instead of include). Escape the exclamation 

90 # mark with a back-slash to match a literal exclamation mark 

91 # (i.e., '\!'). 

92 include = False 

93 # Remove leading exclamation mark. 

94 pattern = pattern[1:] 

95 else: 

96 include = True 

97 

98 # Allow a regex override for edge cases that cannot be handled 

99 # through normalization. 

100 override_regex = None 

101 

102 # Split pattern into segments. 

103 pattern_segs = pattern.split('/') 

104 

105 # Normalize pattern to make processing easier. 

106 

107 # EDGE CASE: Deal with duplicate double-asterisk sequences. 

108 # Collapse each sequence down to one double-asterisk. Iterate over 

109 # the segments in reverse and remove the duplicate double 

110 # asterisks as we go. 

111 for i in range(len(pattern_segs) - 1, 0, -1): 

112 prev = pattern_segs[i-1] 

113 seg = pattern_segs[i] 

114 if prev == '**' and seg == '**': 

115 del pattern_segs[i] 

116 

117 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: 

118 # EDGE CASE: The '**/' pattern should match everything except 

119 # individual files in the root directory. This case cannot be 

120 # adequately handled through normalization. Use the override. 

121 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' 

122 

123 if not pattern_segs[0]: 

124 # A pattern beginning with a slash ('/') will only match paths 

125 # directly on the root directory instead of any descendant 

126 # paths. So, remove empty first segment to make pattern relative 

127 # to root. 

128 del pattern_segs[0] 

129 

130 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): 

131 # A single pattern without a beginning slash ('/') will match 

132 # any descendant path. This is equivalent to "**/{pattern}". So, 

133 # prepend with double-asterisks to make pattern relative to 

134 # root. 

135 # EDGE CASE: This also holds for a single pattern with a 

136 # trailing slash (e.g. dir/). 

137 if pattern_segs[0] != '**': 

138 pattern_segs.insert(0, '**') 

139 

140 else: 

141 # EDGE CASE: A pattern without a beginning slash ('/') but 

142 # contains at least one prepended directory (e.g. 

143 # "dir/{pattern}") should not match "**/dir/{pattern}", 

144 # according to `git check-ignore` (v2.4.1). 

145 pass 

146 

147 if not pattern_segs: 

148 # After resolving the edge cases, we end up with no pattern at 

149 # all. This must be because the pattern is invalid. 

150 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") 

151 

152 if not pattern_segs[-1] and len(pattern_segs) > 1: 

153 # A pattern ending with a slash ('/') will match all descendant 

154 # paths if it is a directory but not if it is a regular file. 

155 # This is equivalent to "{pattern}/**". So, set last segment to 

156 # a double-asterisk to include all descendants. 

157 pattern_segs[-1] = '**' 

158 

159 if override_regex is None: 

160 # Build regular expression from pattern. 

161 output = ['^'] 

162 need_slash = False 

163 end = len(pattern_segs) - 1 

164 for i, seg in enumerate(pattern_segs): 

165 if seg == '**': 

166 if i == 0 and i == end: 

167 # A pattern consisting solely of double-asterisks ('**') 

168 # will match every path. 

169 output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?') 

170 elif i == 0: 

171 # A normalized pattern beginning with double-asterisks 

172 # ('**') will match any leading path segments. 

173 output.append('(?:.+/)?') 

174 need_slash = False 

175 elif i == end: 

176 # A normalized pattern ending with double-asterisks ('**') 

177 # will match any trailing path segments. 

178 output.append(f'(?P<{_DIR_MARK}>/).*') 

179 else: 

180 # A pattern with inner double-asterisks ('**') will match 

181 # multiple (or zero) inner path segments. 

182 output.append('(?:/.+)?') 

183 need_slash = True 

184 

185 elif seg == '*': 

186 # Match single path segment. 

187 if need_slash: 

188 output.append('/') 

189 

190 output.append('[^/]+') 

191 

192 if i == end: 

193 # A pattern ending without a slash ('/') will match a file 

194 # or a directory (with paths underneath it). E.g., "foo" 

195 # matches "foo", "foo/bar", "foo/bar/baz", etc. 

196 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

197 

198 need_slash = True 

199 

200 else: 

201 # Match segment glob pattern. 

202 if need_slash: 

203 output.append('/') 

204 

205 try: 

206 output.append(cls._translate_segment_glob(seg)) 

207 except ValueError as e: 

208 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e 

209 

210 if i == end: 

211 # A pattern ending without a slash ('/') will match a file 

212 # or a directory (with paths underneath it). E.g., "foo" 

213 # matches "foo", "foo/bar", "foo/bar/baz", etc. 

214 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') 

215 

216 need_slash = True 

217 

218 output.append('$') 

219 regex = ''.join(output) 

220 

221 else: 

222 # Use regex override. 

223 regex = override_regex 

224 

225 else: 

226 # A blank pattern is a null-operation (neither includes nor 

227 # excludes files). 

228 regex = None 

229 include = None 

230 

231 if regex is not None and return_type is bytes: 

232 regex = regex.encode(_BYTES_ENCODING) 

233 

234 return regex, include 

235 

236 @staticmethod 

237 def _translate_segment_glob(pattern: str) -> str: 

238 """ 

239 Translates the glob pattern to a regular expression. This is used in 

240 the constructor to translate a path segment glob pattern to its 

241 corresponding regular expression. 

242 

243 *pattern* (:class:`str`) is the glob pattern. 

244 

245 Returns the regular expression (:class:`str`). 

246 """ 

247 # NOTE: This is derived from `fnmatch.translate()` and is similar to 

248 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. 

249 

250 escape = False 

251 regex = '' 

252 i, end = 0, len(pattern) 

253 while i < end: 

254 # Get next character. 

255 char = pattern[i] 

256 i += 1 

257 

258 if escape: 

259 # Escape the character. 

260 escape = False 

261 regex += re.escape(char) 

262 

263 elif char == '\\': 

264 # Escape character, escape next character. 

265 escape = True 

266 

267 elif char == '*': 

268 # Multi-character wildcard. Match any string (except slashes), 

269 # including an empty string. 

270 regex += '[^/]*' 

271 

272 elif char == '?': 

273 # Single-character wildcard. Match any single character (except 

274 # a slash). 

275 regex += '[^/]' 

276 

277 elif char == '[': 

278 # Bracket expression wildcard. Except for the beginning 

279 # exclamation mark, the whole bracket expression can be used 

280 # directly as regex but we have to find where the expression 

281 # ends. 

282 # - "[][!]" matches ']', '[' and '!'. 

283 # - "[]-]" matches ']' and '-'. 

284 # - "[!]a-]" matches any character except ']', 'a' and '-'. 

285 j = i 

286 # Pass brack expression negation. 

287 if j < end and pattern[j] == '!': 

288 j += 1 

289 # Pass first closing bracket if it is at the beginning of the 

290 # expression. 

291 if j < end and pattern[j] == ']': 

292 j += 1 

293 # Find closing bracket. Stop once we reach the end or find it. 

294 while j < end and pattern[j] != ']': 

295 j += 1 

296 

297 if j < end: 

298 # Found end of bracket expression. Increment j to be one past 

299 # the closing bracket: 

300 # 

301 # [...] 

302 # ^ ^ 

303 # i j 

304 # 

305 j += 1 

306 expr = '[' 

307 

308 if pattern[i] == '!': 

309 # Braket expression needs to be negated. 

310 expr += '^' 

311 i += 1 

312 elif pattern[i] == '^': 

313 # POSIX declares that the regex bracket expression negation 

314 # "[^...]" is undefined in a glob pattern. Python's 

315 # `fnmatch.translate()` escapes the caret ('^') as a 

316 # literal. To maintain consistency with undefined behavior, 

317 # I am escaping the '^' as well. 

318 expr += '\\^' 

319 i += 1 

320 

321 # Build regex bracket expression. Escape slashes so they are 

322 # treated as literal slashes by regex as defined by POSIX. 

323 expr += pattern[i:j].replace('\\', '\\\\') 

324 

325 # Add regex bracket expression to regex result. 

326 regex += expr 

327 

328 # Set i to one past the closing bracket. 

329 i = j 

330 

331 else: 

332 # Failed to find closing bracket, treat opening bracket as a 

333 # bracket literal instead of as an expression. 

334 regex += '\\[' 

335 

336 else: 

337 # Regular character, escape it for regex. 

338 regex += re.escape(char) 

339 

340 if escape: 

341 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") 

342 

343 return regex 

344 

345 @staticmethod 

346 def escape(s: AnyStr) -> AnyStr: 

347 """ 

348 Escape special characters in the given string. 

349 

350 *s* (:class:`str` or :class:`bytes`) a filename or a string that you 

351 want to escape, usually before adding it to a ".gitignore". 

352 

353 Returns the escaped string (:class:`str` or :class:`bytes`). 

354 """ 

355 if isinstance(s, str): 

356 return_type = str 

357 string = s 

358 elif isinstance(s, bytes): 

359 return_type = bytes 

360 string = s.decode(_BYTES_ENCODING) 

361 else: 

362 raise TypeError(f"s:{s!r} is not a unicode or byte string.") 

363 

364 # Reference: https://git-scm.com/docs/gitignore#_pattern_format 

365 meta_characters = r"[]!*#?" 

366 

367 out_string = "".join("\\" + x if x in meta_characters else x for x in string) 

368 

369 if return_type is bytes: 

370 return out_string.encode(_BYTES_ENCODING) 

371 else: 

372 return out_string 

373 

374util.register_pattern('gitwildmatch', GitWildMatchPattern) 

375 

376 

377class GitIgnorePattern(GitWildMatchPattern): 

378 """ 

379 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. 

380 This class only exists to maintain compatibility with v0.4. 

381 """ 

382 

383 def __init__(self, *args, **kw) -> None: 

384 """ 

385 Warn about deprecation. 

386 """ 

387 self._deprecated() 

388 super(GitIgnorePattern, self).__init__(*args, **kw) 

389 

390 @staticmethod 

391 def _deprecated() -> None: 

392 """ 

393 Warn about deprecation. 

394 """ 

395 warnings.warn(( 

396 "GitIgnorePattern ('gitignore') is deprecated. Use " 

397 "GitWildMatchPattern ('gitwildmatch') instead." 

398 ), DeprecationWarning, stacklevel=3) 

399 

400 @classmethod 

401 def pattern_to_regex(cls, *args, **kw): 

402 """ 

403 Warn about deprecation. 

404 """ 

405 cls._deprecated() 

406 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) 

407 

408# Register `GitIgnorePattern` as "gitignore" for backward compatibility 

409# with v0.4. 

410util.register_pattern('gitignore', GitIgnorePattern)