Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 86%

1"""

2This module implements Git's wildmatch pattern matching which itself is

3derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore"

4files.

5"""

7import re

8import warnings

9from typing import (

10 AnyStr,

11 Optional,

12 Tuple)

14from .. import util

15from ..pattern import RegexPattern

17_BYTES_ENCODING = 'latin1'

18"""

19The encoding to use when parsing a byte string pattern.

20"""

22_DIR_MARK = 'ps_d'

23"""

24The regex group name for the directory marker. This is only used by

25:class:`GitIgnoreSpec`.

26"""

29class GitWildMatchPatternError(ValueError):

30 """

31 The :class:`GitWildMatchPatternError` indicates an invalid git wild match

32 pattern.

33 """

34 pass

37class GitWildMatchPattern(RegexPattern):

38 """

39 The :class:`GitWildMatchPattern` class represents a compiled Git

40 wildmatch pattern.

41 """

43 # Keep the dict-less class hierarchy.

44 __slots__ = ()

46 @classmethod

47 def pattern_to_regex(

48 cls,

49 pattern: AnyStr,

50 ) -> Tuple[Optional[AnyStr], Optional[bool]]:

51 """

52 Convert the pattern into a regular expression.

54 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert

55 into a regular expression.

57 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`,

58 or :data:`None`); and whether matched files should be included

59 (:data:`True`), excluded (:data:`False`), or if it is a

60 null-operation (:data:`None`).

61 """

62 if isinstance(pattern, str):

63 return_type = str

64 elif isinstance(pattern, bytes):

65 return_type = bytes

66 pattern = pattern.decode(_BYTES_ENCODING)

67 else:

68 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")

70 original_pattern = pattern

72 if pattern.endswith('\\ '):

73 # EDGE CASE: Spaces can be escaped with backslash.

74 # If a pattern that ends with backslash followed by a space,

75 # only strip from left.

76 pattern = pattern.lstrip()

77 else:

78 pattern = pattern.strip()

80 if pattern.startswith('#'):

81 # A pattern starting with a hash ('#') serves as a comment

82 # (neither includes nor excludes files). Escape the hash with a

83 # back-slash to match a literal hash (i.e., '\#').

84 regex = None

85 include = None

87 elif pattern == '/':

88 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single

89 # '/' does not match any file.

90 regex = None

91 include = None

93 elif pattern:

94 if pattern.startswith('!'):

95 # A pattern starting with an exclamation mark ('!') negates the

96 # pattern (exclude instead of include). Escape the exclamation

97 # mark with a back-slash to match a literal exclamation mark

98 # (i.e., '\!').

99 include = False

100 # Remove leading exclamation mark.

101 pattern = pattern[1:]

102 else:

103 include = True

104

105 # Allow a regex override for edge cases that cannot be handled

106 # through normalization.

107 override_regex = None

108

109 # Split pattern into segments.

110 pattern_segs = pattern.split('/')

111

112 # Normalize pattern to make processing easier.

113

114 # EDGE CASE: Deal with duplicate double-asterisk sequences.

115 # Collapse each sequence down to one double-asterisk. Iterate over

116 # the segments in reverse and remove the duplicate double

117 # asterisks as we go.

118 for i in range(len(pattern_segs) - 1, 0, -1):

119 prev = pattern_segs[i-1]

120 seg = pattern_segs[i]

121 if prev == '**' and seg == '**':

122 del pattern_segs[i]

123

124 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:

125 # EDGE CASE: The '**/' pattern should match everything except

126 # individual files in the root directory. This case cannot be

127 # adequately handled through normalization. Use the override.

128 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'

129

130 if not pattern_segs[0]:

131 # A pattern beginning with a slash ('/') will only match paths

132 # directly on the root directory instead of any descendant

133 # paths. So, remove empty first segment to make pattern relative

134 # to root.

135 del pattern_segs[0]

136

137 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):

138 # A single pattern without a beginning slash ('/') will match

139 # any descendant path. This is equivalent to "**/{pattern}". So,

140 # prepend with double-asterisks to make pattern relative to

141 # root.

142 # EDGE CASE: This also holds for a single pattern with a

143 # trailing slash (e.g. dir/).

144 if pattern_segs[0] != '**':

145 pattern_segs.insert(0, '**')

146

147 else:

148 # EDGE CASE: A pattern without a beginning slash ('/') but

149 # contains at least one prepended directory (e.g.

150 # "dir/{pattern}") should not match "**/dir/{pattern}",

151 # according to `git check-ignore` (v2.4.1).

152 pass

153

154 if not pattern_segs:

155 # After resolving the edge cases, we end up with no pattern at

156 # all. This must be because the pattern is invalid.

157 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")

158

159 if not pattern_segs[-1] and len(pattern_segs) > 1:

160 # A pattern ending with a slash ('/') will match all descendant

161 # paths if it is a directory but not if it is a regular file.

162 # This is equivalent to "{pattern}/**". So, set last segment to

163 # a double-asterisk to include all descendants.

164 pattern_segs[-1] = '**'

165

166 if override_regex is None:

167 # Build regular expression from pattern.

168 output = ['^']

169 need_slash = False

170 end = len(pattern_segs) - 1

171 for i, seg in enumerate(pattern_segs):

172 if seg == '**':

173 if i == 0 and i == end:

174 # A pattern consisting solely of double-asterisks ('**')

175 # will match every path.

176 output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?')

177 elif i == 0:

178 # A normalized pattern beginning with double-asterisks

179 # ('**') will match any leading path segments.

180 output.append('(?:.+/)?')

181 need_slash = False

182 elif i == end:

183 # A normalized pattern ending with double-asterisks ('**')

184 # will match any trailing path segments.

185 output.append(f'(?P<{_DIR_MARK}>/).*')

186 else:

187 # A pattern with inner double-asterisks ('**') will match

188 # multiple (or zero) inner path segments.

189 output.append('(?:/.+)?')

190 need_slash = True

191

192 elif seg == '*':

193 # Match single path segment.

194 if need_slash:

195 output.append('/')

196

197 output.append('[^/]+')

198

199 if i == end:

200 # A pattern ending without a slash ('/') will match a file

201 # or a directory (with paths underneath it). E.g., "foo"

202 # matches "foo", "foo/bar", "foo/bar/baz", etc.

203 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

204

205 need_slash = True

206

207 else:

208 # Match segment glob pattern.

209 if need_slash:

210 output.append('/')

211

212 try:

213 output.append(cls._translate_segment_glob(seg))

214 except ValueError as e:

215 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e

216

217 if i == end:

218 # A pattern ending without a slash ('/') will match a file

219 # or a directory (with paths underneath it). E.g., "foo"

220 # matches "foo", "foo/bar", "foo/bar/baz", etc.

221 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

222

223 need_slash = True

224

225 output.append('$')

226 regex = ''.join(output)

227

228 else:

229 # Use regex override.

230 regex = override_regex

231

232 else:

233 # A blank pattern is a null-operation (neither includes nor

234 # excludes files).

235 regex = None

236 include = None

237

238 if regex is not None and return_type is bytes:

239 regex = regex.encode(_BYTES_ENCODING)

240

241 return regex, include

242

243 @staticmethod

244 def _translate_segment_glob(pattern: str) -> str:

245 """

246 Translates the glob pattern to a regular expression. This is used in

247 the constructor to translate a path segment glob pattern to its

248 corresponding regular expression.

249

250 *pattern* (:class:`str`) is the glob pattern.

251

252 Returns the regular expression (:class:`str`).

253 """

254 # NOTE: This is derived from `fnmatch.translate()` and is similar to

255 # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.

256

257 escape = False

258 regex = ''

259 i, end = 0, len(pattern)

260 while i < end:

261 # Get next character.

262 char = pattern[i]

263 i += 1

264

265 if escape:

266 # Escape the character.

267 escape = False

268 regex += re.escape(char)

269

270 elif char == '\\':

271 # Escape character, escape next character.

272 escape = True

273

274 elif char == '*':

275 # Multi-character wildcard. Match any string (except slashes),

276 # including an empty string.

277 regex += '[^/]*'

278

279 elif char == '?':

280 # Single-character wildcard. Match any single character (except

281 # a slash).

282 regex += '[^/]'

283

284 elif char == '[':

285 # Bracket expression wildcard. Except for the beginning

286 # exclamation mark, the whole bracket expression can be used

287 # directly as regex but we have to find where the expression

288 # ends.

289 # - "[][!]" matches ']', '[' and '!'.

290 # - "[]-]" matches ']' and '-'.

291 # - "[!]a-]" matches any character except ']', 'a' and '-'.

292 j = i

293

294 # Pass bracket expression negation.

295 if j < end and (pattern[j] == '!' or pattern[j] == '^'):

296 j += 1

297

298 # Pass first closing bracket if it is at the beginning of the

299 # expression.

300 if j < end and pattern[j] == ']':

301 j += 1

302

303 # Find closing bracket. Stop once we reach the end or find it.

304 while j < end and pattern[j] != ']':

305 j += 1

306

307 if j < end:

308 # Found end of bracket expression. Increment j to be one past

309 # the closing bracket:

310 #

311 # [...]

312 # ^ ^

313 # i j

314 #

315 j += 1

316 expr = '['

317

318 if pattern[i] == '!':

319 # Bracket expression needs to be negated.

320 expr += '^'

321 i += 1

322 elif pattern[i] == '^':

323 # POSIX declares that the regex bracket expression negation

324 # "[^...]" is undefined in a glob pattern. Python's

325 # `fnmatch.translate()` escapes the caret ('^') as a

326 # literal. Git supports the using a caret for negation.

327 # Maintain consistency with Git because that is the expected

328 # behavior.

329 expr += '^'

330 i += 1

331

332 # Build regex bracket expression. Escape slashes so they are

333 # treated as literal slashes by regex as defined by POSIX.

334 expr += pattern[i:j].replace('\\', '\\\\')

335

336 # Add regex bracket expression to regex result.

337 regex += expr

338

339 # Set i to one past the closing bracket.

340 i = j

341

342 else:

343 # Failed to find closing bracket, treat opening bracket as a

344 # bracket literal instead of as an expression.

345 regex += '\\['

346

347 else:

348 # Regular character, escape it for regex.

349 regex += re.escape(char)

350

351 if escape:

352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")

353

354 return regex

355

356 @staticmethod

357 def escape(s: AnyStr) -> AnyStr:

358 """

359 Escape special characters in the given string.

360

361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you

362 want to escape, usually before adding it to a ".gitignore".

363

364 Returns the escaped string (:class:`str` or :class:`bytes`).

365 """

366 if isinstance(s, str):

367 return_type = str

368 string = s

369 elif isinstance(s, bytes):

370 return_type = bytes

371 string = s.decode(_BYTES_ENCODING)

372 else:

373 raise TypeError(f"s:{s!r} is not a unicode or byte string.")

374

375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format

376 meta_characters = r"[]!*#?"

377

378 out_string = "".join("\\" + x if x in meta_characters else x for x in string)

379

380 if return_type is bytes:

381 return out_string.encode(_BYTES_ENCODING)

382 else:

383 return out_string

384

385util.register_pattern('gitwildmatch', GitWildMatchPattern)

386

387

388class GitIgnorePattern(GitWildMatchPattern):

389 """

390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.

391 This class only exists to maintain compatibility with v0.4.

392 """

393

394 def __init__(self, *args, **kw) -> None:

395 """

396 Warn about deprecation.

397 """

398 self._deprecated()

399 super(GitIgnorePattern, self).__init__(*args, **kw)

400

401 @staticmethod

402 def _deprecated() -> None:

403 """

404 Warn about deprecation.

405 """

406 warnings.warn((

407 "GitIgnorePattern ('gitignore') is deprecated. Use "

408 "GitWildMatchPattern ('gitwildmatch') instead."

409 ), DeprecationWarning, stacklevel=3)

410

411 @classmethod

412 def pattern_to_regex(cls, *args, **kw):

413 """

414 Warn about deprecation.

415 """

416 cls._deprecated()

417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)

418

419# Register `GitIgnorePattern` as "gitignore" for backward compatibility

420# with v0.4.

421util.register_pattern('gitignore', GitIgnorePattern)