Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pathspec/patterns/gitwildmatch.py: 87%

1"""

2This module implements Git's wildmatch pattern matching which itself is derived

3from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.

4"""

6import re

7import warnings

8from typing import (

9 AnyStr,

10 Optional, # Replaced by `X | None` in 3.10.

11 Tuple) # Replaced by `tuple` in 3.9.

13from .. import util

14from ..pattern import RegexPattern

16_BYTES_ENCODING = 'latin1'

17"""

18The encoding to use when parsing a byte string pattern.

19"""

21_DIR_MARK = 'ps_d'

22"""

23The regex group name for the directory marker. This is only used by

24:class:`GitIgnoreSpec`.

25"""

28class GitWildMatchPatternError(ValueError):

29 """

30 The :class:`GitWildMatchPatternError` indicates an invalid git wild match

31 pattern.

32 """

33 pass

36class GitWildMatchPattern(RegexPattern):

37 """

38 The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch

39 pattern.

40 """

42 # Keep the dict-less class hierarchy.

43 __slots__ = ()

45 @classmethod

46 def pattern_to_regex(

47 cls,

48 pattern: AnyStr,

49 ) -> Tuple[Optional[AnyStr], Optional[bool]]:

50 """

51 Convert the pattern into a regular expression.

53 *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a

54 regular expression.

56 Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or

57 :data:`None`); and whether matched files should be included (:data:`True`),

58 excluded (:data:`False`), or if it is a null-operation (:data:`None`).

59 """

60 if isinstance(pattern, str):

61 return_type = str

62 elif isinstance(pattern, bytes):

63 return_type = bytes

64 pattern = pattern.decode(_BYTES_ENCODING)

65 else:

66 raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")

68 original_pattern = pattern

70 if pattern.endswith('\\ '):

71 # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends

72 # with backslash followed by a space, only strip from left.

73 pattern = pattern.lstrip()

74 else:

75 pattern = pattern.strip()

77 if pattern.startswith('#'):

78 # A pattern starting with a hash ('#') serves as a comment (neither

79 # includes nor excludes files). Escape the hash with a back-slash to match

80 # a literal hash (i.e., '\#').

81 regex = None

82 include = None

84 elif pattern == '/':

85 # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does

86 # not match any file.

87 regex = None

88 include = None

90 elif pattern:

91 if pattern.startswith('!'):

92 # A pattern starting with an exclamation mark ('!') negates the pattern

93 # (exclude instead of include). Escape the exclamation mark with a

94 # back-slash to match a literal exclamation mark (i.e., '\!').

95 include = False

96 # Remove leading exclamation mark.

97 pattern = pattern[1:]

98 else:

99 include = True

100

101 # Allow a regex override for edge cases that cannot be handled through

102 # normalization.

103 override_regex = None

104

105 # Split pattern into segments.

106 pattern_segs = pattern.split('/')

107

108 # Check whether the pattern is specifically a directory pattern before

109 # normalization.

110 is_dir_pattern = not pattern_segs[-1]

111

112 # Normalize pattern to make processing easier.

113

114 # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each

115 # sequence down to one double-asterisk. Iterate over the segments in

116 # reverse and remove the duplicate double asterisks as we go.

117 for i in range(len(pattern_segs) - 1, 0, -1):

118 prev = pattern_segs[i-1]

119 seg = pattern_segs[i]

120 if prev == '**' and seg == '**':

121 del pattern_segs[i]

122

123 if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:

124 # EDGE CASE: The '**/' pattern should match everything except individual

125 # files in the root directory. This case cannot be adequately handled

126 # through normalization. Use the override.

127 override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'

128

129 if not pattern_segs[0]:

130 # A pattern beginning with a slash ('/') will only match paths directly

131 # on the root directory instead of any descendant paths. So, remove

132 # empty first segment to make pattern relative to root.

133 del pattern_segs[0]

134

135 elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):

136 # A single pattern without a beginning slash ('/') will match any

137 # descendant path. This is equivalent to "**/{pattern}". So, prepend

138 # with double-asterisks to make pattern relative to root.

139 # - EDGE CASE: This also holds for a single pattern with a trailing

140 # slash (e.g. dir/).

141 if pattern_segs[0] != '**':

142 pattern_segs.insert(0, '**')

143

144 else:

145 # EDGE CASE: A pattern without a beginning slash ('/') but contains at

146 # least one prepended directory (e.g. "dir/{pattern}") should not match

147 # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).

148 pass

149

150 if not pattern_segs:

151 # After resolving the edge cases, we end up with no pattern at all. This

152 # must be because the pattern is invalid.

153 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")

154

155 if not pattern_segs[-1] and len(pattern_segs) > 1:

156 # A pattern ending with a slash ('/') will match all descendant paths if

157 # it is a directory but not if it is a regular file. This is equivalent

158 # to "{pattern}/**". So, set last segment to a double-asterisk to

159 # include all descendants.

160 pattern_segs[-1] = '**'

161

162 if override_regex is None:

163 # Build regular expression from pattern.

164 output = ['^']

165 need_slash = False

166 end = len(pattern_segs) - 1

167 for i, seg in enumerate(pattern_segs):

168 if seg == '**':

169 if i == 0 and i == end:

170 # A pattern consisting solely of double-asterisks ('**') will

171 # match every path.

172 output.append(f'[^/]+(?:/.*)?')

173

174 elif i == 0:

175 # A normalized pattern beginning with double-asterisks

176 # ('**') will match any leading path segments.

177 output.append('(?:.+/)?')

178 need_slash = False

179

180 elif i == end:

181 # A normalized pattern ending with double-asterisks ('**') will

182 # match any trailing path segments.

183 if is_dir_pattern:

184 output.append(f'(?P<{_DIR_MARK}>/).*')

185 else:

186 output.append(f'/.*')

187

188 else:

189 # A pattern with inner double-asterisks ('**') will match multiple

190 # (or zero) inner path segments.

191 output.append('(?:/.+)?')

192 need_slash = True

193

194 elif seg == '*':

195 # Match single path segment.

196 if need_slash:

197 output.append('/')

198

199 output.append('[^/]+')

200

201 if i == end:

202 # A pattern ending without a slash ('/') will match a file or a

203 # directory (with paths underneath it). E.g., "foo" matches "foo",

204 # "foo/bar", "foo/bar/baz", etc.

205 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

206

207 need_slash = True

208

209 else:

210 # Match segment glob pattern.

211 if need_slash:

212 output.append('/')

213

214 try:

215 output.append(cls._translate_segment_glob(seg))

216 except ValueError as e:

217 raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e

218

219 if i == end:

220 # A pattern ending without a slash ('/') will match a file or a

221 # directory (with paths underneath it). E.g., "foo" matches "foo",

222 # "foo/bar", "foo/bar/baz", etc.

223 output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')

224

225 need_slash = True

226

227 output.append('$')

228 regex = ''.join(output)

229

230 else:

231 # Use regex override.

232 regex = override_regex

233

234 else:

235 # A blank pattern is a null-operation (neither includes nor excludes

236 # files).

237 regex = None

238 include = None

239

240 if regex is not None and return_type is bytes:

241 regex = regex.encode(_BYTES_ENCODING)

242

243 return regex, include

244

245 @staticmethod

246 def _translate_segment_glob(pattern: str) -> str:

247 """

248 Translates the glob pattern to a regular expression. This is used in the

249 constructor to translate a path segment glob pattern to its corresponding

250 regular expression.

251

252 *pattern* (:class:`str`) is the glob pattern.

253

254 Returns the regular expression (:class:`str`).

255 """

256 # NOTE: This is derived from `fnmatch.translate()` and is similar to the

257 # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.

258

259 escape = False

260 regex = ''

261 i, end = 0, len(pattern)

262 while i < end:

263 # Get next character.

264 char = pattern[i]

265 i += 1

266

267 if escape:

268 # Escape the character.

269 escape = False

270 regex += re.escape(char)

271

272 elif char == '\\':

273 # Escape character, escape next character.

274 escape = True

275

276 elif char == '*':

277 # Multi-character wildcard. Match any string (except slashes), including

278 # an empty string.

279 regex += '[^/]*'

280

281 elif char == '?':

282 # Single-character wildcard. Match any single character (except a

283 # slash).

284 regex += '[^/]'

285

286 elif char == '[':

287 # Bracket expression wildcard. Except for the beginning exclamation

288 # mark, the whole bracket expression can be used directly as regex, but

289 # we have to find where the expression ends.

290 # - "[][!]" matches ']', '[' and '!'.

291 # - "[]-]" matches ']' and '-'.

292 # - "[!]a-]" matches any character except ']', 'a' and '-'.

293 j = i

294

295 # Pass bracket expression negation.

296 if j < end and (pattern[j] == '!' or pattern[j] == '^'):

297 j += 1

298

299 # Pass first closing bracket if it is at the beginning of the

300 # expression.

301 if j < end and pattern[j] == ']':

302 j += 1

303

304 # Find closing bracket. Stop once we reach the end or find it.

305 while j < end and pattern[j] != ']':

306 j += 1

307

308 if j < end:

309 # Found end of bracket expression. Increment j to be one past the

310 # closing bracket:

311 #

312 # [...]

313 # ^ ^

314 # i j

315 #

316 j += 1

317 expr = '['

318

319 if pattern[i] == '!':

320 # Bracket expression needs to be negated.

321 expr += '^'

322 i += 1

323 elif pattern[i] == '^':

324 # POSIX declares that the regex bracket expression negation "[^...]"

325 # is undefined in a glob pattern. Python's `fnmatch.translate()`

326 # escapes the caret ('^') as a literal. Git supports the using a

327 # caret for negation. Maintain consistency with Git because that is

328 # the expected behavior.

329 expr += '^'

330 i += 1

331

332 # Build regex bracket expression. Escape slashes so they are treated

333 # as literal slashes by regex as defined by POSIX.

334 expr += pattern[i:j].replace('\\', '\\\\')

335

336 # Add regex bracket expression to regex result.

337 regex += expr

338

339 # Set i to one past the closing bracket.

340 i = j

341

342 else:

343 # Failed to find closing bracket, treat opening bracket as a bracket

344 # literal instead of as an expression.

345 regex += '\\['

346

347 else:

348 # Regular character, escape it for regex.

349 regex += re.escape(char)

350

351 if escape:

352 raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")

353

354 return regex

355

356 @staticmethod

357 def escape(s: AnyStr) -> AnyStr:

358 """

359 Escape special characters in the given string.

360

361 *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to

362 escape, usually before adding it to a ".gitignore".

363

364 Returns the escaped string (:class:`str` or :class:`bytes`).

365 """

366 if isinstance(s, str):

367 return_type = str

368 string = s

369 elif isinstance(s, bytes):

370 return_type = bytes

371 string = s.decode(_BYTES_ENCODING)

372 else:

373 raise TypeError(f"s:{s!r} is not a unicode or byte string.")

374

375 # Reference: https://git-scm.com/docs/gitignore#_pattern_format

376 meta_characters = r"[]!*#?"

377

378 out_string = "".join("\\" + x if x in meta_characters else x for x in string)

379

380 if return_type is bytes:

381 return out_string.encode(_BYTES_ENCODING)

382 else:

383 return out_string

384

385util.register_pattern('gitwildmatch', GitWildMatchPattern)

386

387

388class GitIgnorePattern(GitWildMatchPattern):

389 """

390 The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.

391 This class only exists to maintain compatibility with v0.4.

392 """

393

394 def __init__(self, *args, **kw) -> None:

395 """

396 Warn about deprecation.

397 """

398 self._deprecated()

399 super(GitIgnorePattern, self).__init__(*args, **kw)

400

401 @staticmethod

402 def _deprecated() -> None:

403 """

404 Warn about deprecation.

405 """

406 warnings.warn((

407 "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "

408 "('gitwildmatch') instead."

409 ), DeprecationWarning, stacklevel=3)

410

411 @classmethod

412 def pattern_to_regex(cls, *args, **kw):

413 """

414 Warn about deprecation.

415 """

416 cls._deprecated()

417 return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)

418

419# Register `GitIgnorePattern` as "gitignore" for backward compatibility with

420# v0.4.

421util.register_pattern('gitignore', GitIgnorePattern)