1"""
2This module provides the :module:`re2` backend for :class:`~pathspec.gitignore.GitIgnoreSpec`.
3
4WARNING: The *pathspec._backends.re2* package is not part of the public API. Its
5contents and structure are likely to change.
6"""
7from __future__ import annotations
8
9from typing import (
10 Callable, # Replaced by `collections.abc.Callable` in 3.9.2.
11 Optional, # Replaced by `X | None` in 3.10.
12 Union) # Replaced by `X | Y` in 3.10.
13
14try:
15 import re2
16except ModuleNotFoundError:
17 re2 = None
18
19from pathspec.pattern import (
20 RegexPattern)
21from pathspec.patterns.gitignore.spec import (
22 GitIgnoreSpecPattern,
23 _BYTES_ENCODING,
24 _DIR_MARK_CG,
25 _DIR_MARK_OPT)
26from pathspec._typing import (
27 override) # Added in 3.12.
28
29from ._base import (
30 Re2RegexDat,
31 Re2RegexDebug)
32from .pathspec import (
33 Re2PsBackend)
34
35
36class Re2GiBackend(Re2PsBackend):
37 """
38 The :class:`Re2GiBackend` class is the :module:`re2` implementation used by
39 :class:`~pathspec.gitignore.GitIgnoreSpec` for matching files.
40 """
41
42 @override
43 @staticmethod
44 def _init_set(
45 debug: bool,
46 patterns: dict[int, RegexPattern],
47 regex_set: re2.Set,
48 sort_indices: Optional[Callable[[list[int]], None]],
49 ) -> list[Re2RegexDat]:
50 """
51 Create the re2 regex set.
52
53 *debug* (:class:`bool`) is whether to include additional debugging
54 information for the regular expressions.
55
56 *patterns* (:class:`dict`) maps pattern index (:class:`int`) to pattern
57 (:class:`.RegexPattern`).
58
59 *regex_set* (:class:`re2.Set`) is the regex set.
60
61 *sort_indices* (:class:`callable` or :data:`None`) is a function used to
62 sort the patterns by index. This is used during testing to ensure the order
63 of patterns is not accidentally relied on.
64
65 Returns a :class:`list` indexed by regex id (:class:`int`) to its data
66 (:class:`Re2RegexDat`).
67 """
68 # Sort patterns.
69 indices = list(patterns.keys())
70 if sort_indices is not None:
71 sort_indices(indices)
72
73 # Prepare patterns.
74 regex_data: list[Re2RegexDat] = []
75 for pattern_index in indices:
76 pattern = patterns[pattern_index]
77 if pattern.include is None:
78 continue
79
80 assert isinstance(pattern, RegexPattern), pattern
81 regex = pattern.regex.pattern
82
83 use_regexes: list[tuple[Union[str, bytes], bool]] = []
84 if isinstance(pattern, GitIgnoreSpecPattern):
85 # GitIgnoreSpecPattern uses capture groups for its directory marker. Re2
86 # supports capture groups, but they cannot be utilized when using
87 # `re2.Set`. Handle this scenario.
88 regex_str: str
89 if isinstance(regex, str):
90 regex_str = regex
91 else:
92 assert isinstance(regex, bytes), regex
93 regex_str = regex.decode(_BYTES_ENCODING)
94
95 if _DIR_MARK_CG in regex_str:
96 # Found directory marker.
97 if regex_str.endswith(_DIR_MARK_OPT):
98 # Regex has optional directory marker. Split regex into directory
99 # and file variants.
100 base_regex = regex_str[:-len(_DIR_MARK_OPT)]
101 use_regexes.append((f'{base_regex}/', True))
102 use_regexes.append((f'{base_regex}$', False))
103 else:
104 # Remove capture group.
105 base_regex = regex_str.replace(_DIR_MARK_CG, '/')
106 use_regexes.append((base_regex, True))
107
108 if not use_regexes:
109 # No special case for regex.
110 use_regexes.append((regex, False))
111
112 for regex, is_dir_pattern in use_regexes:
113 if debug:
114 regex_data.append(Re2RegexDebug(
115 include=pattern.include,
116 index=pattern_index,
117 is_dir_pattern=is_dir_pattern,
118 regex=regex,
119 ))
120 else:
121 regex_data.append(Re2RegexDat(
122 include=pattern.include,
123 index=pattern_index,
124 is_dir_pattern=is_dir_pattern,
125 ))
126
127 regex_set.Add(regex)
128
129 # Compile patterns.
130 regex_set.Compile()
131 return regex_data
132
133 @override
134 def match_file(self, file: str) -> tuple[Optional[bool], Optional[int]]:
135 """
136 Check the file against the patterns.
137
138 *file* (:class:`str`) is the normalized file path to check.
139
140 Returns a :class:`tuple` containing whether to include *file* (:class:`bool`
141 or :data:`None`), and the index of the last matched pattern (:class:`int` or
142 :data:`None`).
143 """
144 # Find best match.
145 match_ids: Optional[list[int]] = self._set.Match(file)
146 if not match_ids:
147 return (None, None)
148
149 out_include: Optional[bool] = None
150 out_index: int = -1
151 out_priority = -1
152
153 regex_data = self._regex_data
154 for regex_id in match_ids:
155 regex_dat = regex_data[regex_id]
156
157 is_dir_pattern = regex_dat.is_dir_pattern
158 if is_dir_pattern:
159 # Pattern matched by a directory pattern.
160 priority = 1
161 else:
162 # Pattern matched by a file pattern.
163 priority = 2
164
165 # WARNING: According to the documentation on `RE2::Set::Match()`, there is
166 # no guarantee matches will be produced in order!
167 include = regex_dat.include
168 index = regex_dat.index
169 if (
170 (include and is_dir_pattern and index > out_index)
171 or (priority == out_priority and index > out_index)
172 or priority > out_priority
173 ):
174 out_include = include
175 out_index = index
176 out_priority = priority
177
178 assert out_index != -1, (out_index, out_include, out_priority)
179 return (out_include, out_index)