1"""
2This module provides the :module:`hyperscan` backend for :class:`~pathspec.gitignore.GitIgnoreSpec`.
3
4WARNING: The *pathspec._backends.hyperscan* package is not part of the public
5API. Its contents and structure are likely to change.
6"""
7from __future__ import annotations
8
9from collections.abc import (
10 Callable,
11 Sequence)
12from typing import (
13 Any,
14 Optional, # Replaced by `X | None` in 3.10.
15 Union) # Replaced by `X | Y` in 3.10.
16
17try:
18 import hyperscan
19except ModuleNotFoundError:
20 hyperscan = None
21
22from ...pattern import (
23 RegexPattern)
24from ...patterns.gitwildmatch import (
25 GitWildMatchPattern,
26 _BYTES_ENCODING,
27 _DIR_MARK_CG,
28 _DIR_MARK_OPT)
29from ..._typing import (
30 override) # Added in 3.12.
31
32from ._base import (
33 HS_FLAGS,
34 HyperscanExprDat,
35 HyperscanExprDebug)
36from .pathspec import (
37 HyperscanPsBackend)
38
39
40class HyperscanGiBackend(HyperscanPsBackend):
41 """
42 The :class:`HyperscanGiBackend` class is the :module:`hyperscan`
43 implementation used by :class:`~pathspec.gitignore.GitIgnoreSpec`. The
44 Hyperscan database uses block mode for matching files.
45 """
46
47 # Change type hint.
48 _out: tuple[Optional[bool], int, int]
49
50 def __init__(
51 self,
52 patterns: Sequence[RegexPattern],
53 *,
54 _debug_exprs: Optional[bool] = None,
55 _test_sort: Optional[Callable[[list], None]] = None,
56 ) -> None:
57 """
58 Initialize the :class:`HyperscanMatcher` instance.
59
60 *patterns* (:class:`Sequence` of :class:`.RegexPattern`) contains the
61 compiled patterns.
62 """
63 super().__init__(patterns, _debug_exprs=_debug_exprs, _test_sort=_test_sort)
64
65 self._out = (None, -1, 0)
66 """
67 *_out* (:class:`tuple`) stores the current match:
68
69 - *0* (:class:`bool` or :data:`None`) is the match include.
70
71 - *1* (:class:`int`) is the match index.
72
73 - *2* (:class:`int`) is the match priority.
74 """
75
76 @override
77 @staticmethod
78 def _init_db(
79 db: hyperscan.Database,
80 debug: bool,
81 patterns: list[tuple[int, RegexPattern]],
82 sort_ids: Optional[Callable[[list[int]], None]],
83 ) -> list[HyperscanExprDat]:
84 """
85 Create the Hyperscan database from the given patterns.
86
87 *db* (:class:`hyperscan.Hyperscan`) is the Hyperscan database.
88
89 *debug* (:class:`bool`) is whether to include additional debugging
90 information for the expressions.
91
92 *patterns* (:class:`~collections.abc.Sequence` of :class:`.RegexPattern`)
93 contains the patterns.
94
95 *sort_ids* (:class:`callable` or :data:`None`) is a function used to sort
96 the compiled expression ids. This is used during testing to ensure the order
97 of expressions is not accidentally relied on.
98
99 Returns a :class:`list` indexed by expression id (:class:`int`) to its data
100 (:class:`HyperscanExprDat`).
101 """
102 # Prepare patterns.
103 expr_data: list[HyperscanExprDat] = []
104 exprs: list[bytes] = []
105 for pattern_index, pattern in patterns:
106 if pattern.include is None:
107 continue
108
109 # Encode regex.
110 assert isinstance(pattern, RegexPattern), pattern
111 regex = pattern.regex.pattern
112
113 use_regexes: list[tuple[Union[str, bytes], bool]] = []
114 if isinstance(pattern, GitWildMatchPattern):
115 # GitWildMatch uses capture groups for its directory marker but
116 # Hyperscan does not support capture groups. Handle this scenario.
117 regex_str: str
118 if isinstance(regex, str):
119 regex_str: str = regex
120 else:
121 assert isinstance(regex, bytes), regex
122 regex_str = regex.decode(_BYTES_ENCODING)
123
124 if _DIR_MARK_CG in regex_str:
125 # Found directory marker.
126 if regex_str.endswith(_DIR_MARK_OPT):
127 # Regex has optional directory marker. Split regex into directory
128 # and file variants.
129 base_regex = regex_str[:-len(_DIR_MARK_OPT)]
130 use_regexes.append((f'{base_regex}/', True))
131 use_regexes.append((f'{base_regex}$', False))
132 else:
133 # Remove capture group.
134 base_regex = regex_str.replace(_DIR_MARK_CG, '/')
135 use_regexes.append((base_regex, True))
136
137 if not use_regexes:
138 # No special case for regex.
139 use_regexes.append((regex, False))
140
141 for regex, is_dir_pattern in use_regexes:
142 if isinstance(regex, bytes):
143 regex_bytes = regex
144 else:
145 assert isinstance(regex, str), regex
146 regex_bytes = regex.encode('utf8')
147
148 if debug:
149 expr_data.append(HyperscanExprDebug(
150 include=pattern.include,
151 index=pattern_index,
152 is_dir_pattern=is_dir_pattern,
153 regex=regex,
154 ))
155 else:
156 expr_data.append(HyperscanExprDat(
157 include=pattern.include,
158 index=pattern_index,
159 is_dir_pattern=is_dir_pattern,
160 ))
161
162 exprs.append(regex_bytes)
163
164 # Sort expressions.
165 ids = list(range(len(exprs)))
166 if sort_ids is not None:
167 sort_ids(ids)
168 exprs = [exprs[__id] for __id in ids]
169
170 # Compile patterns.
171 db.compile(
172 expressions=exprs,
173 ids=ids,
174 elements=len(exprs),
175 flags=HS_FLAGS,
176 )
177 return expr_data
178
179 @override
180 def match_file(self, file: str) -> tuple[Optional[bool], Optional[int]]:
181 """
182 Check the file against the patterns.
183
184 *file* (:class:`str`) is the normalized file path to check.
185
186 Returns a :class:`tuple` containing whether to include *file* (:class:`bool`
187 or :data:`None`), and the index of the last matched pattern (:class:`int` or
188 :data:`None`).
189 """
190 # NOTICE: According to benchmarking, a method callback is 13% faster than
191 # using a closure here.
192 self._out = (None, -1, 0)
193 self._db.scan(file.encode('utf8'), match_event_handler=self.__on_match)
194
195 out_include, out_index = self._out[:2]
196 if out_index == -1:
197 out_index = None
198
199 return (out_include, out_index)
200
201 @override
202 def __on_match(
203 self,
204 expr_id: int,
205 _from: int,
206 _to: int,
207 _flags: int,
208 _context: Any,
209 ) -> Optional[bool]:
210 """
211 Called on each match.
212
213 *expr_id* (:class:`int`) is the expression id (index) of the matched
214 pattern.
215 """
216 expr_dat = self._expr_data[expr_id]
217
218 is_dir_pattern = expr_dat.is_dir_pattern
219 if is_dir_pattern:
220 # Pattern matched by a directory pattern.
221 priority = 1
222 else:
223 # Pattern matched by a file pattern.
224 priority = 2
225
226 # WARNING: Hyperscan does not guarantee matches will be produced in order!
227 include = expr_dat.include
228 index = expr_dat.index
229 prev_index = self._out[1]
230 prev_priority = self._out[2]
231 if (
232 (include and is_dir_pattern and index > prev_index)
233 or (priority == prev_priority and index > prev_index)
234 or priority > prev_priority
235 ):
236 self._out = (include, expr_dat.index, priority)