1"""
2This module provides the :module:`re2` backend for :class:`~pathspec.gitignore.GitIgnoreSpec`.
3
4WARNING: The *pathspec._backends.re2* package is not part of the public API. Its
5contents and structure are likely to change.
6"""
7from __future__ import annotations
8
9from collections.abc import (
10 Callable)
11from typing import (
12 Optional, # Replaced by `X | None` in 3.10.
13 Union) # Replaced by `X | Y` in 3.10.
14
15try:
16 import re2
17except ModuleNotFoundError:
18 re2 = None
19
20from ...pattern import (
21 RegexPattern)
22from ...patterns.gitwildmatch import (
23 GitWildMatchPattern,
24 _BYTES_ENCODING,
25 _DIR_MARK_CG,
26 _DIR_MARK_OPT)
27from ..._typing import (
28 override) # Added in 3.12.
29
30from ._base import (
31 Re2RegexDat,
32 Re2RegexDebug)
33from .pathspec import (
34 Re2PsBackend)
35
36
37class Re2GiBackend(Re2PsBackend):
38 """
39 The :class:`Re2GiBackend` class is the :module:`re2` implementation used by
40 :class:`~pathspec.gitignore.GitIgnoreSpec` for matching files.
41 """
42
43 @override
44 @staticmethod
45 def _init_set(
46 debug: bool,
47 patterns: dict[int, RegexPattern],
48 regex_set: re2.Set,
49 sort_indices: Optional[Callable[[list[int]], None]],
50 ) -> list[Re2RegexDat]:
51 """
52 Create the re2 regex set.
53
54 *debug* (:class:`bool`) is whether to include additional debugging
55 information for the regular expressions.
56
57 *patterns* (:class:`dict`) maps pattern index (:class:`int`) to pattern
58 (:class:`.RegexPattern`).
59
60 *regex_set* (:class:`re2.Set`) is the regex set.
61
62 *sort_indices* (:class:`callable` or :data:`None`) is a function used to
63 sort the patterns by index. This is used during testing to ensure the order
64 of patterns is not accidentally relied on.
65
66 Returns a :class:`list` indexed by regex id (:class:`int`) to its data
67 (:class:`Re2RegexDat`).
68 """
69 # Sort patterns.
70 indices = list(patterns.keys())
71 if sort_indices is not None:
72 sort_indices(indices)
73
74 # Prepare patterns.
75 regex_data: list[Re2RegexDat] = []
76 for pattern_index in indices:
77 pattern = patterns[pattern_index]
78 if pattern.include is None:
79 continue
80
81 assert isinstance(pattern, RegexPattern), pattern
82 regex = pattern.regex.pattern
83
84 use_regexes: list[tuple[Union[str, bytes], bool]] = []
85 if isinstance(pattern, GitWildMatchPattern):
86 # GitWildMatch uses capture groups for its directory marker. Re2
87 # supports capture groups, but they cannot be utilized when using
88 # `re2.Set`. Handle this scenario.
89 regex_str: str
90 if isinstance(regex, str):
91 regex_str = regex
92 else:
93 assert isinstance(regex, bytes), regex
94 regex_str = regex.decode(_BYTES_ENCODING)
95
96 if _DIR_MARK_CG in regex_str:
97 # Found directory marker.
98 if regex_str.endswith(_DIR_MARK_OPT):
99 # Regex has optional directory marker. Split regex into directory
100 # and file variants.
101 base_regex = regex_str[:-len(_DIR_MARK_OPT)]
102 use_regexes.append((f'{base_regex}/', True))
103 use_regexes.append((f'{base_regex}$', False))
104 else:
105 # Remove capture group.
106 base_regex = regex_str.replace(_DIR_MARK_CG, '/')
107 use_regexes.append((base_regex, True))
108
109 if not use_regexes:
110 # No special case for regex.
111 use_regexes.append((regex, False))
112
113 for regex, is_dir_pattern in use_regexes:
114 if debug:
115 regex_data.append(Re2RegexDebug(
116 include=pattern.include,
117 index=pattern_index,
118 is_dir_pattern=is_dir_pattern,
119 regex=regex,
120 ))
121 else:
122 regex_data.append(Re2RegexDat(
123 include=pattern.include,
124 index=pattern_index,
125 is_dir_pattern=is_dir_pattern,
126 ))
127
128 regex_set.Add(regex)
129
130 # Compile patterns.
131 regex_set.Compile()
132 return regex_data
133
134 @override
135 def match_file(self, file: str) -> tuple[Optional[bool], Optional[int]]:
136 """
137 Check the file against the patterns.
138
139 *file* (:class:`str`) is the normalized file path to check.
140
141 Returns a :class:`tuple` containing whether to include *file* (:class:`bool`
142 or :data:`None`), and the index of the last matched pattern (:class:`int` or
143 :data:`None`).
144 """
145 # Find best match.
146 match_ids: Optional[list[int]] = self._set.Match(file)
147 if not match_ids:
148 return (None, None)
149
150 out_include: Optional[bool] = None
151 out_index: int = -1
152 out_priority = -1
153
154 regex_data = self._regex_data
155 for regex_id in match_ids:
156 regex_dat = regex_data[regex_id]
157
158 is_dir_pattern = regex_dat.is_dir_pattern
159 if is_dir_pattern:
160 # Pattern matched by a directory pattern.
161 priority = 1
162 else:
163 # Pattern matched by a file pattern.
164 priority = 2
165
166 # WARNING: According to the documentation on `RE2::Set::Match()`, there is
167 # no guarantee matches will be produced in order!
168 include = regex_dat.include
169 index = regex_dat.index
170 if (
171 (include and is_dir_pattern and index > out_index)
172 or (priority == out_priority and index > out_index)
173 or priority > out_priority
174 ):
175 out_include = include
176 out_index = index
177 out_priority = priority
178
179 assert out_index != -1, (out_index, out_include, out_priority)
180 return (out_include, out_index)