Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/atheris/function_hooks.py: 25%
159 statements
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
1# Copyright 2021 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Provides Atheris instrumentation hooks for particular functions like regex."""
16import re
17import sre_parse
18import sys
19from typing import Set, Any, Pattern, List, Match, Optional, Iterator, Union, Callable, AnyStr
21# mypy does not like the implicit rexport of the constants available in
22# sre_parse, and also does not support ignoring for blocks of code. Rather
23# than having a whole-file ignore, or interrupting every line of every statement
24# below with an ignore, we will make aliases and ignore here.
25_ANY = sre_parse.ANY # type: ignore[attr-defined]
26_ASSERT = sre_parse.ASSERT # type: ignore[attr-defined]
27_ASSERT_NOT = sre_parse.ASSERT_NOT # type: ignore[attr-defined]
28_BRANCH = sre_parse.BRANCH # type: ignore[attr-defined]
29_CATEGORY = sre_parse.CATEGORY # type: ignore[attr-defined]
30_CATEGORY_DIGIT = sre_parse.CATEGORY_DIGIT # type: ignore[attr-defined]
31_CATEGORY_NOT_DIGIT = sre_parse.CATEGORY_NOT_DIGIT # type: ignore[attr-defined]
32_CATEGORY_SPACE = sre_parse.CATEGORY_SPACE # type: ignore[attr-defined]
33_CATEGORY_NOT_SPACE = sre_parse.CATEGORY_NOT_SPACE # type: ignore[attr-defined]
34_CATEGORY_WORD = sre_parse.CATEGORY_WORD # type: ignore[attr-defined]
35_CATEGORY_NOT_WORD = sre_parse.CATEGORY_NOT_WORD # type: ignore[attr-defined]
36_IN = sre_parse.IN # type: ignore[attr-defined]
37_LITERAL = sre_parse.LITERAL # type: ignore[attr-defined]
38_MAX_REPEAT = sre_parse.MAX_REPEAT # type: ignore[attr-defined]
39_MIN_REPEAT = sre_parse.MIN_REPEAT # type: ignore[attr-defined]
40_NEGATE = sre_parse.NEGATE # type: ignore[attr-defined]
41_RANGE = sre_parse.RANGE # type: ignore[attr-defined]
42_SUBPATTERN = sre_parse.SUBPATTERN # type: ignore[attr-defined]
45def to_correct_type(to_convert: Union[str, bytes],
46 return_type: Callable[[], AnyStr]) -> AnyStr:
47 if return_type != str and return_type != bytes:
48 raise TypeError("Expected `return_type` to be str or bytes, got {}" %
49 return_type)
50 if (isinstance(to_convert, bytes) and
51 return_type == bytes) or (isinstance(to_convert, str) and
52 return_type == str):
53 return to_convert
54 elif isinstance(to_convert, bytes):
55 return str(to_convert)
56 else:
57 return bytes(to_convert, "utf-8")
60def gen_match_recursive(ops: Any,
61 return_type: Callable[[], AnyStr] = str,
62 respect_lookarounds: bool = False) -> AnyStr:
63 """Returns a matching string given a regex expression."""
64 # TODO(cffsmith): This generator is *not* feature complete.
66 available_characters = set([chr(x) for x in range(0x20, 0x7e)] + ["\t", "\n"])
68 literals = return_type()
70 for tup in ops:
71 if tup[0] == _LITERAL:
72 val = tup[1]
73 if return_type == str:
74 literals += chr(val)
75 elif return_type == bytes:
76 # Endianess does not matter because there's just a single byte.
77 literals += val.to_bytes(1, "big")
78 else:
79 raise TypeError(
80 f"Expected return_type to be `str` or `bytes`, got {return_type}")
82 elif tup[0] == _ANY:
83 literals += "a"
85 elif tup[0] == _BRANCH:
86 # just generate the first branch
87 literals += gen_match_recursive(tup[1][1][0], return_type)
89 elif tup[0] == _NEGATE:
90 sys.stderr.write("WARNING: We did not expect a NEGATE op here; is " +
91 "there an invalid RegEx somewhere?\n")
92 pass
94 elif tup[0] == _RANGE:
95 literals += to_correct_type(chr(tup[1][1]), return_type)
97 elif tup[0] == _IN:
98 # Check if this class is negated.
99 negated = tup[1][0][0] == _NEGATE
100 # Take the first one that is actually in the class
101 if not negated:
102 literals += gen_match_recursive([tup[1][0]], return_type)
103 else:
104 char_set = set()
105 # grab all literals from this class
106 for t in tup[1][1:]:
107 if t[0] == _LITERAL:
108 char_set.add(chr(t[1]))
109 elif t[0] == _RANGE:
110 char_set |= set(chr(c) for c in range(t[1][0], t[1][1] + 1))
111 else:
112 sys.stderr.write("WARNING: Encountered non literal in character " +
113 "class, cannot instrument RegEx!\n")
114 continue
115 allowed = available_characters - char_set
116 if not allowed:
117 sys.stderr.write("WARNING: This character set does not seem to " +
118 "allow any characters, cannot instrument RegEx!\n")
119 else:
120 literals += to_correct_type(list(allowed)[0], return_type)
122 elif tup[0] == _SUBPATTERN:
123 literals += gen_match_recursive(tup[1][3], return_type)
125 elif tup[0] == _MAX_REPEAT or tup[0] == _MIN_REPEAT:
126 # The minimum amount of repetitions we need to fulfill the pattern.
127 # This refers to the distinction between `*` and `+`, not between greedy
128 # (the default) matching vs non-greedy repeat matching with `.*?`, which
129 # is represented by _MAX_REPEAT vs _MIN_REPEAT.
130 minimum = tup[1][0]
131 literals += gen_match_recursive(tup[1][2], return_type) * minimum
133 elif tup[0] == _ASSERT_NOT:
134 sys.stderr.write(
135 "WARNING: found negative lookahead or negative lookbehind, "
136 "which are currently unsupported due to NP Completeness.")
137 elif tup[0] == _ASSERT:
138 if not respect_lookarounds:
139 sys.stderr.write(
140 "WARNING: Found lookahead or lookbehind in the middle of a regex, "
141 "ignoring due to NP Completeness."
142 )
143 continue
145 is_lookahead = tup[1][0] > 0
146 is_beginning = ops.data.index(tup) == 0
147 is_end = ops.data.index(tup) == len(ops) - 1
148 if is_lookahead and is_end:
149 literals += gen_match_recursive(tup[1][1], return_type)
150 elif not is_lookahead and is_beginning:
151 literals = gen_match_recursive(tup[1][1], return_type) + literals
153 elif tup[0] == _CATEGORY:
154 # For how each of these is encoded, see
155 # https://github.com/python/cpython/blob/main/Lib/sre_parse.py#L42
156 category = tup[1]
157 # start with a string, we'll do the type conversion later.
158 ch = ""
159 if category == _CATEGORY_DIGIT:
160 ch = "0"
161 if category == _CATEGORY_NOT_DIGIT:
162 ch = "a"
163 elif category == _CATEGORY_SPACE:
164 ch = " "
165 elif category == _CATEGORY_NOT_SPACE:
166 ch = "a"
167 elif category == _CATEGORY_WORD:
168 ch = "a"
169 elif category == _CATEGORY_NOT_WORD:
170 ch = " "
171 else:
172 sys.stderr.write("WARNING: Unsupported RegEx category, " +
173 "cannot instrument RegEx!\n")
175 literals += to_correct_type(ch, return_type)
177 else:
178 sys.stderr.write(f"WARNING: Encountered non-handled RegEx op: {tup[0]}" +
179 ", cannot instrument RegEx\n")
181 return literals
184def gen_match(pattern: AnyStr) -> AnyStr:
185 pat = sre_parse.parse(pattern)
186 return gen_match_recursive(pat, type(pattern), respect_lookarounds=True)
189def hook_re_module() -> None:
190 """Adds Atheris instrumentation hooks to the `re` module."""
191 pattern_gen_map = {}
193 original_compile_func = re._compile # type: ignore[attr-defined]
195 def _compile_hook(pattern: AnyStr, flags: int) -> "AtherisPatternProxy":
196 """Overrides re._compile."""
198 generated: AnyStr # pytype: disable=invalid-annotation # enable-bare-annotations
199 if pattern not in pattern_gen_map:
200 generated = gen_match(pattern)
202 try:
203 if original_compile_func(pattern, flags).search(generated) is None:
204 sys.stderr.write(f"ERROR: generated match '{generated}' did not " +
205 "match the RegEx pattern '{_pattern}'!\n")
206 except Exception as e: # pylint: disable=broad-except
207 sys.stderr.write("Could not check the generated match against the " +
208 f"RegEx pattern: {e}\n")
209 pattern_gen_map[pattern] = generated
210 else:
211 generated = pattern_gen_map[pattern]
213 # Create the `re.Pattern` object. We will wrap this in a proxy later on.
214 re_object = original_compile_func(pattern, flags)
216 # Return the wrapped `re.Pattern` object.
217 return AtherisPatternProxy(re_object, generated)
219 # actually hook the `_compile` function now
220 # pylint: disable=protected-access
221 re._compile = _compile_hook # type: ignore[attr-defined]
222 # pylint: enable=protected-access
224class EnabledHooks:
225 """Manages the set of enabled hooks."""
227 def __init__(self) -> None:
228 self._enabled_hooks: Set[str] = set()
230 def add(self, hook: str) -> None:
231 hook = hook.lower()
232 if hook not in list(self._enabled_hooks):
233 if hook == "regex":
234 hook_re_module()
235 self._enabled_hooks.add(hook)
238enabled_hooks = EnabledHooks()
241class AtherisPatternProxy:
242 """Proxy routing regex functions though Atheris tracing equivalents.
244 This is a simple proxy where we can hook into various regex
245 functions. This ensures that the tracing happens on each call to
246 `match`, `search`, etc.
248 This can be observable by users who call `compile` and then check
249 if the object is actually a `re.Pattern` object.
251 Unfortunately it is not possible to change the functions on the
252 `re.Pattern` object itself as the functions are not writable.
253 (One could try to bypass this but it would need unsafe usage from
254 ctypes and probably won't be version agnostic)
255 """
257 # Importing at the top will not work. TODO(b/207008147): Why does it fail?
258 # pylint: disable=g-import-not-at-top
260 def __init__(self, re_obj: Pattern, generated: str) -> None:
261 self.re_obj = re_obj
262 self.generated = generated
264 def search(self, string: str) -> Optional[Match[Any]]:
265 from atheris import _trace_regex_match # type: ignore[import]
266 _trace_regex_match(self.generated, self.re_obj)
267 return self.re_obj.search(string)
269 def match(self, string: str) -> Optional[Match[Any]]:
270 from atheris import _trace_regex_match # type: ignore[import]
271 _trace_regex_match(self.generated, self.re_obj)
272 return self.re_obj.match(string)
274 def fullmatch(self, string: str) -> Optional[Match[str]]:
275 from atheris import _trace_regex_match # type: ignore[import]
276 _trace_regex_match(self.generated, self.re_obj)
277 return self.re_obj.fullmatch(string)
279 def findall(self, string: str) -> List[str]:
280 from atheris import _trace_regex_match # type: ignore[import]
281 _trace_regex_match(self.generated, self.re_obj)
282 return self.re_obj.findall(string)
284 def finditer(self, string: str) -> Iterator[Match[str]]:
285 from atheris import _trace_regex_match # type: ignore[import]
286 _trace_regex_match(self.generated, self.re_obj)
287 return self.re_obj.finditer(string)
289 def __getattr__(self, attr: str) -> Any:
290 return getattr(self.re_obj, attr)
292 # pylint: enable=g-import-not-at-top