Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/atheris/function_hooks.py: 25%

159 statements  

« prev     ^ index     » next       coverage.py v7.0.5, created at 2023-01-17 06:13 +0000

1# Copyright 2021 Google LLC 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""Provides Atheris instrumentation hooks for particular functions like regex.""" 

15 

16import re 

17import sre_parse 

18import sys 

19from typing import Set, Any, Pattern, List, Match, Optional, Iterator, Union, Callable, AnyStr 

20 

21# mypy does not like the implicit rexport of the constants available in 

22# sre_parse, and also does not support ignoring for blocks of code. Rather 

23# than having a whole-file ignore, or interrupting every line of every statement 

24# below with an ignore, we will make aliases and ignore here. 

25_ANY = sre_parse.ANY # type: ignore[attr-defined] 

26_ASSERT = sre_parse.ASSERT # type: ignore[attr-defined] 

27_ASSERT_NOT = sre_parse.ASSERT_NOT # type: ignore[attr-defined] 

28_BRANCH = sre_parse.BRANCH # type: ignore[attr-defined] 

29_CATEGORY = sre_parse.CATEGORY # type: ignore[attr-defined] 

30_CATEGORY_DIGIT = sre_parse.CATEGORY_DIGIT # type: ignore[attr-defined] 

31_CATEGORY_NOT_DIGIT = sre_parse.CATEGORY_NOT_DIGIT # type: ignore[attr-defined] 

32_CATEGORY_SPACE = sre_parse.CATEGORY_SPACE # type: ignore[attr-defined] 

33_CATEGORY_NOT_SPACE = sre_parse.CATEGORY_NOT_SPACE # type: ignore[attr-defined] 

34_CATEGORY_WORD = sre_parse.CATEGORY_WORD # type: ignore[attr-defined] 

35_CATEGORY_NOT_WORD = sre_parse.CATEGORY_NOT_WORD # type: ignore[attr-defined] 

36_IN = sre_parse.IN # type: ignore[attr-defined] 

37_LITERAL = sre_parse.LITERAL # type: ignore[attr-defined] 

38_MAX_REPEAT = sre_parse.MAX_REPEAT # type: ignore[attr-defined] 

39_MIN_REPEAT = sre_parse.MIN_REPEAT # type: ignore[attr-defined] 

40_NEGATE = sre_parse.NEGATE # type: ignore[attr-defined] 

41_RANGE = sre_parse.RANGE # type: ignore[attr-defined] 

42_SUBPATTERN = sre_parse.SUBPATTERN # type: ignore[attr-defined] 

43 

44 

45def to_correct_type(to_convert: Union[str, bytes], 

46 return_type: Callable[[], AnyStr]) -> AnyStr: 

47 if return_type != str and return_type != bytes: 

48 raise TypeError("Expected `return_type` to be str or bytes, got {}" % 

49 return_type) 

50 if (isinstance(to_convert, bytes) and 

51 return_type == bytes) or (isinstance(to_convert, str) and 

52 return_type == str): 

53 return to_convert 

54 elif isinstance(to_convert, bytes): 

55 return str(to_convert) 

56 else: 

57 return bytes(to_convert, "utf-8") 

58 

59 

60def gen_match_recursive(ops: Any, 

61 return_type: Callable[[], AnyStr] = str, 

62 respect_lookarounds: bool = False) -> AnyStr: 

63 """Returns a matching string given a regex expression.""" 

64 # TODO(cffsmith): This generator is *not* feature complete. 

65 

66 available_characters = set([chr(x) for x in range(0x20, 0x7e)] + ["\t", "\n"]) 

67 

68 literals = return_type() 

69 

70 for tup in ops: 

71 if tup[0] == _LITERAL: 

72 val = tup[1] 

73 if return_type == str: 

74 literals += chr(val) 

75 elif return_type == bytes: 

76 # Endianess does not matter because there's just a single byte. 

77 literals += val.to_bytes(1, "big") 

78 else: 

79 raise TypeError( 

80 f"Expected return_type to be `str` or `bytes`, got {return_type}") 

81 

82 elif tup[0] == _ANY: 

83 literals += "a" 

84 

85 elif tup[0] == _BRANCH: 

86 # just generate the first branch 

87 literals += gen_match_recursive(tup[1][1][0], return_type) 

88 

89 elif tup[0] == _NEGATE: 

90 sys.stderr.write("WARNING: We did not expect a NEGATE op here; is " + 

91 "there an invalid RegEx somewhere?\n") 

92 pass 

93 

94 elif tup[0] == _RANGE: 

95 literals += to_correct_type(chr(tup[1][1]), return_type) 

96 

97 elif tup[0] == _IN: 

98 # Check if this class is negated. 

99 negated = tup[1][0][0] == _NEGATE 

100 # Take the first one that is actually in the class 

101 if not negated: 

102 literals += gen_match_recursive([tup[1][0]], return_type) 

103 else: 

104 char_set = set() 

105 # grab all literals from this class 

106 for t in tup[1][1:]: 

107 if t[0] == _LITERAL: 

108 char_set.add(chr(t[1])) 

109 elif t[0] == _RANGE: 

110 char_set |= set(chr(c) for c in range(t[1][0], t[1][1] + 1)) 

111 else: 

112 sys.stderr.write("WARNING: Encountered non literal in character " + 

113 "class, cannot instrument RegEx!\n") 

114 continue 

115 allowed = available_characters - char_set 

116 if not allowed: 

117 sys.stderr.write("WARNING: This character set does not seem to " + 

118 "allow any characters, cannot instrument RegEx!\n") 

119 else: 

120 literals += to_correct_type(list(allowed)[0], return_type) 

121 

122 elif tup[0] == _SUBPATTERN: 

123 literals += gen_match_recursive(tup[1][3], return_type) 

124 

125 elif tup[0] == _MAX_REPEAT or tup[0] == _MIN_REPEAT: 

126 # The minimum amount of repetitions we need to fulfill the pattern. 

127 # This refers to the distinction between `*` and `+`, not between greedy 

128 # (the default) matching vs non-greedy repeat matching with `.*?`, which 

129 # is represented by _MAX_REPEAT vs _MIN_REPEAT. 

130 minimum = tup[1][0] 

131 literals += gen_match_recursive(tup[1][2], return_type) * minimum 

132 

133 elif tup[0] == _ASSERT_NOT: 

134 sys.stderr.write( 

135 "WARNING: found negative lookahead or negative lookbehind, " 

136 "which are currently unsupported due to NP Completeness.") 

137 elif tup[0] == _ASSERT: 

138 if not respect_lookarounds: 

139 sys.stderr.write( 

140 "WARNING: Found lookahead or lookbehind in the middle of a regex, " 

141 "ignoring due to NP Completeness." 

142 ) 

143 continue 

144 

145 is_lookahead = tup[1][0] > 0 

146 is_beginning = ops.data.index(tup) == 0 

147 is_end = ops.data.index(tup) == len(ops) - 1 

148 if is_lookahead and is_end: 

149 literals += gen_match_recursive(tup[1][1], return_type) 

150 elif not is_lookahead and is_beginning: 

151 literals = gen_match_recursive(tup[1][1], return_type) + literals 

152 

153 elif tup[0] == _CATEGORY: 

154 # For how each of these is encoded, see 

155 # https://github.com/python/cpython/blob/main/Lib/sre_parse.py#L42 

156 category = tup[1] 

157 # start with a string, we'll do the type conversion later. 

158 ch = "" 

159 if category == _CATEGORY_DIGIT: 

160 ch = "0" 

161 if category == _CATEGORY_NOT_DIGIT: 

162 ch = "a" 

163 elif category == _CATEGORY_SPACE: 

164 ch = " " 

165 elif category == _CATEGORY_NOT_SPACE: 

166 ch = "a" 

167 elif category == _CATEGORY_WORD: 

168 ch = "a" 

169 elif category == _CATEGORY_NOT_WORD: 

170 ch = " " 

171 else: 

172 sys.stderr.write("WARNING: Unsupported RegEx category, " + 

173 "cannot instrument RegEx!\n") 

174 

175 literals += to_correct_type(ch, return_type) 

176 

177 else: 

178 sys.stderr.write(f"WARNING: Encountered non-handled RegEx op: {tup[0]}" + 

179 ", cannot instrument RegEx\n") 

180 

181 return literals 

182 

183 

184def gen_match(pattern: AnyStr) -> AnyStr: 

185 pat = sre_parse.parse(pattern) 

186 return gen_match_recursive(pat, type(pattern), respect_lookarounds=True) 

187 

188 

189def hook_re_module() -> None: 

190 """Adds Atheris instrumentation hooks to the `re` module.""" 

191 pattern_gen_map = {} 

192 

193 original_compile_func = re._compile # type: ignore[attr-defined] 

194 

195 def _compile_hook(pattern: AnyStr, flags: int) -> "AtherisPatternProxy": 

196 """Overrides re._compile.""" 

197 

198 generated: AnyStr # pytype: disable=invalid-annotation # enable-bare-annotations 

199 if pattern not in pattern_gen_map: 

200 generated = gen_match(pattern) 

201 

202 try: 

203 if original_compile_func(pattern, flags).search(generated) is None: 

204 sys.stderr.write(f"ERROR: generated match '{generated}' did not " + 

205 "match the RegEx pattern '{_pattern}'!\n") 

206 except Exception as e: # pylint: disable=broad-except 

207 sys.stderr.write("Could not check the generated match against the " + 

208 f"RegEx pattern: {e}\n") 

209 pattern_gen_map[pattern] = generated 

210 else: 

211 generated = pattern_gen_map[pattern] 

212 

213 # Create the `re.Pattern` object. We will wrap this in a proxy later on. 

214 re_object = original_compile_func(pattern, flags) 

215 

216 # Return the wrapped `re.Pattern` object. 

217 return AtherisPatternProxy(re_object, generated) 

218 

219 # actually hook the `_compile` function now 

220 # pylint: disable=protected-access 

221 re._compile = _compile_hook # type: ignore[attr-defined] 

222 # pylint: enable=protected-access 

223 

224class EnabledHooks: 

225 """Manages the set of enabled hooks.""" 

226 

227 def __init__(self) -> None: 

228 self._enabled_hooks: Set[str] = set() 

229 

230 def add(self, hook: str) -> None: 

231 hook = hook.lower() 

232 if hook not in list(self._enabled_hooks): 

233 if hook == "regex": 

234 hook_re_module() 

235 self._enabled_hooks.add(hook) 

236 

237 

238enabled_hooks = EnabledHooks() 

239 

240 

241class AtherisPatternProxy: 

242 """Proxy routing regex functions though Atheris tracing equivalents. 

243 

244 This is a simple proxy where we can hook into various regex 

245 functions. This ensures that the tracing happens on each call to 

246 `match`, `search`, etc. 

247 

248 This can be observable by users who call `compile` and then check 

249 if the object is actually a `re.Pattern` object. 

250 

251 Unfortunately it is not possible to change the functions on the 

252 `re.Pattern` object itself as the functions are not writable. 

253 (One could try to bypass this but it would need unsafe usage from 

254 ctypes and probably won't be version agnostic) 

255 """ 

256 

257 # Importing at the top will not work. TODO(b/207008147): Why does it fail? 

258 # pylint: disable=g-import-not-at-top 

259 

260 def __init__(self, re_obj: Pattern, generated: str) -> None: 

261 self.re_obj = re_obj 

262 self.generated = generated 

263 

264 def search(self, string: str) -> Optional[Match[Any]]: 

265 from atheris import _trace_regex_match # type: ignore[import] 

266 _trace_regex_match(self.generated, self.re_obj) 

267 return self.re_obj.search(string) 

268 

269 def match(self, string: str) -> Optional[Match[Any]]: 

270 from atheris import _trace_regex_match # type: ignore[import] 

271 _trace_regex_match(self.generated, self.re_obj) 

272 return self.re_obj.match(string) 

273 

274 def fullmatch(self, string: str) -> Optional[Match[str]]: 

275 from atheris import _trace_regex_match # type: ignore[import] 

276 _trace_regex_match(self.generated, self.re_obj) 

277 return self.re_obj.fullmatch(string) 

278 

279 def findall(self, string: str) -> List[str]: 

280 from atheris import _trace_regex_match # type: ignore[import] 

281 _trace_regex_match(self.generated, self.re_obj) 

282 return self.re_obj.findall(string) 

283 

284 def finditer(self, string: str) -> Iterator[Match[str]]: 

285 from atheris import _trace_regex_match # type: ignore[import] 

286 _trace_regex_match(self.generated, self.re_obj) 

287 return self.re_obj.finditer(string) 

288 

289 def __getattr__(self, attr: str) -> Any: 

290 return getattr(self.re_obj, attr) 

291 

292 # pylint: enable=g-import-not-at-top