Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/computation/parsing.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

47 statements  

1""" 

2:func:`~pandas.eval` source string parsing functions 

3""" 

4from __future__ import annotations 

5 

6from io import StringIO 

7from keyword import iskeyword 

8import token 

9import tokenize 

10from typing import TYPE_CHECKING 

11 

12if TYPE_CHECKING: 

13 from collections.abc import ( 

14 Hashable, 

15 Iterator, 

16 ) 

17 

18# A token value Python's tokenizer probably will never use. 

19BACKTICK_QUOTED_STRING = 100 

20 

21 

22def create_valid_python_identifier(name: str) -> str: 

23 """ 

24 Create valid Python identifiers from any string. 

25 

26 Check if name contains any special characters. If it contains any 

27 special characters, the special characters will be replaced by 

28 a special string and a prefix is added. 

29 

30 Raises 

31 ------ 

32 SyntaxError 

33 If the returned name is not a Python valid identifier, raise an exception. 

34 This can happen if there is a hashtag in the name, as the tokenizer will 

35 than terminate and not find the backtick. 

36 But also for characters that fall out of the range of (U+0001..U+007F). 

37 """ 

38 if name.isidentifier() and not iskeyword(name): 

39 return name 

40 

41 # Create a dict with the special characters and their replacement string. 

42 # EXACT_TOKEN_TYPES contains these special characters 

43 # token.tok_name contains a readable description of the replacement string. 

44 special_characters_replacements = { 

45 char: f"_{token.tok_name[tokval]}_" 

46 for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) 

47 } 

48 special_characters_replacements.update( 

49 { 

50 " ": "_", 

51 "?": "_QUESTIONMARK_", 

52 "!": "_EXCLAMATIONMARK_", 

53 "$": "_DOLLARSIGN_", 

54 "€": "_EUROSIGN_", 

55 "°": "_DEGREESIGN_", 

56 # Including quotes works, but there are exceptions. 

57 "'": "_SINGLEQUOTE_", 

58 '"': "_DOUBLEQUOTE_", 

59 # Currently not possible. Terminates parser and won't find backtick. 

60 # "#": "_HASH_", 

61 } 

62 ) 

63 

64 name = "".join([special_characters_replacements.get(char, char) for char in name]) 

65 name = f"BACKTICK_QUOTED_STRING_{name}" 

66 

67 if not name.isidentifier(): 

68 raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") 

69 

70 return name 

71 

72 

73def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: 

74 """ 

75 Clean up a column name if surrounded by backticks. 

76 

77 Backtick quoted string are indicated by a certain tokval value. If a string 

78 is a backtick quoted token it will processed by 

79 :func:`_create_valid_python_identifier` so that the parser can find this 

80 string when the query is executed. 

81 In this case the tok will get the NAME tokval. 

82 

83 Parameters 

84 ---------- 

85 tok : tuple of int, str 

86 ints correspond to the all caps constants in the tokenize module 

87 

88 Returns 

89 ------- 

90 tok : Tuple[int, str] 

91 Either the input or token or the replacement values 

92 """ 

93 toknum, tokval = tok 

94 if toknum == BACKTICK_QUOTED_STRING: 

95 return tokenize.NAME, create_valid_python_identifier(tokval) 

96 return toknum, tokval 

97 

98 

99def clean_column_name(name: Hashable) -> Hashable: 

100 """ 

101 Function to emulate the cleaning of a backtick quoted name. 

102 

103 The purpose for this function is to see what happens to the name of 

104 identifier if it goes to the process of being parsed a Python code 

105 inside a backtick quoted string and than being cleaned 

106 (removed of any special characters). 

107 

108 Parameters 

109 ---------- 

110 name : hashable 

111 Name to be cleaned. 

112 

113 Returns 

114 ------- 

115 name : hashable 

116 Returns the name after tokenizing and cleaning. 

117 

118 Notes 

119 ----- 

120 For some cases, a name cannot be converted to a valid Python identifier. 

121 In that case :func:`tokenize_string` raises a SyntaxError. 

122 In that case, we just return the name unmodified. 

123 

124 If this name was used in the query string (this makes the query call impossible) 

125 an error will be raised by :func:`tokenize_backtick_quoted_string` instead, 

126 which is not caught and propagates to the user level. 

127 """ 

128 try: 

129 tokenized = tokenize_string(f"`{name}`") 

130 tokval = next(tokenized)[1] 

131 return create_valid_python_identifier(tokval) 

132 except SyntaxError: 

133 return name 

134 

135 

136def tokenize_backtick_quoted_string( 

137 token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int 

138) -> tuple[int, str]: 

139 """ 

140 Creates a token from a backtick quoted string. 

141 

142 Moves the token_generator forwards till right after the next backtick. 

143 

144 Parameters 

145 ---------- 

146 token_generator : Iterator[tokenize.TokenInfo] 

147 The generator that yields the tokens of the source string (Tuple[int, str]). 

148 The generator is at the first token after the backtick (`) 

149 

150 source : str 

151 The Python source code string. 

152 

153 string_start : int 

154 This is the start of backtick quoted string inside the source string. 

155 

156 Returns 

157 ------- 

158 tok: Tuple[int, str] 

159 The token that represents the backtick quoted string. 

160 The integer is equal to BACKTICK_QUOTED_STRING (100). 

161 """ 

162 for _, tokval, start, _, _ in token_generator: 

163 if tokval == "`": 

164 string_end = start[1] 

165 break 

166 

167 return BACKTICK_QUOTED_STRING, source[string_start:string_end] 

168 

169 

170def tokenize_string(source: str) -> Iterator[tuple[int, str]]: 

171 """ 

172 Tokenize a Python source code string. 

173 

174 Parameters 

175 ---------- 

176 source : str 

177 The Python source code string. 

178 

179 Returns 

180 ------- 

181 tok_generator : Iterator[Tuple[int, str]] 

182 An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). 

183 """ 

184 line_reader = StringIO(source).readline 

185 token_generator = tokenize.generate_tokens(line_reader) 

186 

187 # Loop over all tokens till a backtick (`) is found. 

188 # Then, take all tokens till the next backtick to form a backtick quoted string 

189 for toknum, tokval, start, _, _ in token_generator: 

190 if tokval == "`": 

191 try: 

192 yield tokenize_backtick_quoted_string( 

193 token_generator, source, string_start=start[1] + 1 

194 ) 

195 except Exception as err: 

196 raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err 

197 else: 

198 yield toknum, tokval