Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/computation/parsing.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

45 statements  

1""" 

2:func:`~pandas.eval` source string parsing functions 

3""" 

4from __future__ import annotations 

5 

6from io import StringIO 

7from keyword import iskeyword 

8import token 

9import tokenize 

10from typing import ( 

11 Hashable, 

12 Iterator, 

13) 

14 

15# A token value Python's tokenizer probably will never use. 

16BACKTICK_QUOTED_STRING = 100 

17 

18 

19def create_valid_python_identifier(name: str) -> str: 

20 """ 

21 Create valid Python identifiers from any string. 

22 

23 Check if name contains any special characters. If it contains any 

24 special characters, the special characters will be replaced by 

25 a special string and a prefix is added. 

26 

27 Raises 

28 ------ 

29 SyntaxError 

30 If the returned name is not a Python valid identifier, raise an exception. 

31 This can happen if there is a hashtag in the name, as the tokenizer will 

32 than terminate and not find the backtick. 

33 But also for characters that fall out of the range of (U+0001..U+007F). 

34 """ 

35 if name.isidentifier() and not iskeyword(name): 

36 return name 

37 

38 # Create a dict with the special characters and their replacement string. 

39 # EXACT_TOKEN_TYPES contains these special characters 

40 # token.tok_name contains a readable description of the replacement string. 

41 special_characters_replacements = { 

42 char: f"_{token.tok_name[tokval]}_" 

43 for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) 

44 } 

45 special_characters_replacements.update( 

46 { 

47 " ": "_", 

48 "?": "_QUESTIONMARK_", 

49 "!": "_EXCLAMATIONMARK_", 

50 "$": "_DOLLARSIGN_", 

51 "€": "_EUROSIGN_", 

52 "°": "_DEGREESIGN_", 

53 # Including quotes works, but there are exceptions. 

54 "'": "_SINGLEQUOTE_", 

55 '"': "_DOUBLEQUOTE_", 

56 # Currently not possible. Terminates parser and won't find backtick. 

57 # "#": "_HASH_", 

58 } 

59 ) 

60 

61 name = "".join([special_characters_replacements.get(char, char) for char in name]) 

62 name = f"BACKTICK_QUOTED_STRING_{name}" 

63 

64 if not name.isidentifier(): 

65 raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") 

66 

67 return name 

68 

69 

70def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: 

71 """ 

72 Clean up a column name if surrounded by backticks. 

73 

74 Backtick quoted string are indicated by a certain tokval value. If a string 

75 is a backtick quoted token it will processed by 

76 :func:`_create_valid_python_identifier` so that the parser can find this 

77 string when the query is executed. 

78 In this case the tok will get the NAME tokval. 

79 

80 Parameters 

81 ---------- 

82 tok : tuple of int, str 

83 ints correspond to the all caps constants in the tokenize module 

84 

85 Returns 

86 ------- 

87 tok : Tuple[int, str] 

88 Either the input or token or the replacement values 

89 """ 

90 toknum, tokval = tok 

91 if toknum == BACKTICK_QUOTED_STRING: 

92 return tokenize.NAME, create_valid_python_identifier(tokval) 

93 return toknum, tokval 

94 

95 

96def clean_column_name(name: Hashable) -> Hashable: 

97 """ 

98 Function to emulate the cleaning of a backtick quoted name. 

99 

100 The purpose for this function is to see what happens to the name of 

101 identifier if it goes to the process of being parsed a Python code 

102 inside a backtick quoted string and than being cleaned 

103 (removed of any special characters). 

104 

105 Parameters 

106 ---------- 

107 name : hashable 

108 Name to be cleaned. 

109 

110 Returns 

111 ------- 

112 name : hashable 

113 Returns the name after tokenizing and cleaning. 

114 

115 Notes 

116 ----- 

117 For some cases, a name cannot be converted to a valid Python identifier. 

118 In that case :func:`tokenize_string` raises a SyntaxError. 

119 In that case, we just return the name unmodified. 

120 

121 If this name was used in the query string (this makes the query call impossible) 

122 an error will be raised by :func:`tokenize_backtick_quoted_string` instead, 

123 which is not caught and propagates to the user level. 

124 """ 

125 try: 

126 tokenized = tokenize_string(f"`{name}`") 

127 tokval = next(tokenized)[1] 

128 return create_valid_python_identifier(tokval) 

129 except SyntaxError: 

130 return name 

131 

132 

133def tokenize_backtick_quoted_string( 

134 token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int 

135) -> tuple[int, str]: 

136 """ 

137 Creates a token from a backtick quoted string. 

138 

139 Moves the token_generator forwards till right after the next backtick. 

140 

141 Parameters 

142 ---------- 

143 token_generator : Iterator[tokenize.TokenInfo] 

144 The generator that yields the tokens of the source string (Tuple[int, str]). 

145 The generator is at the first token after the backtick (`) 

146 

147 source : str 

148 The Python source code string. 

149 

150 string_start : int 

151 This is the start of backtick quoted string inside the source string. 

152 

153 Returns 

154 ------- 

155 tok: Tuple[int, str] 

156 The token that represents the backtick quoted string. 

157 The integer is equal to BACKTICK_QUOTED_STRING (100). 

158 """ 

159 for _, tokval, start, _, _ in token_generator: 

160 if tokval == "`": 

161 string_end = start[1] 

162 break 

163 

164 return BACKTICK_QUOTED_STRING, source[string_start:string_end] 

165 

166 

167def tokenize_string(source: str) -> Iterator[tuple[int, str]]: 

168 """ 

169 Tokenize a Python source code string. 

170 

171 Parameters 

172 ---------- 

173 source : str 

174 The Python source code string. 

175 

176 Returns 

177 ------- 

178 tok_generator : Iterator[Tuple[int, str]] 

179 An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). 

180 """ 

181 line_reader = StringIO(source).readline 

182 token_generator = tokenize.generate_tokens(line_reader) 

183 

184 # Loop over all tokens till a backtick (`) is found. 

185 # Then, take all tokens till the next backtick to form a backtick quoted string 

186 for toknum, tokval, start, _, _ in token_generator: 

187 if tokval == "`": 

188 try: 

189 yield tokenize_backtick_quoted_string( 

190 token_generator, source, string_start=start[1] + 1 

191 ) 

192 except Exception as err: 

193 raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err 

194 else: 

195 yield toknum, tokval