1"""
2:func:`~pandas.eval` source string parsing functions
3"""
4from __future__ import annotations
5
6from io import StringIO
7from keyword import iskeyword
8import token
9import tokenize
10from typing import (
11 Hashable,
12 Iterator,
13)
14
15# A token value Python's tokenizer probably will never use.
16BACKTICK_QUOTED_STRING = 100
17
18
19def create_valid_python_identifier(name: str) -> str:
20 """
21 Create valid Python identifiers from any string.
22
23 Check if name contains any special characters. If it contains any
24 special characters, the special characters will be replaced by
25 a special string and a prefix is added.
26
27 Raises
28 ------
29 SyntaxError
30 If the returned name is not a Python valid identifier, raise an exception.
31 This can happen if there is a hashtag in the name, as the tokenizer will
32 than terminate and not find the backtick.
33 But also for characters that fall out of the range of (U+0001..U+007F).
34 """
35 if name.isidentifier() and not iskeyword(name):
36 return name
37
38 # Create a dict with the special characters and their replacement string.
39 # EXACT_TOKEN_TYPES contains these special characters
40 # token.tok_name contains a readable description of the replacement string.
41 special_characters_replacements = {
42 char: f"_{token.tok_name[tokval]}_"
43 for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
44 }
45 special_characters_replacements.update(
46 {
47 " ": "_",
48 "?": "_QUESTIONMARK_",
49 "!": "_EXCLAMATIONMARK_",
50 "$": "_DOLLARSIGN_",
51 "€": "_EUROSIGN_",
52 "°": "_DEGREESIGN_",
53 # Including quotes works, but there are exceptions.
54 "'": "_SINGLEQUOTE_",
55 '"': "_DOUBLEQUOTE_",
56 # Currently not possible. Terminates parser and won't find backtick.
57 # "#": "_HASH_",
58 }
59 )
60
61 name = "".join([special_characters_replacements.get(char, char) for char in name])
62 name = f"BACKTICK_QUOTED_STRING_{name}"
63
64 if not name.isidentifier():
65 raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
66
67 return name
68
69
70def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
71 """
72 Clean up a column name if surrounded by backticks.
73
74 Backtick quoted string are indicated by a certain tokval value. If a string
75 is a backtick quoted token it will processed by
76 :func:`_create_valid_python_identifier` so that the parser can find this
77 string when the query is executed.
78 In this case the tok will get the NAME tokval.
79
80 Parameters
81 ----------
82 tok : tuple of int, str
83 ints correspond to the all caps constants in the tokenize module
84
85 Returns
86 -------
87 tok : Tuple[int, str]
88 Either the input or token or the replacement values
89 """
90 toknum, tokval = tok
91 if toknum == BACKTICK_QUOTED_STRING:
92 return tokenize.NAME, create_valid_python_identifier(tokval)
93 return toknum, tokval
94
95
96def clean_column_name(name: Hashable) -> Hashable:
97 """
98 Function to emulate the cleaning of a backtick quoted name.
99
100 The purpose for this function is to see what happens to the name of
101 identifier if it goes to the process of being parsed a Python code
102 inside a backtick quoted string and than being cleaned
103 (removed of any special characters).
104
105 Parameters
106 ----------
107 name : hashable
108 Name to be cleaned.
109
110 Returns
111 -------
112 name : hashable
113 Returns the name after tokenizing and cleaning.
114
115 Notes
116 -----
117 For some cases, a name cannot be converted to a valid Python identifier.
118 In that case :func:`tokenize_string` raises a SyntaxError.
119 In that case, we just return the name unmodified.
120
121 If this name was used in the query string (this makes the query call impossible)
122 an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
123 which is not caught and propagates to the user level.
124 """
125 try:
126 tokenized = tokenize_string(f"`{name}`")
127 tokval = next(tokenized)[1]
128 return create_valid_python_identifier(tokval)
129 except SyntaxError:
130 return name
131
132
133def tokenize_backtick_quoted_string(
134 token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
135) -> tuple[int, str]:
136 """
137 Creates a token from a backtick quoted string.
138
139 Moves the token_generator forwards till right after the next backtick.
140
141 Parameters
142 ----------
143 token_generator : Iterator[tokenize.TokenInfo]
144 The generator that yields the tokens of the source string (Tuple[int, str]).
145 The generator is at the first token after the backtick (`)
146
147 source : str
148 The Python source code string.
149
150 string_start : int
151 This is the start of backtick quoted string inside the source string.
152
153 Returns
154 -------
155 tok: Tuple[int, str]
156 The token that represents the backtick quoted string.
157 The integer is equal to BACKTICK_QUOTED_STRING (100).
158 """
159 for _, tokval, start, _, _ in token_generator:
160 if tokval == "`":
161 string_end = start[1]
162 break
163
164 return BACKTICK_QUOTED_STRING, source[string_start:string_end]
165
166
167def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
168 """
169 Tokenize a Python source code string.
170
171 Parameters
172 ----------
173 source : str
174 The Python source code string.
175
176 Returns
177 -------
178 tok_generator : Iterator[Tuple[int, str]]
179 An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
180 """
181 line_reader = StringIO(source).readline
182 token_generator = tokenize.generate_tokens(line_reader)
183
184 # Loop over all tokens till a backtick (`) is found.
185 # Then, take all tokens till the next backtick to form a backtick quoted string
186 for toknum, tokval, start, _, _ in token_generator:
187 if tokval == "`":
188 try:
189 yield tokenize_backtick_quoted_string(
190 token_generator, source, string_start=start[1] + 1
191 )
192 except Exception as err:
193 raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
194 else:
195 yield toknum, tokval