/src/cpython/Python/suggestions.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "pycore_code.h" // _PyCode_GetVarnames() |
3 | | #include "pycore_frame.h" |
4 | | #include "pycore_pyerrors.h" // export _Py_UTF8_Edit_Cost() |
5 | | #include "pycore_runtime.h" // _Py_ID() |
6 | | #include "pycore_unicodeobject.h" // _PyUnicode_Equal() |
7 | | |
8 | 0 | #define MAX_CANDIDATE_ITEMS 750 |
9 | 0 | #define MAX_STRING_SIZE 40 |
10 | | |
11 | 0 | #define MOVE_COST 2 |
12 | 0 | #define CASE_COST 1 |
13 | | |
14 | 0 | #define LEAST_FIVE_BITS(n) ((n) & 31) |
15 | | |
16 | | static inline int |
17 | | substitution_cost(char a, char b) |
18 | 0 | { |
19 | 0 | if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) { |
20 | | // Not the same, not a case flip. |
21 | 0 | return MOVE_COST; |
22 | 0 | } |
23 | 0 | if (a == b) { |
24 | 0 | return 0; |
25 | 0 | } |
26 | 0 | if ('A' <= a && a <= 'Z') { |
27 | 0 | a += ('a' - 'A'); |
28 | 0 | } |
29 | 0 | if ('A' <= b && b <= 'Z') { |
30 | 0 | b += ('a' - 'A'); |
31 | 0 | } |
32 | 0 | if (a == b) { |
33 | 0 | return CASE_COST; |
34 | 0 | } |
35 | 0 | return MOVE_COST; |
36 | 0 | } |
37 | | |
38 | | /* Calculate the Levenshtein distance between string1 and string2 */ |
39 | | static Py_ssize_t |
40 | | levenshtein_distance(const char *a, size_t a_size, |
41 | | const char *b, size_t b_size, |
42 | | size_t max_cost, size_t *buffer) |
43 | 0 | { |
44 | | // Both strings are the same (by identity) |
45 | 0 | if (a == b) { |
46 | 0 | return 0; |
47 | 0 | } |
48 | | |
49 | | // Trim away common affixes. |
50 | 0 | while (a_size && b_size && a[0] == b[0]) { |
51 | 0 | a++; a_size--; |
52 | 0 | b++; b_size--; |
53 | 0 | } |
54 | 0 | while (a_size && b_size && a[a_size-1] == b[b_size-1]) { |
55 | 0 | a_size--; |
56 | 0 | b_size--; |
57 | 0 | } |
58 | 0 | if (a_size == 0 || b_size == 0) { |
59 | 0 | return (a_size + b_size) * MOVE_COST; |
60 | 0 | } |
61 | 0 | if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) { |
62 | 0 | return max_cost + 1; |
63 | 0 | } |
64 | | |
65 | | // Prefer shorter buffer |
66 | 0 | if (b_size < a_size) { |
67 | 0 | const char *t = a; a = b; b = t; |
68 | 0 | size_t t_size = a_size; a_size = b_size; b_size = t_size; |
69 | 0 | } |
70 | | |
71 | | // quick fail when a match is impossible. |
72 | 0 | if ((b_size - a_size) * MOVE_COST > max_cost) { |
73 | 0 | return max_cost + 1; |
74 | 0 | } |
75 | | |
76 | | // Instead of producing the whole traditional len(a)-by-len(b) |
77 | | // matrix, we can update just one row in place. |
78 | | // Initialize the buffer row |
79 | 0 | size_t tmp = MOVE_COST; |
80 | 0 | for (size_t i = 0; i < a_size; i++) { |
81 | | // cost from b[:0] to a[:i+1] |
82 | 0 | buffer[i] = tmp; |
83 | 0 | tmp += MOVE_COST; |
84 | 0 | } |
85 | |
|
86 | 0 | size_t result = 0; |
87 | 0 | for (size_t b_index = 0; b_index < b_size; b_index++) { |
88 | 0 | char code = b[b_index]; |
89 | | // cost(b[:b_index], a[:0]) == b_index * MOVE_COST |
90 | 0 | size_t distance = result = b_index * MOVE_COST; |
91 | 0 | size_t minimum = SIZE_MAX; |
92 | 0 | for (size_t index = 0; index < a_size; index++) { |
93 | | |
94 | | // cost(b[:b_index+1], a[:index+1]) = min( |
95 | | // // 1) substitute |
96 | | // cost(b[:b_index], a[:index]) |
97 | | // + substitution_cost(b[b_index], a[index]), |
98 | | // // 2) delete from b |
99 | | // cost(b[:b_index], a[:index+1]) + MOVE_COST, |
100 | | // // 3) delete from a |
101 | | // cost(b[:b_index+1], a[index]) + MOVE_COST |
102 | | // ) |
103 | | |
104 | | // 1) Previous distance in this row is cost(b[:b_index], a[:index]) |
105 | 0 | size_t substitute = distance + substitution_cost(code, a[index]); |
106 | | // 2) cost(b[:b_index], a[:index+1]) from previous row |
107 | 0 | distance = buffer[index]; |
108 | | // 3) existing result is cost(b[:b_index+1], a[index]) |
109 | |
|
110 | 0 | size_t insert_delete = Py_MIN(result, distance) + MOVE_COST; |
111 | 0 | result = Py_MIN(insert_delete, substitute); |
112 | | |
113 | | // cost(b[:b_index+1], a[:index+1]) |
114 | 0 | buffer[index] = result; |
115 | 0 | if (result < minimum) { |
116 | 0 | minimum = result; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | if (minimum > max_cost) { |
120 | | // Everything in this row is too big, so bail early. |
121 | 0 | return max_cost + 1; |
122 | 0 | } |
123 | 0 | } |
124 | 0 | return result; |
125 | 0 | } |
126 | | |
127 | | PyObject * |
128 | | _Py_CalculateSuggestions(PyObject *dir, |
129 | | PyObject *name) |
130 | 0 | { |
131 | 0 | assert(!PyErr_Occurred()); |
132 | 0 | assert(PyList_CheckExact(dir)); |
133 | |
|
134 | 0 | Py_ssize_t dir_size = PyList_GET_SIZE(dir); |
135 | 0 | if (dir_size >= MAX_CANDIDATE_ITEMS) { |
136 | 0 | return NULL; |
137 | 0 | } |
138 | | |
139 | 0 | Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX; |
140 | 0 | PyObject *suggestion = NULL; |
141 | 0 | Py_ssize_t name_size; |
142 | 0 | const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size); |
143 | 0 | if (name_str == NULL) { |
144 | 0 | return NULL; |
145 | 0 | } |
146 | 0 | size_t *buffer = PyMem_New(size_t, MAX_STRING_SIZE); |
147 | 0 | if (buffer == NULL) { |
148 | 0 | return PyErr_NoMemory(); |
149 | 0 | } |
150 | 0 | for (Py_ssize_t i = 0; i < dir_size; ++i) { |
151 | 0 | PyObject *item = PyList_GET_ITEM(dir, i); |
152 | 0 | if (_PyUnicode_Equal(name, item)) { |
153 | 0 | continue; |
154 | 0 | } |
155 | 0 | Py_ssize_t item_size; |
156 | 0 | const char *item_str = PyUnicode_AsUTF8AndSize(item, &item_size); |
157 | 0 | if (item_str == NULL) { |
158 | 0 | PyMem_Free(buffer); |
159 | 0 | return NULL; |
160 | 0 | } |
161 | | // No more than 1/3 of the involved characters should need changed. |
162 | 0 | Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6; |
163 | | // Don't take matches we've already beaten. |
164 | 0 | max_distance = Py_MIN(max_distance, suggestion_distance - 1); |
165 | 0 | Py_ssize_t current_distance = |
166 | 0 | levenshtein_distance(name_str, name_size, item_str, |
167 | 0 | item_size, max_distance, buffer); |
168 | 0 | if (current_distance > max_distance) { |
169 | 0 | continue; |
170 | 0 | } |
171 | 0 | if (!suggestion || current_distance < suggestion_distance) { |
172 | 0 | suggestion = item; |
173 | 0 | suggestion_distance = current_distance; |
174 | 0 | } |
175 | 0 | } |
176 | 0 | PyMem_Free(buffer); |
177 | 0 | return Py_XNewRef(suggestion); |
178 | 0 | } |
179 | | |
180 | | Py_ssize_t |
181 | | _Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost) |
182 | 0 | { |
183 | 0 | assert(PyUnicode_Check(a) && PyUnicode_Check(b)); |
184 | 0 | Py_ssize_t size_a, size_b; |
185 | 0 | const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a); |
186 | 0 | if (utf8_a == NULL) { |
187 | 0 | return -1; |
188 | 0 | } |
189 | 0 | const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b); |
190 | 0 | if (utf8_b == NULL) { |
191 | 0 | return -1; |
192 | 0 | } |
193 | 0 | if (max_cost == -1) { |
194 | 0 | max_cost = MOVE_COST * Py_MAX(size_a, size_b); |
195 | 0 | } |
196 | 0 | size_t *buffer = PyMem_New(size_t, MAX_STRING_SIZE); |
197 | 0 | if (buffer == NULL) { |
198 | 0 | PyErr_NoMemory(); |
199 | 0 | return -1; |
200 | 0 | } |
201 | 0 | Py_ssize_t res = levenshtein_distance(utf8_a, size_a, |
202 | 0 | utf8_b, size_b, max_cost, buffer); |
203 | 0 | PyMem_Free(buffer); |
204 | 0 | return res; |
205 | 0 | } |
206 | | |