Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Python/suggestions.c
Line
Count
Source (jump to first uncovered line)
1
#include "Python.h"
2
#include "pycore_code.h"          // _PyCode_GetVarnames()
3
#include "pycore_frame.h"
4
#include "pycore_pyerrors.h"      // export _Py_UTF8_Edit_Cost()
5
#include "pycore_runtime.h"       // _Py_ID()
6
#include "pycore_unicodeobject.h" // _PyUnicode_Equal()
7
8
0
#define MAX_CANDIDATE_ITEMS 750
9
0
#define MAX_STRING_SIZE 40
10
11
0
#define MOVE_COST 2
12
0
#define CASE_COST 1
13
14
0
#define LEAST_FIVE_BITS(n) ((n) & 31)
15
16
static inline int
17
substitution_cost(char a, char b)
18
0
{
19
0
    if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) {
20
        // Not the same, not a case flip.
21
0
        return MOVE_COST;
22
0
    }
23
0
    if (a == b) {
24
0
        return 0;
25
0
    }
26
0
    if ('A' <= a && a <= 'Z') {
27
0
        a += ('a' - 'A');
28
0
    }
29
0
    if ('A' <= b && b <= 'Z') {
30
0
        b += ('a' - 'A');
31
0
    }
32
0
    if (a == b) {
33
0
        return CASE_COST;
34
0
    }
35
0
    return MOVE_COST;
36
0
}
37
38
/* Calculate the Levenshtein distance between string1 and string2 */
39
static Py_ssize_t
40
levenshtein_distance(const char *a, size_t a_size,
41
                     const char *b, size_t b_size,
42
                     size_t max_cost, size_t *buffer)
43
0
{
44
    // Both strings are the same (by identity)
45
0
    if (a == b) {
46
0
        return 0;
47
0
    }
48
49
    // Trim away common affixes.
50
0
    while (a_size && b_size && a[0] == b[0]) {
51
0
        a++; a_size--;
52
0
        b++; b_size--;
53
0
    }
54
0
    while (a_size && b_size && a[a_size-1] == b[b_size-1]) {
55
0
        a_size--;
56
0
        b_size--;
57
0
    }
58
0
    if (a_size == 0 || b_size == 0) {
59
0
        return (a_size + b_size) * MOVE_COST;
60
0
    }
61
0
    if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
62
0
        return max_cost + 1;
63
0
    }
64
65
    // Prefer shorter buffer
66
0
    if (b_size < a_size) {
67
0
        const char *t = a; a = b; b = t;
68
0
        size_t t_size = a_size; a_size = b_size; b_size = t_size;
69
0
    }
70
71
    // quick fail when a match is impossible.
72
0
    if ((b_size - a_size) * MOVE_COST > max_cost) {
73
0
        return max_cost + 1;
74
0
    }
75
76
    // Instead of producing the whole traditional len(a)-by-len(b)
77
    // matrix, we can update just one row in place.
78
    // Initialize the buffer row
79
0
    size_t tmp = MOVE_COST;
80
0
    for (size_t i = 0; i < a_size; i++) {
81
        // cost from b[:0] to a[:i+1]
82
0
        buffer[i] = tmp;
83
0
        tmp += MOVE_COST;
84
0
    }
85
86
0
    size_t result = 0;
87
0
    for (size_t b_index = 0; b_index < b_size; b_index++) {
88
0
        char code = b[b_index];
89
        // cost(b[:b_index], a[:0]) == b_index * MOVE_COST
90
0
        size_t distance = result = b_index * MOVE_COST;
91
0
        size_t minimum = SIZE_MAX;
92
0
        for (size_t index = 0; index < a_size; index++) {
93
94
            // cost(b[:b_index+1], a[:index+1]) = min(
95
            //     // 1) substitute
96
            //     cost(b[:b_index], a[:index])
97
            //         + substitution_cost(b[b_index], a[index]),
98
            //     // 2) delete from b
99
            //     cost(b[:b_index], a[:index+1]) + MOVE_COST,
100
            //     // 3) delete from a
101
            //     cost(b[:b_index+1], a[index]) + MOVE_COST
102
            // )
103
104
            // 1) Previous distance in this row is cost(b[:b_index], a[:index])
105
0
            size_t substitute = distance + substitution_cost(code, a[index]);
106
            // 2) cost(b[:b_index], a[:index+1]) from previous row
107
0
            distance = buffer[index];
108
            // 3) existing result is cost(b[:b_index+1], a[index])
109
110
0
            size_t insert_delete = Py_MIN(result, distance) + MOVE_COST;
111
0
            result = Py_MIN(insert_delete, substitute);
112
113
            // cost(b[:b_index+1], a[:index+1])
114
0
            buffer[index] = result;
115
0
            if (result < minimum) {
116
0
                minimum = result;
117
0
            }
118
0
        }
119
0
        if (minimum > max_cost) {
120
            // Everything in this row is too big, so bail early.
121
0
            return max_cost + 1;
122
0
        }
123
0
    }
124
0
    return result;
125
0
}
126
127
PyObject *
128
_Py_CalculateSuggestions(PyObject *dir,
129
                      PyObject *name)
130
0
{
131
0
    assert(!PyErr_Occurred());
132
0
    assert(PyList_CheckExact(dir));
133
134
0
    Py_ssize_t dir_size = PyList_GET_SIZE(dir);
135
0
    if (dir_size >= MAX_CANDIDATE_ITEMS) {
136
0
        return NULL;
137
0
    }
138
139
0
    Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX;
140
0
    PyObject *suggestion = NULL;
141
0
    Py_ssize_t name_size;
142
0
    const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size);
143
0
    if (name_str == NULL) {
144
0
        return NULL;
145
0
    }
146
0
    size_t *buffer = PyMem_New(size_t, MAX_STRING_SIZE);
147
0
    if (buffer == NULL) {
148
0
        return PyErr_NoMemory();
149
0
    }
150
0
    for (Py_ssize_t i = 0; i < dir_size; ++i) {
151
0
        PyObject *item = PyList_GET_ITEM(dir, i);
152
0
        if (_PyUnicode_Equal(name, item)) {
153
0
            continue;
154
0
        }
155
0
        Py_ssize_t item_size;
156
0
        const char *item_str = PyUnicode_AsUTF8AndSize(item, &item_size);
157
0
        if (item_str == NULL) {
158
0
            PyMem_Free(buffer);
159
0
            return NULL;
160
0
        }
161
        // No more than 1/3 of the involved characters should need changed.
162
0
        Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6;
163
        // Don't take matches we've already beaten.
164
0
        max_distance = Py_MIN(max_distance, suggestion_distance - 1);
165
0
        Py_ssize_t current_distance =
166
0
            levenshtein_distance(name_str, name_size, item_str,
167
0
                                 item_size, max_distance, buffer);
168
0
        if (current_distance > max_distance) {
169
0
            continue;
170
0
        }
171
0
        if (!suggestion || current_distance < suggestion_distance) {
172
0
            suggestion = item;
173
0
            suggestion_distance = current_distance;
174
0
        }
175
0
    }
176
0
    PyMem_Free(buffer);
177
0
    return Py_XNewRef(suggestion);
178
0
}
179
180
Py_ssize_t
181
_Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost)
182
0
{
183
0
    assert(PyUnicode_Check(a) && PyUnicode_Check(b));
184
0
    Py_ssize_t size_a, size_b;
185
0
    const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a);
186
0
    if (utf8_a == NULL) {
187
0
        return -1;
188
0
    }
189
0
    const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b);
190
0
    if (utf8_b == NULL) {
191
0
        return -1;
192
0
    }
193
0
    if (max_cost == -1) {
194
0
        max_cost = MOVE_COST * Py_MAX(size_a, size_b);
195
0
    }
196
0
    size_t *buffer = PyMem_New(size_t, MAX_STRING_SIZE);
197
0
    if (buffer == NULL) {
198
0
        PyErr_NoMemory();
199
0
        return -1;
200
0
    }
201
0
    Py_ssize_t res = levenshtein_distance(utf8_a, size_a,
202
0
                                    utf8_b, size_b, max_cost, buffer);
203
0
    PyMem_Free(buffer);
204
0
    return res;
205
0
}
206