/src/gdal/port/cpl_levenshtein.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: MIT |
2 | | // Copyright (c) 2019, Guilherme Agostinelli |
3 | | // Ported from https://github.com/guilhermeagostinelli/levenshtein/blob/master/levenshtein.cpp |
4 | | |
5 | | #include "cpl_levenshtein.h" |
6 | | |
7 | | #include <algorithm> |
8 | | #include <cstring> |
9 | | #include <limits> |
10 | | #include <vector> |
11 | | |
12 | | /** Computes the Levenshtein distance between 2 words. |
13 | | * |
14 | | * If transpositionAllowed = true, then it is the Damerau-Levenshtein distance |
15 | | * |
16 | | * Return SIZE_T_MAX in case of error. |
17 | | */ |
18 | | size_t CPLLevenshteinDistance(const char *word1, const char *word2, |
19 | | bool transpositionAllowed) |
20 | 0 | { |
21 | 0 | const size_t size1 = strlen(word1); |
22 | 0 | const size_t size2 = strlen(word2); |
23 | | |
24 | | // If one of the words has zero length, the distance is equal to the size of the other word. |
25 | 0 | if (size1 == 0) |
26 | 0 | return size2; |
27 | 0 | if (size2 == 0) |
28 | 0 | return size1; |
29 | | |
30 | | // Would cause enormous amount of memory to be allocated |
31 | 0 | if (size1 >= 32768 || size2 >= 32768) |
32 | 0 | { |
33 | 0 | return strcmp(word1, word2) == 0 ? 0 |
34 | 0 | : std::numeric_limits<size_t>::max(); |
35 | 0 | } |
36 | | |
37 | | // Verification matrix i.e. 2D array which will store the calculated distance. |
38 | 0 | const size_t dimFastSize = size2 + 1; |
39 | 0 | std::vector<unsigned short> verif; |
40 | 0 | try |
41 | 0 | { |
42 | 0 | verif.resize((size1 + 1) * dimFastSize); |
43 | 0 | } |
44 | 0 | catch (const std::exception &) |
45 | 0 | { |
46 | 0 | return std::numeric_limits<size_t>::max(); |
47 | 0 | } |
48 | | |
49 | 0 | const auto verifVal = [&verif, dimFastSize](size_t i, |
50 | 0 | size_t j) -> unsigned short & |
51 | 0 | { return verif[i * dimFastSize + j]; }; |
52 | | |
53 | | // Sets the first row and the first column of the verification matrix with the numerical order from 0 to the length of each word. |
54 | 0 | for (size_t i = 0; i <= size1; i++) |
55 | 0 | verifVal(i, 0) = static_cast<unsigned short>(i); |
56 | 0 | for (size_t j = 0; j <= size2; j++) |
57 | 0 | verifVal(0, j) = static_cast<unsigned short>(j); |
58 | | |
59 | | // Verification step / matrix filling. |
60 | 0 | for (size_t i = 1; i <= size1; i++) |
61 | 0 | { |
62 | 0 | for (size_t j = 1; j <= size2; j++) |
63 | 0 | { |
64 | | // Sets the modification cost. |
65 | | // 0 means no modification (i.e. equal letters) and 1 means that a modification is needed (i.e. unequal letters). |
66 | 0 | const int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; |
67 | | |
68 | | // Sets the current position of the matrix as the minimum value between a (deletion), b (insertion) and c (substitution). |
69 | | // a = the upper adjacent value plus 1: verif[i - 1][j] + 1 |
70 | | // b = the left adjacent value plus 1: verif[i][j - 1] + 1 |
71 | | // c = the upper left adjacent value plus the modification cost: verif[i - 1][j - 1] + cost |
72 | 0 | verifVal(i, j) = std::min( |
73 | 0 | std::min(static_cast<unsigned short>(verifVal(i - 1, j) + 1), |
74 | 0 | static_cast<unsigned short>(verifVal(i, j - 1) + 1)), |
75 | 0 | static_cast<unsigned short>(verifVal(i - 1, j - 1) + cost)); |
76 | | |
77 | | // Cf https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance |
78 | | // (note that in the Wikipedia page, a[] and b[] are indexed from 1... |
79 | 0 | if (transpositionAllowed && i > 1 && j > 1 && |
80 | 0 | word1[i - 1] == word2[j - 2] && word1[i - 2] == word2[j - 1]) |
81 | 0 | { |
82 | 0 | verifVal(i, j) = std::min( |
83 | 0 | verifVal(i, j), |
84 | 0 | static_cast<unsigned short>(verifVal(i - 2, j - 2) + 1)); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | | // The last position of the matrix will contain the Levenshtein distance. |
90 | 0 | return verifVal(size1, size2); |
91 | 0 | } |