/src/gdal/port/cpl_levenshtein.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // SPDX-License-Identifier: MIT  | 
2  |  | // Copyright (c) 2019, Guilherme Agostinelli  | 
3  |  | // Ported from https://github.com/guilhermeagostinelli/levenshtein/blob/master/levenshtein.cpp  | 
4  |  |  | 
5  |  | #include "cpl_levenshtein.h"  | 
6  |  |  | 
7  |  | #include <algorithm>  | 
8  |  | #include <cstring>  | 
9  |  | #include <limits>  | 
10  |  | #include <vector>  | 
11  |  |  | 
12  |  | /** Computes the Levenshtein distance between 2 words.  | 
13  |  |  *  | 
14  |  |  * If transpositionAllowed = true, then it is the Damerau-Levenshtein distance  | 
15  |  |  *  | 
16  |  |  * Return SIZE_T_MAX in case of error.  | 
17  |  |  */  | 
18  |  | size_t CPLLevenshteinDistance(const char *word1, const char *word2,  | 
19  |  |                               bool transpositionAllowed)  | 
20  | 0  | { | 
21  | 0  |     const size_t size1 = strlen(word1);  | 
22  | 0  |     const size_t size2 = strlen(word2);  | 
23  |  |  | 
24  |  |     // If one of the words has zero length, the distance is equal to the size of the other word.  | 
25  | 0  |     if (size1 == 0)  | 
26  | 0  |         return size2;  | 
27  | 0  |     if (size2 == 0)  | 
28  | 0  |         return size1;  | 
29  |  |  | 
30  |  |     // Would cause enormous amount of memory to be allocated  | 
31  | 0  |     if (size1 >= 32768 || size2 >= 32768)  | 
32  | 0  |     { | 
33  | 0  |         return strcmp(word1, word2) == 0 ? 0  | 
34  | 0  |                                          : std::numeric_limits<size_t>::max();  | 
35  | 0  |     }  | 
36  |  |  | 
37  |  |     // Verification matrix i.e. 2D array which will store the calculated distance.  | 
38  | 0  |     const size_t dimFastSize = size2 + 1;  | 
39  | 0  |     std::vector<unsigned short> verif;  | 
40  | 0  |     try  | 
41  | 0  |     { | 
42  | 0  |         verif.resize((size1 + 1) * dimFastSize);  | 
43  | 0  |     }  | 
44  | 0  |     catch (const std::exception &)  | 
45  | 0  |     { | 
46  | 0  |         return std::numeric_limits<size_t>::max();  | 
47  | 0  |     }  | 
48  |  |  | 
49  | 0  |     const auto verifVal = [&verif, dimFastSize](size_t i,  | 
50  | 0  |                                                 size_t j) -> unsigned short &  | 
51  | 0  |     { return verif[i * dimFastSize + j]; }; | 
52  |  |  | 
53  |  |     // Sets the first row and the first column of the verification matrix with the numerical order from 0 to the length of each word.  | 
54  | 0  |     for (size_t i = 0; i <= size1; i++)  | 
55  | 0  |         verifVal(i, 0) = static_cast<unsigned short>(i);  | 
56  | 0  |     for (size_t j = 0; j <= size2; j++)  | 
57  | 0  |         verifVal(0, j) = static_cast<unsigned short>(j);  | 
58  |  |  | 
59  |  |     // Verification step / matrix filling.  | 
60  | 0  |     for (size_t i = 1; i <= size1; i++)  | 
61  | 0  |     { | 
62  | 0  |         for (size_t j = 1; j <= size2; j++)  | 
63  | 0  |         { | 
64  |  |             // Sets the modification cost.  | 
65  |  |             // 0 means no modification (i.e. equal letters) and 1 means that a modification is needed (i.e. unequal letters).  | 
66  | 0  |             const int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1;  | 
67  |  |  | 
68  |  |             // Sets the current position of the matrix as the minimum value between a (deletion), b (insertion) and c (substitution).  | 
69  |  |             // a = the upper adjacent value plus 1: verif[i - 1][j] + 1  | 
70  |  |             // b = the left adjacent value plus 1: verif[i][j - 1] + 1  | 
71  |  |             // c = the upper left adjacent value plus the modification cost: verif[i - 1][j - 1] + cost  | 
72  | 0  |             verifVal(i, j) = std::min(  | 
73  | 0  |                 std::min(static_cast<unsigned short>(verifVal(i - 1, j) + 1),  | 
74  | 0  |                          static_cast<unsigned short>(verifVal(i, j - 1) + 1)),  | 
75  | 0  |                 static_cast<unsigned short>(verifVal(i - 1, j - 1) + cost));  | 
76  |  |  | 
77  |  |             // Cf https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance  | 
78  |  |             // (note that in the Wikipedia page, a[] and b[] are indexed from 1...  | 
79  | 0  |             if (transpositionAllowed && i > 1 && j > 1 &&  | 
80  | 0  |                 word1[i - 1] == word2[j - 2] && word1[i - 2] == word2[j - 1])  | 
81  | 0  |             { | 
82  | 0  |                 verifVal(i, j) = std::min(  | 
83  | 0  |                     verifVal(i, j),  | 
84  | 0  |                     static_cast<unsigned short>(verifVal(i - 2, j - 2) + 1));  | 
85  | 0  |             }  | 
86  | 0  |         }  | 
87  | 0  |     }  | 
88  |  |  | 
89  |  |     // The last position of the matrix will contain the Levenshtein distance.  | 
90  | 0  |     return verifVal(size1, size2);  | 
91  | 0  | }  |