Coverage Report

Created: 2025-09-05 08:05

/src/duckdb/third_party/jaro_winkler/jaro_winkler.hpp
Line
Count
Source (jump to first uncovered line)
1
/* SPDX-License-Identifier: MIT */
2
/* Copyright © 2022 Max Bachmann */
3
4
#pragma once
5
#include "details/common.hpp"
6
#include "details/jaro_impl.hpp"
7
8
#include <stdexcept>
9
10
namespace duckdb_jaro_winkler {
11
12
/**
13
 * @defgroup jaro_winkler jaro_winkler
14
 * @{
15
 */
16
17
/**
18
 * @brief Calculates the jaro winkler similarity
19
 *
20
 * @tparam Sentence1 This is a string that can be converted to
21
 * basic_string_view<char_type>
22
 * @tparam Sentence2 This is a string that can be converted to
23
 * basic_string_view<char_type>
24
 *
25
 * @param s1
26
 *   string to compare with s2 (for type info check Template parameters above)
27
 * @param s2
28
 *   string to compare with s1 (for type info check Template parameters above)
29
 * @param prefix_weight
30
 *   Weight used for the common prefix of the two strings.
31
 *   Has to be between 0 and 0.25. Default is 0.1.
32
 * @param score_cutoff
33
 *   Optional argument for a score threshold as a float between 0 and 100.
34
 *   For similarity < score_cutoff 0 is returned instead. Default is 0,
35
 *   which deactivates this behaviour.
36
 *
37
 * @return jaro winkler similarity between s1 and s2
38
 *   as a float between 0 and 100
39
 */
40
template <typename InputIt1, typename InputIt2>
41
typename std::enable_if<
42
    common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type
43
jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
44
                        double prefix_weight = 0.1, double score_cutoff = 0.0)
45
4.93M
{
46
4.93M
    if (prefix_weight < 0.0 || prefix_weight > 0.25) {
47
0
        throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
48
0
    }
49
50
4.93M
    return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight,
51
4.93M
                                           score_cutoff);
52
4.93M
}
53
54
template <typename S1, typename S2>
55
double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1,
56
                               double score_cutoff = 0.0)
57
{
58
    return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
59
                                   prefix_weight, score_cutoff);
60
}
61
62
template <typename CharT1>
63
struct CachedJaroWinklerSimilarity {
64
    template <typename InputIt1>
65
    CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1)
66
0
        : s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_)
67
0
    {
68
0
        if (prefix_weight < 0.0 || prefix_weight > 0.25) {
69
0
            throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
70
0
        }
71
0
    }
72
73
    template <typename S1>
74
    CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1)
75
0
        : CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_)
76
0
    {}
77
78
    template <typename InputIt2>
79
    double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
80
0
    {
81
0
        return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
82
0
                                               prefix_weight, score_cutoff);
83
0
    }
84
85
    template <typename S2>
86
    double similarity(const S2& s2, double score_cutoff = 0) const
87
    {
88
        return similarity(std::begin(s2), std::end(s2), score_cutoff);
89
    }
90
91
    template <typename InputIt2>
92
    double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
93
    {
94
        return similarity(first2, last2, score_cutoff);
95
    }
96
97
    template <typename S2>
98
    double normalized_similarity(const S2& s2, double score_cutoff = 0) const
99
    {
100
        return similarity(s2, score_cutoff);
101
    }
102
103
private:
104
    std::basic_string<CharT1> s1;
105
    common::BlockPatternMatchVector PM;
106
107
    double prefix_weight;
108
};
109
110
/**
111
 * @brief Calculates the jaro similarity
112
 *
113
 * @tparam Sentence1 This is a string that can be converted to
114
 * basic_string_view<char_type>
115
 * @tparam Sentence2 This is a string that can be converted to
116
 * basic_string_view<char_type>
117
 *
118
 * @param s1
119
 *   string to compare with s2 (for type info check Template parameters above)
120
 * @param s2
121
 *   string to compare with s1 (for type info check Template parameters above)
122
 * @param score_cutoff
123
 *   Optional argument for a score threshold as a float between 0 and 100.
124
 *   For similarity < score_cutoff 0 is returned instead. Default is 0,
125
 *   which deactivates this behaviour.
126
 *
127
 * @return jaro similarity between s1 and s2
128
 *   as a float between 0 and 100
129
 */
130
template <typename InputIt1, typename InputIt2>
131
double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
132
                       double score_cutoff = 0.0)
133
0
{
134
0
    return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff);
135
0
}
136
137
template <typename S1, typename S2>
138
double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0)
139
{
140
    return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
141
                           score_cutoff);
142
}
143
144
template <typename CharT1>
145
struct CachedJaroSimilarity {
146
    template <typename InputIt1>
147
0
    CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1)
148
0
    {}
149
150
    template <typename S1>
151
0
    CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_))
152
0
    {}
153
154
    template <typename InputIt2>
155
    double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
156
0
    {
157
0
        return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
158
0
                                       score_cutoff);
159
0
    }
160
161
    template <typename S2>
162
    double similarity(const S2& s2, double score_cutoff = 0) const
163
    {
164
        return similarity(std::begin(s2), std::end(s2), score_cutoff);
165
    }
166
167
    template <typename InputIt2>
168
    double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
169
    {
170
        return similarity(first2, last2, score_cutoff);
171
    }
172
173
    template <typename S2>
174
    double normalized_similarity(const S2& s2, double score_cutoff = 0) const
175
    {
176
        return similarity(s2, score_cutoff);
177
    }
178
179
private:
180
    std::basic_string<CharT1> s1;
181
    common::BlockPatternMatchVector PM;
182
};
183
184
/**@}*/
185
186
} // namespace duckdb_jaro_winkler