/src/duckdb/third_party/jaro_winkler/jaro_winkler.hpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* SPDX-License-Identifier: MIT */ |
2 | | /* Copyright © 2022 Max Bachmann */ |
3 | | |
4 | | #pragma once |
5 | | #include "details/common.hpp" |
6 | | #include "details/jaro_impl.hpp" |
7 | | |
8 | | #include <stdexcept> |
9 | | |
10 | | namespace duckdb_jaro_winkler { |
11 | | |
12 | | /** |
13 | | * @defgroup jaro_winkler jaro_winkler |
14 | | * @{ |
15 | | */ |
16 | | |
17 | | /** |
18 | | * @brief Calculates the jaro winkler similarity |
19 | | * |
20 | | * @tparam Sentence1 This is a string that can be converted to |
21 | | * basic_string_view<char_type> |
22 | | * @tparam Sentence2 This is a string that can be converted to |
23 | | * basic_string_view<char_type> |
24 | | * |
25 | | * @param s1 |
26 | | * string to compare with s2 (for type info check Template parameters above) |
27 | | * @param s2 |
28 | | * string to compare with s1 (for type info check Template parameters above) |
29 | | * @param prefix_weight |
30 | | * Weight used for the common prefix of the two strings. |
31 | | * Has to be between 0 and 0.25. Default is 0.1. |
32 | | * @param score_cutoff |
33 | | * Optional argument for a score threshold as a float between 0 and 100. |
34 | | * For similarity < score_cutoff 0 is returned instead. Default is 0, |
35 | | * which deactivates this behaviour. |
36 | | * |
37 | | * @return jaro winkler similarity between s1 and s2 |
38 | | * as a float between 0 and 100 |
39 | | */ |
40 | | template <typename InputIt1, typename InputIt2> |
41 | | typename std::enable_if< |
42 | | common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type |
43 | | jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, |
44 | | double prefix_weight = 0.1, double score_cutoff = 0.0) |
45 | 4.93M | { |
46 | 4.93M | if (prefix_weight < 0.0 || prefix_weight > 0.25) { |
47 | 0 | throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25"); |
48 | 0 | } |
49 | | |
50 | 4.93M | return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight, |
51 | 4.93M | score_cutoff); |
52 | 4.93M | } |
53 | | |
54 | | template <typename S1, typename S2> |
55 | | double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1, |
56 | | double score_cutoff = 0.0) |
57 | | { |
58 | | return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), |
59 | | prefix_weight, score_cutoff); |
60 | | } |
61 | | |
62 | | template <typename CharT1> |
63 | | struct CachedJaroWinklerSimilarity { |
64 | | template <typename InputIt1> |
65 | | CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1) |
66 | 0 | : s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_) |
67 | 0 | { |
68 | 0 | if (prefix_weight < 0.0 || prefix_weight > 0.25) { |
69 | 0 | throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25"); |
70 | 0 | } |
71 | 0 | } |
72 | | |
73 | | template <typename S1> |
74 | | CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1) |
75 | 0 | : CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_) |
76 | 0 | {} |
77 | | |
78 | | template <typename InputIt2> |
79 | | double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
80 | 0 | { |
81 | 0 | return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2, |
82 | 0 | prefix_weight, score_cutoff); |
83 | 0 | } |
84 | | |
85 | | template <typename S2> |
86 | | double similarity(const S2& s2, double score_cutoff = 0) const |
87 | | { |
88 | | return similarity(std::begin(s2), std::end(s2), score_cutoff); |
89 | | } |
90 | | |
91 | | template <typename InputIt2> |
92 | | double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
93 | | { |
94 | | return similarity(first2, last2, score_cutoff); |
95 | | } |
96 | | |
97 | | template <typename S2> |
98 | | double normalized_similarity(const S2& s2, double score_cutoff = 0) const |
99 | | { |
100 | | return similarity(s2, score_cutoff); |
101 | | } |
102 | | |
103 | | private: |
104 | | std::basic_string<CharT1> s1; |
105 | | common::BlockPatternMatchVector PM; |
106 | | |
107 | | double prefix_weight; |
108 | | }; |
109 | | |
110 | | /** |
111 | | * @brief Calculates the jaro similarity |
112 | | * |
113 | | * @tparam Sentence1 This is a string that can be converted to |
114 | | * basic_string_view<char_type> |
115 | | * @tparam Sentence2 This is a string that can be converted to |
116 | | * basic_string_view<char_type> |
117 | | * |
118 | | * @param s1 |
119 | | * string to compare with s2 (for type info check Template parameters above) |
120 | | * @param s2 |
121 | | * string to compare with s1 (for type info check Template parameters above) |
122 | | * @param score_cutoff |
123 | | * Optional argument for a score threshold as a float between 0 and 100. |
124 | | * For similarity < score_cutoff 0 is returned instead. Default is 0, |
125 | | * which deactivates this behaviour. |
126 | | * |
127 | | * @return jaro similarity between s1 and s2 |
128 | | * as a float between 0 and 100 |
129 | | */ |
130 | | template <typename InputIt1, typename InputIt2> |
131 | | double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, |
132 | | double score_cutoff = 0.0) |
133 | 0 | { |
134 | 0 | return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff); |
135 | 0 | } |
136 | | |
137 | | template <typename S1, typename S2> |
138 | | double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0) |
139 | | { |
140 | | return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), |
141 | | score_cutoff); |
142 | | } |
143 | | |
144 | | template <typename CharT1> |
145 | | struct CachedJaroSimilarity { |
146 | | template <typename InputIt1> |
147 | 0 | CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) |
148 | 0 | {} |
149 | | |
150 | | template <typename S1> |
151 | 0 | CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_)) |
152 | 0 | {} |
153 | | |
154 | | template <typename InputIt2> |
155 | | double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
156 | 0 | { |
157 | 0 | return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2, |
158 | 0 | score_cutoff); |
159 | 0 | } |
160 | | |
161 | | template <typename S2> |
162 | | double similarity(const S2& s2, double score_cutoff = 0) const |
163 | | { |
164 | | return similarity(std::begin(s2), std::end(s2), score_cutoff); |
165 | | } |
166 | | |
167 | | template <typename InputIt2> |
168 | | double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
169 | | { |
170 | | return similarity(first2, last2, score_cutoff); |
171 | | } |
172 | | |
173 | | template <typename S2> |
174 | | double normalized_similarity(const S2& s2, double score_cutoff = 0) const |
175 | | { |
176 | | return similarity(s2, score_cutoff); |
177 | | } |
178 | | |
179 | | private: |
180 | | std::basic_string<CharT1> s1; |
181 | | common::BlockPatternMatchVector PM; |
182 | | }; |
183 | | |
184 | | /**@}*/ |
185 | | |
186 | | } // namespace duckdb_jaro_winkler |