Coverage Report

Created: 2026-03-31 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/third_party/jaro_winkler/details/common.hpp
Line
Count
Source
1
/* SPDX-License-Identifier: MIT */
2
/* Copyright © 2022 Max Bachmann */
3
4
#pragma once
5
#include <algorithm>
6
#include <array>
7
#include <cassert>
8
#include <cmath>
9
#include <cstdint>
10
#include <cstring>
11
#include <iterator>
12
#include <type_traits>
13
#include <vector>
14
15
namespace duckdb_jaro_winkler {
16
17
namespace common {
18
19
/**
20
 * @defgroup Common Common
21
 * Common utilities shared among multiple functions
22
 * @{
23
 */
24
25
/* taken from https://stackoverflow.com/a/30766365/11335032 */
26
template <typename T>
27
struct is_iterator {
28
    static char test(...);
29
30
    template <typename U, typename = typename std::iterator_traits<U>::difference_type,
31
              typename = typename std::iterator_traits<U>::pointer,
32
              typename = typename std::iterator_traits<U>::reference,
33
              typename = typename std::iterator_traits<U>::value_type,
34
              typename = typename std::iterator_traits<U>::iterator_category>
35
    static long test(U&&);
36
37
    constexpr static bool value = std::is_same<decltype(test(std::declval<T>())), long>::value;
38
};
39
40
constexpr double result_cutoff(double result, double score_cutoff)
41
7.73M
{
42
7.73M
    return (result >= score_cutoff) ? result : 0;
43
7.73M
}
44
45
template <typename T, typename U>
46
T ceildiv(T a, U divisor)
47
76.3k
{
48
76.3k
    return static_cast<T>(a / divisor) + static_cast<T>((a % divisor) != 0);
49
76.3k
}
50
51
/**
52
 * Removes common prefix of two string views // todo
53
 */
54
template <typename InputIt1, typename InputIt2>
55
int64_t remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2)
56
5.04M
{
57
  // DuckDB passes a raw pointer, but this gives compile errors for std::
58
5.04M
  int64_t len1 = std::distance(first1, last1);
59
5.04M
  int64_t len2 = std::distance(first2, last2);
60
5.04M
  const int64_t max_comparisons = std::min<int64_t>(len1, len2);
61
5.04M
  int64_t prefix;
62
6.35M
  for (prefix = 0; prefix < max_comparisons; prefix++) {
63
6.19M
    if (first1[prefix] != first2[prefix]) {
64
4.88M
      break;
65
4.88M
    }
66
6.19M
  }
67
68
//    int64_t prefix = static_cast<int64_t>(
69
//        std::distance(first1, std::mismatch(first1, last1, first2, last2).first));
70
5.04M
    first1 += prefix;
71
5.04M
    first2 += prefix;
72
5.04M
    return prefix;
73
5.04M
}
74
75
struct BitvectorHashmap {
76
    struct MapElem {
77
        uint64_t key = 0;
78
        uint64_t value = 0;
79
    };
80
81
4.88M
    BitvectorHashmap() : m_map()
82
4.88M
    {}
83
84
    template <typename CharT>
85
    void insert(CharT key, int64_t pos)
86
    {
87
        insert_mask(key, 1ull << pos);
88
    }
89
90
    template <typename CharT>
91
    void insert_mask(CharT key, uint64_t mask)
92
38.7k
    {
93
38.7k
        uint64_t i = lookup(static_cast<uint64_t>(key));
94
38.7k
        m_map[i].key = static_cast<uint64_t>(key);
95
38.7k
        m_map[i].value |= mask;
96
38.7k
    }
97
98
    template <typename CharT>
99
    uint64_t get(CharT key) const
100
3.37M
    {
101
3.37M
        return m_map[lookup(static_cast<uint64_t>(key))].value;
102
3.37M
    }
103
104
private:
105
    /**
106
     * lookup key inside the hashmap using a similar collision resolution
107
     * strategy to CPython and Ruby
108
     */
109
    uint64_t lookup(uint64_t key) const
110
3.41M
    {
111
3.41M
        uint64_t i = key % 128;
112
113
3.41M
        if (!m_map[i].value || m_map[i].key == key) {
114
3.41M
            return i;
115
3.41M
        }
116
117
0
        uint64_t perturb = key;
118
0
        while (true) {
119
0
            i = ((i * 5) + perturb + 1) % 128;
120
0
            if (!m_map[i].value || m_map[i].key == key) {
121
0
                return i;
122
0
            }
123
124
0
            perturb >>= 5;
125
0
        }
126
0
    }
127
128
    std::array<MapElem, 128> m_map;
129
};
130
131
struct PatternMatchVector {
132
    struct MapElem {
133
        uint64_t key = 0;
134
        uint64_t value = 0;
135
    };
136
137
    PatternMatchVector() : m_map(), m_extendedAscii()
138
0
    {}
139
140
    template <typename InputIt1>
141
4.85M
    PatternMatchVector(InputIt1 first, InputIt1 last) : m_map(), m_extendedAscii()
142
4.85M
    {
143
4.85M
        insert(first, last);
144
4.85M
    }
145
146
    template <typename InputIt1>
147
    void insert(InputIt1 first, InputIt1 last)
148
4.85M
    {
149
4.85M
        uint64_t mask = 1;
150
45.9M
        for (int64_t i = 0; i < std::distance(first, last); ++i) {
151
41.0M
            auto key = first[i];
152
41.0M
            if (key >= 0 && key <= 255) {
153
41.0M
                m_extendedAscii[static_cast<size_t>(key)] |= mask;
154
41.0M
            }
155
14.8k
            else {
156
14.8k
                m_map.insert_mask(key, mask);
157
14.8k
            }
158
41.0M
            mask <<= 1;
159
41.0M
        }
160
4.85M
    }
161
162
    template <typename CharT>
163
    void insert(CharT key, int64_t pos)
164
    {
165
        uint64_t mask = 1ull << pos;
166
        if (key >= 0 && key <= 255) {
167
            m_extendedAscii[key] |= mask;
168
        }
169
        else {
170
            m_map.insert_mask(key, mask);
171
        }
172
    }
173
174
    template <typename CharT>
175
    uint64_t get(CharT key) const
176
37.5M
    {
177
37.5M
        if (key >= 0 && key <= 255) {
178
35.2M
            return m_extendedAscii[static_cast<size_t>(key)];
179
35.2M
        }
180
2.31M
        else {
181
2.31M
            return m_map.get(key);
182
2.31M
        }
183
37.5M
    }
184
185
    /**
186
     * combat func for BlockPatternMatchVector
187
     */
188
    template <typename CharT>
189
    uint64_t get(int64_t block, CharT key) const
190
    {
191
        (void)block;
192
        assert(block == 0);
193
        return get(key);
194
    }
195
196
private:
197
    BitvectorHashmap m_map;
198
    std::array<uint64_t, 256> m_extendedAscii;
199
};
200
201
struct BlockPatternMatchVector {
202
    BlockPatternMatchVector() : m_block_count(0)
203
0
    {}
204
205
    template <typename InputIt1>
206
25.4k
    BlockPatternMatchVector(InputIt1 first, InputIt1 last) : m_block_count(0)
207
25.4k
    {
208
25.4k
        insert(first, last);
209
25.4k
    }
duckdb_jaro_winkler::common::BlockPatternMatchVector::BlockPatternMatchVector<char const*>(char const*, char const*)
Line
Count
Source
206
25.4k
    BlockPatternMatchVector(InputIt1 first, InputIt1 last) : m_block_count(0)
207
25.4k
    {
208
25.4k
        insert(first, last);
209
25.4k
    }
Unexecuted instantiation: duckdb_jaro_winkler::common::BlockPatternMatchVector::BlockPatternMatchVector<std::__1::__wrap_iter<char const*> >(std::__1::__wrap_iter<char const*>, std::__1::__wrap_iter<char const*>)
210
211
    template <typename CharT>
212
    void insert(int64_t block, CharT key, int pos)
213
537k
    {
214
537k
        uint64_t mask = 1ull << pos;
215
216
537k
        assert(block < m_block_count);
217
537k
        if (key >= 0 && key <= 255) {
218
513k
            m_extendedAscii[static_cast<size_t>(key * m_block_count + block)] |= mask;
219
513k
        }
220
23.9k
        else {
221
23.9k
            m_map[static_cast<size_t>(block)].insert_mask(key, mask);
222
23.9k
        }
223
537k
    }
224
225
    template <typename InputIt1>
226
    void insert(InputIt1 first, InputIt1 last)
227
25.4k
    {
228
25.4k
        int64_t len = std::distance(first, last);
229
25.4k
        m_block_count = ceildiv(len, 64);
230
25.4k
        m_map.resize(static_cast<size_t>(m_block_count));
231
25.4k
        m_extendedAscii.resize(static_cast<size_t>(m_block_count * 256));
232
233
562k
        for (int64_t i = 0; i < len; ++i) {
234
537k
            int64_t block = i / 64;
235
537k
            int64_t pos = i % 64;
236
537k
            insert(block, first[i], static_cast<int>(pos));
237
537k
        }
238
25.4k
    }
void duckdb_jaro_winkler::common::BlockPatternMatchVector::insert<char const*>(char const*, char const*)
Line
Count
Source
227
25.4k
    {
228
25.4k
        int64_t len = std::distance(first, last);
229
25.4k
        m_block_count = ceildiv(len, 64);
230
25.4k
        m_map.resize(static_cast<size_t>(m_block_count));
231
25.4k
        m_extendedAscii.resize(static_cast<size_t>(m_block_count * 256));
232
233
562k
        for (int64_t i = 0; i < len; ++i) {
234
537k
            int64_t block = i / 64;
235
537k
            int64_t pos = i % 64;
236
537k
            insert(block, first[i], static_cast<int>(pos));
237
537k
        }
238
25.4k
    }
Unexecuted instantiation: void duckdb_jaro_winkler::common::BlockPatternMatchVector::insert<std::__1::__wrap_iter<char const*> >(std::__1::__wrap_iter<char const*>, std::__1::__wrap_iter<char const*>)
239
240
    /**
241
     * combat func for PatternMatchVector
242
     */
243
    template <typename CharT>
244
    uint64_t get(CharT key) const
245
0
    {
246
0
        return get(0, key);
247
0
    }
248
249
    template <typename CharT>
250
    uint64_t get(int64_t block, CharT key) const
251
4.76M
    {
252
4.76M
        assert(block < m_block_count);
253
4.76M
        if (key >= 0 && key <= 255) {
254
3.70M
            return m_extendedAscii[static_cast<size_t>(key * m_block_count + block)];
255
3.70M
        }
256
1.05M
        else {
257
1.05M
            return m_map[static_cast<size_t>(block)].get(key);
258
1.05M
        }
259
4.76M
    }
260
261
private:
262
    std::vector<BitvectorHashmap> m_map;
263
    std::vector<uint64_t> m_extendedAscii;
264
    int64_t m_block_count;
265
};
266
267
/**@}*/
268
269
} // namespace common
270
} // namespace duckdb_jaro_winkler