Coverage Report

Created: 2022-08-24 06:40

/src/duckdb/third_party/jaro_winkler/details/common.hpp
Line
Count
Source (jump to first uncovered line)
1
/* SPDX-License-Identifier: MIT */
2
/* Copyright © 2022 Max Bachmann */
3
4
#pragma once
5
#include <algorithm>
6
#include <array>
7
#include <cassert>
8
#include <cmath>
9
#include <cstdint>
10
#include <cstring>
11
#include <iterator>
12
#include <type_traits>
13
#include <vector>
14
15
namespace duckdb_jaro_winkler {
16
17
namespace common {
18
19
/**
20
 * @defgroup Common Common
21
 * Common utilities shared among multiple functions
22
 * @{
23
 */
24
25
/* taken from https://stackoverflow.com/a/30766365/11335032 */
26
template <typename T>
27
struct is_iterator {
28
    static char test(...);
29
30
    template <typename U, typename = typename std::iterator_traits<U>::difference_type,
31
              typename = typename std::iterator_traits<U>::pointer,
32
              typename = typename std::iterator_traits<U>::reference,
33
              typename = typename std::iterator_traits<U>::value_type,
34
              typename = typename std::iterator_traits<U>::iterator_category>
35
    static long test(U&&);
36
37
    constexpr static bool value = std::is_same<decltype(test(std::declval<T>())), long>::value;
38
};
39
40
constexpr double result_cutoff(double result, double score_cutoff)
41
0
{
42
0
    return (result >= score_cutoff) ? result : 0;
43
0
}
44
45
template <typename T, typename U>
46
T ceildiv(T a, U divisor)
47
0
{
48
0
    return static_cast<T>(a / divisor) + static_cast<T>((a % divisor) != 0);
49
0
}
50
51
/**
52
 * Removes common prefix of two string views // todo
53
 */
54
template <typename InputIt1, typename InputIt2>
55
int64_t remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2)
56
0
{
57
  // DuckDB passes a raw pointer, but this gives compile errors for std::
58
0
  int64_t len1 = std::distance(first1, last1);
59
0
  int64_t len2 = std::distance(first2, last2);
60
0
  const int64_t max_comparisons = std::min<int64_t>(len1, len2);
61
0
  int64_t prefix;
62
0
  for (prefix = 0; prefix < max_comparisons; prefix++) {
63
0
    if (first1[prefix] != first2[prefix]) {
64
0
      break;
65
0
    }
66
0
  }
67
68
//    int64_t prefix = static_cast<int64_t>(
69
//        std::distance(first1, std::mismatch(first1, last1, first2, last2).first));
70
0
    first1 += prefix;
71
0
    first2 += prefix;
72
0
    return prefix;
73
0
}
74
75
struct BitvectorHashmap {
76
    struct MapElem {
77
        uint64_t key = 0;
78
        uint64_t value = 0;
79
    };
80
81
    BitvectorHashmap() : m_map()
82
0
    {}
83
84
    template <typename CharT>
85
    void insert(CharT key, int64_t pos)
86
    {
87
        insert_mask(key, 1ull << pos);
88
    }
89
90
    template <typename CharT>
91
    void insert_mask(CharT key, uint64_t mask)
92
0
    {
93
0
        uint64_t i = lookup(static_cast<uint64_t>(key));
94
0
        m_map[i].key = key;
95
0
        m_map[i].value |= mask;
96
0
    }
97
98
    template <typename CharT>
99
    uint64_t get(CharT key) const
100
0
    {
101
0
        return m_map[lookup(static_cast<uint64_t>(key))].value;
102
0
    }
103
104
private:
105
    /**
106
     * lookup key inside the hashmap using a similar collision resolution
107
     * strategy to CPython and Ruby
108
     */
109
    uint64_t lookup(uint64_t key) const
110
0
    {
111
0
        uint64_t i = key % 128;
112
113
0
        if (!m_map[i].value || m_map[i].key == key) {
114
0
            return i;
115
0
        }
116
117
0
        uint64_t perturb = key;
118
0
        while (true) {
119
0
            i = ((i * 5) + perturb + 1) % 128;
120
0
            if (!m_map[i].value || m_map[i].key == key) {
121
0
                return i;
122
0
            }
123
124
0
            perturb >>= 5;
125
0
        }
126
0
    }
127
128
    std::array<MapElem, 128> m_map;
129
};
130
131
struct PatternMatchVector {
132
    struct MapElem {
133
        uint64_t key = 0;
134
        uint64_t value = 0;
135
    };
136
137
    PatternMatchVector() : m_map(), m_extendedAscii()
138
0
    {}
139
140
    template <typename InputIt1>
141
    PatternMatchVector(InputIt1 first, InputIt1 last) : m_map(), m_extendedAscii()
142
0
    {
143
0
        insert(first, last);
144
0
    }
145
146
    template <typename InputIt1>
147
    void insert(InputIt1 first, InputIt1 last)
148
0
    {
149
0
        uint64_t mask = 1;
150
0
        for (int64_t i = 0; i < std::distance(first, last); ++i) {
151
0
            auto key = first[i];
152
0
            if (key >= 0 && key <= 255) {
153
0
                m_extendedAscii[key] |= mask;
154
0
            }
155
0
            else {
156
0
                m_map.insert_mask(key, mask);
157
0
            }
158
0
            mask <<= 1;
159
0
        }
160
0
    }
161
162
    template <typename CharT>
163
    void insert(CharT key, int64_t pos)
164
    {
165
        uint64_t mask = 1ull << pos;
166
        if (key >= 0 && key <= 255) {
167
            m_extendedAscii[key] |= mask;
168
        }
169
        else {
170
            m_map.insert_mask(key, mask);
171
        }
172
    }
173
174
    template <typename CharT>
175
    uint64_t get(CharT key) const
176
0
    {
177
0
        if (key >= 0 && key <= 255) {
178
0
            return m_extendedAscii[key];
179
0
        }
180
0
        else {
181
0
            return m_map.get(key);
182
0
        }
183
0
    }
184
185
    /**
186
     * combat func for BlockPatternMatchVector
187
     */
188
    template <typename CharT>
189
    uint64_t get(int64_t block, CharT key) const
190
    {
191
        (void)block;
192
        assert(block == 0);
193
        return get(key);
194
    }
195
196
private:
197
    BitvectorHashmap m_map;
198
    std::array<uint64_t, 256> m_extendedAscii;
199
};
200
201
struct BlockPatternMatchVector {
202
    BlockPatternMatchVector() : m_block_count(0)
203
0
    {}
204
205
    template <typename InputIt1>
206
    BlockPatternMatchVector(InputIt1 first, InputIt1 last) : m_block_count(0)
207
0
    {
208
0
        insert(first, last);
209
0
    }
Unexecuted instantiation: duckdb_jaro_winkler::common::BlockPatternMatchVector::BlockPatternMatchVector<std::__1::__wrap_iter<char const*> >(std::__1::__wrap_iter<char const*>, std::__1::__wrap_iter<char const*>)
Unexecuted instantiation: duckdb_jaro_winkler::common::BlockPatternMatchVector::BlockPatternMatchVector<char const*>(char const*, char const*)
210
211
    template <typename CharT>
212
    void insert(int64_t block, CharT key, int pos)
213
0
    {
214
0
        uint64_t mask = 1ull << pos;
215
216
0
        assert(block < m_block_count);
217
0
        if (key >= 0 && key <= 255) {
218
0
            m_extendedAscii[key * m_block_count + block] |= mask;
219
0
        }
220
0
        else {
221
0
            m_map[block].insert_mask(key, mask);
222
0
        }
223
0
    }
224
225
    template <typename InputIt1>
226
    void insert(InputIt1 first, InputIt1 last)
227
0
    {
228
0
        int64_t len = std::distance(first, last);
229
0
        m_block_count = ceildiv(len, 64);
230
0
        m_map.resize(m_block_count);
231
0
        m_extendedAscii.resize(m_block_count * 256);
232
233
0
        for (int64_t i = 0; i < len; ++i) {
234
0
            int64_t block = i / 64;
235
0
            int64_t pos = i % 64;
236
0
            insert(block, first[i], pos);
237
0
        }
238
0
    }
Unexecuted instantiation: void duckdb_jaro_winkler::common::BlockPatternMatchVector::insert<std::__1::__wrap_iter<char const*> >(std::__1::__wrap_iter<char const*>, std::__1::__wrap_iter<char const*>)
Unexecuted instantiation: void duckdb_jaro_winkler::common::BlockPatternMatchVector::insert<char const*>(char const*, char const*)
239
240
    /**
241
     * combat func for PatternMatchVector
242
     */
243
    template <typename CharT>
244
    uint64_t get(CharT key) const
245
0
    {
246
0
        return get(0, key);
247
0
    }
248
249
    template <typename CharT>
250
    uint64_t get(int64_t block, CharT key) const
251
0
    {
252
0
        assert(block < m_block_count);
253
0
        if (key >= 0 && key <= 255) {
254
0
            return m_extendedAscii[key * m_block_count + block];
255
0
        }
256
0
        else {
257
0
            return m_map[block].get(key);
258
0
        }
259
0
    }
260
261
private:
262
    std::vector<BitvectorHashmap> m_map;
263
    std::vector<uint64_t> m_extendedAscii;
264
    int64_t m_block_count;
265
};
266
267
/**@}*/
268
269
} // namespace common
270
} // namespace duckdb_jaro_winkler