/src/rocksdb/util/ribbon_alg.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | |
6 | | #pragma once |
7 | | |
8 | | #include <array> |
9 | | #include <memory> |
10 | | |
11 | | #include "rocksdb/rocksdb_namespace.h" |
12 | | #include "util/math128.h" |
13 | | |
14 | | namespace ROCKSDB_NAMESPACE { |
15 | | |
16 | | namespace ribbon { |
17 | | |
18 | | // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) |
19 | | // |
20 | | // ribbon_alg.h: generic versions of core algorithms. |
21 | | // |
22 | | // Ribbon is a Perfect Hash Static Function construction useful as a compact |
23 | | // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear |
24 | | // system construction that approximates a Band Matrix with hashing, |
25 | | // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is |
26 | | // remarkably efficient and adaptable at constructing an upper-triangular |
27 | | // band matrix from a set of band-approximating inputs from (a), and |
28 | | // (c) a storage layout that is fast and adaptable as a filter. |
29 | | // |
30 | | // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices |
31 | | // with One Short Random Block per Row, with Applications" by Stefan |
32 | | // Walzer and Martin Dietzfelbinger ("DW paper") |
33 | | // (b) developed by Peter C. Dillinger, though not the first on-the-fly |
34 | | // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by |
35 | | // Bioglio, Grangetto, Gaeta, and Sereno. |
36 | | // (c) see "interleaved" solution storage below. |
37 | | // |
38 | | // See ribbon_impl.h for high-level behavioral summary. This file focuses |
39 | | // on the core design details. |
40 | | // |
41 | | // ###################################################################### |
42 | | // ################# PHSF -> static filter reduction #################### |
43 | | // |
44 | | // A Perfect Hash Static Function is a data structure representing a |
45 | | // map from anything hashable (a "key") to values of some fixed size. |
46 | | // Crucially, it is allowed to return garbage values for anything not in |
47 | | // the original set of map keys, and it is a "static" structure: entries |
48 | | // cannot be added or deleted after construction. PHSFs representing n |
49 | | // mappings to b-bit values (assume uniformly distributed) require at least |
50 | | // n * b bits to represent, or at least b bits per entry. We typically |
51 | | // describe the compactness of a PHSF by typical bits per entry as some |
52 | | // function of b. For example, the MWHC construction (k=3 "peeling") |
53 | | // requires about 1.0222*b and a variant called Xor+ requires about |
54 | | // 1.08*b + 0.5 bits per entry. |
55 | | // |
56 | | // With more hashing, a PHSF can over-approximate a set as a Bloom filter |
57 | | // does, with no FN queries and predictable false positive (FP) query |
58 | | // rate. Instead of the user providing a value to map each input key to, |
59 | | // a hash function provides the value. Keys in the original set will |
60 | | // return a positive membership query because the underlying PHSF returns |
61 | | // the same value as hashing the key. When a key is not in the original set, |
62 | | // the PHSF returns a "garbage" value, which is only equal to the key's |
63 | | // hash with (false positive) probability 1 in 2^b. |
64 | | // |
65 | | // For a matching false positive rate, standard Bloom filters require |
66 | | // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h) |
67 | | // require a bit more, around 1.5*b bits per entry. Thus, a Bloom |
68 | | // alternative could save up to or nearly 1/3rd of memory and storage |
69 | | // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter |
70 | | // is dynamic.) |
71 | | // |
72 | | // Recommended reading: |
73 | | // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" |
74 | | // by Graf and Lemire |
75 | | // First three sections of "Fast Scalable Construction of (Minimal |
76 | | // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna |
77 | | // |
78 | | // ###################################################################### |
79 | | // ################## PHSF vs. hash table vs. Bloom ##################### |
80 | | // |
81 | | // You can think of traditional hash tables and related filter variants |
82 | | // such as Cuckoo filters as utilizing an "OR" construction: a hash |
83 | | // function associates a key with some slots and the data is returned if |
84 | | // the data is found in any one of those slots. The collision resolution |
85 | | // is visible in the final data structure and requires extra information. |
86 | | // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and |
87 | | // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data |
88 | | // structure associates each input key with data in one slot, the |
89 | | // structure implicitly constructs a (near-)minimal (near-)perfect hash |
90 | | // (MPH) of the keys, which requires at least 1.44 bits per key to |
91 | | // represent. This is why approaches with visible collision resolution |
92 | | // have a fixed + 1.5 or more in storage overhead per entry, often in |
93 | | // addition to an overhead multiplier on b. |
94 | | // |
95 | | // By contrast Bloom filters utilize an "AND" construction: a query only |
96 | | // returns true if all bit positions associated with a key are set to 1. |
97 | | // There is no collision resolution, so Bloom filters do not suffer a |
98 | | // fixed bits per entry overhead like the above structures. |
99 | | // |
100 | | // PHSFs typically use a bitwise XOR construction: the data you want is |
101 | | // not in a single slot, but in a linear combination of several slots. |
102 | | // For static data, this gives the best of "AND" and "OR" constructions: |
103 | | // avoids the +1.44 or more fixed overhead by not approximating a MPH and |
104 | | // can do much better than Bloom's 1.44 factor on b with collision |
105 | | // resolution, which here is done ahead of time and invisible at query |
106 | | // time. |
107 | | // |
108 | | // ###################################################################### |
109 | | // ######################## PHSF construction ########################### |
110 | | // |
111 | | // For a typical PHSF, construction is solving a linear system of |
112 | | // equations, typically in GF(2), which is to say that values are boolean |
113 | | // and XOR serves both as addition and subtraction. We can use matrices to |
114 | | // represent the problem: |
115 | | // |
116 | | // C * S = R |
117 | | // (n x m) (m x b) (n x b) |
118 | | // where C = coefficients, S = solution, R = results |
119 | | // and solving for S given C and R. |
120 | | // |
121 | | // Note that C and R each have n rows, one for each input entry for the |
122 | | // PHSF. A row in C is given by a hash function on the PHSF input key, |
123 | | // and the corresponding row in R is the b-bit value to associate with |
124 | | // that input key. (In a filter, rows of R are given by another hash |
125 | | // function on the input key.) |
126 | | // |
127 | | // On solving, the matrix S (solution) is the final PHSF data, as it |
128 | | // maps any row from the original C to its corresponding desired result |
129 | | // in R. We just have to hash our query inputs and compute a linear |
130 | | // combination of rows in S. |
131 | | // |
132 | | // In theory, we could chose m = n and let a hash function associate |
133 | | // each input key with random rows in C. A solution exists with high |
134 | | // probability, and uses essentially minimum space, b bits per entry |
135 | | // (because we set m = n) but this has terrible scaling, something |
136 | | // like O(n^2) space and O(n^3) time during construction (Gaussian |
137 | | // elimination) and O(n) query time. But computational efficiency is |
138 | | // key, and the core of this is avoiding scanning all of S to answer |
139 | | // each query. |
140 | | // |
141 | | // The traditional approach (MWHC, aka Xor filter) starts with setting |
142 | | // only some small fixed number of columns (typically k=3) to 1 for each |
143 | | // row of C, with remaining entries implicitly 0. This is implemented as |
144 | | // three hash functions over [0,m), and S can be implemented as a vector |
145 | | // of b-bit values. Now, a query only involves looking up k rows |
146 | | // (values) in S and computing their bitwise XOR. Additionally, this |
147 | | // construction can use a linear time algorithm called "peeling" for |
148 | | // finding a solution in many cases of one existing, but peeling |
149 | | // generally requires a larger space overhead factor in the solution |
150 | | // (m/n) than is required with Gaussian elimination. |
151 | | // |
152 | | // Recommended reading: |
153 | | // "Peeling Close to the Orientability Threshold - Spatial Coupling in |
154 | | // Hashing-Based Data Structures" by Stefan Walzer |
155 | | // |
156 | | // ###################################################################### |
157 | | // ##################### Ribbon PHSF construction ####################### |
158 | | // |
159 | | // Ribbon constructs coefficient rows essentially the same as in the |
160 | | // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width |
161 | | // r (kCoeffBits in code), each key is hashed to a starting column in |
162 | | // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean |
163 | | // coefficients (GetCoeffRow() in code). If you sort the rows by start, |
164 | | // the C matrix would look something like this: |
165 | | // |
166 | | // [####00000000000000000000] |
167 | | // [####00000000000000000000] |
168 | | // [000####00000000000000000] |
169 | | // [0000####0000000000000000] |
170 | | // [0000000####0000000000000] |
171 | | // [000000000####00000000000] |
172 | | // [000000000####00000000000] |
173 | | // [0000000000000####0000000] |
174 | | // [0000000000000000####0000] |
175 | | // [00000000000000000####000] |
176 | | // [00000000000000000000####] |
177 | | // |
178 | | // where each # could be a 0 or 1, chosen uniformly by a hash function. |
179 | | // (Except we typically set the start column value to 1.) This scheme |
180 | | // uses hashing to approximate a band matrix, and it has a solution iff |
181 | | // it reduces to an upper-triangular boolean r-band matrix, like this: |
182 | | // |
183 | | // [1###00000000000000000000] |
184 | | // [01##00000000000000000000] |
185 | | // [000000000000000000000000] |
186 | | // [0001###00000000000000000] |
187 | | // [000000000000000000000000] |
188 | | // [000001##0000000000000000] |
189 | | // [000000000000000000000000] |
190 | | // [00000001###0000000000000] |
191 | | // [000000001###000000000000] |
192 | | // [0000000001##000000000000] |
193 | | // ... |
194 | | // [00000000000000000000001#] |
195 | | // [000000000000000000000001] |
196 | | // |
197 | | // where we have expanded to an m x m matrix by filling with rows of |
198 | | // all zeros as needed. As in Gaussian elimination, this form is ready for |
199 | | // generating a solution through back-substitution. |
200 | | // |
201 | | // The awesome thing about the Ribbon construction (from the DW paper) is |
202 | | // how row reductions keep each row representable as a start column and |
203 | | // r coefficients, because row reductions are only needed when two rows |
204 | | // have the same number of leading zero columns. Thus, the combination |
205 | | // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels |
206 | | // out the leading 1s, so starts (at least) one column later and only |
207 | | // needs (at most) r - 1 coefficients. |
208 | | // |
209 | | // ###################################################################### |
210 | | // ###################### Ribbon PHSF scalability ####################### |
211 | | // |
212 | | // Although more practical detail is in ribbon_impl.h, it's worth |
213 | | // understanding some of the overall benefits and limitations of the |
214 | | // Ribbon PHSFs. |
215 | | // |
216 | | // High-end scalability is a primary issue for Ribbon PHSFs, because in |
217 | | // a single Ribbon linear system with fixed r and fixed m/n ratio, the |
218 | | // solution probability approaches zero as n approaches infinity. |
219 | | // For a given n, solution probability improves with larger r and larger |
220 | | // m/n. |
221 | | // |
222 | | // By contrast, peeling-based PHSFs have somewhat worse storage ratio |
223 | | // or solution probability for small n (less than ~1000). This is |
224 | | // especially true with spatial-coupling, where benefits are only |
225 | | // notable for n on the order of 100k or 1m or more. |
226 | | // |
227 | | // To make best use of current hardware, r=128 seems to be closest to |
228 | | // a "generally good" choice for Ribbon, at least in RocksDB where SST |
229 | | // Bloom filters typically hold around 10-100k keys, and almost always |
230 | | // less than 10m keys. r=128 ribbon has a high chance of encoding success |
231 | | // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05) |
232 | | // for roughly 10k - 10m keys in a single linear system. r=64 only scales |
233 | | // up to about 10k keys with the same storage overhead. Construction and |
234 | | // access times for r=128 are similar to r=64. r=128 tracks nearly |
235 | | // twice as much data during construction, but in most cases we expect |
236 | | // the scalability benefits of r=128 vs. r=64 to make it preferred. |
237 | | // |
238 | | // A natural approach to scaling Ribbon beyond ~10m keys is splitting |
239 | | // (or "sharding") the inputs into multiple linear systems with their |
240 | | // own hash seeds. This can also help to control peak memory consumption. |
241 | | // TODO: much more to come |
242 | | // |
243 | | // ###################################################################### |
244 | | // #################### Ribbon on-the-fly banding ####################### |
245 | | // |
246 | | // "Banding" is what we call the process of reducing the inputs to an |
247 | | // upper-triangular r-band matrix ready for finishing a solution with |
248 | | // back-substitution. Although the DW paper presents an algorithm for |
249 | | // this ("SGauss"), the awesome properties of their construction enable |
250 | | // an even simpler, faster, and more backtrackable algorithm. In simplest |
251 | | // terms, the SGauss algorithm requires sorting the inputs by start |
252 | | // columns, but it's possible to make Gaussian elimination resemble hash |
253 | | // table insertion! |
254 | | // |
255 | | // The enhanced algorithm is based on these observations: |
256 | | // - When processing a coefficient row with first 1 in column j, |
257 | | // - If it's the first at column j to be processed, it can be part of |
258 | | // the banding at row j. (And that decision never overwritten, with |
259 | | // no loss of generality!) |
260 | | // - Else, it can be combined with existing row j and re-processed, |
261 | | // which will look for a later "empty" row or reach "no solution". |
262 | | // |
263 | | // We call our banding algorithm "incremental" and "on-the-fly" because |
264 | | // (like hash table insertion) we are "finished" after each input |
265 | | // processed, with respect to all inputs processed so far. Although the |
266 | | // band matrix is an intermediate step to the solution structure, we have |
267 | | // eliminated intermediate steps and unnecessary data tracking for |
268 | | // banding. |
269 | | // |
270 | | // Building on "incremental" and "on-the-fly", the banding algorithm is |
271 | | // easily backtrackable because no (non-empty) rows are overwritten in |
272 | | // the banding. Thus, if we want to "try" adding an additional set of |
273 | | // inputs to the banding, we only have to record which rows were written |
274 | | // in order to efficiently backtrack to our state before considering |
275 | | // the additional set. (TODO: how this can mitigate scalability and |
276 | | // reach sub-1% overheads) |
277 | | // |
278 | | // Like in a linear-probed hash table, as the occupancy approaches and |
279 | | // surpasses 90-95%, collision resolution dominates the construction |
280 | | // time. (Ribbon doesn't usually pay at query time; see solution |
281 | | // storage below.) This means that we can speed up construction time |
282 | | // by using a higher m/n ratio, up to negative returns around 1.2. |
283 | | // At m/n ~= 1.2, which still saves memory substantially vs. Bloom |
284 | | // filter's 1.5, construction speed (including back-substitution) is not |
285 | | // far from sorting speed, but still a few times slower than cache-local |
286 | | // Bloom construction speed. |
287 | | // |
288 | | // Back-substitution from an upper-triangular boolean band matrix is |
289 | | // especially fast and easy. All the memory accesses are sequential or at |
290 | | // least local, no random. If the number of result bits (b) is a |
291 | | // compile-time constant, the back-substitution state can even be tracked |
292 | | // in CPU registers. Regardless of the solution representation, we prefer |
293 | | // column-major representation for tracking back-substitution state, as |
294 | | // r (the band width) will typically be much larger than b (result bits |
295 | | // or columns), so better to handle r-bit values b times (per solution |
296 | | // row) than b-bit values r times. |
297 | | // |
298 | | // ###################################################################### |
299 | | // ##################### Ribbon solution storage ######################## |
300 | | // |
301 | | // Row-major layout is typical for boolean (bit) matrices, including for |
302 | | // MWHC (Xor) filters where a query combines k b-bit values, and k is |
303 | | // typically smaller than b. Even for k=4 and b=2, at least k=4 random |
304 | | // look-ups are required regardless of layout. |
305 | | // |
306 | | // Ribbon PHSFs are quite different, however, because |
307 | | // (a) all of the solution rows relevant to a query are within a single |
308 | | // range of r rows, and |
309 | | // (b) the number of solution rows involved (r/2 on average, or r if |
310 | | // avoiding conditional accesses) is typically much greater than |
311 | | // b, the number of solution columns. |
312 | | // |
313 | | // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead |
314 | | // by processing (up to) r entries of b bits each, where b is typically |
315 | | // less than 10 for filter applications. |
316 | | // |
317 | | // Column-major layout has poor locality because of accessing up to b |
318 | | // memory locations in different pages (and obviously cache lines). Note |
319 | | // that negative filter queries do not typically need to access all |
320 | | // solution columns, as they can return when a mismatch is found in any |
321 | | // result/solution column. This optimization doesn't always pay off on |
322 | | // recent hardware, where the penalty for unpredictable conditional |
323 | | // branching can exceed the penalty for unnecessary work, but the |
324 | | // optimization is essentially unavailable with row-major layout. |
325 | | // |
326 | | // The best compromise seems to be interleaving column-major on the small |
327 | | // scale with row-major on the large scale. For example, let a solution |
328 | | // "block" be r rows column-major encoded as b r-bit values in sequence. |
329 | | // Each query accesses (up to) 2 adjacent blocks, which will typically |
330 | | // span 1-3 cache lines in adjacent memory. We get very close to the same |
331 | | // locality as row-major, but with much faster reconstruction of each |
332 | | // result column, at least for filter applications where b is relatively |
333 | | // small and negative queries can return early. |
334 | | // |
335 | | // ###################################################################### |
336 | | // ###################### Fractional result bits ######################## |
337 | | // |
338 | | // Bloom filters have great flexibility that alternatives mostly do not |
339 | | // have. One of those flexibilities is in utilizing any ratio of data |
340 | | // structure bits per key. With a typical memory allocator like jemalloc, |
341 | | // this flexibility can save roughly 10% of the filters' footprint in |
342 | | // DRAM by rounding up and down filter sizes to minimize memory internal |
343 | | // fragmentation (see optimize_filters_for_memory RocksDB option). |
344 | | // |
345 | | // At first glance, PHSFs only offer a whole number of bits per "slot" |
346 | | // (m rather than number of keys n), but coefficient locality in the |
347 | | // Ribbon construction makes fractional bits/key quite possible and |
348 | | // attractive for filter applications. This works by a prefix of the |
349 | | // structure using b-1 solution columns and the rest using b solution |
350 | | // columns. See InterleavedSolutionStorage below for more detail. |
351 | | // |
352 | | // Because false positive rates are non-linear in bits/key, this approach |
353 | | // is not quite optimal in terms of information theory. In common cases, |
354 | | // we see additional space overhead up to about 1.5% vs. theoretical |
355 | | // optimal to achieve the same FP rate. We consider this a quite acceptable |
356 | | // overhead for very efficiently utilizing space that might otherwise be |
357 | | // wasted. |
358 | | // |
359 | | // This property of Ribbon even makes it "elastic." A Ribbon filter and |
360 | | // its small metadata for answering queries can be adapted into another |
361 | | // Ribbon filter filling any smaller multiple of r bits (plus small |
362 | | // metadata), with a correspondingly higher FP rate. None of the data |
363 | | // thrown away during construction needs to be recalled for this reduction. |
364 | | // Similarly a single Ribbon construction can be separated (by solution |
365 | | // column) into two or more structures (or "layers" or "levels") with |
366 | | // independent filtering ability (no FP correlation, just as solution or |
367 | | // result columns in a single structure) despite being constructed as part |
368 | | // of a single linear system. (TODO: implement) |
369 | | // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards |
370 | | // Efficient Read for LSM-tree-based KV Stores." |
371 | | // |
372 | | |
373 | | // ###################################################################### |
374 | | // ################### CODE: Ribbon core algorithms ##################### |
375 | | // ###################################################################### |
376 | | // |
377 | | // These algorithms are templatized for genericity but near-maximum |
378 | | // performance in a given application. The template parameters |
379 | | // adhere to informal class/struct type concepts outlined below. (This |
380 | | // code is written for C++11 so does not use formal C++ concepts.) |
381 | | |
382 | | // Rough architecture for these algorithms: |
383 | | // |
384 | | // +-----------+ +---+ +-----------------+ |
385 | | // | AddInputs | --> | H | --> | BandingStorage | |
386 | | // +-----------+ | a | +-----------------+ |
387 | | // | s | | |
388 | | // | h | Back substitution |
389 | | // | e | V |
390 | | // +-----------+ | r | +-----------------+ |
391 | | // | Query Key | --> | | >+< | SolutionStorage | |
392 | | // +-----------+ +---+ | +-----------------+ |
393 | | // V |
394 | | // Query result |
395 | | |
396 | | // Common to other concepts |
397 | | // concept RibbonTypes { |
398 | | // // An unsigned integer type for an r-bit subsequence of coefficients. |
399 | | // // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would |
400 | | // // generally only hurt scalability to leave bits of CoeffRow unused. |
401 | | // typename CoeffRow; |
402 | | // // An unsigned integer type big enough to hold a result row (b bits, |
403 | | // // or number of solution/result columns). |
404 | | // // In many applications, especially filters, the number of result |
405 | | // // columns is decided at run time, so ResultRow simply needs to be |
406 | | // // big enough for the largest number of columns allowed. |
407 | | // typename ResultRow; |
408 | | // // An unsigned integer type sufficient for representing the number of |
409 | | // // rows in the solution structure, and at least the arithmetic |
410 | | // // promotion size (usually 32 bits). uint32_t recommended because a |
411 | | // // single Ribbon construction doesn't really scale to billions of |
412 | | // // entries. |
413 | | // typename Index; |
414 | | // }; |
415 | | |
416 | | // ###################################################################### |
417 | | // ######################## Hashers and Banding ######################### |
418 | | |
419 | | // Hasher concepts abstract out hashing details. |
420 | | |
421 | | // concept PhsfQueryHasher extends RibbonTypes { |
422 | | // // Type for a lookup key, which is hashable. |
423 | | // typename Key; |
424 | | // |
425 | | // // Type for hashed summary of a Key. uint64_t is recommended. |
426 | | // typename Hash; |
427 | | // |
428 | | // // Compute a hash value summarizing a Key |
429 | | // Hash GetHash(const Key &) const; |
430 | | // |
431 | | // // Given a hash value and a number of columns that can start an |
432 | | // // r-sequence of coefficients (== m - r + 1), return the start |
433 | | // // column to associate with that hash value. (Starts can be chosen |
434 | | // // uniformly or "smash" extra entries into the beginning and end for |
435 | | // // better utilization at those extremes of the structure. Details in |
436 | | // // ribbon.impl.h) |
437 | | // Index GetStart(Hash, Index num_starts) const; |
438 | | // |
439 | | // // Given a hash value, return the r-bit sequence of coefficients to |
440 | | // // associate with it. It's generally OK if |
441 | | // // sizeof(CoeffRow) > sizeof(Hash) |
442 | | // // as long as the hash itself is not too prone to collisions for the |
443 | | // // applications and the CoeffRow is generated uniformly from |
444 | | // // available hash data, but relatively independent of the start. |
445 | | // // |
446 | | // // Must be non-zero, because that's required for a solution to exist |
447 | | // // when mapping to non-zero result row. (Note: BandingAdd could be |
448 | | // // modified to allow 0 coeff row if that only occurs with 0 result |
449 | | // // row, which really only makes sense for filter implementation, |
450 | | // // where both values are hash-derived. Or BandingAdd could reject 0 |
451 | | // // coeff row, forcing next seed, but that has potential problems with |
452 | | // // generality/scalability.) |
453 | | // CoeffRow GetCoeffRow(Hash) const; |
454 | | // }; |
455 | | |
456 | | // concept FilterQueryHasher extends PhsfQueryHasher { |
457 | | // // For building or querying a filter, this returns the expected |
458 | | // // result row associated with a hashed input. For general PHSF, |
459 | | // // this must return 0. |
460 | | // // |
461 | | // // Although not strictly required, there's a slightly better chance of |
462 | | // // solver success if result row is masked down here to only the bits |
463 | | // // actually needed. |
464 | | // ResultRow GetResultRowFromHash(Hash) const; |
465 | | // } |
466 | | |
467 | | // concept BandingHasher extends FilterQueryHasher { |
468 | | // // For a filter, this will generally be the same as Key. |
469 | | // // For a general PHSF, it must either |
470 | | // // (a) include a key and a result it maps to (e.g. in a std::pair), or |
471 | | // // (b) GetResultRowFromInput looks up the result somewhere rather than |
472 | | // // extracting it. |
473 | | // typename AddInput; |
474 | | // |
475 | | // // Instead of requiring a way to extract a Key from an |
476 | | // // AddInput, we require getting the hash of the Key part |
477 | | // // of an AddInput, which is trivial if AddInput == Key. |
478 | | // Hash GetHash(const AddInput &) const; |
479 | | // |
480 | | // // For building a non-filter PHSF, this extracts or looks up the result |
481 | | // // row to associate with an input. For filter PHSF, this must return 0. |
482 | | // ResultRow GetResultRowFromInput(const AddInput &) const; |
483 | | // |
484 | | // // Whether the solver can assume the lowest bit of GetCoeffRow is |
485 | | // // always 1. When true, it should improve solver efficiency slightly. |
486 | | // static bool kFirstCoeffAlwaysOne; |
487 | | // } |
488 | | |
489 | | // Abstract storage for the the result of "banding" the inputs (Gaussian |
490 | | // elimination to an upper-triangular boolean band matrix). Because the |
491 | | // banding is an incremental / on-the-fly algorithm, this also represents |
492 | | // all the intermediate state between input entries. |
493 | | // |
494 | | // concept BandingStorage extends RibbonTypes { |
495 | | // // Tells the banding algorithm to prefetch memory associated with |
496 | | // // the next input before processing the current input. Generally |
497 | | // // recommended iff the BandingStorage doesn't easily fit in CPU |
498 | | // // cache. |
499 | | // bool UsePrefetch() const; |
500 | | // |
501 | | // // Prefetches (e.g. __builtin_prefetch) memory associated with a |
502 | | // // slot index i. |
503 | | // void Prefetch(Index i) const; |
504 | | // |
505 | | // // Load or store CoeffRow and ResultRow for slot index i. |
506 | | // // (Gaussian row operations involve both sides of the equation.) |
507 | | // // Bool `for_back_subst` indicates that customizing values for |
508 | | // // unconstrained solution rows (cr == 0) is allowed. |
509 | | // void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst) |
510 | | // const; |
511 | | // void StoreRow(Index i, CoeffRow cr, ResultRow rr); |
512 | | // |
513 | | // // Returns the number of columns that can start an r-sequence of |
514 | | // // coefficients, which is the number of slots minus r (kCoeffBits) |
515 | | // // plus one. (m - r + 1) |
516 | | // Index GetNumStarts() const; |
517 | | // }; |
518 | | |
519 | | // Optional storage for backtracking data in banding a set of input |
520 | | // entries. It exposes an array structure which will generally be |
521 | | // used as a stack. It must be able to accommodate as many entries |
522 | | // as are passed in as inputs to `BandingAddRange`. |
523 | | // |
524 | | // concept BacktrackStorage extends RibbonTypes { |
525 | | // // If false, backtracking support will be disabled in the algorithm. |
526 | | // // This should preferably be an inline compile-time constant function. |
527 | | // bool UseBacktrack() const; |
528 | | // |
529 | | // // Records `to_save` as the `i`th backtrack entry |
530 | | // void BacktrackPut(Index i, Index to_save); |
531 | | // |
532 | | // // Recalls the `i`th backtrack entry |
533 | | // Index BacktrackGet(Index i) const; |
534 | | // } |
535 | | |
536 | | // Adds a single entry to BandingStorage (and optionally, BacktrackStorage), |
537 | | // returning true if successful or false if solution is impossible with |
538 | | // current hasher (and presumably its seed) and number of "slots" (solution |
539 | | // or banding rows). (A solution is impossible when there is a linear |
540 | | // dependence among the inputs that doesn't "cancel out".) |
541 | | // |
542 | | // Pre- and post-condition: the BandingStorage represents a band matrix |
543 | | // ready for back substitution (row echelon form except for zero rows), |
544 | | // augmented with result values such that back substitution would give a |
545 | | // solution satisfying all the cr@start -> rr entries added. |
546 | | template <bool kFirstCoeffAlwaysOne, typename BandingStorage, |
547 | | typename BacktrackStorage> |
548 | | bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start, |
549 | | typename BandingStorage::ResultRow rr, |
550 | | typename BandingStorage::CoeffRow cr, BacktrackStorage *bts, |
551 | 0 | typename BandingStorage::Index *backtrack_pos) { |
552 | 0 | using CoeffRow = typename BandingStorage::CoeffRow; |
553 | 0 | using ResultRow = typename BandingStorage::ResultRow; |
554 | 0 | using Index = typename BandingStorage::Index; |
555 | |
|
556 | 0 | Index i = start; |
557 | |
|
558 | 0 | if (!kFirstCoeffAlwaysOne) { |
559 | | // Requires/asserts that cr != 0 |
560 | 0 | int tz = CountTrailingZeroBits(cr); |
561 | 0 | i += static_cast<Index>(tz); |
562 | 0 | cr >>= tz; |
563 | 0 | } |
564 | |
|
565 | 0 | for (;;) { |
566 | 0 | assert((cr & 1) == 1); |
567 | 0 | CoeffRow cr_at_i; |
568 | 0 | ResultRow rr_at_i; |
569 | 0 | bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false); |
570 | 0 | if (cr_at_i == 0) { |
571 | 0 | bs->StoreRow(i, cr, rr); |
572 | 0 | bts->BacktrackPut(*backtrack_pos, i); |
573 | 0 | ++*backtrack_pos; |
574 | 0 | return true; |
575 | 0 | } |
576 | 0 | assert((cr_at_i & 1) == 1); |
577 | | // Gaussian row reduction |
578 | 0 | cr ^= cr_at_i; |
579 | 0 | rr ^= rr_at_i; |
580 | 0 | if (cr == 0) { |
581 | | // Inconsistency or (less likely) redundancy |
582 | 0 | break; |
583 | 0 | } |
584 | | // Find relative offset of next non-zero coefficient. |
585 | 0 | int tz = CountTrailingZeroBits(cr); |
586 | 0 | i += static_cast<Index>(tz); |
587 | 0 | cr >>= tz; |
588 | 0 | } |
589 | | |
590 | | // Failed, unless result row == 0 because e.g. a duplicate input or a |
591 | | // stock hash collision, with same result row. (For filter, stock hash |
592 | | // collision implies same result row.) Or we could have a full equation |
593 | | // equal to sum of other equations, which is very possible with |
594 | | // small range of values for result row. |
595 | 0 | return rr == 0; |
596 | 0 | } |
597 | | |
598 | | // Adds a range of entries to BandingStorage returning true if successful |
599 | | // or false if solution is impossible with current hasher (and presumably |
600 | | // its seed) and number of "slots" (solution or banding rows). (A solution |
601 | | // is impossible when there is a linear dependence among the inputs that |
602 | | // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. |
603 | | // |
604 | | // If UseBacktrack in the BacktrackStorage, this function call rolls back |
605 | | // to prior state on failure. If !UseBacktrack, some subset of the entries |
606 | | // will have been added to the BandingStorage, so best considered to be in |
607 | | // an indeterminate state. |
608 | | // |
609 | | template <typename BandingStorage, typename BacktrackStorage, |
610 | | typename BandingHasher, typename InputIterator> |
611 | | bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts, |
612 | | const BandingHasher &bh, InputIterator begin, |
613 | 0 | InputIterator end) { |
614 | 0 | using CoeffRow = typename BandingStorage::CoeffRow; |
615 | 0 | using Index = typename BandingStorage::Index; |
616 | 0 | using ResultRow = typename BandingStorage::ResultRow; |
617 | 0 | using Hash = typename BandingHasher::Hash; |
618 | |
|
619 | 0 | static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned"); |
620 | 0 | static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned"); |
621 | 0 | static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned"); |
622 | |
|
623 | 0 | constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne; |
624 | |
|
625 | 0 | if (begin == end) { |
626 | | // trivial |
627 | 0 | return true; |
628 | 0 | } |
629 | | |
630 | 0 | const Index num_starts = bs->GetNumStarts(); |
631 | |
|
632 | 0 | InputIterator cur = begin; |
633 | 0 | Index backtrack_pos = 0; |
634 | 0 | if (!bs->UsePrefetch()) { |
635 | | // Simple version, no prefetch |
636 | 0 | for (;;) { |
637 | 0 | Hash h = bh.GetHash(*cur); |
638 | 0 | Index start = bh.GetStart(h, num_starts); |
639 | 0 | ResultRow rr = |
640 | 0 | bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h); |
641 | 0 | CoeffRow cr = bh.GetCoeffRow(h); |
642 | |
|
643 | 0 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { |
644 | 0 | break; |
645 | 0 | } |
646 | 0 | if ((++cur) == end) { |
647 | 0 | return true; |
648 | 0 | } |
649 | 0 | } |
650 | 0 | } else { |
651 | | // Pipelined w/prefetch |
652 | | // Prime the pipeline |
653 | 0 | Hash h = bh.GetHash(*cur); |
654 | 0 | Index start = bh.GetStart(h, num_starts); |
655 | 0 | ResultRow rr = bh.GetResultRowFromInput(*cur); |
656 | 0 | bs->Prefetch(start); |
657 | | |
658 | | // Pipeline |
659 | 0 | for (;;) { |
660 | 0 | rr |= bh.GetResultRowFromHash(h); |
661 | 0 | CoeffRow cr = bh.GetCoeffRow(h); |
662 | 0 | if ((++cur) == end) { |
663 | 0 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { |
664 | 0 | break; |
665 | 0 | } |
666 | 0 | return true; |
667 | 0 | } |
668 | 0 | Hash next_h = bh.GetHash(*cur); |
669 | 0 | Index next_start = bh.GetStart(next_h, num_starts); |
670 | 0 | ResultRow next_rr = bh.GetResultRowFromInput(*cur); |
671 | 0 | bs->Prefetch(next_start); |
672 | 0 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { |
673 | 0 | break; |
674 | 0 | } |
675 | 0 | h = next_h; |
676 | 0 | start = next_start; |
677 | 0 | rr = next_rr; |
678 | 0 | } |
679 | 0 | } |
680 | | // failed; backtrack (if implemented) |
681 | 0 | if (bts->UseBacktrack()) { |
682 | 0 | while (backtrack_pos > 0) { |
683 | 0 | --backtrack_pos; |
684 | 0 | Index i = bts->BacktrackGet(backtrack_pos); |
685 | | // Clearing the ResultRow is not strictly required, but is required |
686 | | // for good FP rate on inputs that might have been backtracked out. |
687 | | // (We don't want anything we've backtracked on to leak into final |
688 | | // result, as that might not be "harmless".) |
689 | 0 | bs->StoreRow(i, 0, 0); |
690 | 0 | } |
691 | 0 | } |
692 | 0 | return false; |
693 | 0 | } |
694 | | |
695 | | // Adds a range of entries to BandingStorage returning true if successful |
696 | | // or false if solution is impossible with current hasher (and presumably |
697 | | // its seed) and number of "slots" (solution or banding rows). (A solution |
698 | | // is impossible when there is a linear dependence among the inputs that |
699 | | // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. |
700 | | // |
701 | | // On failure, some subset of the entries will have been added to the |
702 | | // BandingStorage, so best considered to be in an indeterminate state. |
703 | | // |
704 | | template <typename BandingStorage, typename BandingHasher, |
705 | | typename InputIterator> |
706 | | bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh, |
707 | 0 | InputIterator begin, InputIterator end) { |
708 | 0 | using Index = typename BandingStorage::Index; |
709 | 0 | struct NoopBacktrackStorage { |
710 | 0 | bool UseBacktrack() { return false; } |
711 | 0 | void BacktrackPut(Index, Index) {} |
712 | 0 | Index BacktrackGet(Index) { |
713 | 0 | assert(false); |
714 | 0 | return 0; |
715 | 0 | } |
716 | 0 | } nbts; |
717 | 0 | return BandingAddRange(bs, &nbts, bh, begin, end); |
718 | 0 | } |
719 | | |
720 | | // ###################################################################### |
721 | | // ######################### Solution Storage ########################### |
722 | | |
723 | | // Back-substitution and query algorithms unfortunately depend on some |
724 | | // details of data layout in the final data structure ("solution"). Thus, |
725 | | // there is no common SolutionStorage covering all the reasonable |
726 | | // possibilities. |
727 | | |
728 | | // ###################### SimpleSolutionStorage ######################### |
729 | | |
730 | | // SimpleSolutionStorage is for a row-major storage, typically with no |
731 | | // unused bits in each ResultRow. This is mostly for demonstration |
732 | | // purposes as the simplest solution storage scheme. It is relatively slow |
733 | | // for filter queries. |
734 | | |
735 | | // concept SimpleSolutionStorage extends RibbonTypes { |
736 | | // // This is called at the beginning of back-substitution for the |
737 | | // // solution storage to do any remaining configuration before data |
738 | | // // is stored to it. If configuration is previously finalized, this |
739 | | // // could be a simple assertion or even no-op. Ribbon algorithms |
740 | | // // only call this from back-substitution, and only once per call, |
741 | | // // before other functions here. |
742 | | // void PrepareForNumStarts(Index num_starts) const; |
743 | | // // Must return num_starts passed to PrepareForNumStarts, or the most |
744 | | // // recent call to PrepareForNumStarts if this storage object can be |
745 | | // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because |
746 | | // // there must be a run of kCoeffBits slots starting from each start. |
747 | | // Index GetNumStarts() const; |
748 | | // // Load the solution row (type ResultRow) for a slot |
749 | | // ResultRow Load(Index slot_num) const; |
750 | | // // Store the solution row (type ResultRow) for a slot |
751 | | // void Store(Index slot_num, ResultRow data); |
752 | | // }; |
753 | | |
754 | | // Back-substitution for generating a solution from BandingStorage to |
755 | | // SimpleSolutionStorage. |
756 | | template <typename SimpleSolutionStorage, typename BandingStorage> |
757 | | void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) { |
758 | | using CoeffRow = typename BandingStorage::CoeffRow; |
759 | | using Index = typename BandingStorage::Index; |
760 | | using ResultRow = typename BandingStorage::ResultRow; |
761 | | |
762 | | static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index), |
763 | | "must be same"); |
764 | | static_assert( |
765 | | sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow), |
766 | | "must be same"); |
767 | | static_assert( |
768 | | sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow), |
769 | | "must be same"); |
770 | | |
771 | | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
772 | | constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U); |
773 | | |
774 | | // A column-major buffer of the solution matrix, containing enough |
775 | | // recently-computed solution data to compute the next solution row |
776 | | // (based also on banding data). |
777 | | std::array<CoeffRow, kResultBits> state; |
778 | | state.fill(0); |
779 | | |
780 | | const Index num_starts = bs.GetNumStarts(); |
781 | | sss->PrepareForNumStarts(num_starts); |
782 | | const Index num_slots = num_starts + kCoeffBits - 1; |
783 | | |
784 | | for (Index i = num_slots; i > 0;) { |
785 | | --i; |
786 | | CoeffRow cr; |
787 | | ResultRow rr; |
788 | | bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); |
789 | | // solution row |
790 | | ResultRow sr = 0; |
791 | | for (Index j = 0; j < kResultBits; ++j) { |
792 | | // Compute next solution bit at row i, column j (see derivation below) |
793 | | CoeffRow tmp = state[j] << 1; |
794 | | bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0; |
795 | | tmp |= bit ? CoeffRow{1} : CoeffRow{0}; |
796 | | |
797 | | // Now tmp is solution at column j from row i for next kCoeffBits |
798 | | // more rows. Thus, for valid solution, the dot product of the |
799 | | // solution column with the coefficient row has to equal the result |
800 | | // at that column, |
801 | | // BitParity(tmp & cr) == ((rr >> j) & 1) |
802 | | |
803 | | // Update state. |
804 | | state[j] = tmp; |
805 | | // add to solution row |
806 | | sr |= (bit ? ResultRow{1} : ResultRow{0}) << j; |
807 | | } |
808 | | sss->Store(i, sr); |
809 | | } |
810 | | } |
811 | | |
812 | | // Common functionality for querying a key (already hashed) in |
813 | | // SimpleSolutionStorage. |
814 | | template <typename SimpleSolutionStorage> |
815 | | typename SimpleSolutionStorage::ResultRow SimpleQueryHelper( |
816 | | typename SimpleSolutionStorage::Index start_slot, |
817 | | typename SimpleSolutionStorage::CoeffRow cr, |
818 | | const SimpleSolutionStorage &sss) { |
819 | | using CoeffRow = typename SimpleSolutionStorage::CoeffRow; |
820 | | using ResultRow = typename SimpleSolutionStorage::ResultRow; |
821 | | |
822 | | constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U); |
823 | | |
824 | | ResultRow result = 0; |
825 | | for (unsigned i = 0; i < kCoeffBits; ++i) { |
826 | | // Bit masking whole value is generally faster here than 'if' |
827 | | result ^= sss.Load(start_slot + i) & |
828 | | (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1})); |
829 | | } |
830 | | return result; |
831 | | } |
832 | | |
833 | | // General PHSF query a key from SimpleSolutionStorage. |
834 | | template <typename SimpleSolutionStorage, typename PhsfQueryHasher> |
835 | | typename SimpleSolutionStorage::ResultRow SimplePhsfQuery( |
836 | | const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, |
837 | | const SimpleSolutionStorage &sss) { |
838 | | const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key); |
839 | | |
840 | | static_assert(sizeof(typename SimpleSolutionStorage::Index) == |
841 | | sizeof(typename PhsfQueryHasher::Index), |
842 | | "must be same"); |
843 | | static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == |
844 | | sizeof(typename PhsfQueryHasher::CoeffRow), |
845 | | "must be same"); |
846 | | |
847 | | return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), |
848 | | hasher.GetCoeffRow(hash), sss); |
849 | | } |
850 | | |
851 | | // Filter query a key from SimpleSolutionStorage. |
852 | | template <typename SimpleSolutionStorage, typename FilterQueryHasher> |
853 | | bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key, |
854 | | const FilterQueryHasher &hasher, |
855 | | const SimpleSolutionStorage &sss) { |
856 | | const typename FilterQueryHasher::Hash hash = hasher.GetHash(key); |
857 | | const typename SimpleSolutionStorage::ResultRow expected = |
858 | | hasher.GetResultRowFromHash(hash); |
859 | | |
860 | | static_assert(sizeof(typename SimpleSolutionStorage::Index) == |
861 | | sizeof(typename FilterQueryHasher::Index), |
862 | | "must be same"); |
863 | | static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == |
864 | | sizeof(typename FilterQueryHasher::CoeffRow), |
865 | | "must be same"); |
866 | | static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) == |
867 | | sizeof(typename FilterQueryHasher::ResultRow), |
868 | | "must be same"); |
869 | | |
870 | | return expected == |
871 | | SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), |
872 | | hasher.GetCoeffRow(hash), sss); |
873 | | } |
874 | | |
875 | | // #################### InterleavedSolutionStorage ###################### |
876 | | |
877 | | // InterleavedSolutionStorage is row-major at a high level, for good |
878 | | // locality, and column-major at a low level, for CPU efficiency |
879 | | // especially in filter queries or relatively small number of result bits |
880 | | // (== solution columns). The storage is a sequence of "blocks" where a |
881 | | // block has one CoeffRow-sized segment for each solution column. Each |
882 | | // query spans at most two blocks; the starting solution row is typically |
883 | | // in the row-logical middle of a block and spans to the middle of the |
884 | | // next block. (See diagram below.) |
885 | | // |
886 | | // InterleavedSolutionStorage supports choosing b (number of result or |
887 | | // solution columns) at run time, and even supports mixing b and b-1 solution |
888 | | // columns in a single linear system solution, for filters that can |
889 | | // effectively utilize any size space (multiple of CoeffRow) for minimizing |
890 | | // FP rate for any number of added keys. To simplify query implementation |
891 | | // (with lower-index columns first), the b-bit portion comes after the b-1 |
892 | | // portion of the structure. |
893 | | // |
894 | | // Diagram (=== marks logical block boundary; b=4; ### is data used by a |
895 | | // query crossing the b-1 to b boundary, each Segment has type CoeffRow): |
896 | | // ... |
897 | | // +======================+ |
898 | | // | S e g m e n t col=0 | |
899 | | // +----------------------+ |
900 | | // | S e g m e n t col=1 | |
901 | | // +----------------------+ |
902 | | // | S e g m e n t col=2 | |
903 | | // +======================+ |
904 | | // | S e g m e n #########| |
905 | | // +----------------------+ |
906 | | // | S e g m e n #########| |
907 | | // +----------------------+ |
908 | | // | S e g m e n #########| |
909 | | // +======================+ Result/solution columns: above = 3, below = 4 |
910 | | // |#############t col=0 | |
911 | | // +----------------------+ |
912 | | // |#############t col=1 | |
913 | | // +----------------------+ |
914 | | // |#############t col=2 | |
915 | | // +----------------------+ |
916 | | // | S e g m e n t col=3 | |
917 | | // +======================+ |
918 | | // | S e g m e n t col=0 | |
919 | | // +----------------------+ |
920 | | // | S e g m e n t col=1 | |
921 | | // +----------------------+ |
922 | | // | S e g m e n t col=2 | |
923 | | // +----------------------+ |
924 | | // | S e g m e n t col=3 | |
925 | | // +======================+ |
926 | | // ... |
927 | | // |
928 | | // InterleavedSolutionStorage will be adapted by the algorithms from |
929 | | // simple array-like segment storage. That array-like storage is templatized |
930 | | // in part so that an implementation may choose to handle byte ordering |
931 | | // at access time. |
932 | | // |
933 | | // concept InterleavedSolutionStorage extends RibbonTypes { |
934 | | // // This is called at the beginning of back-substitution for the |
935 | | // // solution storage to do any remaining configuration before data |
936 | | // // is stored to it. If configuration is previously finalized, this |
937 | | // // could be a simple assertion or even no-op. Ribbon algorithms |
938 | | // // only call this from back-substitution, and only once per call, |
939 | | // // before other functions here. |
940 | | // void PrepareForNumStarts(Index num_starts) const; |
941 | | // // Must return num_starts passed to PrepareForNumStarts, or the most |
942 | | // // recent call to PrepareForNumStarts if this storage object can be |
943 | | // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because |
944 | | // // there must be a run of kCoeffBits slots starting from each start. |
945 | | // Index GetNumStarts() const; |
946 | | // // The larger number of solution columns used (called "b" above). |
947 | | // Index GetUpperNumColumns() const; |
948 | | // // If returns > 0, then block numbers below that use |
949 | | // // GetUpperNumColumns() - 1 columns per solution row, and the rest |
950 | | // // use GetUpperNumColumns(). A block represents kCoeffBits "slots", |
951 | | // // where all but the last kCoeffBits - 1 slots are also starts. And |
952 | | // // a block contains a segment for each solution column. |
953 | | // // An implementation may only support uniform columns per solution |
954 | | // // row and return constant 0 here. |
955 | | // Index GetUpperStartBlock() const; |
956 | | // |
957 | | // // ### "Array of segments" portion of API ### |
958 | | // // The number of values of type CoeffRow used in this solution |
959 | | // // representation. (This value can be inferred from the previous |
960 | | // // three functions, but is expected at least for sanity / assertion |
961 | | // // checking.) |
962 | | // Index GetNumSegments() const; |
963 | | // // Load an entry from the logical array of segments |
964 | | // CoeffRow LoadSegment(Index segment_num) const; |
965 | | // // Store an entry to the logical array of segments |
966 | | // void StoreSegment(Index segment_num, CoeffRow data); |
967 | | // }; |
968 | | |
969 | | // A helper for InterleavedBackSubst. |
970 | | template <typename BandingStorage> |
971 | | inline void BackSubstBlock(typename BandingStorage::CoeffRow *state, |
972 | | typename BandingStorage::Index num_columns, |
973 | | const BandingStorage &bs, |
974 | 0 | typename BandingStorage::Index start_slot) { |
975 | 0 | using CoeffRow = typename BandingStorage::CoeffRow; |
976 | 0 | using Index = typename BandingStorage::Index; |
977 | 0 | using ResultRow = typename BandingStorage::ResultRow; |
978 | |
|
979 | 0 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
980 | |
|
981 | 0 | for (Index i = start_slot + kCoeffBits; i > start_slot;) { |
982 | 0 | --i; |
983 | 0 | CoeffRow cr; |
984 | 0 | ResultRow rr; |
985 | 0 | bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); |
986 | 0 | for (Index j = 0; j < num_columns; ++j) { |
987 | | // Compute next solution bit at row i, column j (see derivation below) |
988 | 0 | CoeffRow tmp = state[j] << 1; |
989 | 0 | int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1); |
990 | 0 | tmp |= static_cast<CoeffRow>(bit); |
991 | | |
992 | | // Now tmp is solution at column j from row i for next kCoeffBits |
993 | | // more rows. Thus, for valid solution, the dot product of the |
994 | | // solution column with the coefficient row has to equal the result |
995 | | // at that column, |
996 | | // BitParity(tmp & cr) == ((rr >> j) & 1) |
997 | | |
998 | | // Update state. |
999 | 0 | state[j] = tmp; |
1000 | 0 | } |
1001 | 0 | } |
1002 | 0 | } |
1003 | | |
1004 | | // Back-substitution for generating a solution from BandingStorage to |
1005 | | // InterleavedSolutionStorage. |
1006 | | template <typename InterleavedSolutionStorage, typename BandingStorage> |
1007 | | void InterleavedBackSubst(InterleavedSolutionStorage *iss, |
1008 | 0 | const BandingStorage &bs) { |
1009 | 0 | using CoeffRow = typename BandingStorage::CoeffRow; |
1010 | 0 | using Index = typename BandingStorage::Index; |
1011 | |
|
1012 | 0 | static_assert( |
1013 | 0 | sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index), |
1014 | 0 | "must be same"); |
1015 | 0 | static_assert( |
1016 | 0 | sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow), |
1017 | 0 | "must be same"); |
1018 | |
|
1019 | 0 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
1020 | |
|
1021 | 0 | const Index num_starts = bs.GetNumStarts(); |
1022 | | // Although it might be nice to have a filter that returns "always false" |
1023 | | // when no key is added, we aren't specifically supporting that here |
1024 | | // because it would require another condition branch in the query. |
1025 | 0 | assert(num_starts > 0); |
1026 | 0 | iss->PrepareForNumStarts(num_starts); |
1027 | |
|
1028 | 0 | const Index num_slots = num_starts + kCoeffBits - 1; |
1029 | 0 | assert(num_slots % kCoeffBits == 0); |
1030 | 0 | const Index num_blocks = num_slots / kCoeffBits; |
1031 | 0 | const Index num_segments = iss->GetNumSegments(); |
1032 | | |
1033 | | // For now upper, then lower |
1034 | 0 | Index num_columns = iss->GetUpperNumColumns(); |
1035 | 0 | const Index upper_start_block = iss->GetUpperStartBlock(); |
1036 | |
|
1037 | 0 | if (num_columns == 0) { |
1038 | | // Nothing to do, presumably because there's not enough space for even |
1039 | | // a single segment. |
1040 | 0 | assert(num_segments == 0); |
1041 | | // When num_columns == 0, a Ribbon filter query will always return true, |
1042 | | // or a PHSF query always 0. |
1043 | 0 | return; |
1044 | 0 | } |
1045 | | |
1046 | | // We should be utilizing all available segments |
1047 | 0 | assert(num_segments == (upper_start_block * (num_columns - 1)) + |
1048 | 0 | ((num_blocks - upper_start_block) * num_columns)); |
1049 | | |
1050 | | // TODO: consider fixed-column specializations with stack-allocated state |
1051 | | |
1052 | | // A column-major buffer of the solution matrix, containing enough |
1053 | | // recently-computed solution data to compute the next solution row |
1054 | | // (based also on banding data). |
1055 | 0 | std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()}; |
1056 | |
|
1057 | 0 | Index block = num_blocks; |
1058 | 0 | Index segment_num = num_segments; |
1059 | 0 | while (block > upper_start_block) { |
1060 | 0 | --block; |
1061 | 0 | BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); |
1062 | 0 | segment_num -= num_columns; |
1063 | 0 | for (Index i = 0; i < num_columns; ++i) { |
1064 | 0 | iss->StoreSegment(segment_num + i, state[i]); |
1065 | 0 | } |
1066 | 0 | } |
1067 | | // Now (if applicable), region using lower number of columns |
1068 | | // (This should be optimized away if GetUpperStartBlock() returns |
1069 | | // constant 0.) |
1070 | 0 | --num_columns; |
1071 | 0 | while (block > 0) { |
1072 | 0 | --block; |
1073 | 0 | BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); |
1074 | 0 | segment_num -= num_columns; |
1075 | 0 | for (Index i = 0; i < num_columns; ++i) { |
1076 | 0 | iss->StoreSegment(segment_num + i, state[i]); |
1077 | 0 | } |
1078 | 0 | } |
1079 | | // Verify everything processed |
1080 | 0 | assert(block == 0); |
1081 | 0 | assert(segment_num == 0); |
1082 | 0 | } |
1083 | | |
1084 | | // Prefetch memory for a key in InterleavedSolutionStorage. |
1085 | | template <typename InterleavedSolutionStorage, typename PhsfQueryHasher> |
1086 | | inline void InterleavedPrepareQuery( |
1087 | | const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, |
1088 | | const InterleavedSolutionStorage &iss, |
1089 | | typename PhsfQueryHasher::Hash *saved_hash, |
1090 | | typename InterleavedSolutionStorage::Index *saved_segment_num, |
1091 | | typename InterleavedSolutionStorage::Index *saved_num_columns, |
1092 | 0 | typename InterleavedSolutionStorage::Index *saved_start_bit) { |
1093 | 0 | using Hash = typename PhsfQueryHasher::Hash; |
1094 | 0 | using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; |
1095 | 0 | using Index = typename InterleavedSolutionStorage::Index; |
1096 | |
|
1097 | 0 | static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), |
1098 | 0 | "must be same"); |
1099 | |
|
1100 | 0 | const Hash hash = hasher.GetHash(key); |
1101 | 0 | const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); |
1102 | |
|
1103 | 0 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
1104 | |
|
1105 | 0 | const Index upper_start_block = iss.GetUpperStartBlock(); |
1106 | 0 | Index num_columns = iss.GetUpperNumColumns(); |
1107 | 0 | Index start_block_num = start_slot / kCoeffBits; |
1108 | 0 | Index segment_num = start_block_num * num_columns - |
1109 | 0 | std::min(start_block_num, upper_start_block); |
1110 | | // Change to lower num columns if applicable. |
1111 | | // (This should not compile to a conditional branch.) |
1112 | 0 | num_columns -= (start_block_num < upper_start_block) ? 1 : 0; |
1113 | |
|
1114 | 0 | Index start_bit = start_slot % kCoeffBits; |
1115 | |
|
1116 | 0 | Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns); |
1117 | |
|
1118 | 0 | iss.PrefetchSegmentRange(segment_num, segment_num + segment_count); |
1119 | |
|
1120 | 0 | *saved_hash = hash; |
1121 | 0 | *saved_segment_num = segment_num; |
1122 | 0 | *saved_num_columns = num_columns; |
1123 | 0 | *saved_start_bit = start_bit; |
1124 | 0 | } |
1125 | | |
1126 | | // General PHSF query from InterleavedSolutionStorage, using data for |
1127 | | // the query key from InterleavedPrepareQuery |
1128 | | template <typename InterleavedSolutionStorage, typename PhsfQueryHasher> |
1129 | | inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery( |
1130 | | typename PhsfQueryHasher::Hash hash, |
1131 | | typename InterleavedSolutionStorage::Index segment_num, |
1132 | | typename InterleavedSolutionStorage::Index num_columns, |
1133 | | typename InterleavedSolutionStorage::Index start_bit, |
1134 | | const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) { |
1135 | | using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; |
1136 | | using Index = typename InterleavedSolutionStorage::Index; |
1137 | | using ResultRow = typename InterleavedSolutionStorage::ResultRow; |
1138 | | |
1139 | | static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), |
1140 | | "must be same"); |
1141 | | static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow), |
1142 | | "must be same"); |
1143 | | |
1144 | | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
1145 | | |
1146 | | const CoeffRow cr = hasher.GetCoeffRow(hash); |
1147 | | |
1148 | | ResultRow sr = 0; |
1149 | | const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit); |
1150 | | for (Index i = 0; i < num_columns; ++i) { |
1151 | | sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i; |
1152 | | } |
1153 | | |
1154 | | if (start_bit > 0) { |
1155 | | segment_num += num_columns; |
1156 | | const CoeffRow cr_right = |
1157 | | cr >> static_cast<unsigned>(kCoeffBits - start_bit); |
1158 | | for (Index i = 0; i < num_columns; ++i) { |
1159 | | sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i; |
1160 | | } |
1161 | | } |
1162 | | |
1163 | | return sr; |
1164 | | } |
1165 | | |
1166 | | // Filter query a key from InterleavedFilterQuery. |
1167 | | template <typename InterleavedSolutionStorage, typename FilterQueryHasher> |
1168 | | inline bool InterleavedFilterQuery( |
1169 | | typename FilterQueryHasher::Hash hash, |
1170 | | typename InterleavedSolutionStorage::Index segment_num, |
1171 | | typename InterleavedSolutionStorage::Index num_columns, |
1172 | | typename InterleavedSolutionStorage::Index start_bit, |
1173 | 0 | const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) { |
1174 | 0 | using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; |
1175 | 0 | using Index = typename InterleavedSolutionStorage::Index; |
1176 | 0 | using ResultRow = typename InterleavedSolutionStorage::ResultRow; |
1177 | |
|
1178 | 0 | static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index), |
1179 | 0 | "must be same"); |
1180 | 0 | static_assert( |
1181 | 0 | sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow), |
1182 | 0 | "must be same"); |
1183 | 0 | static_assert( |
1184 | 0 | sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow), |
1185 | 0 | "must be same"); |
1186 | |
|
1187 | 0 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); |
1188 | |
|
1189 | 0 | const CoeffRow cr = hasher.GetCoeffRow(hash); |
1190 | 0 | const ResultRow expected = hasher.GetResultRowFromHash(hash); |
1191 | | |
1192 | | // TODO: consider optimizations such as |
1193 | | // * get rid of start_bit == 0 condition with careful fetching & shifting |
1194 | 0 | if (start_bit == 0) { |
1195 | 0 | for (Index i = 0; i < num_columns; ++i) { |
1196 | 0 | if (BitParity(iss.LoadSegment(segment_num + i) & cr) != |
1197 | 0 | (static_cast<int>(expected >> i) & 1)) { |
1198 | 0 | return false; |
1199 | 0 | } |
1200 | 0 | } |
1201 | 0 | } else { |
1202 | 0 | const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit); |
1203 | 0 | const CoeffRow cr_right = |
1204 | 0 | cr >> static_cast<unsigned>(kCoeffBits - start_bit); |
1205 | |
|
1206 | 0 | for (Index i = 0; i < num_columns; ++i) { |
1207 | 0 | CoeffRow soln_data = |
1208 | 0 | (iss.LoadSegment(segment_num + i) & cr_left) ^ |
1209 | 0 | (iss.LoadSegment(segment_num + num_columns + i) & cr_right); |
1210 | 0 | if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) { |
1211 | 0 | return false; |
1212 | 0 | } |
1213 | 0 | } |
1214 | 0 | } |
1215 | | // otherwise, all match |
1216 | 0 | return true; |
1217 | 0 | } |
1218 | | |
1219 | | // TODO: refactor Interleaved*Query so that queries can be "prepared" by |
1220 | | // prefetching memory, to hide memory latency for multiple queries in a |
1221 | | // single thread. |
1222 | | |
1223 | | } // namespace ribbon |
1224 | | |
1225 | | } // namespace ROCKSDB_NAMESPACE |