Coverage Report

Created: 2024-07-27 06:53

/src/rocksdb/util/ribbon_alg.h
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
6
#pragma once
7
8
#include <array>
9
#include <memory>
10
11
#include "rocksdb/rocksdb_namespace.h"
12
#include "util/math128.h"
13
14
namespace ROCKSDB_NAMESPACE {
15
16
namespace ribbon {
17
18
// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
19
//
20
// ribbon_alg.h: generic versions of core algorithms.
21
//
22
// Ribbon is a Perfect Hash Static Function construction useful as a compact
23
// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
24
// system construction that approximates a Band Matrix with hashing,
25
// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
26
// remarkably efficient and adaptable at constructing an upper-triangular
27
// band matrix from a set of band-approximating inputs from (a), and
28
// (c) a storage layout that is fast and adaptable as a filter.
29
//
30
// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
31
// with One Short Random Block per Row, with Applications" by Stefan
32
// Walzer and Martin Dietzfelbinger ("DW paper")
33
// (b) developed by Peter C. Dillinger, though not the first on-the-fly
34
// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
35
// Bioglio, Grangetto, Gaeta, and Sereno.
36
// (c) see "interleaved" solution storage below.
37
//
38
// See ribbon_impl.h for high-level behavioral summary. This file focuses
39
// on the core design details.
40
//
41
// ######################################################################
42
// ################# PHSF -> static filter reduction ####################
43
//
44
// A Perfect Hash Static Function is a data structure representing a
45
// map from anything hashable (a "key") to values of some fixed size.
46
// Crucially, it is allowed to return garbage values for anything not in
47
// the original set of map keys, and it is a "static" structure: entries
48
// cannot be added or deleted after construction. PHSFs representing n
49
// mappings to b-bit values (assume uniformly distributed) require at least
50
// n * b bits to represent, or at least b bits per entry. We typically
51
// describe the compactness of a PHSF by typical bits per entry as some
52
// function of b. For example, the MWHC construction (k=3 "peeling")
53
// requires about 1.0222*b and a variant called Xor+ requires about
54
// 1.08*b + 0.5 bits per entry.
55
//
56
// With more hashing, a PHSF can over-approximate a set as a Bloom filter
57
// does, with no FN queries and predictable false positive (FP) query
58
// rate. Instead of the user providing a value to map each input key to,
59
// a hash function provides the value. Keys in the original set will
60
// return a positive membership query because the underlying PHSF returns
61
// the same value as hashing the key. When a key is not in the original set,
62
// the PHSF returns a "garbage" value, which is only equal to the key's
63
// hash with (false positive) probability 1 in 2^b.
64
//
65
// For a matching false positive rate, standard Bloom filters require
66
// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
67
// require a bit more, around 1.5*b bits per entry. Thus, a Bloom
68
// alternative could save up to or nearly 1/3rd of memory and storage
69
// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
70
// is dynamic.)
71
//
72
// Recommended reading:
73
// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
74
// by Graf and Lemire
75
// First three sections of "Fast Scalable Construction of (Minimal
76
// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
77
//
78
// ######################################################################
79
// ################## PHSF vs. hash table vs. Bloom #####################
80
//
81
// You can think of traditional hash tables and related filter variants
82
// such as Cuckoo filters as utilizing an "OR" construction: a hash
83
// function associates a key with some slots and the data is returned if
84
// the data is found in any one of those slots. The collision resolution
85
// is visible in the final data structure and requires extra information.
86
// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
87
// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
88
// structure associates each input key with data in one slot, the
89
// structure implicitly constructs a (near-)minimal (near-)perfect hash
90
// (MPH) of the keys, which requires at least 1.44 bits per key to
91
// represent. This is why approaches with visible collision resolution
92
// have a fixed + 1.5 or more in storage overhead per entry, often in
93
// addition to an overhead multiplier on b.
94
//
95
// By contrast Bloom filters utilize an "AND" construction: a query only
96
// returns true if all bit positions associated with a key are set to 1.
97
// There is no collision resolution, so Bloom filters do not suffer a
98
// fixed bits per entry overhead like the above structures.
99
//
100
// PHSFs typically use a bitwise XOR construction: the data you want is
101
// not in a single slot, but in a linear combination of several slots.
102
// For static data, this gives the best of "AND" and "OR" constructions:
103
// avoids the +1.44 or more fixed overhead by not approximating a MPH and
104
// can do much better than Bloom's 1.44 factor on b with collision
105
// resolution, which here is done ahead of time and invisible at query
106
// time.
107
//
108
// ######################################################################
109
// ######################## PHSF construction ###########################
110
//
111
// For a typical PHSF, construction is solving a linear system of
112
// equations, typically in GF(2), which is to say that values are boolean
113
// and XOR serves both as addition and subtraction. We can use matrices to
114
// represent the problem:
115
//
116
//    C    *    S    =    R
117
// (n x m)   (m x b)   (n x b)
118
// where C = coefficients, S = solution, R = results
119
// and solving for S given C and R.
120
//
121
// Note that C and R each have n rows, one for each input entry for the
122
// PHSF. A row in C is given by a hash function on the PHSF input key,
123
// and the corresponding row in R is the b-bit value to associate with
124
// that input key. (In a filter, rows of R are given by another hash
125
// function on the input key.)
126
//
127
// On solving, the matrix S (solution) is the final PHSF data, as it
128
// maps any row from the original C to its corresponding desired result
129
// in R. We just have to hash our query inputs and compute a linear
130
// combination of rows in S.
131
//
132
// In theory, we could chose m = n and let a hash function associate
133
// each input key with random rows in C. A solution exists with high
134
// probability, and uses essentially minimum space, b bits per entry
135
// (because we set m = n) but this has terrible scaling, something
136
// like O(n^2) space and O(n^3) time during construction (Gaussian
137
// elimination) and O(n) query time. But computational efficiency is
138
// key, and the core of this is avoiding scanning all of S to answer
139
// each query.
140
//
141
// The traditional approach (MWHC, aka Xor filter) starts with setting
142
// only some small fixed number of columns (typically k=3) to 1 for each
143
// row of C, with remaining entries implicitly 0. This is implemented as
144
// three hash functions over [0,m), and S can be implemented as a vector
145
// of b-bit values. Now, a query only involves looking up k rows
146
// (values) in S and computing their bitwise XOR. Additionally, this
147
// construction can use a linear time algorithm called "peeling" for
148
// finding a solution in many cases of one existing, but peeling
149
// generally requires a larger space overhead factor in the solution
150
// (m/n) than is required with Gaussian elimination.
151
//
152
// Recommended reading:
153
// "Peeling Close to the Orientability Threshold - Spatial Coupling in
154
// Hashing-Based Data Structures" by Stefan Walzer
155
//
156
// ######################################################################
157
// ##################### Ribbon PHSF construction #######################
158
//
159
// Ribbon constructs coefficient rows essentially the same as in the
160
// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
161
// r (kCoeffBits in code), each key is hashed to a starting column in
162
// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
163
// coefficients (GetCoeffRow() in code). If you sort the rows by start,
164
// the C matrix would look something like this:
165
//
166
// [####00000000000000000000]
167
// [####00000000000000000000]
168
// [000####00000000000000000]
169
// [0000####0000000000000000]
170
// [0000000####0000000000000]
171
// [000000000####00000000000]
172
// [000000000####00000000000]
173
// [0000000000000####0000000]
174
// [0000000000000000####0000]
175
// [00000000000000000####000]
176
// [00000000000000000000####]
177
//
178
// where each # could be a 0 or 1, chosen uniformly by a hash function.
179
// (Except we typically set the start column value to 1.) This scheme
180
// uses hashing to approximate a band matrix, and it has a solution iff
181
// it reduces to an upper-triangular boolean r-band matrix, like this:
182
//
183
// [1###00000000000000000000]
184
// [01##00000000000000000000]
185
// [000000000000000000000000]
186
// [0001###00000000000000000]
187
// [000000000000000000000000]
188
// [000001##0000000000000000]
189
// [000000000000000000000000]
190
// [00000001###0000000000000]
191
// [000000001###000000000000]
192
// [0000000001##000000000000]
193
// ...
194
// [00000000000000000000001#]
195
// [000000000000000000000001]
196
//
197
// where we have expanded to an m x m matrix by filling with rows of
198
// all zeros as needed. As in Gaussian elimination, this form is ready for
199
// generating a solution through back-substitution.
200
//
201
// The awesome thing about the Ribbon construction (from the DW paper) is
202
// how row reductions keep each row representable as a start column and
203
// r coefficients, because row reductions are only needed when two rows
204
// have the same number of leading zero columns. Thus, the combination
205
// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
206
// out the leading 1s, so starts (at least) one column later and only
207
// needs (at most) r - 1 coefficients.
208
//
209
// ######################################################################
210
// ###################### Ribbon PHSF scalability #######################
211
//
212
// Although more practical detail is in ribbon_impl.h, it's worth
213
// understanding some of the overall benefits and limitations of the
214
// Ribbon PHSFs.
215
//
216
// High-end scalability is a primary issue for Ribbon PHSFs, because in
217
// a single Ribbon linear system with fixed r and fixed m/n ratio, the
218
// solution probability approaches zero as n approaches infinity.
219
// For a given n, solution probability improves with larger r and larger
220
// m/n.
221
//
222
// By contrast, peeling-based PHSFs have somewhat worse storage ratio
223
// or solution probability for small n (less than ~1000). This is
224
// especially true with spatial-coupling, where benefits are only
225
// notable for n on the order of 100k or 1m or more.
226
//
227
// To make best use of current hardware, r=128 seems to be closest to
228
// a "generally good" choice for Ribbon, at least in RocksDB where SST
229
// Bloom filters typically hold around 10-100k keys, and almost always
230
// less than 10m keys. r=128 ribbon has a high chance of encoding success
231
// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
232
// for roughly 10k - 10m keys in a single linear system. r=64 only scales
233
// up to about 10k keys with the same storage overhead. Construction and
234
// access times for r=128 are similar to r=64. r=128 tracks nearly
235
// twice as much data during construction, but in most cases we expect
236
// the scalability benefits of r=128 vs. r=64 to make it preferred.
237
//
238
// A natural approach to scaling Ribbon beyond ~10m keys is splitting
239
// (or "sharding") the inputs into multiple linear systems with their
240
// own hash seeds. This can also help to control peak memory consumption.
241
// TODO: much more to come
242
//
243
// ######################################################################
244
// #################### Ribbon on-the-fly banding #######################
245
//
246
// "Banding" is what we call the process of reducing the inputs to an
247
// upper-triangular r-band matrix ready for finishing a solution with
248
// back-substitution. Although the DW paper presents an algorithm for
249
// this ("SGauss"), the awesome properties of their construction enable
250
// an even simpler, faster, and more backtrackable algorithm. In simplest
251
// terms, the SGauss algorithm requires sorting the inputs by start
252
// columns, but it's possible to make Gaussian elimination resemble hash
253
// table insertion!
254
//
255
// The enhanced algorithm is based on these observations:
256
// - When processing a coefficient row with first 1 in column j,
257
//   - If it's the first at column j to be processed, it can be part of
258
//     the banding at row j. (And that decision never overwritten, with
259
//     no loss of generality!)
260
//   - Else, it can be combined with existing row j and re-processed,
261
//     which will look for a later "empty" row or reach "no solution".
262
//
263
// We call our banding algorithm "incremental" and "on-the-fly" because
264
// (like hash table insertion) we are "finished" after each input
265
// processed, with respect to all inputs processed so far. Although the
266
// band matrix is an intermediate step to the solution structure, we have
267
// eliminated intermediate steps and unnecessary data tracking for
268
// banding.
269
//
270
// Building on "incremental" and "on-the-fly", the banding algorithm is
271
// easily backtrackable because no (non-empty) rows are overwritten in
272
// the banding. Thus, if we want to "try" adding an additional set of
273
// inputs to the banding, we only have to record which rows were written
274
// in order to efficiently backtrack to our state before considering
275
// the additional set. (TODO: how this can mitigate scalability and
276
// reach sub-1% overheads)
277
//
278
// Like in a linear-probed hash table, as the occupancy approaches and
279
// surpasses 90-95%, collision resolution dominates the construction
280
// time. (Ribbon doesn't usually pay at query time; see solution
281
// storage below.) This means that we can speed up construction time
282
// by using a higher m/n ratio, up to negative returns around 1.2.
283
// At m/n ~= 1.2, which still saves memory substantially vs. Bloom
284
// filter's 1.5, construction speed (including back-substitution) is not
285
// far from sorting speed, but still a few times slower than cache-local
286
// Bloom construction speed.
287
//
288
// Back-substitution from an upper-triangular boolean band matrix is
289
// especially fast and easy. All the memory accesses are sequential or at
290
// least local, no random. If the number of result bits (b) is a
291
// compile-time constant, the back-substitution state can even be tracked
292
// in CPU registers. Regardless of the solution representation, we prefer
293
// column-major representation for tracking back-substitution state, as
294
// r (the band width) will typically be much larger than b (result bits
295
// or columns), so better to handle r-bit values b times (per solution
296
// row) than b-bit values r times.
297
//
298
// ######################################################################
299
// ##################### Ribbon solution storage ########################
300
//
301
// Row-major layout is typical for boolean (bit) matrices, including for
302
// MWHC (Xor) filters where a query combines k b-bit values, and k is
303
// typically smaller than b. Even for k=4 and b=2, at least k=4 random
304
// look-ups are required regardless of layout.
305
//
306
// Ribbon PHSFs are quite different, however, because
307
// (a) all of the solution rows relevant to a query are within a single
308
// range of r rows, and
309
// (b) the number of solution rows involved (r/2 on average, or r if
310
// avoiding conditional accesses) is typically much greater than
311
// b, the number of solution columns.
312
//
313
// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
314
// by processing (up to) r entries of b bits each, where b is typically
315
// less than 10 for filter applications.
316
//
317
// Column-major layout has poor locality because of accessing up to b
318
// memory locations in different pages (and obviously cache lines). Note
319
// that negative filter queries do not typically need to access all
320
// solution columns, as they can return when a mismatch is found in any
321
// result/solution column. This optimization doesn't always pay off on
322
// recent hardware, where the penalty for unpredictable conditional
323
// branching can exceed the penalty for unnecessary work, but the
324
// optimization is essentially unavailable with row-major layout.
325
//
326
// The best compromise seems to be interleaving column-major on the small
327
// scale with row-major on the large scale. For example, let a solution
328
// "block" be r rows column-major encoded as b r-bit values in sequence.
329
// Each query accesses (up to) 2 adjacent blocks, which will typically
330
// span 1-3 cache lines in adjacent memory. We get very close to the same
331
// locality as row-major, but with much faster reconstruction of each
332
// result column, at least for filter applications where b is relatively
333
// small and negative queries can return early.
334
//
335
// ######################################################################
336
// ###################### Fractional result bits ########################
337
//
338
// Bloom filters have great flexibility that alternatives mostly do not
339
// have. One of those flexibilities is in utilizing any ratio of data
340
// structure bits per key. With a typical memory allocator like jemalloc,
341
// this flexibility can save roughly 10% of the filters' footprint in
342
// DRAM by rounding up and down filter sizes to minimize memory internal
343
// fragmentation (see optimize_filters_for_memory RocksDB option).
344
//
345
// At first glance, PHSFs only offer a whole number of bits per "slot"
346
// (m rather than number of keys n), but coefficient locality in the
347
// Ribbon construction makes fractional bits/key quite possible and
348
// attractive for filter applications. This works by a prefix of the
349
// structure using b-1 solution columns and the rest using b solution
350
// columns. See InterleavedSolutionStorage below for more detail.
351
//
352
// Because false positive rates are non-linear in bits/key, this approach
353
// is not quite optimal in terms of information theory. In common cases,
354
// we see additional space overhead up to about 1.5% vs. theoretical
355
// optimal to achieve the same FP rate. We consider this a quite acceptable
356
// overhead for very efficiently utilizing space that might otherwise be
357
// wasted.
358
//
359
// This property of Ribbon even makes it "elastic." A Ribbon filter and
360
// its small metadata for answering queries can be adapted into another
361
// Ribbon filter filling any smaller multiple of r bits (plus small
362
// metadata), with a correspondingly higher FP rate. None of the data
363
// thrown away during construction needs to be recalled for this reduction.
364
// Similarly a single Ribbon construction can be separated (by solution
365
// column) into two or more structures (or "layers" or "levels") with
366
// independent filtering ability (no FP correlation, just as solution or
367
// result columns in a single structure) despite being constructed as part
368
// of a single linear system. (TODO: implement)
369
// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
370
// Efficient Read for LSM-tree-based KV Stores."
371
//
372
373
// ######################################################################
374
// ################### CODE: Ribbon core algorithms #####################
375
// ######################################################################
376
//
377
// These algorithms are templatized for genericity but near-maximum
378
// performance in a given application. The template parameters
379
// adhere to informal class/struct type concepts outlined below. (This
380
// code is written for C++11 so does not use formal C++ concepts.)
381
382
// Rough architecture for these algorithms:
383
//
384
// +-----------+     +---+     +-----------------+
385
// | AddInputs | --> | H | --> | BandingStorage  |
386
// +-----------+     | a |     +-----------------+
387
//                   | s |             |
388
//                   | h |      Back substitution
389
//                   | e |             V
390
// +-----------+     | r |     +-----------------+
391
// | Query Key | --> |   | >+< | SolutionStorage |
392
// +-----------+     +---+  |  +-----------------+
393
//                          V
394
//                     Query result
395
396
// Common to other concepts
397
// concept RibbonTypes {
398
//   // An unsigned integer type for an r-bit subsequence of coefficients.
399
//   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
400
//   // generally only hurt scalability to leave bits of CoeffRow unused.
401
//   typename CoeffRow;
402
//   // An unsigned integer type big enough to hold a result row (b bits,
403
//   // or number of solution/result columns).
404
//   // In many applications, especially filters, the number of result
405
//   // columns is decided at run time, so ResultRow simply needs to be
406
//   // big enough for the largest number of columns allowed.
407
//   typename ResultRow;
408
//   // An unsigned integer type sufficient for representing the number of
409
//   // rows in the solution structure, and at least the arithmetic
410
//   // promotion size (usually 32 bits). uint32_t recommended because a
411
//   // single Ribbon construction doesn't really scale to billions of
412
//   // entries.
413
//   typename Index;
414
// };
415
416
// ######################################################################
417
// ######################## Hashers and Banding #########################
418
419
// Hasher concepts abstract out hashing details.
420
421
// concept PhsfQueryHasher extends RibbonTypes {
422
//   // Type for a lookup key, which is hashable.
423
//   typename Key;
424
//
425
//   // Type for hashed summary of a Key. uint64_t is recommended.
426
//   typename Hash;
427
//
428
//   // Compute a hash value summarizing a Key
429
//   Hash GetHash(const Key &) const;
430
//
431
//   // Given a hash value and a number of columns that can start an
432
//   // r-sequence of coefficients (== m - r + 1), return the start
433
//   // column to associate with that hash value. (Starts can be chosen
434
//   // uniformly or "smash" extra entries into the beginning and end for
435
//   // better utilization at those extremes of the structure. Details in
436
//   // ribbon.impl.h)
437
//   Index GetStart(Hash, Index num_starts) const;
438
//
439
//   // Given a hash value, return the r-bit sequence of coefficients to
440
//   // associate with it. It's generally OK if
441
//   //   sizeof(CoeffRow) > sizeof(Hash)
442
//   // as long as the hash itself is not too prone to collisions for the
443
//   // applications and the CoeffRow is generated uniformly from
444
//   // available hash data, but relatively independent of the start.
445
//   //
446
//   // Must be non-zero, because that's required for a solution to exist
447
//   // when mapping to non-zero result row. (Note: BandingAdd could be
448
//   // modified to allow 0 coeff row if that only occurs with 0 result
449
//   // row, which really only makes sense for filter implementation,
450
//   // where both values are hash-derived. Or BandingAdd could reject 0
451
//   // coeff row, forcing next seed, but that has potential problems with
452
//   // generality/scalability.)
453
//   CoeffRow GetCoeffRow(Hash) const;
454
// };
455
456
// concept FilterQueryHasher extends PhsfQueryHasher {
457
//   // For building or querying a filter, this returns the expected
458
//   // result row associated with a hashed input. For general PHSF,
459
//   // this must return 0.
460
//   //
461
//   // Although not strictly required, there's a slightly better chance of
462
//   // solver success if result row is masked down here to only the bits
463
//   // actually needed.
464
//   ResultRow GetResultRowFromHash(Hash) const;
465
// }
466
467
// concept BandingHasher extends FilterQueryHasher {
468
//   // For a filter, this will generally be the same as Key.
469
//   // For a general PHSF, it must either
470
//   // (a) include a key and a result it maps to (e.g. in a std::pair), or
471
//   // (b) GetResultRowFromInput looks up the result somewhere rather than
472
//   // extracting it.
473
//   typename AddInput;
474
//
475
//   // Instead of requiring a way to extract a Key from an
476
//   // AddInput, we require getting the hash of the Key part
477
//   // of an AddInput, which is trivial if AddInput == Key.
478
//   Hash GetHash(const AddInput &) const;
479
//
480
//   // For building a non-filter PHSF, this extracts or looks up the result
481
//   // row to associate with an input. For filter PHSF, this must return 0.
482
//   ResultRow GetResultRowFromInput(const AddInput &) const;
483
//
484
//   // Whether the solver can assume the lowest bit of GetCoeffRow is
485
//   // always 1. When true, it should improve solver efficiency slightly.
486
//   static bool kFirstCoeffAlwaysOne;
487
// }
488
489
// Abstract storage for the the result of "banding" the inputs (Gaussian
490
// elimination to an upper-triangular boolean band matrix). Because the
491
// banding is an incremental / on-the-fly algorithm, this also represents
492
// all the intermediate state between input entries.
493
//
494
// concept BandingStorage extends RibbonTypes {
495
//   // Tells the banding algorithm to prefetch memory associated with
496
//   // the next input before processing the current input. Generally
497
//   // recommended iff the BandingStorage doesn't easily fit in CPU
498
//   // cache.
499
//   bool UsePrefetch() const;
500
//
501
//   // Prefetches (e.g. __builtin_prefetch) memory associated with a
502
//   // slot index i.
503
//   void Prefetch(Index i) const;
504
//
505
//   // Load or store CoeffRow and ResultRow for slot index i.
506
//   // (Gaussian row operations involve both sides of the equation.)
507
//   // Bool `for_back_subst` indicates that customizing values for
508
//   // unconstrained solution rows (cr == 0) is allowed.
509
//   void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
510
//        const;
511
//   void StoreRow(Index i, CoeffRow cr, ResultRow rr);
512
//
513
//   // Returns the number of columns that can start an r-sequence of
514
//   // coefficients, which is the number of slots minus r (kCoeffBits)
515
//   // plus one. (m - r + 1)
516
//   Index GetNumStarts() const;
517
// };
518
519
// Optional storage for backtracking data in banding a set of input
520
// entries. It exposes an array structure which will generally be
521
// used as a stack. It must be able to accommodate as many entries
522
// as are passed in as inputs to `BandingAddRange`.
523
//
524
// concept BacktrackStorage extends RibbonTypes {
525
//   // If false, backtracking support will be disabled in the algorithm.
526
//   // This should preferably be an inline compile-time constant function.
527
//   bool UseBacktrack() const;
528
//
529
//   // Records `to_save` as the `i`th backtrack entry
530
//   void BacktrackPut(Index i, Index to_save);
531
//
532
//   // Recalls the `i`th backtrack entry
533
//   Index BacktrackGet(Index i) const;
534
// }
535
536
// Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
537
// returning true if successful or false if solution is impossible with
538
// current hasher (and presumably its seed) and number of "slots" (solution
539
// or banding rows). (A solution is impossible when there is a linear
540
// dependence among the inputs that doesn't "cancel out".)
541
//
542
// Pre- and post-condition: the BandingStorage represents a band matrix
543
// ready for back substitution (row echelon form except for zero rows),
544
// augmented with result values such that back substitution would give a
545
// solution satisfying all the cr@start -> rr entries added.
546
template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
547
          typename BacktrackStorage>
548
bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
549
                typename BandingStorage::ResultRow rr,
550
                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
551
0
                typename BandingStorage::Index *backtrack_pos) {
552
0
  using CoeffRow = typename BandingStorage::CoeffRow;
553
0
  using ResultRow = typename BandingStorage::ResultRow;
554
0
  using Index = typename BandingStorage::Index;
555
556
0
  Index i = start;
557
558
0
  if (!kFirstCoeffAlwaysOne) {
559
    // Requires/asserts that cr != 0
560
0
    int tz = CountTrailingZeroBits(cr);
561
0
    i += static_cast<Index>(tz);
562
0
    cr >>= tz;
563
0
  }
564
565
0
  for (;;) {
566
0
    assert((cr & 1) == 1);
567
0
    CoeffRow cr_at_i;
568
0
    ResultRow rr_at_i;
569
0
    bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
570
0
    if (cr_at_i == 0) {
571
0
      bs->StoreRow(i, cr, rr);
572
0
      bts->BacktrackPut(*backtrack_pos, i);
573
0
      ++*backtrack_pos;
574
0
      return true;
575
0
    }
576
0
    assert((cr_at_i & 1) == 1);
577
    // Gaussian row reduction
578
0
    cr ^= cr_at_i;
579
0
    rr ^= rr_at_i;
580
0
    if (cr == 0) {
581
      // Inconsistency or (less likely) redundancy
582
0
      break;
583
0
    }
584
    // Find relative offset of next non-zero coefficient.
585
0
    int tz = CountTrailingZeroBits(cr);
586
0
    i += static_cast<Index>(tz);
587
0
    cr >>= tz;
588
0
  }
589
590
  // Failed, unless result row == 0 because e.g. a duplicate input or a
591
  // stock hash collision, with same result row. (For filter, stock hash
592
  // collision implies same result row.) Or we could have a full equation
593
  // equal to sum of other equations, which is very possible with
594
  // small range of values for result row.
595
0
  return rr == 0;
596
0
}
597
598
// Adds a range of entries to BandingStorage returning true if successful
599
// or false if solution is impossible with current hasher (and presumably
600
// its seed) and number of "slots" (solution or banding rows). (A solution
601
// is impossible when there is a linear dependence among the inputs that
602
// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
603
//
604
// If UseBacktrack in the BacktrackStorage, this function call rolls back
605
// to prior state on failure. If !UseBacktrack, some subset of the entries
606
// will have been added to the BandingStorage, so best considered to be in
607
// an indeterminate state.
608
//
609
template <typename BandingStorage, typename BacktrackStorage,
610
          typename BandingHasher, typename InputIterator>
611
bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
612
                     const BandingHasher &bh, InputIterator begin,
613
0
                     InputIterator end) {
614
0
  using CoeffRow = typename BandingStorage::CoeffRow;
615
0
  using Index = typename BandingStorage::Index;
616
0
  using ResultRow = typename BandingStorage::ResultRow;
617
0
  using Hash = typename BandingHasher::Hash;
618
619
0
  static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
620
0
  static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
621
0
  static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
622
623
0
  constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
624
625
0
  if (begin == end) {
626
    // trivial
627
0
    return true;
628
0
  }
629
630
0
  const Index num_starts = bs->GetNumStarts();
631
632
0
  InputIterator cur = begin;
633
0
  Index backtrack_pos = 0;
634
0
  if (!bs->UsePrefetch()) {
635
    // Simple version, no prefetch
636
0
    for (;;) {
637
0
      Hash h = bh.GetHash(*cur);
638
0
      Index start = bh.GetStart(h, num_starts);
639
0
      ResultRow rr =
640
0
          bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
641
0
      CoeffRow cr = bh.GetCoeffRow(h);
642
643
0
      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
644
0
        break;
645
0
      }
646
0
      if ((++cur) == end) {
647
0
        return true;
648
0
      }
649
0
    }
650
0
  } else {
651
    // Pipelined w/prefetch
652
    // Prime the pipeline
653
0
    Hash h = bh.GetHash(*cur);
654
0
    Index start = bh.GetStart(h, num_starts);
655
0
    ResultRow rr = bh.GetResultRowFromInput(*cur);
656
0
    bs->Prefetch(start);
657
658
    // Pipeline
659
0
    for (;;) {
660
0
      rr |= bh.GetResultRowFromHash(h);
661
0
      CoeffRow cr = bh.GetCoeffRow(h);
662
0
      if ((++cur) == end) {
663
0
        if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
664
0
          break;
665
0
        }
666
0
        return true;
667
0
      }
668
0
      Hash next_h = bh.GetHash(*cur);
669
0
      Index next_start = bh.GetStart(next_h, num_starts);
670
0
      ResultRow next_rr = bh.GetResultRowFromInput(*cur);
671
0
      bs->Prefetch(next_start);
672
0
      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
673
0
        break;
674
0
      }
675
0
      h = next_h;
676
0
      start = next_start;
677
0
      rr = next_rr;
678
0
    }
679
0
  }
680
  // failed; backtrack (if implemented)
681
0
  if (bts->UseBacktrack()) {
682
0
    while (backtrack_pos > 0) {
683
0
      --backtrack_pos;
684
0
      Index i = bts->BacktrackGet(backtrack_pos);
685
      // Clearing the ResultRow is not strictly required, but is required
686
      // for good FP rate on inputs that might have been backtracked out.
687
      // (We don't want anything we've backtracked on to leak into final
688
      // result, as that might not be "harmless".)
689
0
      bs->StoreRow(i, 0, 0);
690
0
    }
691
0
  }
692
0
  return false;
693
0
}
694
695
// Adds a range of entries to BandingStorage returning true if successful
696
// or false if solution is impossible with current hasher (and presumably
697
// its seed) and number of "slots" (solution or banding rows). (A solution
698
// is impossible when there is a linear dependence among the inputs that
699
// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
700
//
701
// On failure, some subset of the entries will have been added to the
702
// BandingStorage, so best considered to be in an indeterminate state.
703
//
704
template <typename BandingStorage, typename BandingHasher,
705
          typename InputIterator>
706
bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
707
0
                     InputIterator begin, InputIterator end) {
708
0
  using Index = typename BandingStorage::Index;
709
0
  struct NoopBacktrackStorage {
710
0
    bool UseBacktrack() { return false; }
711
0
    void BacktrackPut(Index, Index) {}
712
0
    Index BacktrackGet(Index) {
713
0
      assert(false);
714
0
      return 0;
715
0
    }
716
0
  } nbts;
717
0
  return BandingAddRange(bs, &nbts, bh, begin, end);
718
0
}
719
720
// ######################################################################
721
// ######################### Solution Storage ###########################
722
723
// Back-substitution and query algorithms unfortunately depend on some
724
// details of data layout in the final data structure ("solution"). Thus,
725
// there is no common SolutionStorage covering all the reasonable
726
// possibilities.
727
728
// ###################### SimpleSolutionStorage #########################
729
730
// SimpleSolutionStorage is for a row-major storage, typically with no
731
// unused bits in each ResultRow. This is mostly for demonstration
732
// purposes as the simplest solution storage scheme. It is relatively slow
733
// for filter queries.
734
735
// concept SimpleSolutionStorage extends RibbonTypes {
736
//   // This is called at the beginning of back-substitution for the
737
//   // solution storage to do any remaining configuration before data
738
//   // is stored to it. If configuration is previously finalized, this
739
//   // could be a simple assertion or even no-op. Ribbon algorithms
740
//   // only call this from back-substitution, and only once per call,
741
//   // before other functions here.
742
//   void PrepareForNumStarts(Index num_starts) const;
743
//   // Must return num_starts passed to PrepareForNumStarts, or the most
744
//   // recent call to PrepareForNumStarts if this storage object can be
745
//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
746
//   // there must be a run of kCoeffBits slots starting from each start.
747
//   Index GetNumStarts() const;
748
//   // Load the solution row (type ResultRow) for a slot
749
//   ResultRow Load(Index slot_num) const;
750
//   // Store the solution row (type ResultRow) for a slot
751
//   void Store(Index slot_num, ResultRow data);
752
// };
753
754
// Back-substitution for generating a solution from BandingStorage to
755
// SimpleSolutionStorage.
756
template <typename SimpleSolutionStorage, typename BandingStorage>
757
void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
758
  using CoeffRow = typename BandingStorage::CoeffRow;
759
  using Index = typename BandingStorage::Index;
760
  using ResultRow = typename BandingStorage::ResultRow;
761
762
  static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
763
                "must be same");
764
  static_assert(
765
      sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
766
      "must be same");
767
  static_assert(
768
      sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
769
      "must be same");
770
771
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
772
  constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
773
774
  // A column-major buffer of the solution matrix, containing enough
775
  // recently-computed solution data to compute the next solution row
776
  // (based also on banding data).
777
  std::array<CoeffRow, kResultBits> state;
778
  state.fill(0);
779
780
  const Index num_starts = bs.GetNumStarts();
781
  sss->PrepareForNumStarts(num_starts);
782
  const Index num_slots = num_starts + kCoeffBits - 1;
783
784
  for (Index i = num_slots; i > 0;) {
785
    --i;
786
    CoeffRow cr;
787
    ResultRow rr;
788
    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
789
    // solution row
790
    ResultRow sr = 0;
791
    for (Index j = 0; j < kResultBits; ++j) {
792
      // Compute next solution bit at row i, column j (see derivation below)
793
      CoeffRow tmp = state[j] << 1;
794
      bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
795
      tmp |= bit ? CoeffRow{1} : CoeffRow{0};
796
797
      // Now tmp is solution at column j from row i for next kCoeffBits
798
      // more rows. Thus, for valid solution, the dot product of the
799
      // solution column with the coefficient row has to equal the result
800
      // at that column,
801
      //   BitParity(tmp & cr) == ((rr >> j) & 1)
802
803
      // Update state.
804
      state[j] = tmp;
805
      // add to solution row
806
      sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
807
    }
808
    sss->Store(i, sr);
809
  }
810
}
811
812
// Common functionality for querying a key (already hashed) in
813
// SimpleSolutionStorage.
814
template <typename SimpleSolutionStorage>
815
typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
816
    typename SimpleSolutionStorage::Index start_slot,
817
    typename SimpleSolutionStorage::CoeffRow cr,
818
    const SimpleSolutionStorage &sss) {
819
  using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
820
  using ResultRow = typename SimpleSolutionStorage::ResultRow;
821
822
  constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
823
824
  ResultRow result = 0;
825
  for (unsigned i = 0; i < kCoeffBits; ++i) {
826
    // Bit masking whole value is generally faster here than 'if'
827
    result ^= sss.Load(start_slot + i) &
828
              (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
829
  }
830
  return result;
831
}
832
833
// General PHSF query a key from SimpleSolutionStorage.
834
template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
835
typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
836
    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
837
    const SimpleSolutionStorage &sss) {
838
  const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
839
840
  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
841
                    sizeof(typename PhsfQueryHasher::Index),
842
                "must be same");
843
  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
844
                    sizeof(typename PhsfQueryHasher::CoeffRow),
845
                "must be same");
846
847
  return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
848
                           hasher.GetCoeffRow(hash), sss);
849
}
850
851
// Filter query a key from SimpleSolutionStorage.
852
template <typename SimpleSolutionStorage, typename FilterQueryHasher>
853
bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
854
                       const FilterQueryHasher &hasher,
855
                       const SimpleSolutionStorage &sss) {
856
  const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
857
  const typename SimpleSolutionStorage::ResultRow expected =
858
      hasher.GetResultRowFromHash(hash);
859
860
  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
861
                    sizeof(typename FilterQueryHasher::Index),
862
                "must be same");
863
  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
864
                    sizeof(typename FilterQueryHasher::CoeffRow),
865
                "must be same");
866
  static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
867
                    sizeof(typename FilterQueryHasher::ResultRow),
868
                "must be same");
869
870
  return expected ==
871
         SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
872
                           hasher.GetCoeffRow(hash), sss);
873
}
874
875
// #################### InterleavedSolutionStorage ######################
876
877
// InterleavedSolutionStorage is row-major at a high level, for good
878
// locality, and column-major at a low level, for CPU efficiency
879
// especially in filter queries or relatively small number of result bits
880
// (== solution columns). The storage is a sequence of "blocks" where a
881
// block has one CoeffRow-sized segment for each solution column. Each
882
// query spans at most two blocks; the starting solution row is typically
883
// in the row-logical middle of a block and spans to the middle of the
884
// next block. (See diagram below.)
885
//
886
// InterleavedSolutionStorage supports choosing b (number of result or
887
// solution columns) at run time, and even supports mixing b and b-1 solution
888
// columns in a single linear system solution, for filters that can
889
// effectively utilize any size space (multiple of CoeffRow) for minimizing
890
// FP rate for any number of added keys. To simplify query implementation
891
// (with lower-index columns first), the b-bit portion comes after the b-1
892
// portion of the structure.
893
//
894
// Diagram (=== marks logical block boundary; b=4; ### is data used by a
895
// query crossing the b-1 to b boundary, each Segment has type CoeffRow):
896
//  ...
897
// +======================+
898
// | S e g m e n t  col=0 |
899
// +----------------------+
900
// | S e g m e n t  col=1 |
901
// +----------------------+
902
// | S e g m e n t  col=2 |
903
// +======================+
904
// | S e g m e n #########|
905
// +----------------------+
906
// | S e g m e n #########|
907
// +----------------------+
908
// | S e g m e n #########|
909
// +======================+ Result/solution columns: above = 3, below = 4
910
// |#############t  col=0 |
911
// +----------------------+
912
// |#############t  col=1 |
913
// +----------------------+
914
// |#############t  col=2 |
915
// +----------------------+
916
// | S e g m e n t  col=3 |
917
// +======================+
918
// | S e g m e n t  col=0 |
919
// +----------------------+
920
// | S e g m e n t  col=1 |
921
// +----------------------+
922
// | S e g m e n t  col=2 |
923
// +----------------------+
924
// | S e g m e n t  col=3 |
925
// +======================+
926
//  ...
927
//
928
// InterleavedSolutionStorage will be adapted by the algorithms from
929
// simple array-like segment storage. That array-like storage is templatized
930
// in part so that an implementation may choose to handle byte ordering
931
// at access time.
932
//
933
// concept InterleavedSolutionStorage extends RibbonTypes {
934
//   // This is called at the beginning of back-substitution for the
935
//   // solution storage to do any remaining configuration before data
936
//   // is stored to it. If configuration is previously finalized, this
937
//   // could be a simple assertion or even no-op. Ribbon algorithms
938
//   // only call this from back-substitution, and only once per call,
939
//   // before other functions here.
940
//   void PrepareForNumStarts(Index num_starts) const;
941
//   // Must return num_starts passed to PrepareForNumStarts, or the most
942
//   // recent call to PrepareForNumStarts if this storage object can be
943
//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
944
//   // there must be a run of kCoeffBits slots starting from each start.
945
//   Index GetNumStarts() const;
946
//   // The larger number of solution columns used (called "b" above).
947
//   Index GetUpperNumColumns() const;
948
//   // If returns > 0, then block numbers below that use
949
//   // GetUpperNumColumns() - 1 columns per solution row, and the rest
950
//   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
951
//   // where all but the last kCoeffBits - 1 slots are also starts. And
952
//   // a block contains a segment for each solution column.
953
//   // An implementation may only support uniform columns per solution
954
//   // row and return constant 0 here.
955
//   Index GetUpperStartBlock() const;
956
//
957
//   // ### "Array of segments" portion of API ###
958
//   // The number of values of type CoeffRow used in this solution
959
//   // representation. (This value can be inferred from the previous
960
//   // three functions, but is expected at least for sanity / assertion
961
//   // checking.)
962
//   Index GetNumSegments() const;
963
//   // Load an entry from the logical array of segments
964
//   CoeffRow LoadSegment(Index segment_num) const;
965
//   // Store an entry to the logical array of segments
966
//   void StoreSegment(Index segment_num, CoeffRow data);
967
// };
968
969
// A helper for InterleavedBackSubst.
970
template <typename BandingStorage>
971
inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
972
                           typename BandingStorage::Index num_columns,
973
                           const BandingStorage &bs,
974
0
                           typename BandingStorage::Index start_slot) {
975
0
  using CoeffRow = typename BandingStorage::CoeffRow;
976
0
  using Index = typename BandingStorage::Index;
977
0
  using ResultRow = typename BandingStorage::ResultRow;
978
979
0
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
980
981
0
  for (Index i = start_slot + kCoeffBits; i > start_slot;) {
982
0
    --i;
983
0
    CoeffRow cr;
984
0
    ResultRow rr;
985
0
    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
986
0
    for (Index j = 0; j < num_columns; ++j) {
987
      // Compute next solution bit at row i, column j (see derivation below)
988
0
      CoeffRow tmp = state[j] << 1;
989
0
      int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
990
0
      tmp |= static_cast<CoeffRow>(bit);
991
992
      // Now tmp is solution at column j from row i for next kCoeffBits
993
      // more rows. Thus, for valid solution, the dot product of the
994
      // solution column with the coefficient row has to equal the result
995
      // at that column,
996
      //   BitParity(tmp & cr) == ((rr >> j) & 1)
997
998
      // Update state.
999
0
      state[j] = tmp;
1000
0
    }
1001
0
  }
1002
0
}
1003
1004
// Back-substitution for generating a solution from BandingStorage to
1005
// InterleavedSolutionStorage.
1006
template <typename InterleavedSolutionStorage, typename BandingStorage>
1007
void InterleavedBackSubst(InterleavedSolutionStorage *iss,
1008
0
                          const BandingStorage &bs) {
1009
0
  using CoeffRow = typename BandingStorage::CoeffRow;
1010
0
  using Index = typename BandingStorage::Index;
1011
1012
0
  static_assert(
1013
0
      sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
1014
0
      "must be same");
1015
0
  static_assert(
1016
0
      sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
1017
0
      "must be same");
1018
1019
0
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1020
1021
0
  const Index num_starts = bs.GetNumStarts();
1022
  // Although it might be nice to have a filter that returns "always false"
1023
  // when no key is added, we aren't specifically supporting that here
1024
  // because it would require another condition branch in the query.
1025
0
  assert(num_starts > 0);
1026
0
  iss->PrepareForNumStarts(num_starts);
1027
1028
0
  const Index num_slots = num_starts + kCoeffBits - 1;
1029
0
  assert(num_slots % kCoeffBits == 0);
1030
0
  const Index num_blocks = num_slots / kCoeffBits;
1031
0
  const Index num_segments = iss->GetNumSegments();
1032
1033
  // For now upper, then lower
1034
0
  Index num_columns = iss->GetUpperNumColumns();
1035
0
  const Index upper_start_block = iss->GetUpperStartBlock();
1036
1037
0
  if (num_columns == 0) {
1038
    // Nothing to do, presumably because there's not enough space for even
1039
    // a single segment.
1040
0
    assert(num_segments == 0);
1041
    // When num_columns == 0, a Ribbon filter query will always return true,
1042
    // or a PHSF query always 0.
1043
0
    return;
1044
0
  }
1045
1046
  // We should be utilizing all available segments
1047
0
  assert(num_segments == (upper_start_block * (num_columns - 1)) +
1048
0
                             ((num_blocks - upper_start_block) * num_columns));
1049
1050
  // TODO: consider fixed-column specializations with stack-allocated state
1051
1052
  // A column-major buffer of the solution matrix, containing enough
1053
  // recently-computed solution data to compute the next solution row
1054
  // (based also on banding data).
1055
0
  std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
1056
1057
0
  Index block = num_blocks;
1058
0
  Index segment_num = num_segments;
1059
0
  while (block > upper_start_block) {
1060
0
    --block;
1061
0
    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1062
0
    segment_num -= num_columns;
1063
0
    for (Index i = 0; i < num_columns; ++i) {
1064
0
      iss->StoreSegment(segment_num + i, state[i]);
1065
0
    }
1066
0
  }
1067
  // Now (if applicable), region using lower number of columns
1068
  // (This should be optimized away if GetUpperStartBlock() returns
1069
  // constant 0.)
1070
0
  --num_columns;
1071
0
  while (block > 0) {
1072
0
    --block;
1073
0
    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1074
0
    segment_num -= num_columns;
1075
0
    for (Index i = 0; i < num_columns; ++i) {
1076
0
      iss->StoreSegment(segment_num + i, state[i]);
1077
0
    }
1078
0
  }
1079
  // Verify everything processed
1080
0
  assert(block == 0);
1081
0
  assert(segment_num == 0);
1082
0
}
1083
1084
// Prefetch memory for a key in InterleavedSolutionStorage.
1085
template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
1086
inline void InterleavedPrepareQuery(
1087
    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
1088
    const InterleavedSolutionStorage &iss,
1089
    typename PhsfQueryHasher::Hash *saved_hash,
1090
    typename InterleavedSolutionStorage::Index *saved_segment_num,
1091
    typename InterleavedSolutionStorage::Index *saved_num_columns,
1092
0
    typename InterleavedSolutionStorage::Index *saved_start_bit) {
1093
0
  using Hash = typename PhsfQueryHasher::Hash;
1094
0
  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1095
0
  using Index = typename InterleavedSolutionStorage::Index;
1096
1097
0
  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1098
0
                "must be same");
1099
1100
0
  const Hash hash = hasher.GetHash(key);
1101
0
  const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1102
1103
0
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1104
1105
0
  const Index upper_start_block = iss.GetUpperStartBlock();
1106
0
  Index num_columns = iss.GetUpperNumColumns();
1107
0
  Index start_block_num = start_slot / kCoeffBits;
1108
0
  Index segment_num = start_block_num * num_columns -
1109
0
                      std::min(start_block_num, upper_start_block);
1110
  // Change to lower num columns if applicable.
1111
  // (This should not compile to a conditional branch.)
1112
0
  num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1113
1114
0
  Index start_bit = start_slot % kCoeffBits;
1115
1116
0
  Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
1117
1118
0
  iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
1119
1120
0
  *saved_hash = hash;
1121
0
  *saved_segment_num = segment_num;
1122
0
  *saved_num_columns = num_columns;
1123
0
  *saved_start_bit = start_bit;
1124
0
}
1125
1126
// General PHSF query from InterleavedSolutionStorage, using data for
1127
// the query key from InterleavedPrepareQuery
1128
template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
1129
inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
1130
    typename PhsfQueryHasher::Hash hash,
1131
    typename InterleavedSolutionStorage::Index segment_num,
1132
    typename InterleavedSolutionStorage::Index num_columns,
1133
    typename InterleavedSolutionStorage::Index start_bit,
1134
    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
1135
  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1136
  using Index = typename InterleavedSolutionStorage::Index;
1137
  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1138
1139
  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1140
                "must be same");
1141
  static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
1142
                "must be same");
1143
1144
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1145
1146
  const CoeffRow cr = hasher.GetCoeffRow(hash);
1147
1148
  ResultRow sr = 0;
1149
  const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
1150
  for (Index i = 0; i < num_columns; ++i) {
1151
    sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
1152
  }
1153
1154
  if (start_bit > 0) {
1155
    segment_num += num_columns;
1156
    const CoeffRow cr_right =
1157
        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
1158
    for (Index i = 0; i < num_columns; ++i) {
1159
      sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
1160
    }
1161
  }
1162
1163
  return sr;
1164
}
1165
1166
// Filter query a key from InterleavedFilterQuery.
1167
template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
1168
inline bool InterleavedFilterQuery(
1169
    typename FilterQueryHasher::Hash hash,
1170
    typename InterleavedSolutionStorage::Index segment_num,
1171
    typename InterleavedSolutionStorage::Index num_columns,
1172
    typename InterleavedSolutionStorage::Index start_bit,
1173
0
    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
1174
0
  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1175
0
  using Index = typename InterleavedSolutionStorage::Index;
1176
0
  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1177
1178
0
  static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
1179
0
                "must be same");
1180
0
  static_assert(
1181
0
      sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
1182
0
      "must be same");
1183
0
  static_assert(
1184
0
      sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
1185
0
      "must be same");
1186
1187
0
  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1188
1189
0
  const CoeffRow cr = hasher.GetCoeffRow(hash);
1190
0
  const ResultRow expected = hasher.GetResultRowFromHash(hash);
1191
1192
  // TODO: consider optimizations such as
1193
  // * get rid of start_bit == 0 condition with careful fetching & shifting
1194
0
  if (start_bit == 0) {
1195
0
    for (Index i = 0; i < num_columns; ++i) {
1196
0
      if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
1197
0
          (static_cast<int>(expected >> i) & 1)) {
1198
0
        return false;
1199
0
      }
1200
0
    }
1201
0
  } else {
1202
0
    const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
1203
0
    const CoeffRow cr_right =
1204
0
        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
1205
1206
0
    for (Index i = 0; i < num_columns; ++i) {
1207
0
      CoeffRow soln_data =
1208
0
          (iss.LoadSegment(segment_num + i) & cr_left) ^
1209
0
          (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
1210
0
      if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
1211
0
        return false;
1212
0
      }
1213
0
    }
1214
0
  }
1215
  // otherwise, all match
1216
0
  return true;
1217
0
}
1218
1219
// TODO: refactor Interleaved*Query so that queries can be "prepared" by
1220
// prefetching memory, to hide memory latency for multiple queries in a
1221
// single thread.
1222
1223
}  // namespace ribbon
1224
1225
}  // namespace ROCKSDB_NAMESPACE