Coverage Report

Created: 2023-06-07 06:23

/src/bind9/lib/dns/qp_p.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4
 * SPDX-License-Identifier: MPL-2.0
5
 *
6
 * This Source Code Form is subject to the terms of the Mozilla Public
7
 * License, v. 2.0. If a copy of the MPL was not distributed with this
8
 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9
 *
10
 * See the COPYRIGHT file distributed with this work for additional
11
 * information regarding copyright ownership.
12
 */
13
14
/*
15
 * For an overview, see doc/design/qp-trie.md
16
 *
17
 * This private header defines the internal data structures,
18
 */
19
20
#pragma once
21
22
/***********************************************************************
23
 *
24
 *  interior node basics
25
 */
26
27
/*
28
 * A qp-trie node is normally either a branch or a leaf. It consists of
29
 * three 32-bit words into which the components are packed. They are used
30
 * as a 64-bit word and a 32-bit word, but they are not declared like that
31
 * to avoid unwanted padding, keeping the size down to 12 bytes. They are
32
 * in native endian order so getting the 64-bit part should compile down to
33
 * an unaligned load.
34
 *
35
 * The type of node is identified by the tag in the least significant bits
36
 * of the 64-bit word.
37
 *
38
 * In a branch the 64-bit word is described by the enum below. The 32-bit
39
 * word is a reference to the packed sparse vector of "twigs", i.e. child
40
 * nodes. A branch node has at least 2 and less than SHIFT_OFFSET twigs
41
 * (see the enum below). The qp-trie update functions ensure that branches
42
 * actually branch, i.e. branches cannot have only 1 child.
43
 *
44
 * The contents of each leaf are set by the trie's user. The 64-bit word
45
 * contains a pointer value (which must be word-aligned, so the tag bits
46
 * are zero), and the 32-bit word is an arbitrary integer value.
47
 *
48
 * There is a third kind of node, reader nodes, which anchor the root of a
49
 * trie. A pair of reader nodes together contain a packed `dns_qpreader_t`.
50
 * See the section on "packed reader nodes" below.
51
 */
52
typedef struct qp_node {
53
#if WORDS_BIGENDIAN
54
  uint32_t bighi, biglo, small;
55
#else
56
  uint32_t biglo, bighi, small;
57
#endif
58
} qp_node_t;
59
60
/*
61
 * The possible values of the node type tag. Type tags must fit in two bits
62
 * for compatibility with 4-byte pointer alignment on 32-bit systems.
63
 */
64
enum {
65
  LEAF_TAG = 0, /* leaf node */
66
  BRANCH_TAG = 1, /* branch node */
67
  READER_TAG = 2, /* reader node */
68
  TAG_MASK = 3, /* mask covering tag bits */
69
};
70
71
/*
72
 * This code does not work on CPUs with large pointers, e.g. CHERI capability
73
 * architectures. When porting to that kind of machine, a `dns_qpnode` should
74
 * be just a `uintptr_t`; a leaf node will contain a single pointer, and a
75
 * branch node will fit in the same space with room to spare.
76
 */
77
STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t),
78
        "pointers must fit in 64 bits");
79
80
/*
81
 * A branch node contains a 64-bit word comprising the type tag, the
82
 * bitmap, and an offset into the key. It is called an "index word" because
83
 * it describes how to access the twigs vector (think "database index").
84
 * The following enum sets up the bit positions of these parts.
85
 *
86
 * In a leaf, the same 64-bit word contains a pointer. The pointer
87
 * must be word-aligned so that the branch/leaf tag bit is zero.
88
 * This requirement is checked by the newleaf() constructor.
89
 *
90
 * The bitmap is just above the type tag. The `bits_for_byte[]` table is
91
 * used to fill in a key so that bit tests can work directly against the
92
 * index word without superfluous masking or shifting; we don't need to
93
 * mask out the bitmap before testing a bit, but we do need to mask the
94
 * bitmap before calling popcount.
95
 *
96
 * The byte offset into the key is at the top of the word, so that it
97
 * can be extracted with just a shift, with no masking needed.
98
 *
99
 * The names are SHIFT_thing because they are qp_shift_t values. (See
100
 * below for the various `qp_*` type declarations.)
101
 *
102
 * These values are relatively fixed in practice: SHIFT_NOBYTE needs
103
 * to leave space for the type tag, and the implementation of
104
 * `dns_qpkey_fromname()` depends on the bitmap being large enough.
105
 * The symbolic names avoid mystery numbers in the code.
106
 */
107
enum {
108
  SHIFT_NOBYTE = 2,  /* label separator has no byte value */
109
  SHIFT_BITMAP,    /* many bits here */
110
  SHIFT_OFFSET = 49, /* offset of byte in key */
111
};
112
113
/***********************************************************************
114
 *
115
 *  garbage collector tuning parameters
116
 */
117
118
/*
119
 * A "cell" is a location that can contain a `qp_node_t`, and a "chunk"
120
 * is a moderately large array of cells. A big trie can occupy
121
 * multiple chunks. (Unlike other nodes, a trie's root node lives in
122
 * its `struct dns_qp` instead of being allocated in a cell.)
123
 *
124
 * The qp-trie allocator hands out space for twigs vectors. Allocations are
125
 * made sequentially from one of the chunks; this kind of "sequential
126
 * allocator" is also known as a "bump allocator", so in `struct dns_qp`
127
 * (see below) the allocation chunk is called `bump`.
128
 */
129
130
/*
131
 * Number of cells in a chunk is a power of 2, which must have space for
132
 * a full twigs vector (48 wide). When testing, use a much smaller chunk
133
 * size to make the allocator work harder.
134
 */
135
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
136
#define QP_CHUNK_LOG 7
137
#else
138
#define QP_CHUNK_LOG 10
139
#endif
140
141
STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20,
142
        "qp-trie chunk size is unreasonable");
143
144
#define QP_CHUNK_SIZE  (1U << QP_CHUNK_LOG)
145
#define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(qp_node_t))
146
147
/*
148
 * We need a bitfield this big to count how much of a chunk is in use:
149
 * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`.
150
 */
151
#define QP_USAGE_BITS (QP_CHUNK_LOG + 1)
152
153
/*
154
 * A chunk needs to be compacted if it is less full than this threshold.
155
 * (12% overhead seems reasonable)
156
 */
157
#define QP_MAX_FREE (QP_CHUNK_SIZE / 8)
158
#define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE)
159
160
/*
161
 * Compact automatically when we pass this threshold: when there is a lot
162
 * of free space in absolute terms, and when we have freed more than half
163
 * of the space we allocated.
164
 *
165
 * The current compaction algorithm scans the whole trie, so it is important
166
 * to scale the threshold based on the size of the trie to avoid quadratic
167
 * behaviour. XXXFANF find an algorithm that scans less of the trie!
168
 *
169
 * During a modification transaction, when we copy-on-write some twigs we
170
 * count the old copy as "free", because they will be when the transaction
171
 * commits. But they cannot be recovered immediately so they are also
172
 * counted as on hold, and discounted when we decide whether to compact.
173
 */
174
#define QP_GC_HEURISTIC(qp, free) \
175
  ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2)
176
177
#define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count)
178
#define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count)
179
180
/*
181
 * The chunk base and usage arrays are resized geometically and start off
182
 * with two entries.
183
 */
184
#define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2)
185
186
/***********************************************************************
187
 *
188
 *  helper types
189
 */
190
191
/*
192
 * C is not strict enough with its integer types for these typedefs to
193
 * improve type safety, but it helps to have annotations saying what
194
 * particular kind of number we are dealing with.
195
 */
196
197
/*
198
 * The number or position of a bit inside a word. (0..63)
199
 *
200
 * Note: A dns_qpkey_t is logically an array of qp_shift_t values, but it
201
 * isn't declared that way because dns_qpkey_t is a public type whereas
202
 * qp_shift_t is private.
203
 *
204
 * A dns_qpkey element key[off] must satisfy
205
 *
206
 *  SHIFT_NOBYTE <= key[off] && key[off] < SHIFT_OFFSET
207
 */
208
typedef uint8_t qp_shift_t;
209
210
/*
211
 * The number of bits set in a word (as in Hamming weight or popcount)
212
 * which is used for the position of a node in the packed sparse
213
 * vector of twigs. (0..47) because our bitmap does not fill the word.
214
 */
215
typedef uint8_t qp_weight_t;
216
217
/*
218
 * A chunk number, i.e. an index into the chunk arrays.
219
 */
220
typedef uint32_t qp_chunk_t;
221
222
/*
223
 * Cell offset within a chunk, or a count of cells. Each cell in a
224
 * chunk can contain a node.
225
 */
226
typedef uint32_t qp_cell_t;
227
228
/*
229
 * A twig reference is used to refer to a twigs vector, which occupies a
230
 * contiguous group of cells.
231
 */
232
typedef uint32_t qp_ref_t;
233
234
/*
235
 * Constructors and accessors for qp_ref_t values, defined here to show
236
 * how the qp_ref_t, qp_chunk_t, qp_cell_t types relate to each other
237
 */
238
239
static inline qp_ref_t
240
make_ref(qp_chunk_t chunk, qp_cell_t cell) {
241
  return (QP_CHUNK_SIZE * chunk + cell);
242
}
243
244
static inline qp_chunk_t
245
ref_chunk(qp_ref_t ref) {
246
  return (ref / QP_CHUNK_SIZE);
247
}
248
249
static inline qp_cell_t
250
ref_cell(qp_ref_t ref) {
251
  return (ref % QP_CHUNK_SIZE);
252
}
253
254
/*
255
 * We should not use the `root_ref` in an empty trie, so we set it
256
 * to a value that should trigger an obvious bug. See qp_init()
257
 * and get_root() below.
258
 */
259
0
#define INVALID_REF ((qp_ref_t)~0UL)
260
261
/***********************************************************************
262
 *
263
 *  chunk arrays
264
 */
265
266
/*
267
 * A `dns_qp_t` contains two arrays holding information about each chunk.
268
 *
269
 * The `base` array holds pointers to the base of each chunk.
270
 * The `usage` array hold the allocator's state for each chunk.
271
 *
272
 * The `base` array is used by the hot qp-trie traversal paths. It can
273
 * be shared by multiple versions of a trie, which are tracked with a
274
 * refcount. Old versions of the trie can retain old versions of the
275
 * `base` array.
276
 *
277
 * In multithreaded code, the `usage` array is only used when the
278
 * `dns_qpmulti_t` mutex is held, and there is only one version of
279
 * it in active use (maybe with a snapshot for rollback support).
280
 *
281
 * The two arrays are separate because they have rather different
282
 * access patterns, different lifetimes, and different element sizes.
283
 */
284
285
/*
286
 * For most purposes we don't need to know exactly which cells are
287
 * in use in a chunk, we only need to know how many of them there are.
288
 *
289
 * After we have finished allocating from a chunk, the `used` counter
290
 * is the size we need to know for shrinking the chunk and for
291
 * scanning it to detach leaf values before the chunk is free()d. The
292
 * `free` counter tells us when the chunk needs compacting and when it
293
 * has become empty.
294
 *
295
 * The `exists` flag allows the chunk scanning loops to look at the
296
 * usage array only.
297
 *
298
 * In multithreaded code, we mark chunks as `immutable` when a modify
299
 * transaction is opened. (We don't mark them immutable on commit,
300
 * because the old bump chunk must remain mutable between write
301
 * transactions, but it must become immutable when an update
302
 * transaction is opened.)
303
 *
304
 * There are a few flags used to mark which chunks are still needed by
305
 * snapshots after the chunks have passed their normal reclamation
306
 * phase.
307
 */
308
typedef struct qp_usage {
309
  /*% the allocation point, increases monotonically */
310
  qp_cell_t used : QP_USAGE_BITS;
311
  /*% count of nodes no longer needed, also monotonic */
312
  qp_cell_t free : QP_USAGE_BITS;
313
  /*% qp->base->ptr[chunk] != NULL */
314
  bool exists : 1;
315
  /*% is this chunk shared? [MT] */
316
  bool immutable : 1;
317
  /*% already subtracted from multi->*_count [MT] */
318
  bool discounted : 1;
319
  /*% is a snapshot using this chunk? [MT] */
320
  bool snapshot : 1;
321
  /*% tried to free it but a snapshot needs it [MT] */
322
  bool snapfree : 1;
323
  /*% for mark/sweep snapshot flag updates [MT] */
324
  bool snapmark : 1;
325
} qp_usage_t;
326
327
/*
328
 * The chunks are owned by the current version of the `base` array.
329
 * When the array is resized, the old version might still be in use by
330
 * concurrent readers, in which case it is free()d later when its
331
 * refcount drops to zero.
332
 *
333
 * A `dns_qpbase_t` counts references from `dns_qp_t` objects and
334
 * from packed readers, but not from `dns_qpread_t` nor from
335
 * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
336
 * would wreck multicore scalability; instead we rely on RCU.
337
 *
338
 * The `usage` array determines when a chunk is no longer needed: old
339
 * chunk pointers in old `base` arrays are ignored. (They can become
340
 * dangling pointers to free memory, but they will never be
341
 * dereferenced.)
342
 *
343
 * We ensure that individual chunk base pointers remain immutable
344
 * after assignment, and they are not cleared until the chunk is
345
 * free()d, after all readers have departed. Slots can be reused, and
346
 * we allow transactions to fill or re-fill empty slots adjacent to
347
 * busy slots that are in use by readers.
348
 */
349
struct dns_qpbase {
350
  unsigned int magic;
351
  isc_refcount_t refcount;
352
  qp_node_t *ptr[];
353
};
354
355
/*
356
 * Chunks that may be in use by readers are reclaimed asynchronously.
357
 * When a transaction commits, immutable chunks that are now empty are
358
 * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`.
359
 */
360
typedef struct qp_rcuctx {
361
  unsigned int magic;
362
  struct rcu_head rcu_head;
363
  isc_mem_t *mctx;
364
  dns_qpmulti_t *multi;
365
  qp_chunk_t count;
366
  qp_chunk_t chunk[];
367
} qp_rcuctx_t;
368
369
/*
370
 * Returns true when the base array can be free()d.
371
 */
372
static inline bool
373
qpbase_unref(dns_qpreadable_t qpr) {
374
  dns_qpreader_t *qp = dns_qpreader(qpr);
375
  return (qp->base != NULL &&
376
    isc_refcount_decrement(&qp->base->refcount) == 1);
377
}
378
379
/*
380
 * Now we know about `dns_qpreader_t` and `dns_qpbase_t`,
381
 * here's how we convert a twig reference into a pointer.
382
 */
383
static inline qp_node_t *
384
ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) {
385
  dns_qpreader_t *qp = dns_qpreader(qpr);
386
  return (qp->base->ptr[ref_chunk(ref)] + ref_cell(ref));
387
}
388
389
/***********************************************************************
390
 *
391
 *  main qp-trie structures
392
 */
393
394
#define QP_MAGIC       ISC_MAGIC('t', 'r', 'i', 'e')
395
#define QPITER_MAGIC   ISC_MAGIC('q', 'p', 'i', 't')
396
#define QPMULTI_MAGIC  ISC_MAGIC('q', 'p', 'm', 'v')
397
#define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
398
#define QPBASE_MAGIC   ISC_MAGIC('q', 'p', 'b', 'p')
399
#define QPRCU_MAGIC    ISC_MAGIC('q', 'p', 'c', 'b')
400
401
#define QP_VALID(qp)    ISC_MAGIC_VALID(qp, QP_MAGIC)
402
#define QPITER_VALID(qp)  ISC_MAGIC_VALID(qp, QPITER_MAGIC)
403
#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
404
#define QPBASE_VALID(qp)  ISC_MAGIC_VALID(qp, QPBASE_MAGIC)
405
#define QPRCU_VALID(qp)   ISC_MAGIC_VALID(qp, QPRCU_MAGIC)
406
407
/*
408
 * Polymorphic initialization of the `dns_qpreader_t` prefix.
409
 *
410
 * The location of the root node is actually a qp_ref_t, but is
411
 * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too
412
 * many internal details into the public API.
413
 *
414
 * The `uctx` and `methods` support callbacks into the user's code.
415
 * They are constant after initialization.
416
 */
417
#define QP_INIT(qp, m, x)                 \
418
  (*(qp) = (typeof(*(qp))){         \
419
     .magic = QP_MAGIC,       \
420
     .root_ref = INVALID_REF, \
421
     .uctx = x,               \
422
     .methods = m,            \
423
   })
424
425
/*
426
 * Snapshots have some extra cleanup machinery.
427
 *
428
 * Originally, a snapshot was basically just a `dns_qpread_t`
429
 * allocated on the heap, with the extra behaviour that memory
430
 * reclamation is suppressed for a particular trie while it has any
431
 * snapshots. However that design gets into trouble for a zone with
432
 * frequent updates and many zone transfers.
433
 *
434
 * Instead, each snapshot records which chunks it needs. When a
435
 * snapshot is created, it makes a copy of the `base` array, except
436
 * for chunks that are empty and waiting to be reclaimed. When a
437
 * snapshot is destroyed, we can traverse the list of snapshots to
438
 * accurately mark which chunks are still needed.
439
 *
440
 * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is
441
 * not muddled up with the wrong `dns_qpmulti_t`.
442
 *
443
 * A trie's `base` array might have grown after the snapshot was
444
 * created, so it records its own `chunk_max`.
445
 */
446
struct dns_qpsnap {
447
  DNS_QPREADER_FIELDS;
448
  dns_qpmulti_t *whence;
449
  uint32_t chunk_max;
450
  ISC_LINK(struct dns_qpsnap) link;
451
};
452
453
/*
454
 * Read-write access to a qp-trie requires extra fields to support the
455
 * allocator and garbage collector.
456
 *
457
 * Bare instances of a `struct dns_qp` are used for stand-alone
458
 * single-threaded tries. For multithreaded access, a `dns_qpmulti_t`
459
 * wraps a `dns_qp_t` with a mutex and other fields that are only needed
460
 * at the start or end of a transaction.
461
 *
462
 * Allocations are made sequentially in the `bump` chunk. A sequence
463
 * of lightweight write transactions can use the same `bump` chunk, so
464
 * its prefix before `fender` is immutable, and the rest is mutable.
465
 *
466
 * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines
467
 * the values of `used_count`, `free_count`, and `hold_count`. The
468
 * `hold_count` tracks nodes that need to be retained while readers are
469
 * using them; they are free but cannot be reclaimed until the transaction
470
 * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE()
471
 * during a transaction.
472
 *
473
 * There are some flags that alter the behaviour of write transactions.
474
 *
475
 *  - The `transaction_mode` indicates whether the current transaction is a
476
 *    light write or a heavy update, or (between transactions) the previous
477
 *    transaction's mode, because the setup for the next transaction
478
 *    depends on how the previous one committed. The mode is set at the
479
 *    start of each transaction. It is QP_NONE in a single-threaded qp-trie
480
 *    to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy().
481
 *
482
 *  - The `compact_all` flag is used when every node in the trie should be
483
 *    copied. (Usually compation aims to avoid moving nodes out of
484
 *    unfragmented chunks.) It is used when compaction is explicitly
485
 *    requested via `dns_qp_compact()`, and as an emergency mechanism if
486
 *    normal compaction failed to clear the QP_MAX_GARBAGE() condition.
487
 *    (This emergency is a bug even tho we have a rescue mechanism.)
488
 *
489
 *  - When a qp-trie is destroyed while it has pending cleanup work, its
490
 *    `destroy` flag is set so that it is destroyed by the reclaim worker.
491
 *    (Because items cannot be removed from the middle of the cleanup list.)
492
 *
493
 *  - When built with fuzzing support, we can use mprotect() and munmap()
494
 *    to ensure that incorrect memory accesses cause fatal errors. The
495
 *    `write_protect` flag must be set straight after the `dns_qpmulti_t`
496
 *    is created, then left unchanged.
497
 *
498
 * Some of the dns_qp_t fields are only needed for multithreaded transactions
499
 * (marked [MT] below) but the same code paths are also used for single-
500
 * threaded writes.
501
 */
502
struct dns_qp {
503
  DNS_QPREADER_FIELDS;
504
  /*% memory context (const) */
505
  isc_mem_t *mctx;
506
  /*% array of per-chunk allocation counters */
507
  qp_usage_t *usage;
508
  /*% number of slots in `chunk` and `usage` arrays */
509
  qp_chunk_t chunk_max;
510
  /*% which chunk is used for allocations */
511
  qp_chunk_t bump;
512
  /*% nodes in the `bump` chunk below `fender` are read only [MT] */
513
  qp_cell_t fender;
514
  /*% number of leaf nodes */
515
  qp_cell_t leaf_count;
516
  /*% total of all usage[] counters */
517
  qp_cell_t used_count, free_count;
518
  /*% free cells that cannot be recovered right now */
519
  qp_cell_t hold_count;
520
  /*% what kind of transaction was most recently started [MT] */
521
  enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
522
  /*% compact the entire trie [MT] */
523
  bool compact_all : 1;
524
  /*% optionally when compiled with fuzzing support [MT] */
525
  bool write_protect : 1;
526
};
527
528
/*
529
 * Concurrent access to a qp-trie.
530
 *
531
 * The `reader` pointer provides wait-free access to the current version
532
 * of the trie. See the "packed reader nodes" section below for a
533
 * description of what it points to.
534
 *
535
 * The main object under the protection of the mutex is the `writer`
536
 * containing all the allocator state. There can be a backup copy when
537
 * we want to be able to rollback an update transaction.
538
 *
539
 * There is a `reader_ref` which corresponds to the `reader` pointer
540
 * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is
541
 * necessary when freeing the space used by the reader, because there
542
 * isn't a good way to recover a qp_ref_t from a qp_node_t pointer.
543
 *
544
 * There is a per-trie list of snapshots that is used for reclaiming
545
 * memory when a snapshot is destroyed.
546
 *
547
 * Finally, we maintain a global list of `dns_qpmulti_t` objects that
548
 * need asynchronous safe memory recovery.
549
 */
550
struct dns_qpmulti {
551
  uint32_t magic;
552
  /*% RCU-protected pointer to current packed reader */
553
  qp_node_t *reader;
554
  /*% the mutex protects the rest of this structure */
555
  isc_mutex_t mutex;
556
  /*% ref_ptr(writer, reader_ref) == reader */
557
  qp_ref_t reader_ref;
558
  /*% the main working structure */
559
  dns_qp_t writer;
560
  /*% saved allocator state to support rollback */
561
  dns_qp_t *rollback;
562
  /*% all snapshots of this trie */
563
  ISC_LIST(dns_qpsnap_t) snapshots;
564
};
565
566
/***********************************************************************
567
 *
568
 *  interior node constructors and accessors
569
 */
570
571
/*
572
 * See the comments under "interior node basics" above, which explain
573
 * the layout of nodes as implemented by the following functions.
574
 *
575
 * These functions are (mostly) constructors and getters. Imagine how
576
 * much less code there would be if C had sum types with control over
577
 * the layout...
578
 */
579
580
/*
581
 * Get the 64-bit word of a node.
582
 */
583
static inline uint64_t
584
node64(qp_node_t *n) {
585
  uint64_t lo = n->biglo;
586
  uint64_t hi = n->bighi;
587
  return (lo | (hi << 32));
588
}
589
590
/*
591
 * Get the 32-bit word of a node.
592
 */
593
static inline uint32_t
594
node32(qp_node_t *n) {
595
  return (n->small);
596
}
597
598
/*
599
 * Create a node from its parts
600
 */
601
static inline qp_node_t
602
make_node(uint64_t big, uint32_t small) {
603
  return ((qp_node_t){
604
    .biglo = (uint32_t)(big),
605
    .bighi = (uint32_t)(big >> 32),
606
    .small = small,
607
  });
608
}
609
610
/*
611
 * Extract a pointer from a node's 64 bit word. The double cast is to avoid
612
 * a warning about mismatched pointer/integer sizes on 32 bit systems.
613
 */
614
static inline void *
615
node_pointer(qp_node_t *n) {
616
  return ((void *)(uintptr_t)(node64(n) & ~TAG_MASK));
617
}
618
619
/*
620
 * Examine a node's tag bits
621
 */
622
static inline uint32_t
623
node_tag(qp_node_t *n) {
624
  return (n->biglo & TAG_MASK);
625
}
626
627
/*
628
 * simplified for the hot path
629
 */
630
static inline bool
631
is_branch(qp_node_t *n) {
632
  return (n->biglo & BRANCH_TAG);
633
}
634
635
/* leaf nodes *********************************************************/
636
637
/*
638
 * Get a leaf's pointer value.
639
 */
640
static inline void *
641
leaf_pval(qp_node_t *n) {
642
  return (node_pointer(n));
643
}
644
645
/*
646
 * Get a leaf's integer value
647
 */
648
static inline uint32_t
649
leaf_ival(qp_node_t *n) {
650
  return (node32(n));
651
}
652
653
/*
654
 * Create a leaf node from its parts
655
 */
656
static inline qp_node_t
657
make_leaf(const void *pval, uint32_t ival) {
658
  qp_node_t leaf = make_node((uintptr_t)pval, ival);
659
  REQUIRE(node_tag(&leaf) == LEAF_TAG);
660
  return (leaf);
661
}
662
663
/* branch nodes *******************************************************/
664
665
/*
666
 * The following function names use plural `twigs` when they work on a
667
 * branch's twigs vector as a whole, and singular `twig` when they work on
668
 * a particular twig.
669
 */
670
671
/*
672
 * Get a branch node's index word
673
 */
674
static inline uint64_t
675
branch_index(qp_node_t *n) {
676
  return (node64(n));
677
}
678
679
/*
680
 * Get a reference to a branch node's child twigs.
681
 */
682
static inline qp_ref_t
683
branch_twigs_ref(qp_node_t *n) {
684
  return (node32(n));
685
}
686
687
/*
688
 * Bit positions in the bitmap come directly from the key. DNS names are
689
 * converted to keys using the tables declared at the end of this file.
690
 */
691
static inline qp_shift_t
692
qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) {
693
  if (offset < len) {
694
    return (key[offset]);
695
  } else {
696
    return (SHIFT_NOBYTE);
697
  }
698
}
699
700
/*
701
 * Extract a branch node's offset field, used to index the key.
702
 */
703
static inline size_t
704
branch_key_offset(qp_node_t *n) {
705
  return ((size_t)(branch_index(n) >> SHIFT_OFFSET));
706
}
707
708
/*
709
 * Which bit identifies the twig of this node for this key?
710
 */
711
static inline qp_shift_t
712
branch_keybit(qp_node_t *n, const dns_qpkey_t key, size_t len) {
713
  return (qpkey_bit(key, len, branch_key_offset(n)));
714
}
715
716
/*
717
 * Get a pointer to a branch node's twigs vector.
718
 */
719
static inline qp_node_t *
720
branch_twigs_vector(dns_qpreadable_t qpr, qp_node_t *n) {
721
  return (ref_ptr(qpr, branch_twigs_ref(n)));
722
}
723
724
/*
725
 * Warm up the cache while calculating which twig we want.
726
 */
727
static inline void
728
prefetch_twigs(dns_qpreadable_t qpr, qp_node_t *n) {
729
  __builtin_prefetch(branch_twigs_vector(qpr, n));
730
}
731
732
/* root node **********************************************************/
733
734
/*
735
 * Get a pointer to the root node, checking if the trie is empty.
736
 */
737
static inline qp_node_t *
738
get_root(dns_qpreadable_t qpr) {
739
  dns_qpreader_t *qp = dns_qpreader(qpr);
740
  if (qp->root_ref == INVALID_REF) {
741
    return (NULL);
742
  } else {
743
    return (ref_ptr(qp, qp->root_ref));
744
  }
745
}
746
747
/*
748
 * When we need to move the root node, we avoid repeating allocation
749
 * logistics by making a temporary fake branch node that has
750
 *  `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref`
751
 * just enough to treat the root node as a vector of one twig.
752
 */
753
#define MOVABLE_ROOT(qp)                                   \
754
  (&(qp_node_t){                                     \
755
    .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \
756
    .small = qp->root_ref,                     \
757
  })
758
759
/***********************************************************************
760
 *
761
 *  bitmap popcount shenanigans
762
 */
763
764
/*
765
 * How many twigs appear in the vector before the one corresponding to the
766
 * given bit? Calculated using popcount of part of the branch's bitmap.
767
 *
768
 * To calculate a mask that covers the lesser bits in the bitmap,
769
 * we subtract 1 to set all lesser bits, and subtract the tag mask
770
 * because the type tag is not part of the bitmap.
771
 */
772
static inline qp_weight_t
773
branch_count_bitmap_before(qp_node_t *n, qp_shift_t bit) {
774
  uint64_t mask = (1ULL << bit) - 1 - TAG_MASK;
775
  uint64_t bitmap = branch_index(n) & mask;
776
  return ((qp_weight_t)__builtin_popcountll(bitmap));
777
}
778
779
/*
780
 * How many twigs does this branch have?
781
 *
782
 * The offset is directly after the bitmap so the offset's lesser bits
783
 * covers the whole bitmap, and the bitmap's weight is the number of twigs.
784
 */
785
static inline qp_weight_t
786
branch_twigs_size(qp_node_t *n) {
787
  return (branch_count_bitmap_before(n, SHIFT_OFFSET));
788
}
789
790
/*
791
 * Position of a twig within the packed sparse vector.
792
 */
793
static inline qp_weight_t
794
branch_twig_pos(qp_node_t *n, qp_shift_t bit) {
795
  return (branch_count_bitmap_before(n, bit));
796
}
797
798
/*
799
 * Get a pointer to a particular twig.
800
 */
801
static inline qp_node_t *
802
branch_twig_ptr(dns_qpreadable_t qpr, qp_node_t *n, qp_shift_t bit) {
803
  return (branch_twigs_vector(qpr, n) + branch_twig_pos(n, bit));
804
}
805
806
/*
807
 * Is the twig identified by this bit present?
808
 */
809
static inline bool
810
branch_has_twig(qp_node_t *n, qp_shift_t bit) {
811
  return (branch_index(n) & (1ULL << bit));
812
}
813
814
/* twig logistics *****************************************************/
815
816
static inline void
817
move_twigs(qp_node_t *to, qp_node_t *from, qp_weight_t size) {
818
  memmove(to, from, size * sizeof(qp_node_t));
819
}
820
821
static inline void
822
zero_twigs(qp_node_t *twigs, qp_weight_t size) {
823
  memset(twigs, 0, size * sizeof(qp_node_t));
824
}
825
826
/***********************************************************************
827
 *
828
 *  packed reader nodes
829
 */
830
831
/*
832
 * The purpose of these packed reader nodes is to simplify safe memory
833
 * reclamation for a multithreaded qp-trie.
834
 *
835
 * After the `reader` pointer in a qpmulti is replaced, we need to wait
836
 * for a grace period before we can reclaim the memory that is no longer
837
 * needed by the trie. So we need some kind of structure to hold
838
 * pointers to the (logically) detached memory until it is safe to free.
839
 * This memory includes the chunks and the `base` arrays.
840
 *
841
 * Packed reader nodes save us from having to track `dns_qpread_t`
842
 * objects as distinct allocations: the packed reader nodes get
843
 * reclaimed when the the chunk containing their cells is reclaimed.
844
 * When a real `dns_qpread_t` object is needed, it is allocated on the
845
 * stack (it must not live longer than a isc_loop callback) and the
846
 * packed reader is unpacked into it.
847
 *
848
 * Chunks are owned by the current `base` array, so unused chunks are
849
 * held there until they are free()d. Old `base` arrays are attached
850
 * to packed reader nodes with a refcount. When a chunk is reclaimed,
851
 * it is scanned so that `chunk_free()` can call `detach_leaf()` on
852
 * any remaining references to leaf objects. Similarly, it calls
853
 * `qpbase_unref()` to reclaim old `base` arrays.
854
 */
855
856
/*
857
 * Two nodes is just enough space for the information needed by
858
 * readers and for deferred memory reclamation.
859
 */
860
#define READER_SIZE 2
861
862
/*
863
 * Create a packed reader; space for the reader should have been
864
 * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`.
865
 */
866
static inline void
867
make_reader(qp_node_t *reader, dns_qpmulti_t *multi) {
868
  dns_qp_t *qp = &multi->writer;
869
  reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC);
870
  reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref);
871
}
872
873
static inline bool
874
reader_valid(qp_node_t *reader) {
875
  return (reader != NULL && //
876
    node_tag(&reader[0]) == READER_TAG &&
877
    node_tag(&reader[1]) == READER_TAG &&
878
    node32(&reader[0]) == QPREADER_MAGIC);
879
}
880
881
/*
882
 * Verify and unpack a reader. We return the `multi` pointer to use in
883
 * consistency checks.
884
 */
885
static inline dns_qpmulti_t *
886
unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) {
887
  INSIST(reader_valid(reader));
888
  dns_qpmulti_t *multi = node_pointer(&reader[0]);
889
  dns_qpbase_t *base = node_pointer(&reader[1]);
890
  INSIST(QPMULTI_VALID(multi));
891
  INSIST(QPBASE_VALID(base));
892
  *qp = (dns_qpreader_t){
893
    .magic = QP_MAGIC,
894
    .uctx = multi->writer.uctx,
895
    .methods = multi->writer.methods,
896
    .root_ref = node32(&reader[1]),
897
    .base = base,
898
  };
899
  return (multi);
900
}
901
902
/***********************************************************************
903
 *
904
 *  method invocation helpers
905
 */
906
907
static inline void
908
attach_leaf(dns_qpreadable_t qpr, qp_node_t *n) {
909
  dns_qpreader_t *qp = dns_qpreader(qpr);
910
  qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n));
911
}
912
913
static inline void
914
detach_leaf(dns_qpreadable_t qpr, qp_node_t *n) {
915
  dns_qpreader_t *qp = dns_qpreader(qpr);
916
  qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n));
917
}
918
919
static inline size_t
920
leaf_qpkey(dns_qpreadable_t qpr, qp_node_t *n, dns_qpkey_t key) {
921
  dns_qpreader_t *qp = dns_qpreader(qpr);
922
  size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n),
923
            leaf_ival(n));
924
  INSIST(len < sizeof(dns_qpkey_t));
925
  return (len);
926
}
927
928
static inline char *
929
triename(dns_qpreadable_t qpr, char *buf, size_t size) {
930
  dns_qpreader_t *qp = dns_qpreader(qpr);
931
  qp->methods->triename(qp->uctx, buf, size);
932
  return (buf);
933
}
934
935
#define TRIENAME(qp) \
936
  triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX)
937
938
/***********************************************************************
939
 *
940
 *  converting DNS names to trie keys
941
 */
942
943
/*
944
 * This is a deliberate simplification of the hostname characters,
945
 * because it doesn't matter much if we treat a few extra characters
946
 * favourably: there is plenty of space in the index word for a
947
 * slightly larger bitmap.
948
 */
949
static inline bool
950
qp_common_character(uint8_t byte) {
951
  return (('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z'));
952
}
953
954
/*
955
 * Lookup table mapping bytes in DNS names to bit positions, used
956
 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
957
 */
958
extern uint16_t dns_qp_bits_for_byte[];
959
960
/*
961
 * And the reverse, mapping bit positions to characters, so the tests
962
 * can print diagnostics involving qp-trie keys.
963
 */
964
extern uint8_t dns_qp_byte_for_bit[];
965
966
/**********************************************************************/