/src/bind9/lib/dns/qp_p.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (C) Internet Systems Consortium, Inc. ("ISC") |
3 | | * |
4 | | * SPDX-License-Identifier: MPL-2.0 |
5 | | * |
6 | | * This Source Code Form is subject to the terms of the Mozilla Public |
7 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
8 | | * file, you can obtain one at https://mozilla.org/MPL/2.0/. |
9 | | * |
10 | | * See the COPYRIGHT file distributed with this work for additional |
11 | | * information regarding copyright ownership. |
12 | | */ |
13 | | |
14 | | /* |
15 | | * For an overview, see doc/design/qp-trie.md |
16 | | * |
17 | | * This private header defines the internal data structures, |
18 | | */ |
19 | | |
20 | | #pragma once |
21 | | |
22 | | /*********************************************************************** |
23 | | * |
24 | | * interior node basics |
25 | | */ |
26 | | |
27 | | /* |
28 | | * A qp-trie node is normally either a branch or a leaf. It consists of |
29 | | * three 32-bit words into which the components are packed. They are used |
30 | | * as a 64-bit word and a 32-bit word, but they are not declared like that |
31 | | * to avoid unwanted padding, keeping the size down to 12 bytes. They are |
32 | | * in native endian order so getting the 64-bit part should compile down to |
33 | | * an unaligned load. |
34 | | * |
35 | | * The type of node is identified by the tag in the least significant bits |
36 | | * of the 64-bit word. |
37 | | * |
38 | | * In a branch the 64-bit word is described by the enum below. The 32-bit |
39 | | * word is a reference to the packed sparse vector of "twigs", i.e. child |
40 | | * nodes. A branch node has at least 2 and less than SHIFT_OFFSET twigs |
41 | | * (see the enum below). The qp-trie update functions ensure that branches |
42 | | * actually branch, i.e. branches cannot have only 1 child. |
43 | | * |
44 | | * The contents of each leaf are set by the trie's user. The 64-bit word |
45 | | * contains a pointer value (which must be word-aligned, so the tag bits |
46 | | * are zero), and the 32-bit word is an arbitrary integer value. |
47 | | * |
48 | | * There is a third kind of node, reader nodes, which anchor the root of a |
49 | | * trie. A pair of reader nodes together contain a packed `dns_qpreader_t`. |
50 | | * See the section on "packed reader nodes" below. |
51 | | */ |
52 | | typedef struct qp_node { |
53 | | #if WORDS_BIGENDIAN |
54 | | uint32_t bighi, biglo, small; |
55 | | #else |
56 | | uint32_t biglo, bighi, small; |
57 | | #endif |
58 | | } qp_node_t; |
59 | | |
60 | | /* |
61 | | * The possible values of the node type tag. Type tags must fit in two bits |
62 | | * for compatibility with 4-byte pointer alignment on 32-bit systems. |
63 | | */ |
64 | | enum { |
65 | | LEAF_TAG = 0, /* leaf node */ |
66 | | BRANCH_TAG = 1, /* branch node */ |
67 | | READER_TAG = 2, /* reader node */ |
68 | | TAG_MASK = 3, /* mask covering tag bits */ |
69 | | }; |
70 | | |
71 | | /* |
72 | | * This code does not work on CPUs with large pointers, e.g. CHERI capability |
73 | | * architectures. When porting to that kind of machine, a `dns_qpnode` should |
74 | | * be just a `uintptr_t`; a leaf node will contain a single pointer, and a |
75 | | * branch node will fit in the same space with room to spare. |
76 | | */ |
77 | | STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t), |
78 | | "pointers must fit in 64 bits"); |
79 | | |
80 | | /* |
81 | | * A branch node contains a 64-bit word comprising the type tag, the |
82 | | * bitmap, and an offset into the key. It is called an "index word" because |
83 | | * it describes how to access the twigs vector (think "database index"). |
84 | | * The following enum sets up the bit positions of these parts. |
85 | | * |
86 | | * In a leaf, the same 64-bit word contains a pointer. The pointer |
87 | | * must be word-aligned so that the branch/leaf tag bit is zero. |
88 | | * This requirement is checked by the newleaf() constructor. |
89 | | * |
90 | | * The bitmap is just above the type tag. The `bits_for_byte[]` table is |
91 | | * used to fill in a key so that bit tests can work directly against the |
92 | | * index word without superfluous masking or shifting; we don't need to |
93 | | * mask out the bitmap before testing a bit, but we do need to mask the |
94 | | * bitmap before calling popcount. |
95 | | * |
96 | | * The byte offset into the key is at the top of the word, so that it |
97 | | * can be extracted with just a shift, with no masking needed. |
98 | | * |
99 | | * The names are SHIFT_thing because they are qp_shift_t values. (See |
100 | | * below for the various `qp_*` type declarations.) |
101 | | * |
102 | | * These values are relatively fixed in practice: SHIFT_NOBYTE needs |
103 | | * to leave space for the type tag, and the implementation of |
104 | | * `dns_qpkey_fromname()` depends on the bitmap being large enough. |
105 | | * The symbolic names avoid mystery numbers in the code. |
106 | | */ |
107 | | enum { |
108 | | SHIFT_NOBYTE = 2, /* label separator has no byte value */ |
109 | | SHIFT_BITMAP, /* many bits here */ |
110 | | SHIFT_OFFSET = 49, /* offset of byte in key */ |
111 | | }; |
112 | | |
113 | | /*********************************************************************** |
114 | | * |
115 | | * garbage collector tuning parameters |
116 | | */ |
117 | | |
118 | | /* |
119 | | * A "cell" is a location that can contain a `qp_node_t`, and a "chunk" |
120 | | * is a moderately large array of cells. A big trie can occupy |
121 | | * multiple chunks. (Unlike other nodes, a trie's root node lives in |
122 | | * its `struct dns_qp` instead of being allocated in a cell.) |
123 | | * |
124 | | * The qp-trie allocator hands out space for twigs vectors. Allocations are |
125 | | * made sequentially from one of the chunks; this kind of "sequential |
126 | | * allocator" is also known as a "bump allocator", so in `struct dns_qp` |
127 | | * (see below) the allocation chunk is called `bump`. |
128 | | */ |
129 | | |
130 | | /* |
131 | | * Number of cells in a chunk is a power of 2, which must have space for |
132 | | * a full twigs vector (48 wide). When testing, use a much smaller chunk |
133 | | * size to make the allocator work harder. |
134 | | */ |
135 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
136 | | #define QP_CHUNK_LOG 7 |
137 | | #else |
138 | | #define QP_CHUNK_LOG 10 |
139 | | #endif |
140 | | |
141 | | STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20, |
142 | | "qp-trie chunk size is unreasonable"); |
143 | | |
144 | | #define QP_CHUNK_SIZE (1U << QP_CHUNK_LOG) |
145 | | #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(qp_node_t)) |
146 | | |
147 | | /* |
148 | | * We need a bitfield this big to count how much of a chunk is in use: |
149 | | * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`. |
150 | | */ |
151 | | #define QP_USAGE_BITS (QP_CHUNK_LOG + 1) |
152 | | |
153 | | /* |
154 | | * A chunk needs to be compacted if it is less full than this threshold. |
155 | | * (12% overhead seems reasonable) |
156 | | */ |
157 | | #define QP_MAX_FREE (QP_CHUNK_SIZE / 8) |
158 | | #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE) |
159 | | |
160 | | /* |
161 | | * Compact automatically when we pass this threshold: when there is a lot |
162 | | * of free space in absolute terms, and when we have freed more than half |
163 | | * of the space we allocated. |
164 | | * |
165 | | * The current compaction algorithm scans the whole trie, so it is important |
166 | | * to scale the threshold based on the size of the trie to avoid quadratic |
167 | | * behaviour. XXXFANF find an algorithm that scans less of the trie! |
168 | | * |
169 | | * During a modification transaction, when we copy-on-write some twigs we |
170 | | * count the old copy as "free", because they will be when the transaction |
171 | | * commits. But they cannot be recovered immediately so they are also |
172 | | * counted as on hold, and discounted when we decide whether to compact. |
173 | | */ |
174 | | #define QP_GC_HEURISTIC(qp, free) \ |
175 | | ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2) |
176 | | |
177 | | #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count) |
178 | | #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count) |
179 | | |
180 | | /* |
181 | | * The chunk base and usage arrays are resized geometically and start off |
182 | | * with two entries. |
183 | | */ |
184 | | #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2) |
185 | | |
186 | | /*********************************************************************** |
187 | | * |
188 | | * helper types |
189 | | */ |
190 | | |
191 | | /* |
192 | | * C is not strict enough with its integer types for these typedefs to |
193 | | * improve type safety, but it helps to have annotations saying what |
194 | | * particular kind of number we are dealing with. |
195 | | */ |
196 | | |
197 | | /* |
198 | | * The number or position of a bit inside a word. (0..63) |
199 | | * |
200 | | * Note: A dns_qpkey_t is logically an array of qp_shift_t values, but it |
201 | | * isn't declared that way because dns_qpkey_t is a public type whereas |
202 | | * qp_shift_t is private. |
203 | | * |
204 | | * A dns_qpkey element key[off] must satisfy |
205 | | * |
206 | | * SHIFT_NOBYTE <= key[off] && key[off] < SHIFT_OFFSET |
207 | | */ |
208 | | typedef uint8_t qp_shift_t; |
209 | | |
210 | | /* |
211 | | * The number of bits set in a word (as in Hamming weight or popcount) |
212 | | * which is used for the position of a node in the packed sparse |
213 | | * vector of twigs. (0..47) because our bitmap does not fill the word. |
214 | | */ |
215 | | typedef uint8_t qp_weight_t; |
216 | | |
217 | | /* |
218 | | * A chunk number, i.e. an index into the chunk arrays. |
219 | | */ |
220 | | typedef uint32_t qp_chunk_t; |
221 | | |
222 | | /* |
223 | | * Cell offset within a chunk, or a count of cells. Each cell in a |
224 | | * chunk can contain a node. |
225 | | */ |
226 | | typedef uint32_t qp_cell_t; |
227 | | |
228 | | /* |
229 | | * A twig reference is used to refer to a twigs vector, which occupies a |
230 | | * contiguous group of cells. |
231 | | */ |
232 | | typedef uint32_t qp_ref_t; |
233 | | |
234 | | /* |
235 | | * Constructors and accessors for qp_ref_t values, defined here to show |
236 | | * how the qp_ref_t, qp_chunk_t, qp_cell_t types relate to each other |
237 | | */ |
238 | | |
239 | | static inline qp_ref_t |
240 | | make_ref(qp_chunk_t chunk, qp_cell_t cell) { |
241 | | return (QP_CHUNK_SIZE * chunk + cell); |
242 | | } |
243 | | |
244 | | static inline qp_chunk_t |
245 | | ref_chunk(qp_ref_t ref) { |
246 | | return (ref / QP_CHUNK_SIZE); |
247 | | } |
248 | | |
249 | | static inline qp_cell_t |
250 | | ref_cell(qp_ref_t ref) { |
251 | | return (ref % QP_CHUNK_SIZE); |
252 | | } |
253 | | |
254 | | /* |
255 | | * We should not use the `root_ref` in an empty trie, so we set it |
256 | | * to a value that should trigger an obvious bug. See qp_init() |
257 | | * and get_root() below. |
258 | | */ |
259 | 0 | #define INVALID_REF ((qp_ref_t)~0UL) |
260 | | |
261 | | /*********************************************************************** |
262 | | * |
263 | | * chunk arrays |
264 | | */ |
265 | | |
266 | | /* |
267 | | * A `dns_qp_t` contains two arrays holding information about each chunk. |
268 | | * |
269 | | * The `base` array holds pointers to the base of each chunk. |
270 | | * The `usage` array hold the allocator's state for each chunk. |
271 | | * |
272 | | * The `base` array is used by the hot qp-trie traversal paths. It can |
273 | | * be shared by multiple versions of a trie, which are tracked with a |
274 | | * refcount. Old versions of the trie can retain old versions of the |
275 | | * `base` array. |
276 | | * |
277 | | * In multithreaded code, the `usage` array is only used when the |
278 | | * `dns_qpmulti_t` mutex is held, and there is only one version of |
279 | | * it in active use (maybe with a snapshot for rollback support). |
280 | | * |
281 | | * The two arrays are separate because they have rather different |
282 | | * access patterns, different lifetimes, and different element sizes. |
283 | | */ |
284 | | |
285 | | /* |
286 | | * For most purposes we don't need to know exactly which cells are |
287 | | * in use in a chunk, we only need to know how many of them there are. |
288 | | * |
289 | | * After we have finished allocating from a chunk, the `used` counter |
290 | | * is the size we need to know for shrinking the chunk and for |
291 | | * scanning it to detach leaf values before the chunk is free()d. The |
292 | | * `free` counter tells us when the chunk needs compacting and when it |
293 | | * has become empty. |
294 | | * |
295 | | * The `exists` flag allows the chunk scanning loops to look at the |
296 | | * usage array only. |
297 | | * |
298 | | * In multithreaded code, we mark chunks as `immutable` when a modify |
299 | | * transaction is opened. (We don't mark them immutable on commit, |
300 | | * because the old bump chunk must remain mutable between write |
301 | | * transactions, but it must become immutable when an update |
302 | | * transaction is opened.) |
303 | | * |
304 | | * There are a few flags used to mark which chunks are still needed by |
305 | | * snapshots after the chunks have passed their normal reclamation |
306 | | * phase. |
307 | | */ |
308 | | typedef struct qp_usage { |
309 | | /*% the allocation point, increases monotonically */ |
310 | | qp_cell_t used : QP_USAGE_BITS; |
311 | | /*% count of nodes no longer needed, also monotonic */ |
312 | | qp_cell_t free : QP_USAGE_BITS; |
313 | | /*% qp->base->ptr[chunk] != NULL */ |
314 | | bool exists : 1; |
315 | | /*% is this chunk shared? [MT] */ |
316 | | bool immutable : 1; |
317 | | /*% already subtracted from multi->*_count [MT] */ |
318 | | bool discounted : 1; |
319 | | /*% is a snapshot using this chunk? [MT] */ |
320 | | bool snapshot : 1; |
321 | | /*% tried to free it but a snapshot needs it [MT] */ |
322 | | bool snapfree : 1; |
323 | | /*% for mark/sweep snapshot flag updates [MT] */ |
324 | | bool snapmark : 1; |
325 | | } qp_usage_t; |
326 | | |
327 | | /* |
328 | | * The chunks are owned by the current version of the `base` array. |
329 | | * When the array is resized, the old version might still be in use by |
330 | | * concurrent readers, in which case it is free()d later when its |
331 | | * refcount drops to zero. |
332 | | * |
333 | | * A `dns_qpbase_t` counts references from `dns_qp_t` objects and |
334 | | * from packed readers, but not from `dns_qpread_t` nor from |
335 | | * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t` |
336 | | * would wreck multicore scalability; instead we rely on RCU. |
337 | | * |
338 | | * The `usage` array determines when a chunk is no longer needed: old |
339 | | * chunk pointers in old `base` arrays are ignored. (They can become |
340 | | * dangling pointers to free memory, but they will never be |
341 | | * dereferenced.) |
342 | | * |
343 | | * We ensure that individual chunk base pointers remain immutable |
344 | | * after assignment, and they are not cleared until the chunk is |
345 | | * free()d, after all readers have departed. Slots can be reused, and |
346 | | * we allow transactions to fill or re-fill empty slots adjacent to |
347 | | * busy slots that are in use by readers. |
348 | | */ |
349 | | struct dns_qpbase { |
350 | | unsigned int magic; |
351 | | isc_refcount_t refcount; |
352 | | qp_node_t *ptr[]; |
353 | | }; |
354 | | |
355 | | /* |
356 | | * Chunks that may be in use by readers are reclaimed asynchronously. |
357 | | * When a transaction commits, immutable chunks that are now empty are |
358 | | * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`. |
359 | | */ |
360 | | typedef struct qp_rcuctx { |
361 | | unsigned int magic; |
362 | | struct rcu_head rcu_head; |
363 | | isc_mem_t *mctx; |
364 | | dns_qpmulti_t *multi; |
365 | | qp_chunk_t count; |
366 | | qp_chunk_t chunk[]; |
367 | | } qp_rcuctx_t; |
368 | | |
369 | | /* |
370 | | * Returns true when the base array can be free()d. |
371 | | */ |
372 | | static inline bool |
373 | | qpbase_unref(dns_qpreadable_t qpr) { |
374 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
375 | | return (qp->base != NULL && |
376 | | isc_refcount_decrement(&qp->base->refcount) == 1); |
377 | | } |
378 | | |
379 | | /* |
380 | | * Now we know about `dns_qpreader_t` and `dns_qpbase_t`, |
381 | | * here's how we convert a twig reference into a pointer. |
382 | | */ |
383 | | static inline qp_node_t * |
384 | | ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) { |
385 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
386 | | return (qp->base->ptr[ref_chunk(ref)] + ref_cell(ref)); |
387 | | } |
388 | | |
389 | | /*********************************************************************** |
390 | | * |
391 | | * main qp-trie structures |
392 | | */ |
393 | | |
394 | | #define QP_MAGIC ISC_MAGIC('t', 'r', 'i', 'e') |
395 | | #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't') |
396 | | #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v') |
397 | | #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x') |
398 | | #define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p') |
399 | | #define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b') |
400 | | |
401 | | #define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC) |
402 | | #define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC) |
403 | | #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC) |
404 | | #define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC) |
405 | | #define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC) |
406 | | |
407 | | /* |
408 | | * Polymorphic initialization of the `dns_qpreader_t` prefix. |
409 | | * |
410 | | * The location of the root node is actually a qp_ref_t, but is |
411 | | * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too |
412 | | * many internal details into the public API. |
413 | | * |
414 | | * The `uctx` and `methods` support callbacks into the user's code. |
415 | | * They are constant after initialization. |
416 | | */ |
417 | | #define QP_INIT(qp, m, x) \ |
418 | | (*(qp) = (typeof(*(qp))){ \ |
419 | | .magic = QP_MAGIC, \ |
420 | | .root_ref = INVALID_REF, \ |
421 | | .uctx = x, \ |
422 | | .methods = m, \ |
423 | | }) |
424 | | |
425 | | /* |
426 | | * Snapshots have some extra cleanup machinery. |
427 | | * |
428 | | * Originally, a snapshot was basically just a `dns_qpread_t` |
429 | | * allocated on the heap, with the extra behaviour that memory |
430 | | * reclamation is suppressed for a particular trie while it has any |
431 | | * snapshots. However that design gets into trouble for a zone with |
432 | | * frequent updates and many zone transfers. |
433 | | * |
434 | | * Instead, each snapshot records which chunks it needs. When a |
435 | | * snapshot is created, it makes a copy of the `base` array, except |
436 | | * for chunks that are empty and waiting to be reclaimed. When a |
437 | | * snapshot is destroyed, we can traverse the list of snapshots to |
438 | | * accurately mark which chunks are still needed. |
439 | | * |
440 | | * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is |
441 | | * not muddled up with the wrong `dns_qpmulti_t`. |
442 | | * |
443 | | * A trie's `base` array might have grown after the snapshot was |
444 | | * created, so it records its own `chunk_max`. |
445 | | */ |
446 | | struct dns_qpsnap { |
447 | | DNS_QPREADER_FIELDS; |
448 | | dns_qpmulti_t *whence; |
449 | | uint32_t chunk_max; |
450 | | ISC_LINK(struct dns_qpsnap) link; |
451 | | }; |
452 | | |
453 | | /* |
454 | | * Read-write access to a qp-trie requires extra fields to support the |
455 | | * allocator and garbage collector. |
456 | | * |
457 | | * Bare instances of a `struct dns_qp` are used for stand-alone |
458 | | * single-threaded tries. For multithreaded access, a `dns_qpmulti_t` |
459 | | * wraps a `dns_qp_t` with a mutex and other fields that are only needed |
460 | | * at the start or end of a transaction. |
461 | | * |
462 | | * Allocations are made sequentially in the `bump` chunk. A sequence |
463 | | * of lightweight write transactions can use the same `bump` chunk, so |
464 | | * its prefix before `fender` is immutable, and the rest is mutable. |
465 | | * |
466 | | * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines |
467 | | * the values of `used_count`, `free_count`, and `hold_count`. The |
468 | | * `hold_count` tracks nodes that need to be retained while readers are |
469 | | * using them; they are free but cannot be reclaimed until the transaction |
470 | | * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE() |
471 | | * during a transaction. |
472 | | * |
473 | | * There are some flags that alter the behaviour of write transactions. |
474 | | * |
475 | | * - The `transaction_mode` indicates whether the current transaction is a |
476 | | * light write or a heavy update, or (between transactions) the previous |
477 | | * transaction's mode, because the setup for the next transaction |
478 | | * depends on how the previous one committed. The mode is set at the |
479 | | * start of each transaction. It is QP_NONE in a single-threaded qp-trie |
480 | | * to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy(). |
481 | | * |
482 | | * - The `compact_all` flag is used when every node in the trie should be |
483 | | * copied. (Usually compation aims to avoid moving nodes out of |
484 | | * unfragmented chunks.) It is used when compaction is explicitly |
485 | | * requested via `dns_qp_compact()`, and as an emergency mechanism if |
486 | | * normal compaction failed to clear the QP_MAX_GARBAGE() condition. |
487 | | * (This emergency is a bug even tho we have a rescue mechanism.) |
488 | | * |
489 | | * - When a qp-trie is destroyed while it has pending cleanup work, its |
490 | | * `destroy` flag is set so that it is destroyed by the reclaim worker. |
491 | | * (Because items cannot be removed from the middle of the cleanup list.) |
492 | | * |
493 | | * - When built with fuzzing support, we can use mprotect() and munmap() |
494 | | * to ensure that incorrect memory accesses cause fatal errors. The |
495 | | * `write_protect` flag must be set straight after the `dns_qpmulti_t` |
496 | | * is created, then left unchanged. |
497 | | * |
498 | | * Some of the dns_qp_t fields are only needed for multithreaded transactions |
499 | | * (marked [MT] below) but the same code paths are also used for single- |
500 | | * threaded writes. |
501 | | */ |
502 | | struct dns_qp { |
503 | | DNS_QPREADER_FIELDS; |
504 | | /*% memory context (const) */ |
505 | | isc_mem_t *mctx; |
506 | | /*% array of per-chunk allocation counters */ |
507 | | qp_usage_t *usage; |
508 | | /*% number of slots in `chunk` and `usage` arrays */ |
509 | | qp_chunk_t chunk_max; |
510 | | /*% which chunk is used for allocations */ |
511 | | qp_chunk_t bump; |
512 | | /*% nodes in the `bump` chunk below `fender` are read only [MT] */ |
513 | | qp_cell_t fender; |
514 | | /*% number of leaf nodes */ |
515 | | qp_cell_t leaf_count; |
516 | | /*% total of all usage[] counters */ |
517 | | qp_cell_t used_count, free_count; |
518 | | /*% free cells that cannot be recovered right now */ |
519 | | qp_cell_t hold_count; |
520 | | /*% what kind of transaction was most recently started [MT] */ |
521 | | enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2; |
522 | | /*% compact the entire trie [MT] */ |
523 | | bool compact_all : 1; |
524 | | /*% optionally when compiled with fuzzing support [MT] */ |
525 | | bool write_protect : 1; |
526 | | }; |
527 | | |
528 | | /* |
529 | | * Concurrent access to a qp-trie. |
530 | | * |
531 | | * The `reader` pointer provides wait-free access to the current version |
532 | | * of the trie. See the "packed reader nodes" section below for a |
533 | | * description of what it points to. |
534 | | * |
535 | | * The main object under the protection of the mutex is the `writer` |
536 | | * containing all the allocator state. There can be a backup copy when |
537 | | * we want to be able to rollback an update transaction. |
538 | | * |
539 | | * There is a `reader_ref` which corresponds to the `reader` pointer |
540 | | * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is |
541 | | * necessary when freeing the space used by the reader, because there |
542 | | * isn't a good way to recover a qp_ref_t from a qp_node_t pointer. |
543 | | * |
544 | | * There is a per-trie list of snapshots that is used for reclaiming |
545 | | * memory when a snapshot is destroyed. |
546 | | * |
547 | | * Finally, we maintain a global list of `dns_qpmulti_t` objects that |
548 | | * need asynchronous safe memory recovery. |
549 | | */ |
550 | | struct dns_qpmulti { |
551 | | uint32_t magic; |
552 | | /*% RCU-protected pointer to current packed reader */ |
553 | | qp_node_t *reader; |
554 | | /*% the mutex protects the rest of this structure */ |
555 | | isc_mutex_t mutex; |
556 | | /*% ref_ptr(writer, reader_ref) == reader */ |
557 | | qp_ref_t reader_ref; |
558 | | /*% the main working structure */ |
559 | | dns_qp_t writer; |
560 | | /*% saved allocator state to support rollback */ |
561 | | dns_qp_t *rollback; |
562 | | /*% all snapshots of this trie */ |
563 | | ISC_LIST(dns_qpsnap_t) snapshots; |
564 | | }; |
565 | | |
566 | | /*********************************************************************** |
567 | | * |
568 | | * interior node constructors and accessors |
569 | | */ |
570 | | |
571 | | /* |
572 | | * See the comments under "interior node basics" above, which explain |
573 | | * the layout of nodes as implemented by the following functions. |
574 | | * |
575 | | * These functions are (mostly) constructors and getters. Imagine how |
576 | | * much less code there would be if C had sum types with control over |
577 | | * the layout... |
578 | | */ |
579 | | |
580 | | /* |
581 | | * Get the 64-bit word of a node. |
582 | | */ |
583 | | static inline uint64_t |
584 | | node64(qp_node_t *n) { |
585 | | uint64_t lo = n->biglo; |
586 | | uint64_t hi = n->bighi; |
587 | | return (lo | (hi << 32)); |
588 | | } |
589 | | |
590 | | /* |
591 | | * Get the 32-bit word of a node. |
592 | | */ |
593 | | static inline uint32_t |
594 | | node32(qp_node_t *n) { |
595 | | return (n->small); |
596 | | } |
597 | | |
598 | | /* |
599 | | * Create a node from its parts |
600 | | */ |
601 | | static inline qp_node_t |
602 | | make_node(uint64_t big, uint32_t small) { |
603 | | return ((qp_node_t){ |
604 | | .biglo = (uint32_t)(big), |
605 | | .bighi = (uint32_t)(big >> 32), |
606 | | .small = small, |
607 | | }); |
608 | | } |
609 | | |
610 | | /* |
611 | | * Extract a pointer from a node's 64 bit word. The double cast is to avoid |
612 | | * a warning about mismatched pointer/integer sizes on 32 bit systems. |
613 | | */ |
614 | | static inline void * |
615 | | node_pointer(qp_node_t *n) { |
616 | | return ((void *)(uintptr_t)(node64(n) & ~TAG_MASK)); |
617 | | } |
618 | | |
619 | | /* |
620 | | * Examine a node's tag bits |
621 | | */ |
622 | | static inline uint32_t |
623 | | node_tag(qp_node_t *n) { |
624 | | return (n->biglo & TAG_MASK); |
625 | | } |
626 | | |
627 | | /* |
628 | | * simplified for the hot path |
629 | | */ |
630 | | static inline bool |
631 | | is_branch(qp_node_t *n) { |
632 | | return (n->biglo & BRANCH_TAG); |
633 | | } |
634 | | |
635 | | /* leaf nodes *********************************************************/ |
636 | | |
637 | | /* |
638 | | * Get a leaf's pointer value. |
639 | | */ |
640 | | static inline void * |
641 | | leaf_pval(qp_node_t *n) { |
642 | | return (node_pointer(n)); |
643 | | } |
644 | | |
645 | | /* |
646 | | * Get a leaf's integer value |
647 | | */ |
648 | | static inline uint32_t |
649 | | leaf_ival(qp_node_t *n) { |
650 | | return (node32(n)); |
651 | | } |
652 | | |
653 | | /* |
654 | | * Create a leaf node from its parts |
655 | | */ |
656 | | static inline qp_node_t |
657 | | make_leaf(const void *pval, uint32_t ival) { |
658 | | qp_node_t leaf = make_node((uintptr_t)pval, ival); |
659 | | REQUIRE(node_tag(&leaf) == LEAF_TAG); |
660 | | return (leaf); |
661 | | } |
662 | | |
663 | | /* branch nodes *******************************************************/ |
664 | | |
665 | | /* |
666 | | * The following function names use plural `twigs` when they work on a |
667 | | * branch's twigs vector as a whole, and singular `twig` when they work on |
668 | | * a particular twig. |
669 | | */ |
670 | | |
671 | | /* |
672 | | * Get a branch node's index word |
673 | | */ |
674 | | static inline uint64_t |
675 | | branch_index(qp_node_t *n) { |
676 | | return (node64(n)); |
677 | | } |
678 | | |
679 | | /* |
680 | | * Get a reference to a branch node's child twigs. |
681 | | */ |
682 | | static inline qp_ref_t |
683 | | branch_twigs_ref(qp_node_t *n) { |
684 | | return (node32(n)); |
685 | | } |
686 | | |
687 | | /* |
688 | | * Bit positions in the bitmap come directly from the key. DNS names are |
689 | | * converted to keys using the tables declared at the end of this file. |
690 | | */ |
691 | | static inline qp_shift_t |
692 | | qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) { |
693 | | if (offset < len) { |
694 | | return (key[offset]); |
695 | | } else { |
696 | | return (SHIFT_NOBYTE); |
697 | | } |
698 | | } |
699 | | |
700 | | /* |
701 | | * Extract a branch node's offset field, used to index the key. |
702 | | */ |
703 | | static inline size_t |
704 | | branch_key_offset(qp_node_t *n) { |
705 | | return ((size_t)(branch_index(n) >> SHIFT_OFFSET)); |
706 | | } |
707 | | |
708 | | /* |
709 | | * Which bit identifies the twig of this node for this key? |
710 | | */ |
711 | | static inline qp_shift_t |
712 | | branch_keybit(qp_node_t *n, const dns_qpkey_t key, size_t len) { |
713 | | return (qpkey_bit(key, len, branch_key_offset(n))); |
714 | | } |
715 | | |
716 | | /* |
717 | | * Get a pointer to a branch node's twigs vector. |
718 | | */ |
719 | | static inline qp_node_t * |
720 | | branch_twigs_vector(dns_qpreadable_t qpr, qp_node_t *n) { |
721 | | return (ref_ptr(qpr, branch_twigs_ref(n))); |
722 | | } |
723 | | |
724 | | /* |
725 | | * Warm up the cache while calculating which twig we want. |
726 | | */ |
727 | | static inline void |
728 | | prefetch_twigs(dns_qpreadable_t qpr, qp_node_t *n) { |
729 | | __builtin_prefetch(branch_twigs_vector(qpr, n)); |
730 | | } |
731 | | |
732 | | /* root node **********************************************************/ |
733 | | |
734 | | /* |
735 | | * Get a pointer to the root node, checking if the trie is empty. |
736 | | */ |
737 | | static inline qp_node_t * |
738 | | get_root(dns_qpreadable_t qpr) { |
739 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
740 | | if (qp->root_ref == INVALID_REF) { |
741 | | return (NULL); |
742 | | } else { |
743 | | return (ref_ptr(qp, qp->root_ref)); |
744 | | } |
745 | | } |
746 | | |
747 | | /* |
748 | | * When we need to move the root node, we avoid repeating allocation |
749 | | * logistics by making a temporary fake branch node that has |
750 | | * `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref` |
751 | | * just enough to treat the root node as a vector of one twig. |
752 | | */ |
753 | | #define MOVABLE_ROOT(qp) \ |
754 | | (&(qp_node_t){ \ |
755 | | .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \ |
756 | | .small = qp->root_ref, \ |
757 | | }) |
758 | | |
759 | | /*********************************************************************** |
760 | | * |
761 | | * bitmap popcount shenanigans |
762 | | */ |
763 | | |
764 | | /* |
765 | | * How many twigs appear in the vector before the one corresponding to the |
766 | | * given bit? Calculated using popcount of part of the branch's bitmap. |
767 | | * |
768 | | * To calculate a mask that covers the lesser bits in the bitmap, |
769 | | * we subtract 1 to set all lesser bits, and subtract the tag mask |
770 | | * because the type tag is not part of the bitmap. |
771 | | */ |
772 | | static inline qp_weight_t |
773 | | branch_count_bitmap_before(qp_node_t *n, qp_shift_t bit) { |
774 | | uint64_t mask = (1ULL << bit) - 1 - TAG_MASK; |
775 | | uint64_t bitmap = branch_index(n) & mask; |
776 | | return ((qp_weight_t)__builtin_popcountll(bitmap)); |
777 | | } |
778 | | |
779 | | /* |
780 | | * How many twigs does this branch have? |
781 | | * |
782 | | * The offset is directly after the bitmap so the offset's lesser bits |
783 | | * covers the whole bitmap, and the bitmap's weight is the number of twigs. |
784 | | */ |
785 | | static inline qp_weight_t |
786 | | branch_twigs_size(qp_node_t *n) { |
787 | | return (branch_count_bitmap_before(n, SHIFT_OFFSET)); |
788 | | } |
789 | | |
790 | | /* |
791 | | * Position of a twig within the packed sparse vector. |
792 | | */ |
793 | | static inline qp_weight_t |
794 | | branch_twig_pos(qp_node_t *n, qp_shift_t bit) { |
795 | | return (branch_count_bitmap_before(n, bit)); |
796 | | } |
797 | | |
798 | | /* |
799 | | * Get a pointer to a particular twig. |
800 | | */ |
801 | | static inline qp_node_t * |
802 | | branch_twig_ptr(dns_qpreadable_t qpr, qp_node_t *n, qp_shift_t bit) { |
803 | | return (branch_twigs_vector(qpr, n) + branch_twig_pos(n, bit)); |
804 | | } |
805 | | |
806 | | /* |
807 | | * Is the twig identified by this bit present? |
808 | | */ |
809 | | static inline bool |
810 | | branch_has_twig(qp_node_t *n, qp_shift_t bit) { |
811 | | return (branch_index(n) & (1ULL << bit)); |
812 | | } |
813 | | |
814 | | /* twig logistics *****************************************************/ |
815 | | |
816 | | static inline void |
817 | | move_twigs(qp_node_t *to, qp_node_t *from, qp_weight_t size) { |
818 | | memmove(to, from, size * sizeof(qp_node_t)); |
819 | | } |
820 | | |
821 | | static inline void |
822 | | zero_twigs(qp_node_t *twigs, qp_weight_t size) { |
823 | | memset(twigs, 0, size * sizeof(qp_node_t)); |
824 | | } |
825 | | |
826 | | /*********************************************************************** |
827 | | * |
828 | | * packed reader nodes |
829 | | */ |
830 | | |
831 | | /* |
832 | | * The purpose of these packed reader nodes is to simplify safe memory |
833 | | * reclamation for a multithreaded qp-trie. |
834 | | * |
835 | | * After the `reader` pointer in a qpmulti is replaced, we need to wait |
836 | | * for a grace period before we can reclaim the memory that is no longer |
837 | | * needed by the trie. So we need some kind of structure to hold |
838 | | * pointers to the (logically) detached memory until it is safe to free. |
839 | | * This memory includes the chunks and the `base` arrays. |
840 | | * |
841 | | * Packed reader nodes save us from having to track `dns_qpread_t` |
842 | | * objects as distinct allocations: the packed reader nodes get |
843 | | * reclaimed when the the chunk containing their cells is reclaimed. |
844 | | * When a real `dns_qpread_t` object is needed, it is allocated on the |
845 | | * stack (it must not live longer than a isc_loop callback) and the |
846 | | * packed reader is unpacked into it. |
847 | | * |
848 | | * Chunks are owned by the current `base` array, so unused chunks are |
849 | | * held there until they are free()d. Old `base` arrays are attached |
850 | | * to packed reader nodes with a refcount. When a chunk is reclaimed, |
851 | | * it is scanned so that `chunk_free()` can call `detach_leaf()` on |
852 | | * any remaining references to leaf objects. Similarly, it calls |
853 | | * `qpbase_unref()` to reclaim old `base` arrays. |
854 | | */ |
855 | | |
856 | | /* |
857 | | * Two nodes is just enough space for the information needed by |
858 | | * readers and for deferred memory reclamation. |
859 | | */ |
860 | | #define READER_SIZE 2 |
861 | | |
862 | | /* |
863 | | * Create a packed reader; space for the reader should have been |
864 | | * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`. |
865 | | */ |
866 | | static inline void |
867 | | make_reader(qp_node_t *reader, dns_qpmulti_t *multi) { |
868 | | dns_qp_t *qp = &multi->writer; |
869 | | reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC); |
870 | | reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref); |
871 | | } |
872 | | |
873 | | static inline bool |
874 | | reader_valid(qp_node_t *reader) { |
875 | | return (reader != NULL && // |
876 | | node_tag(&reader[0]) == READER_TAG && |
877 | | node_tag(&reader[1]) == READER_TAG && |
878 | | node32(&reader[0]) == QPREADER_MAGIC); |
879 | | } |
880 | | |
881 | | /* |
882 | | * Verify and unpack a reader. We return the `multi` pointer to use in |
883 | | * consistency checks. |
884 | | */ |
885 | | static inline dns_qpmulti_t * |
886 | | unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) { |
887 | | INSIST(reader_valid(reader)); |
888 | | dns_qpmulti_t *multi = node_pointer(&reader[0]); |
889 | | dns_qpbase_t *base = node_pointer(&reader[1]); |
890 | | INSIST(QPMULTI_VALID(multi)); |
891 | | INSIST(QPBASE_VALID(base)); |
892 | | *qp = (dns_qpreader_t){ |
893 | | .magic = QP_MAGIC, |
894 | | .uctx = multi->writer.uctx, |
895 | | .methods = multi->writer.methods, |
896 | | .root_ref = node32(&reader[1]), |
897 | | .base = base, |
898 | | }; |
899 | | return (multi); |
900 | | } |
901 | | |
902 | | /*********************************************************************** |
903 | | * |
904 | | * method invocation helpers |
905 | | */ |
906 | | |
907 | | static inline void |
908 | | attach_leaf(dns_qpreadable_t qpr, qp_node_t *n) { |
909 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
910 | | qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n)); |
911 | | } |
912 | | |
913 | | static inline void |
914 | | detach_leaf(dns_qpreadable_t qpr, qp_node_t *n) { |
915 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
916 | | qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n)); |
917 | | } |
918 | | |
919 | | static inline size_t |
920 | | leaf_qpkey(dns_qpreadable_t qpr, qp_node_t *n, dns_qpkey_t key) { |
921 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
922 | | size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n), |
923 | | leaf_ival(n)); |
924 | | INSIST(len < sizeof(dns_qpkey_t)); |
925 | | return (len); |
926 | | } |
927 | | |
928 | | static inline char * |
929 | | triename(dns_qpreadable_t qpr, char *buf, size_t size) { |
930 | | dns_qpreader_t *qp = dns_qpreader(qpr); |
931 | | qp->methods->triename(qp->uctx, buf, size); |
932 | | return (buf); |
933 | | } |
934 | | |
935 | | #define TRIENAME(qp) \ |
936 | | triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX) |
937 | | |
938 | | /*********************************************************************** |
939 | | * |
940 | | * converting DNS names to trie keys |
941 | | */ |
942 | | |
943 | | /* |
944 | | * This is a deliberate simplification of the hostname characters, |
945 | | * because it doesn't matter much if we treat a few extra characters |
946 | | * favourably: there is plenty of space in the index word for a |
947 | | * slightly larger bitmap. |
948 | | */ |
949 | | static inline bool |
950 | | qp_common_character(uint8_t byte) { |
951 | | return (('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z')); |
952 | | } |
953 | | |
954 | | /* |
955 | | * Lookup table mapping bytes in DNS names to bit positions, used |
956 | | * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. |
957 | | */ |
958 | | extern uint16_t dns_qp_bits_for_byte[]; |
959 | | |
960 | | /* |
961 | | * And the reverse, mapping bit positions to characters, so the tests |
962 | | * can print diagnostics involving qp-trie keys. |
963 | | */ |
964 | | extern uint8_t dns_qp_byte_for_bit[]; |
965 | | |
966 | | /**********************************************************************/ |