/src/samba/lib/tdb/common/check.c
Line | Count | Source |
1 | | /* |
2 | | Unix SMB/CIFS implementation. |
3 | | |
4 | | trivial database library |
5 | | |
6 | | Copyright (C) Rusty Russell 2009 |
7 | | |
8 | | ** NOTE! The following LGPL license applies to the tdb |
9 | | ** library. This does NOT imply that all of Samba is released |
10 | | ** under the LGPL |
11 | | |
12 | | This library is free software; you can redistribute it and/or |
13 | | modify it under the terms of the GNU Lesser General Public |
14 | | License as published by the Free Software Foundation; either |
15 | | version 3 of the License, or (at your option) any later version. |
16 | | |
17 | | This library is distributed in the hope that it will be useful, |
18 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 | | Lesser General Public License for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License along with this library; if not, see <http://www.gnu.org/licenses/>. |
24 | | */ |
25 | | #include "tdb_private.h" |
26 | | |
27 | | /* Since we opened it, these shouldn't fail unless it's recent corruption. */ |
28 | | static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery) |
29 | 0 | { |
30 | 0 | struct tdb_header hdr; |
31 | 0 | uint32_t h1, h2; |
32 | |
|
33 | 0 | if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1) |
34 | 0 | return false; |
35 | 0 | if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) |
36 | 0 | goto corrupt; |
37 | | |
38 | 0 | CONVERT(hdr); |
39 | 0 | if (hdr.version != TDB_VERSION) |
40 | 0 | goto corrupt; |
41 | | |
42 | 0 | if (hdr.rwlocks != 0 && |
43 | 0 | hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC && |
44 | 0 | hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC) |
45 | 0 | goto corrupt; |
46 | | |
47 | 0 | tdb_header_hash(tdb, &h1, &h2); |
48 | 0 | if (hdr.magic1_hash && hdr.magic2_hash && |
49 | 0 | (hdr.magic1_hash != h1 || hdr.magic2_hash != h2)) |
50 | 0 | goto corrupt; |
51 | | |
52 | 0 | if (hdr.hash_size == 0) |
53 | 0 | goto corrupt; |
54 | | |
55 | 0 | if (hdr.hash_size != tdb->hash_size) |
56 | 0 | goto corrupt; |
57 | | |
58 | 0 | if (hdr.recovery_start != 0 && |
59 | 0 | hdr.recovery_start < TDB_DATA_START(tdb->hash_size)) |
60 | 0 | goto corrupt; |
61 | | |
62 | 0 | *recovery = hdr.recovery_start; |
63 | 0 | return true; |
64 | | |
65 | 0 | corrupt: |
66 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
67 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n")); |
68 | 0 | return false; |
69 | 0 | } |
70 | | |
71 | | /* Generic record header check. */ |
72 | | static bool tdb_check_record(struct tdb_context *tdb, |
73 | | tdb_off_t off, |
74 | | const struct tdb_record *rec) |
75 | 0 | { |
76 | 0 | tdb_off_t tailer; |
77 | | |
78 | | /* Check rec->next: 0 or points to record offset, aligned. */ |
79 | 0 | if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->hash_size)){ |
80 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
81 | 0 | "Record offset %u too small next %u\n", |
82 | 0 | off, rec->next)); |
83 | 0 | goto corrupt; |
84 | 0 | } |
85 | 0 | if (rec->next + sizeof(*rec) < rec->next) { |
86 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
87 | 0 | "Record offset %u too large next %u\n", |
88 | 0 | off, rec->next)); |
89 | 0 | goto corrupt; |
90 | 0 | } |
91 | 0 | if ((rec->next % TDB_ALIGNMENT) != 0) { |
92 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
93 | 0 | "Record offset %u misaligned next %u\n", |
94 | 0 | off, rec->next)); |
95 | 0 | goto corrupt; |
96 | 0 | } |
97 | 0 | if (tdb_oob(tdb, rec->next, sizeof(*rec), 0)) |
98 | 0 | goto corrupt; |
99 | | |
100 | | /* Check rec_len: similar to rec->next, implies next record. */ |
101 | 0 | if ((rec->rec_len % TDB_ALIGNMENT) != 0) { |
102 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
103 | 0 | "Record offset %u misaligned length %u\n", |
104 | 0 | off, rec->rec_len)); |
105 | 0 | goto corrupt; |
106 | 0 | } |
107 | | /* Must fit tailer. */ |
108 | 0 | if (rec->rec_len < sizeof(tailer)) { |
109 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
110 | 0 | "Record offset %u too short length %u\n", |
111 | 0 | off, rec->rec_len)); |
112 | 0 | goto corrupt; |
113 | 0 | } |
114 | | /* OOB allows "right at the end" access, so this works for last rec. */ |
115 | 0 | if (tdb_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0)) |
116 | 0 | goto corrupt; |
117 | | |
118 | | /* Check tailer. */ |
119 | 0 | if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer), |
120 | 0 | &tailer) == -1) |
121 | 0 | goto corrupt; |
122 | 0 | if (tailer != sizeof(*rec) + rec->rec_len) { |
123 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
124 | 0 | "Record offset %u invalid tailer\n", off)); |
125 | 0 | goto corrupt; |
126 | 0 | } |
127 | | |
128 | 0 | return true; |
129 | | |
130 | 0 | corrupt: |
131 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
132 | 0 | return false; |
133 | 0 | } |
134 | | |
135 | | /* Grab some bytes: may copy if can't use mmap. |
136 | | Caller has already done bounds check. */ |
137 | | static TDB_DATA get_bytes(struct tdb_context *tdb, |
138 | | tdb_off_t off, tdb_len_t len) |
139 | 0 | { |
140 | 0 | TDB_DATA d; |
141 | |
|
142 | 0 | d.dsize = len; |
143 | |
|
144 | 0 | if (tdb->transaction == NULL && tdb->map_ptr != NULL) |
145 | 0 | d.dptr = (unsigned char *)tdb->map_ptr + off; |
146 | 0 | else |
147 | 0 | d.dptr = tdb_alloc_read(tdb, off, d.dsize); |
148 | 0 | return d; |
149 | 0 | } |
150 | | |
151 | | /* Frees data if we're not able to simply use mmap. */ |
152 | | static void put_bytes(struct tdb_context *tdb, TDB_DATA d) |
153 | 0 | { |
154 | 0 | if (tdb->transaction == NULL && tdb->map_ptr != NULL) |
155 | 0 | return; |
156 | 0 | free(d.dptr); |
157 | 0 | } |
158 | | |
159 | | /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2. |
160 | | * See: http://burtleburtle.net/bob/c/lookup3.c |
161 | | */ |
162 | 0 | #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) |
163 | | static void hash(uint32_t key, uint32_t *pc, uint32_t *pb) |
164 | 0 | { |
165 | 0 | uint32_t a,b,c; |
166 | | |
167 | | /* Set up the internal state */ |
168 | 0 | a = b = c = 0xdeadbeef + *pc; |
169 | 0 | c += *pb; |
170 | 0 | a += key; |
171 | 0 | c ^= b; c -= rot(b,14); |
172 | 0 | a ^= c; a -= rot(c,11); |
173 | 0 | b ^= a; b -= rot(a,25); |
174 | 0 | c ^= b; c -= rot(b,16); |
175 | 0 | a ^= c; a -= rot(c,4); |
176 | 0 | b ^= a; b -= rot(a,14); |
177 | 0 | c ^= b; c -= rot(b,24); |
178 | 0 | *pc=c; *pb=b; |
179 | 0 | } |
180 | | |
181 | | /* |
182 | | We want to check that all free records are in the free list |
183 | | (only once), and all free list entries are free records. Similarly |
184 | | for each hash chain of used records. |
185 | | |
186 | | Doing that naively (without walking hash chains, since we want to be |
187 | | linear) means keeping a list of records which have been seen in each |
188 | | hash chain, and another of records pointed to (ie. next pointers |
189 | | from records and the initial hash chain heads). These two lists |
190 | | should be equal. This will take 8 bytes per record, and require |
191 | | sorting at the end. |
192 | | |
193 | | So instead, we record each offset in a bitmap such a way that |
194 | | recording it twice will cancel out. Since each offset should appear |
195 | | exactly twice, the bitmap should be zero at the end. |
196 | | |
197 | | The approach was inspired by Bloom Filters (see Wikipedia). For |
198 | | each value, we flip K bits in a bitmap of size N. The number of |
199 | | distinct arrangements is: |
200 | | |
201 | | N! / (K! * (N-K)!) |
202 | | |
203 | | Of course, not all arrangements are actually distinct, but testing |
204 | | shows this formula to be close enough. |
205 | | |
206 | | So, if K == 8 and N == 256, the probability of two things flipping the same |
207 | | bits is 1 in 409,663,695,276,000. |
208 | | |
209 | | Given that ldb uses a hash size of 10000, using 32 bytes per hash chain |
210 | | (320k) seems reasonable. |
211 | | */ |
212 | 0 | #define NUM_HASHES 8 |
213 | 0 | #define BITMAP_BITS 256 |
214 | | |
215 | | static void bit_flip(unsigned char bits[], unsigned int idx) |
216 | 0 | { |
217 | 0 | bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT)); |
218 | 0 | } |
219 | | |
220 | | /* We record offsets in a bitmap for the particular chain it should be in. */ |
221 | | static void record_offset(unsigned char bits[], tdb_off_t off) |
222 | 0 | { |
223 | 0 | uint32_t h1 = off, h2 = 0; |
224 | 0 | unsigned int i; |
225 | | |
226 | | /* We get two good hash values out of jhash2, so we use both. Then |
227 | | * we keep going to produce further hash values. */ |
228 | 0 | for (i = 0; i < NUM_HASHES / 2; i++) { |
229 | 0 | hash(off, &h1, &h2); |
230 | 0 | bit_flip(bits, h1 % BITMAP_BITS); |
231 | 0 | bit_flip(bits, h2 % BITMAP_BITS); |
232 | 0 | h2++; |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | | /* Check that an in-use record is valid. */ |
237 | | static bool tdb_check_used_record(struct tdb_context *tdb, |
238 | | tdb_off_t off, |
239 | | const struct tdb_record *rec, |
240 | | unsigned char **hashes, |
241 | | int (*check)(TDB_DATA, TDB_DATA, void *), |
242 | | void *private_data) |
243 | 0 | { |
244 | 0 | TDB_DATA key, data; |
245 | 0 | tdb_len_t len; |
246 | |
|
247 | 0 | if (!tdb_check_record(tdb, off, rec)) |
248 | 0 | return false; |
249 | | |
250 | | /* key + data + tailer must fit in record */ |
251 | 0 | len = rec->key_len; |
252 | 0 | len += rec->data_len; |
253 | 0 | if (len < rec->data_len) { |
254 | | /* overflow */ |
255 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n")); |
256 | 0 | return false; |
257 | 0 | } |
258 | 0 | len += sizeof(tdb_off_t); |
259 | 0 | if (len < sizeof(tdb_off_t)) { |
260 | | /* overflow */ |
261 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n")); |
262 | 0 | return false; |
263 | 0 | } |
264 | | |
265 | 0 | if (len > rec->rec_len) { |
266 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
267 | 0 | "Record offset %u too short for contents\n", off)); |
268 | 0 | return false; |
269 | 0 | } |
270 | | |
271 | 0 | key = get_bytes(tdb, off + sizeof(*rec), rec->key_len); |
272 | 0 | if (!key.dptr) |
273 | 0 | return false; |
274 | | |
275 | 0 | if (tdb->hash_fn(&key) != rec->full_hash) { |
276 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
277 | 0 | "Record offset %u has incorrect hash\n", off)); |
278 | 0 | goto fail_put_key; |
279 | 0 | } |
280 | | |
281 | | /* Mark this offset as a known value for this hash bucket. */ |
282 | 0 | record_offset(hashes[BUCKET(rec->full_hash)+1], off); |
283 | | /* And similarly if the next pointer is valid. */ |
284 | 0 | if (rec->next) |
285 | 0 | record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next); |
286 | | |
287 | | /* If they supply a check function and this record isn't dead, |
288 | | get data and feed it. */ |
289 | 0 | if (check && rec->magic != TDB_DEAD_MAGIC) { |
290 | 0 | data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len, |
291 | 0 | rec->data_len); |
292 | 0 | if (!data.dptr) |
293 | 0 | goto fail_put_key; |
294 | | |
295 | 0 | if (check(key, data, private_data) == -1) |
296 | 0 | goto fail_put_data; |
297 | 0 | put_bytes(tdb, data); |
298 | 0 | } |
299 | | |
300 | 0 | put_bytes(tdb, key); |
301 | 0 | return true; |
302 | | |
303 | 0 | fail_put_data: |
304 | 0 | put_bytes(tdb, data); |
305 | 0 | fail_put_key: |
306 | 0 | put_bytes(tdb, key); |
307 | 0 | return false; |
308 | 0 | } |
309 | | |
310 | | /* Check that an unused record is valid. */ |
311 | | static bool tdb_check_free_record(struct tdb_context *tdb, |
312 | | tdb_off_t off, |
313 | | const struct tdb_record *rec, |
314 | | unsigned char **hashes) |
315 | 0 | { |
316 | 0 | if (!tdb_check_record(tdb, off, rec)) |
317 | 0 | return false; |
318 | | |
319 | | /* Mark this offset as a known value for the free list. */ |
320 | 0 | record_offset(hashes[0], off); |
321 | | /* And similarly if the next pointer is valid. */ |
322 | 0 | if (rec->next) |
323 | 0 | record_offset(hashes[0], rec->next); |
324 | 0 | return true; |
325 | 0 | } |
326 | | |
327 | | /* Slow, but should be very rare. */ |
328 | | size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off) |
329 | 0 | { |
330 | 0 | size_t len; |
331 | |
|
332 | 0 | for (len = 0; off + len < tdb->map_size; len++) { |
333 | 0 | char c; |
334 | 0 | if (tdb->methods->tdb_read(tdb, off, &c, 1, 0)) |
335 | 0 | return 0; |
336 | 0 | if (c != 0 && c != 0x42) |
337 | 0 | break; |
338 | 0 | } |
339 | 0 | return len; |
340 | 0 | } |
341 | | |
342 | | _PUBLIC_ int tdb_check(struct tdb_context *tdb, |
343 | | int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), |
344 | | void *private_data) |
345 | 0 | { |
346 | 0 | unsigned int h; |
347 | 0 | unsigned char **hashes; |
348 | 0 | tdb_off_t off, recovery_start; |
349 | 0 | struct tdb_record rec; |
350 | 0 | bool found_recovery = false; |
351 | 0 | tdb_len_t dead; |
352 | 0 | bool locked; |
353 | | |
354 | | /* Read-only databases use no locking at all: it's best-effort. |
355 | | * We may have a write lock already, so skip that case too. */ |
356 | 0 | if (tdb->read_only || tdb->allrecord_lock.count != 0) { |
357 | 0 | locked = false; |
358 | 0 | } else { |
359 | 0 | if (tdb_lockall_read(tdb) == -1) |
360 | 0 | return -1; |
361 | 0 | locked = true; |
362 | 0 | } |
363 | | |
364 | | /* Make sure we know true size of the underlying file. */ |
365 | 0 | tdb_oob(tdb, tdb->map_size, 1, 1); |
366 | | |
367 | | /* Header must be OK: also gets us the recovery ptr, if any. */ |
368 | 0 | if (!tdb_check_header(tdb, &recovery_start)) |
369 | 0 | goto unlock; |
370 | | |
371 | | /* We should have the whole header, too. */ |
372 | 0 | if (tdb->map_size < TDB_DATA_START(tdb->hash_size)) { |
373 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
374 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n")); |
375 | 0 | goto unlock; |
376 | 0 | } |
377 | | |
378 | | /* One big malloc: pointers then bit arrays. */ |
379 | 0 | hashes = (unsigned char **)calloc( |
380 | 0 | 1, sizeof(hashes[0]) * (1+tdb->hash_size) |
381 | 0 | + BITMAP_BITS / CHAR_BIT * (1+tdb->hash_size)); |
382 | 0 | if (!hashes) { |
383 | 0 | tdb->ecode = TDB_ERR_OOM; |
384 | 0 | goto unlock; |
385 | 0 | } |
386 | | |
387 | | /* Initialize pointers */ |
388 | 0 | hashes[0] = (unsigned char *)(&hashes[1+tdb->hash_size]); |
389 | 0 | for (h = 1; h < 1+tdb->hash_size; h++) |
390 | 0 | hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT; |
391 | | |
392 | | /* Freelist and hash headers are all in a row: read them. */ |
393 | 0 | for (h = 0; h < 1+tdb->hash_size; h++) { |
394 | 0 | if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t), |
395 | 0 | &off) == -1) |
396 | 0 | goto free; |
397 | 0 | if (off) |
398 | 0 | record_offset(hashes[h], off); |
399 | 0 | } |
400 | | |
401 | | /* For each record, read it in and check it's ok. */ |
402 | 0 | for (off = TDB_DATA_START(tdb->hash_size); |
403 | 0 | off < tdb->map_size; |
404 | 0 | off += sizeof(rec) + rec.rec_len) { |
405 | 0 | if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec), |
406 | 0 | DOCONV()) == -1) |
407 | 0 | goto free; |
408 | 0 | switch (rec.magic) { |
409 | 0 | case TDB_MAGIC: |
410 | 0 | case TDB_DEAD_MAGIC: |
411 | 0 | if (!tdb_check_used_record(tdb, off, &rec, hashes, |
412 | 0 | check, private_data)) |
413 | 0 | goto free; |
414 | 0 | break; |
415 | 0 | case TDB_FREE_MAGIC: |
416 | 0 | if (!tdb_check_free_record(tdb, off, &rec, hashes)) |
417 | 0 | goto free; |
418 | 0 | break; |
419 | | /* If we crash after ftruncate, we can get zeroes or fill. */ |
420 | 0 | case TDB_RECOVERY_INVALID_MAGIC: |
421 | 0 | case 0x42424242: |
422 | 0 | if (recovery_start == off) { |
423 | 0 | found_recovery = true; |
424 | 0 | break; |
425 | 0 | } |
426 | 0 | dead = tdb_dead_space(tdb, off); |
427 | 0 | if (dead < sizeof(rec)) |
428 | 0 | goto corrupt; |
429 | | |
430 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
431 | 0 | "Dead space at %u-%u (of %u)\n", |
432 | 0 | off, off + dead, tdb->map_size)); |
433 | 0 | rec.rec_len = dead - sizeof(rec); |
434 | 0 | break; |
435 | 0 | case TDB_RECOVERY_MAGIC: |
436 | 0 | if (recovery_start != off) { |
437 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
438 | 0 | "Unexpected recovery record at offset %u\n", |
439 | 0 | off)); |
440 | 0 | goto free; |
441 | 0 | } |
442 | 0 | found_recovery = true; |
443 | 0 | break; |
444 | 0 | default: ; |
445 | 0 | corrupt: |
446 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
447 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
448 | 0 | "Bad magic 0x%x at offset %u\n", |
449 | 0 | rec.magic, off)); |
450 | 0 | goto free; |
451 | 0 | } |
452 | 0 | } |
453 | | |
454 | | /* Now, hashes should all be empty: each record exists and is referred |
455 | | * to by one other. */ |
456 | 0 | for (h = 0; h < 1+tdb->hash_size; h++) { |
457 | 0 | unsigned int i; |
458 | 0 | for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) { |
459 | 0 | if (hashes[h][i] != 0) { |
460 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
461 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
462 | 0 | "Hashes do not match records\n")); |
463 | 0 | goto free; |
464 | 0 | } |
465 | 0 | } |
466 | 0 | } |
467 | | |
468 | | /* We must have found recovery area if there was one. */ |
469 | 0 | if (recovery_start != 0 && !found_recovery) { |
470 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
471 | 0 | "Expected a recovery area at %u\n", |
472 | 0 | recovery_start)); |
473 | 0 | goto free; |
474 | 0 | } |
475 | | |
476 | 0 | free(hashes); |
477 | 0 | if (locked) { |
478 | 0 | tdb_unlockall_read(tdb); |
479 | 0 | } |
480 | 0 | return 0; |
481 | | |
482 | 0 | free: |
483 | 0 | free(hashes); |
484 | 0 | unlock: |
485 | 0 | if (locked) { |
486 | 0 | tdb_unlockall_read(tdb); |
487 | 0 | } |
488 | 0 | return -1; |
489 | 0 | } |