/src/samba/third_party/heimdal/lib/base/bsearch.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2011, Secure Endpoints Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions |
7 | | * are met: |
8 | | * |
9 | | * - Redistributions of source code must retain the above copyright |
10 | | * notice, this list of conditions and the following disclaimer. |
11 | | * |
12 | | * - Redistributions in binary form must reproduce the above copyright |
13 | | * notice, this list of conditions and the following disclaimer in |
14 | | * the documentation and/or other materials provided with the |
15 | | * distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
18 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
19 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
20 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
21 | | * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
22 | | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
24 | | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
25 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
26 | | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
27 | | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
28 | | * OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | * |
30 | | */ |
31 | | |
32 | | #include "baselocl.h" |
33 | | |
34 | | #include <sys/types.h> |
35 | | #include <sys/stat.h> |
36 | | #ifdef HAVE_IO_H |
37 | | #include <io.h> |
38 | | #endif |
39 | | #ifdef HAVE_UNISTD_H |
40 | | #include <unistd.h> |
41 | | #endif |
42 | | #include <fcntl.h> |
43 | | #include <ctype.h> |
44 | | #include <stdio.h> |
45 | | #include <stdlib.h> |
46 | | #include <string.h> |
47 | | #ifdef HAVE_STRINGS_H |
48 | | #include <strings.h> |
49 | | #endif |
50 | | #include <errno.h> |
51 | | #include <assert.h> |
52 | | |
53 | | /* |
54 | | * This file contains functions for binary searching flat text in memory |
55 | | * and in text files where each line is a [variable length] record. |
56 | | * Each record has a key and an optional value separated from the key by |
57 | | * unquoted whitespace. Whitespace in the key, and leading whitespace |
58 | | * for the value, can be quoted with backslashes (but CR and LF must be |
59 | | * quoted in such a way that they don't appear in the quoted result). |
60 | | * |
61 | | * Binary searching a tree are normally a dead simple algorithm. It |
62 | | * turns out that binary searching flat text with *variable* length |
63 | | * records is... tricky. There's no indexes to record beginning bytes, |
64 | | * thus any index selected during the search is likely to fall in the |
65 | | * middle of a record. When deciding to search a left sub-tree one |
66 | | * might fail to find the last record in that sub-tree on account of the |
67 | | * right boundary falling in the middle of it -- the chosen solution to |
68 | | * this makes left sub-tree searches slightly less efficient than right |
69 | | * sub-tree searches. |
70 | | * |
71 | | * If binary searching flat text in memory is tricky, using block-wise |
72 | | * I/O instead is trickier! But it's necessary in order to support |
73 | | * large files (which we either can't or wouldn't want to read or map |
74 | | * into memory). Each block we read has to be large enough that the |
75 | | * largest record can fit in it. And each block might start and/or end |
76 | | * in the middle of a record. Here it is the right sub-tree searches |
77 | | * that are less efficient than left sub-tree searches. |
78 | | * |
79 | | * bsearch_common() contains the common text block binary search code. |
80 | | * |
81 | | * _bsearch_text() is the interface for searching in-core text. |
82 | | * _bsearch_file() is the interface for block-wise searching files. |
83 | | */ |
84 | | |
85 | | struct bsearch_file_handle { |
86 | | int fd; /* file descriptor */ |
87 | | char *cache; /* cache bytes */ |
88 | | char *page; /* one double-size page worth of bytes */ |
89 | | size_t file_sz; /* file size */ |
90 | | size_t cache_sz; /* cache size */ |
91 | | size_t page_sz; /* page size */ |
92 | | }; |
93 | | |
94 | | /* Find a new-line */ |
95 | | static const char * |
96 | | find_line(const char *buf, size_t i, size_t right) |
97 | 0 | { |
98 | 0 | if (i == 0) |
99 | 0 | return &buf[i]; |
100 | 0 | for (; i < right; i++) { |
101 | 0 | if (buf[i] == '\n') { |
102 | 0 | if ((i + 1) < right) |
103 | 0 | return &buf[i + 1]; |
104 | 0 | return NULL; |
105 | 0 | } |
106 | 0 | } |
107 | 0 | return NULL; |
108 | 0 | } |
109 | | |
110 | | /* |
111 | | * Common routine for binary searching text in core. |
112 | | * |
113 | | * Perform a binary search of a char array containing a block from a |
114 | | * text file where each line is a record (LF and CRLF supported). Each |
115 | | * record consists of a key followed by an optional value separated from |
116 | | * the key by whitespace. Whitespace can be quoted with backslashes. |
117 | | * It's the caller's responsibility to encode/decode keys/values if |
118 | | * quoting is desired; newlines should be encoded such that a newline |
119 | | * does not appear in the result. |
120 | | * |
121 | | * All output arguments are optional. |
122 | | * |
123 | | * Returns 0 if key is found, -1 if not found, or an error code such as |
124 | | * ENOMEM in case of error. |
125 | | * |
126 | | * Inputs: |
127 | | * |
128 | | * @buf String to search |
129 | | * @sz Size of string to search |
130 | | * @key Key string to search for |
131 | | * @buf_is_start True if the buffer starts with a record, false if it |
132 | | * starts in the middle of a record or if the caller |
133 | | * doesn't know. |
134 | | * |
135 | | * Outputs: |
136 | | * |
137 | | * @value Location to store a copy of the value (caller must free) |
138 | | * @location Record location if found else the location where the |
139 | | * record should be inserted (index into @buf) |
140 | | * @cmp Set to less than or greater than 0 to indicate that a |
141 | | * key not found would have fit in an earlier or later |
142 | | * part of a file. Callers should use this to decide |
143 | | * whether to read a block to the left or to the right and |
144 | | * search that. |
145 | | * @loops Location to store a count of bisections required for |
146 | | * search (useful for confirming logarithmic performance) |
147 | | */ |
148 | | static int |
149 | | bsearch_common(const char *buf, size_t sz, const char *key, |
150 | | int buf_is_start, char **value, size_t *location, |
151 | | int *cmp, size_t *loops) |
152 | 0 | { |
153 | 0 | const char *linep; |
154 | 0 | size_t key_start, key_len; /* key string in buf */ |
155 | 0 | size_t val_start, val_len; /* value string in buf */ |
156 | 0 | int key_cmp = -1; |
157 | 0 | size_t k; |
158 | 0 | size_t l; /* left side of buffer for binary search */ |
159 | 0 | size_t r; /* right side of buffer for binary search */ |
160 | 0 | size_t rmax; /* right side of buffer for binary search */ |
161 | 0 | size_t i; /* index into buffer, typically in the middle of l and r */ |
162 | 0 | size_t loop_count = 0; |
163 | 0 | int ret = -1; |
164 | |
|
165 | 0 | if (value) |
166 | 0 | *value = NULL; |
167 | 0 | if (cmp) |
168 | 0 | *cmp = 0; |
169 | 0 | if (loops) |
170 | 0 | *loops = 0; |
171 | | |
172 | | /* Binary search; file should be sorted */ |
173 | 0 | for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) { |
174 | 0 | heim_assert(i < sz, "invalid aname2lname db index"); |
175 | | |
176 | | /* buf[i] is likely in the middle of a line; find the next line */ |
177 | 0 | linep = find_line(buf, i, rmax); |
178 | 0 | k = linep ? linep - buf : i; |
179 | 0 | if (linep == NULL || k >= rmax) { |
180 | | /* |
181 | | * No new line found to the right; search to the left then |
182 | | * but don't change rmax (this isn't optimal, but it's |
183 | | * simple). |
184 | | */ |
185 | 0 | if (i == l) |
186 | 0 | break; |
187 | 0 | r = i; |
188 | 0 | i = l + ((r - l) >> 1); |
189 | 0 | continue; |
190 | 0 | } |
191 | 0 | i = k; |
192 | 0 | heim_assert(i >= l && i < rmax, "invalid aname2lname db index"); |
193 | | |
194 | | /* Got a line; check it */ |
195 | | |
196 | | /* Search for and split on unquoted whitespace */ |
197 | 0 | val_start = 0; |
198 | 0 | for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) { |
199 | 0 | if (buf[k] == '\\') { |
200 | 0 | k++; |
201 | 0 | continue; |
202 | 0 | } |
203 | 0 | if (buf[k] == '\r' || buf[k] == '\n') { |
204 | | /* We now know where the key ends, and there's no value */ |
205 | 0 | key_len = k - i; |
206 | 0 | break; |
207 | 0 | } |
208 | 0 | if (!isspace((unsigned char)buf[k])) |
209 | 0 | continue; |
210 | | |
211 | 0 | while (k < rmax && isspace((unsigned char)buf[k])) { |
212 | 0 | key_len = k - i; |
213 | 0 | k++; |
214 | 0 | } |
215 | 0 | if (k < rmax) |
216 | 0 | val_start = k; |
217 | | /* Find end of value */ |
218 | 0 | for (; k < rmax && buf[k] != '\0'; k++) { |
219 | 0 | if (buf[k] == '\r' || buf[k] == '\n') { |
220 | 0 | val_len = k - val_start; |
221 | 0 | break; |
222 | 0 | } |
223 | 0 | } |
224 | 0 | break; |
225 | 0 | } |
226 | | |
227 | | /* |
228 | | * The following logic is for dealing with partial buffers, |
229 | | * which we use for block-wise binary searches of large files |
230 | | */ |
231 | 0 | if (key_start == 0 && !buf_is_start) { |
232 | | /* |
233 | | * We're at the beginning of a block that might have started |
234 | | * in the middle of a record whose "key" might well compare |
235 | | * as greater than the key we're looking for, so we don't |
236 | | * bother comparing -- we know key_cmp must be -1 here. |
237 | | */ |
238 | 0 | key_cmp = -1; |
239 | 0 | break; |
240 | 0 | } |
241 | 0 | if ((val_len && buf[val_start + val_len] != '\n') || |
242 | 0 | (!val_len && buf[key_start + key_len] != '\n')) { |
243 | | /* |
244 | | * We're at the end of a block that ends in the middle of a |
245 | | * record whose "key" might well compare as less than the |
246 | | * key we're looking for, so we don't bother comparing -- we |
247 | | * know key_cmp must be >= 0 but we can't tell. Our caller |
248 | | * will end up reading a double-size block to handle this. |
249 | | */ |
250 | 0 | key_cmp = 1; |
251 | 0 | break; |
252 | 0 | } |
253 | | |
254 | 0 | key_cmp = strncmp(key, &buf[key_start], key_len); |
255 | 0 | if (key_cmp == 0 && strlen(key) != key_len) |
256 | 0 | key_cmp = 1; |
257 | 0 | if (key_cmp < 0) { |
258 | | /* search left */ |
259 | 0 | r = rmax = (linep - buf); |
260 | 0 | i = l + ((r - l) >> 1); |
261 | 0 | if (location) |
262 | 0 | *location = key_start; |
263 | 0 | } else if (key_cmp > 0) { |
264 | | /* search right */ |
265 | 0 | if (l == i) |
266 | 0 | break; /* not found */ |
267 | 0 | l = i; |
268 | 0 | i = l + ((r - l) >> 1); |
269 | 0 | if (location) |
270 | 0 | *location = val_start + val_len; |
271 | 0 | } else { |
272 | | /* match! */ |
273 | 0 | if (location) |
274 | 0 | *location = key_start; |
275 | 0 | ret = 0; |
276 | 0 | if (val_len && value) { |
277 | | /* Avoid strndup() so we don't need libroken here yet */ |
278 | 0 | if ((*value = malloc(val_len + 1))) { |
279 | 0 | (void) memcpy(*value, &buf[val_start], val_len); |
280 | 0 | (*value)[val_len] = '\0'; |
281 | 0 | } else { |
282 | 0 | ret = errno; |
283 | 0 | } |
284 | 0 | } |
285 | 0 | break; |
286 | 0 | } |
287 | 0 | } |
288 | | |
289 | 0 | if (cmp) |
290 | 0 | *cmp = key_cmp; |
291 | 0 | if (loops) |
292 | 0 | *loops = loop_count; |
293 | |
|
294 | 0 | return ret; |
295 | 0 | } |
296 | | |
297 | | /* |
298 | | * Binary search a char array containing sorted text records separated |
299 | | * by new-lines (or CRLF). Each record consists of a key and an |
300 | | * optional value following the key, separated from the key by unquoted |
301 | | * whitespace. |
302 | | * |
303 | | * All output arguments are optional. |
304 | | * |
305 | | * Returns 0 if key is found, -1 if not found, or an error code such as |
306 | | * ENOMEM in case of error. |
307 | | * |
308 | | * Inputs: |
309 | | * |
310 | | * @buf Char array pointer |
311 | | * @buf_sz Size of buf |
312 | | * @key Key to search for |
313 | | * |
314 | | * Outputs: |
315 | | * |
316 | | * @value Location where to put the value, if any (caller must free) |
317 | | * @location Record location if found else the location where the record |
318 | | * should be inserted (index into @buf) |
319 | | * @loops Location where to put a number of loops (or comparisons) |
320 | | * needed for the search (useful for benchmarking) |
321 | | */ |
322 | | int |
323 | | _bsearch_text(const char *buf, size_t buf_sz, const char *key, |
324 | | char **value, size_t *location, size_t *loops) |
325 | 0 | { |
326 | 0 | return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops); |
327 | 0 | } |
328 | | |
329 | 0 | #define MAX_BLOCK_SIZE (1024 * 1024) |
330 | 0 | #define DEFAULT_MAX_FILE_SIZE (1024 * 1024) |
331 | | /* |
332 | | * Open a file for binary searching. The file will be read in entirely |
333 | | * if it is smaller than @max_sz, else a cache of @max_sz bytes will be |
334 | | * allocated. |
335 | | * |
336 | | * Returns 0 on success, else an error number or -1 if the file is empty. |
337 | | * |
338 | | * Inputs: |
339 | | * |
340 | | * @fname Name of file to open |
341 | | * @max_sz Maximum size of cache to allocate, in bytes (if zero, default) |
342 | | * @page_sz Page size (must be a power of two, larger than 256, smaller |
343 | | * than 1MB; if zero use default) |
344 | | * |
345 | | * Outputs: |
346 | | * |
347 | | * @bfh Handle for use with _bsearch_file() and _bsearch_file_close() |
348 | | * @reads Number of reads performed |
349 | | */ |
350 | | int |
351 | | _bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz, |
352 | | bsearch_file_handle *bfh, size_t *reads) |
353 | 0 | { |
354 | 0 | bsearch_file_handle new_bfh = NULL; |
355 | 0 | struct stat st; |
356 | 0 | size_t i; |
357 | 0 | int fd; |
358 | 0 | int ret; |
359 | |
|
360 | 0 | *bfh = NULL; |
361 | |
|
362 | 0 | if (reads) |
363 | 0 | *reads = 0; |
364 | |
|
365 | 0 | fd = open(fname, O_RDONLY); |
366 | 0 | if (fd == -1) |
367 | 0 | return errno; |
368 | | |
369 | 0 | if (fstat(fd, &st) == -1) { |
370 | 0 | ret = errno; |
371 | 0 | goto err; |
372 | 0 | } |
373 | | |
374 | 0 | if (st.st_size == 0) { |
375 | 0 | ret = -1; /* no data -> no binary search */ |
376 | 0 | goto err; |
377 | 0 | } |
378 | | |
379 | | /* Validate / default arguments */ |
380 | 0 | if (max_sz == 0) |
381 | 0 | max_sz = DEFAULT_MAX_FILE_SIZE; |
382 | 0 | for (i = page_sz; i; i >>= 1) { |
383 | | /* Make sure page_sz is a power of two */ |
384 | 0 | if ((i % 2) && (i >> 1)) { |
385 | 0 | page_sz = 0; |
386 | 0 | break; |
387 | 0 | } |
388 | 0 | } |
389 | 0 | if (page_sz == 0) |
390 | | #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE |
391 | | page_sz = st.st_blksize; |
392 | | #else |
393 | 0 | page_sz = 4096; |
394 | 0 | #endif |
395 | 0 | for (i = page_sz; i; i >>= 1) { |
396 | | /* Make sure page_sz is a power of two */ |
397 | 0 | if ((i % 2) && (i >> 1)) { |
398 | | /* Can't happen! Filesystems always use powers of two! */ |
399 | 0 | page_sz = 4096; |
400 | 0 | break; |
401 | 0 | } |
402 | 0 | } |
403 | 0 | if (page_sz > MAX_BLOCK_SIZE) |
404 | 0 | page_sz = MAX_BLOCK_SIZE; |
405 | |
|
406 | 0 | new_bfh = calloc(1, sizeof (*new_bfh)); |
407 | 0 | if (new_bfh == NULL) { |
408 | 0 | ret = ENOMEM; |
409 | 0 | goto err; |
410 | 0 | } |
411 | | |
412 | 0 | new_bfh->fd = fd; |
413 | 0 | new_bfh->page_sz = page_sz; |
414 | 0 | new_bfh->file_sz = st.st_size; |
415 | |
|
416 | 0 | if (max_sz >= st.st_size) { |
417 | | /* Whole-file method */ |
418 | 0 | new_bfh->cache = malloc(st.st_size + 1); |
419 | 0 | if (new_bfh->cache) { |
420 | 0 | new_bfh->cache[st.st_size] = '\0'; |
421 | 0 | new_bfh->cache_sz = st.st_size; |
422 | 0 | ret = read(fd, new_bfh->cache, st.st_size); |
423 | 0 | if (ret < 0) { |
424 | 0 | ret = errno; |
425 | 0 | goto err; |
426 | 0 | } |
427 | 0 | if (ret != st.st_size) { |
428 | 0 | ret = EIO; /* XXX ??? */ |
429 | 0 | goto err; |
430 | 0 | } |
431 | 0 | if (reads) |
432 | 0 | *reads = 1; |
433 | 0 | (void) close(fd); |
434 | 0 | new_bfh->fd = -1; |
435 | 0 | *bfh = new_bfh; |
436 | 0 | return 0; |
437 | 0 | } |
438 | 0 | } |
439 | | |
440 | | /* Block-size method, or above malloc() failed */ |
441 | 0 | new_bfh->page = malloc(new_bfh->page_sz << 1); |
442 | 0 | if (new_bfh->page == NULL) { |
443 | | /* Can't even allocate a single double-size page! */ |
444 | 0 | ret = ENOMEM; |
445 | 0 | goto err; |
446 | 0 | } |
447 | | |
448 | 0 | new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size; |
449 | 0 | new_bfh->cache = malloc(new_bfh->cache_sz); |
450 | 0 | *bfh = new_bfh; |
451 | | |
452 | | /* |
453 | | * malloc() may have failed because we were asking for a lot of |
454 | | * memory, but we may still be able to operate without a cache, |
455 | | * so let's not fail. |
456 | | */ |
457 | 0 | if (new_bfh->cache == NULL) { |
458 | 0 | new_bfh->cache_sz = 0; |
459 | 0 | return 0; |
460 | 0 | } |
461 | | |
462 | | /* Initialize cache */ |
463 | 0 | for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz) |
464 | 0 | new_bfh->cache[i] = '\0'; |
465 | 0 | return 0; |
466 | | |
467 | 0 | err: |
468 | 0 | (void) close(fd); |
469 | 0 | if (new_bfh) { |
470 | 0 | free(new_bfh->page); |
471 | 0 | free(new_bfh->cache); |
472 | 0 | free(new_bfh); |
473 | 0 | } |
474 | 0 | return ret; |
475 | 0 | } |
476 | | |
477 | | /* |
478 | | * Indicate whether the given binary search file handle will be searched |
479 | | * with block-wise method. |
480 | | */ |
481 | | void |
482 | | _bsearch_file_info(bsearch_file_handle bfh, |
483 | | size_t *page_sz, size_t *max_sz, int *blockwise) |
484 | 0 | { |
485 | 0 | if (page_sz) |
486 | 0 | *page_sz = bfh->page_sz; |
487 | 0 | if (max_sz) |
488 | 0 | *max_sz = bfh->cache_sz; |
489 | 0 | if (blockwise) |
490 | 0 | *blockwise = (bfh->file_sz != bfh->cache_sz); |
491 | 0 | } |
492 | | |
493 | | /* |
494 | | * Close the given binary file search handle. |
495 | | * |
496 | | * Inputs: |
497 | | * |
498 | | * @bfh Pointer to variable containing handle to close. |
499 | | */ |
500 | | void |
501 | | _bsearch_file_close(bsearch_file_handle *bfh) |
502 | 0 | { |
503 | 0 | if (!*bfh) |
504 | 0 | return; |
505 | 0 | if ((*bfh)->fd >= 0) |
506 | 0 | (void) close((*bfh)->fd); |
507 | 0 | if ((*bfh)->page) |
508 | 0 | free((*bfh)->page); |
509 | 0 | if ((*bfh)->cache) |
510 | 0 | free((*bfh)->cache); |
511 | 0 | free(*bfh); |
512 | 0 | *bfh = NULL; |
513 | 0 | } |
514 | | |
515 | | /* |
516 | | * Private function to get a page from a cache. The cache is a char |
517 | | * array of 2^n - 1 double-size page worth of bytes, where n is the |
518 | | * number of tree levels that the cache stores. The cache can be |
519 | | * smaller than n implies. |
520 | | * |
521 | | * The page may or may not be valid. If the first byte of it is NUL |
522 | | * then it's not valid, else it is. |
523 | | * |
524 | | * Returns 1 if page is in cache and valid, 0 if the cache is too small |
525 | | * or the page is invalid. The page address is output in @buf if the |
526 | | * cache is large enough to contain it regardless of whether the page is |
527 | | * valid. |
528 | | * |
529 | | * Inputs: |
530 | | * |
531 | | * @bfh Binary search file handle |
532 | | * @level Level in the tree that we want a page for |
533 | | * @page_idx Page number in the given level (0..2^level - 1) |
534 | | * |
535 | | * Outputs: |
536 | | * |
537 | | * @buf Set to address of page if the cache is large enough |
538 | | */ |
539 | | static int |
540 | | get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx, |
541 | | char **buf) |
542 | 0 | { |
543 | 0 | size_t idx = 0; |
544 | 0 | size_t page_sz; |
545 | |
|
546 | 0 | page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */ |
547 | |
|
548 | 0 | *buf = NULL; |
549 | | |
550 | | /* |
551 | | * Compute index into cache. The cache is basically an array of |
552 | | * double-size pages. The first (zeroth) double-size page in the |
553 | | * cache will be the middle page of the file -- the root of the |
554 | | * tree. The next two double-size pages will be the left and right |
555 | | * pages of the second level in the tree. The next four double-size |
556 | | * pages will be the four pages at the next level. And so on for as |
557 | | * many pages as fit in the cache. |
558 | | * |
559 | | * The page index is the number of the page at the given level. We |
560 | | * then compute (2^level - 1 + page index) * 2page size, check that |
561 | | * we have that in the cache, check that the page has been read (it |
562 | | * doesn't start with NUL). |
563 | | */ |
564 | 0 | if (level) |
565 | 0 | idx = (1 << level) - 1 + page_idx; |
566 | 0 | if (((idx + 1) * page_sz * 2) > bfh->cache_sz) |
567 | 0 | return 0; |
568 | | |
569 | 0 | *buf = &bfh->cache[idx * page_sz * 2]; |
570 | 0 | if (bfh->cache[idx * page_sz * 2] == '\0') |
571 | 0 | return 0; /* cache[idx] == NUL -> page not loaded in cache */ |
572 | 0 | return 1; |
573 | 0 | } |
574 | | |
575 | | /* |
576 | | * Private function to read a page of @page_sz from @fd at offset @off |
577 | | * into @buf, outputing the number of bytes read, which will be the same |
578 | | * as @page_sz unless the page being read is the last page, in which |
579 | | * case the number of remaining bytes in the file will be output. |
580 | | * |
581 | | * Returns 0 on success or an errno value otherwise (EIO if reads are |
582 | | * short). |
583 | | * |
584 | | * Inputs: |
585 | | * |
586 | | * @bfh Binary search file handle |
587 | | * @level Level in the binary search tree that we're at |
588 | | * @page_idx Page "index" at the @level of the tree that we want |
589 | | * @page Actual page number that we want |
590 | | * want_double Whether we need a page or double page read |
591 | | * |
592 | | * Outputs: |
593 | | * |
594 | | * @buf Page read or cached |
595 | | * @bytes Bytes read (may be less than page or double page size in |
596 | | * the case of the last page, of course) |
597 | | */ |
598 | | static int |
599 | | read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page, |
600 | | int want_double, const char **buf, size_t *bytes) |
601 | 0 | { |
602 | 0 | int ret; |
603 | 0 | off_t off; |
604 | 0 | size_t expected; |
605 | 0 | size_t wanted; |
606 | 0 | char *page_buf; |
607 | | |
608 | | /* Figure out where we're reading and how much */ |
609 | 0 | off = page * bfh->page_sz; |
610 | 0 | if (off < 0) |
611 | 0 | return EOVERFLOW; |
612 | | |
613 | 0 | wanted = bfh->page_sz << want_double; |
614 | 0 | expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off; |
615 | |
|
616 | 0 | if (get_page_from_cache(bfh, level, page_idx, &page_buf)) { |
617 | 0 | *buf = page_buf; |
618 | 0 | *bytes = expected; |
619 | 0 | return 0; /* found in cache */ |
620 | 0 | } |
621 | | |
622 | | |
623 | 0 | *bytes = 0; |
624 | 0 | *buf = NULL; |
625 | | |
626 | | /* OK, we have to read a page or double-size page */ |
627 | |
|
628 | 0 | if (page_buf) |
629 | 0 | want_double = 1; /* we'll be caching; we cache double-size pages */ |
630 | 0 | else |
631 | 0 | page_buf = bfh->page; /* we won't cache this page */ |
632 | |
|
633 | 0 | wanted = bfh->page_sz << want_double; |
634 | 0 | expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off; |
635 | |
|
636 | 0 | #ifdef HAVE_PREAD |
637 | 0 | ret = pread(bfh->fd, page_buf, expected, off); |
638 | | #else |
639 | | if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1) |
640 | | return errno; |
641 | | ret = read(bfh->fd, page_buf, expected); |
642 | | #endif |
643 | 0 | if (ret < 0) |
644 | 0 | return errno; |
645 | | |
646 | 0 | if (ret != expected) |
647 | 0 | return EIO; /* XXX ??? */ |
648 | | |
649 | 0 | *buf = page_buf; |
650 | 0 | *bytes = expected; |
651 | 0 | return 0; |
652 | 0 | } |
653 | | |
654 | | /* |
655 | | * Perform a binary search of a file where each line is a record (LF and |
656 | | * CRLF supported). Each record consists of a key followed by an |
657 | | * optional value separated from the key by whitespace. Whitespace can |
658 | | * be quoted with backslashes. It's the caller's responsibility to |
659 | | * encode/decode keys/values if quoting is desired; newlines should be |
660 | | * encoded such that a newline does not appear in the result. |
661 | | * |
662 | | * The search is done with block-wise I/O (i.e., the whole file is not |
663 | | * read into memory). |
664 | | * |
665 | | * All output arguments are optional. |
666 | | * |
667 | | * Returns 0 if key is found, -1 if not found, or an error code such as |
668 | | * ENOMEM in case of error. |
669 | | * |
670 | | * NOTE: We could improve this by not freeing the buffer, instead |
671 | | * requiring that the caller provide it. Further, we could cache |
672 | | * the top N levels of [double-size] pages (2^N - 1 pages), which |
673 | | * should speed up most searches by reducing the number of reads |
674 | | * by N. |
675 | | * |
676 | | * Inputs: |
677 | | * |
678 | | * @fd File descriptor (file to search) |
679 | | * @page_sz Page size (if zero then the file's st_blksize will be used) |
680 | | * @key Key string to search for |
681 | | * |
682 | | * Outputs: |
683 | | * |
684 | | * @value Location to store a copy of the value (caller must free) |
685 | | * @location Record location if found else the location where the |
686 | | * record should be inserted (index into @buf) |
687 | | * @loops Location to store a count of bisections required for |
688 | | * search (useful for confirming logarithmic performance) |
689 | | * @reads Location to store a count of pages read during search |
690 | | * (useful for confirming logarithmic performance) |
691 | | */ |
692 | | int |
693 | | _bsearch_file(bsearch_file_handle bfh, const char *key, |
694 | | char **value, size_t *location, size_t *loops, size_t *reads) |
695 | 0 | { |
696 | 0 | int ret; |
697 | 0 | const char *buf; |
698 | 0 | size_t buf_sz; |
699 | 0 | size_t page, l, r; |
700 | 0 | size_t my_reads = 0; |
701 | 0 | size_t my_loops_total = 0; |
702 | 0 | size_t my_loops; |
703 | 0 | size_t level; /* level in the tree */ |
704 | 0 | size_t page_idx = 0; /* page number in the tree level */ |
705 | 0 | size_t buf_location; |
706 | 0 | int cmp; |
707 | 0 | int buf_ends_in_eol = 0; |
708 | 0 | int buf_is_start = 0; |
709 | |
|
710 | 0 | if (reads) |
711 | 0 | *reads = 0; |
712 | 0 | if (value) |
713 | 0 | *value = NULL; |
714 | 0 | if (loops) |
715 | 0 | *loops = 0; |
716 | | |
717 | | /* If whole file is in memory then search that and we're done */ |
718 | 0 | if (bfh->file_sz == bfh->cache_sz) |
719 | 0 | return _bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops); |
720 | | |
721 | | /* Else block-wise binary search */ |
722 | | |
723 | 0 | l = 0; |
724 | 0 | r = (bfh->file_sz / bfh->page_sz) + 1; |
725 | 0 | for (level = 0, page = r >> 1; page >= l && page < r ; level++) { |
726 | 0 | ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz); |
727 | 0 | if (ret != 0) |
728 | 0 | return ret; |
729 | 0 | my_reads++; |
730 | 0 | if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n') |
731 | 0 | buf_ends_in_eol = 1; |
732 | 0 | else |
733 | 0 | buf_ends_in_eol = 0; |
734 | |
|
735 | 0 | buf_is_start = page == 0 ? 1 : 0; |
736 | 0 | ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start, |
737 | 0 | value, &buf_location, &cmp, &my_loops); |
738 | 0 | if (ret > 0) |
739 | 0 | return ret; |
740 | | /* Found or no we update stats */ |
741 | 0 | my_loops_total += my_loops; |
742 | 0 | if (loops) |
743 | 0 | *loops = my_loops_total; |
744 | 0 | if (reads) |
745 | 0 | *reads = my_reads; |
746 | 0 | if (location) |
747 | 0 | *location = page * bfh->page_sz + buf_location; |
748 | 0 | if (ret == 0) |
749 | 0 | return 0; /* found! */ |
750 | | /* Not found */ |
751 | 0 | if (cmp < 0) { |
752 | | /* Search left */ |
753 | 0 | page_idx <<= 1; |
754 | 0 | r = page; |
755 | 0 | page = l + ((r - l) >> 1); |
756 | 0 | continue; |
757 | 0 | } else { |
758 | | /* |
759 | | * Search right, but first search the current and next |
760 | | * blocks in case that the record we're looking for either |
761 | | * straddles the boundary between this and the next record, |
762 | | * or in case the record starts exactly at the next page. |
763 | | */ |
764 | 0 | heim_assert(cmp > 0, "cmp > 0"); |
765 | | |
766 | 0 | if (!buf_ends_in_eol || page == l || page == (r - 1)) { |
767 | 0 | ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz); |
768 | 0 | if (ret != 0) |
769 | 0 | return ret; |
770 | 0 | my_reads++; |
771 | |
|
772 | 0 | buf_is_start = page == l ? 1 : 0; |
773 | |
|
774 | 0 | ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start, |
775 | 0 | value, &buf_location, &cmp, &my_loops); |
776 | 0 | if (ret > 0) |
777 | 0 | return ret; |
778 | 0 | my_loops_total += my_loops; |
779 | 0 | if (loops) |
780 | 0 | *loops = my_loops_total; |
781 | 0 | if (reads) |
782 | 0 | *reads = my_reads; |
783 | 0 | if (location) |
784 | 0 | *location = page * bfh->page_sz + buf_location; |
785 | 0 | if (ret == 0) |
786 | 0 | return 0; |
787 | 0 | } |
788 | | |
789 | | /* Oh well, search right */ |
790 | 0 | if (l == page && r == (l + 1)) |
791 | 0 | break; |
792 | 0 | page_idx = (page_idx << 1) + 1; |
793 | 0 | l = page; |
794 | 0 | page = l + ((r - l) >> 1); |
795 | 0 | continue; |
796 | 0 | } |
797 | 0 | } |
798 | 0 | return -1; |
799 | 0 | } |
800 | | |
801 | | |
802 | | static int |
803 | | stdb_open(void *plug, const char *dbtype, const char *dbname, |
804 | | heim_dict_t options, void **db, heim_error_t *error) |
805 | 0 | { |
806 | 0 | bsearch_file_handle bfh; |
807 | 0 | char *p; |
808 | 0 | int ret; |
809 | |
|
810 | 0 | if (error) |
811 | 0 | *error = NULL; |
812 | 0 | if (dbname == NULL || *dbname == '\0') { |
813 | 0 | if (error) |
814 | 0 | *error = heim_error_create(EINVAL, |
815 | 0 | N_("DB name required for sorted-text DB " |
816 | 0 | "plugin", "")); |
817 | 0 | return EINVAL; |
818 | 0 | } |
819 | 0 | p = strrchr(dbname, '.'); |
820 | 0 | if (p == NULL || strcmp(p, ".txt") != 0) { |
821 | 0 | if (error) |
822 | 0 | *error = heim_error_create(ENOTSUP, |
823 | 0 | N_("Text file (name ending in .txt) " |
824 | 0 | "required for sorted-text DB plugin", |
825 | 0 | "")); |
826 | 0 | return ENOTSUP; |
827 | 0 | } |
828 | | |
829 | 0 | ret = _bsearch_file_open(dbname, 0, 0, &bfh, NULL); |
830 | 0 | if (ret) |
831 | 0 | return ret; |
832 | | |
833 | 0 | *db = bfh; |
834 | 0 | return 0; |
835 | 0 | } |
836 | | |
837 | | static int |
838 | | stdb_close(void *db, heim_error_t *error) |
839 | 0 | { |
840 | 0 | bsearch_file_handle bfh = db; |
841 | |
|
842 | 0 | if (error) |
843 | 0 | *error = NULL; |
844 | 0 | _bsearch_file_close(&bfh); |
845 | 0 | return 0; |
846 | 0 | } |
847 | | |
848 | | static heim_data_t |
849 | | stdb_copy_value(void *db, heim_string_t table, heim_data_t key, |
850 | | heim_error_t *error) |
851 | 0 | { |
852 | 0 | bsearch_file_handle bfh = db; |
853 | 0 | const char *k; |
854 | 0 | char *v = NULL; |
855 | 0 | heim_data_t value; |
856 | 0 | int ret; |
857 | |
|
858 | 0 | if (error) |
859 | 0 | *error = NULL; |
860 | |
|
861 | 0 | if (table == NULL) |
862 | 0 | table = HSTR(""); |
863 | |
|
864 | 0 | if (table != HSTR("")) |
865 | 0 | return NULL; |
866 | | |
867 | 0 | if (heim_get_tid(key) == HEIM_TID_STRING) |
868 | 0 | k = heim_string_get_utf8((heim_string_t)key); |
869 | 0 | else |
870 | 0 | k = (const char *)heim_data_get_ptr(key); |
871 | 0 | ret = _bsearch_file(bfh, k, &v, NULL, NULL, NULL); |
872 | 0 | if (ret == 0 && v == NULL) |
873 | 0 | ret = -1; /* Quiet lint */ |
874 | 0 | if (ret != 0) { |
875 | 0 | if (ret > 0 && error) |
876 | 0 | *error = heim_error_create(ret, "%s", strerror(ret)); |
877 | 0 | return NULL; |
878 | 0 | } |
879 | 0 | value = heim_data_create(v, strlen(v)); |
880 | 0 | free(v); |
881 | | /* XXX Handle ENOMEM */ |
882 | 0 | return value; |
883 | 0 | } |
884 | | |
885 | | struct heim_db_type heim_sorted_text_file_dbtype = { |
886 | | 1, stdb_open, NULL, stdb_close, NULL, NULL, NULL, NULL, NULL, NULL, |
887 | | stdb_copy_value, NULL, NULL, NULL |
888 | | }; |