Coverage Report

Created: 2026-02-14 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/samba/third_party/heimdal/lib/base/bsearch.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2011, Secure Endpoints Inc.
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *
9
 * - Redistributions of source code must retain the above copyright
10
 *   notice, this list of conditions and the following disclaimer.
11
 *
12
 * - Redistributions in binary form must reproduce the above copyright
13
 *   notice, this list of conditions and the following disclaimer in
14
 *   the documentation and/or other materials provided with the
15
 *   distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21
 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
28
 * OF THE POSSIBILITY OF SUCH DAMAGE.
29
 *
30
 */
31
32
#include "baselocl.h"
33
34
#include <sys/types.h>
35
#include <sys/stat.h>
36
#ifdef HAVE_IO_H
37
#include <io.h>
38
#endif
39
#ifdef HAVE_UNISTD_H
40
#include <unistd.h>
41
#endif
42
#include <fcntl.h>
43
#include <ctype.h>
44
#include <stdio.h>
45
#include <stdlib.h>
46
#include <string.h>
47
#ifdef HAVE_STRINGS_H
48
#include <strings.h>
49
#endif
50
#include <errno.h>
51
#include <assert.h>
52
53
/*
54
 * This file contains functions for binary searching flat text in memory
55
 * and in text files where each line is a [variable length] record.
56
 * Each record has a key and an optional value separated from the key by
57
 * unquoted whitespace.  Whitespace in the key, and leading whitespace
58
 * for the value, can be quoted with backslashes (but CR and LF must be
59
 * quoted in such a way that they don't appear in the quoted result).
60
 *
61
 * Binary searching a tree are normally a dead simple algorithm.  It
62
 * turns out that binary searching flat text with *variable* length
63
 * records is... tricky.  There's no indexes to record beginning bytes,
64
 * thus any index selected during the search is likely to fall in the
65
 * middle of a record.  When deciding to search a left sub-tree one
66
 * might fail to find the last record in that sub-tree on account of the
67
 * right boundary falling in the middle of it -- the chosen solution to
68
 * this makes left sub-tree searches slightly less efficient than right
69
 * sub-tree searches.
70
 *
71
 * If binary searching flat text in memory is tricky, using block-wise
72
 * I/O instead is trickier!  But it's necessary in order to support
73
 * large files (which we either can't or wouldn't want to read or map
74
 * into memory).  Each block we read has to be large enough that the
75
 * largest record can fit in it.  And each block might start and/or end
76
 * in the middle of a record.  Here it is the right sub-tree searches
77
 * that are less efficient than left sub-tree searches.
78
 *
79
 * bsearch_common() contains the common text block binary search code.
80
 *
81
 * _bsearch_text() is the interface for searching in-core text.
82
 * _bsearch_file() is the interface for block-wise searching files.
83
 */
84
85
struct bsearch_file_handle {
86
    int fd;          /* file descriptor */
87
    char *cache;     /* cache bytes */
88
    char *page;      /* one double-size page worth of bytes */
89
    size_t file_sz;  /* file size */
90
    size_t cache_sz; /* cache size */
91
    size_t page_sz;  /* page size */
92
};
93
94
/* Find a new-line */
95
static const char *
96
find_line(const char *buf, size_t i, size_t right)
97
0
{
98
0
    if (i == 0)
99
0
  return &buf[i];
100
0
    for (; i < right; i++) {
101
0
  if (buf[i] == '\n') {
102
0
      if ((i + 1) < right)
103
0
    return &buf[i + 1];
104
0
      return NULL;
105
0
  }
106
0
    }
107
0
    return NULL;
108
0
}
109
110
/*
111
 * Common routine for binary searching text in core.
112
 *
113
 * Perform a binary search of a char array containing a block from a
114
 * text file where each line is a record (LF and CRLF supported).  Each
115
 * record consists of a key followed by an optional value separated from
116
 * the key by whitespace.  Whitespace can be quoted with backslashes.
117
 * It's the caller's responsibility to encode/decode keys/values if
118
 * quoting is desired; newlines should be encoded such that a newline
119
 * does not appear in the result.
120
 *
121
 * All output arguments are optional.
122
 *
123
 * Returns 0 if key is found, -1 if not found, or an error code such as
124
 * ENOMEM in case of error.
125
 *
126
 * Inputs:
127
 *
128
 * @buf          String to search
129
 * @sz           Size of string to search
130
 * @key          Key string to search for
131
 * @buf_is_start True if the buffer starts with a record, false if it
132
 *               starts in the middle of a record or if the caller
133
 *               doesn't know.
134
 *
135
 * Outputs:
136
 *
137
 * @value        Location to store a copy of the value (caller must free)
138
 * @location     Record location if found else the location where the
139
 *               record should be inserted (index into @buf)
140
 * @cmp          Set to less than or greater than 0 to indicate that a
141
 *               key not found would have fit in an earlier or later
142
 *               part of a file.  Callers should use this to decide
143
 *               whether to read a block to the left or to the right and
144
 *               search that.
145
 * @loops        Location to store a count of bisections required for
146
 *               search (useful for confirming logarithmic performance)
147
 */
148
static int
149
bsearch_common(const char *buf, size_t sz, const char *key,
150
         int buf_is_start, char **value, size_t *location,
151
         int *cmp, size_t *loops)
152
0
{
153
0
    const char *linep;
154
0
    size_t key_start, key_len; /* key string in buf */
155
0
    size_t val_start, val_len; /* value string in buf */
156
0
    int key_cmp = -1;
157
0
    size_t k;
158
0
    size_t l;    /* left side of buffer for binary search */
159
0
    size_t r;    /* right side of buffer for binary search */
160
0
    size_t rmax; /* right side of buffer for binary search */
161
0
    size_t i;    /* index into buffer, typically in the middle of l and r */
162
0
    size_t loop_count = 0;
163
0
    int ret = -1;
164
165
0
    if (value)
166
0
  *value = NULL;
167
0
    if (cmp)
168
0
  *cmp = 0;
169
0
    if (loops)
170
0
  *loops = 0;
171
172
    /* Binary search; file should be sorted */
173
0
    for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) {
174
0
  heim_assert(i < sz, "invalid aname2lname db index");
175
176
  /* buf[i] is likely in the middle of a line; find the next line */
177
0
  linep = find_line(buf, i, rmax);
178
0
  k = linep ? linep - buf : i;
179
0
  if (linep == NULL || k >= rmax) {
180
      /*
181
       * No new line found to the right; search to the left then
182
       * but don't change rmax (this isn't optimal, but it's
183
       * simple).
184
       */
185
0
      if (i == l)
186
0
    break;
187
0
      r = i;
188
0
      i = l + ((r - l) >> 1);
189
0
      continue;
190
0
  }
191
0
  i = k;
192
0
  heim_assert(i >= l && i < rmax, "invalid aname2lname db index");
193
194
  /* Got a line; check it */
195
196
  /* Search for and split on unquoted whitespace */
197
0
  val_start = 0;
198
0
  for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) {
199
0
      if (buf[k] == '\\') {
200
0
    k++;
201
0
    continue;
202
0
      }
203
0
      if (buf[k] == '\r' || buf[k] == '\n') {
204
    /* We now know where the key ends, and there's no value */
205
0
    key_len = k - i;
206
0
    break;
207
0
      }
208
0
      if (!isspace((unsigned char)buf[k]))
209
0
    continue;
210
211
0
      while (k < rmax && isspace((unsigned char)buf[k])) {
212
0
    key_len = k - i;
213
0
    k++;
214
0
      }
215
0
      if (k < rmax)
216
0
    val_start = k;
217
      /* Find end of value */
218
0
      for (; k < rmax && buf[k] != '\0'; k++) {
219
0
    if (buf[k] == '\r' || buf[k] == '\n') {
220
0
        val_len = k - val_start;
221
0
        break;
222
0
    }
223
0
      }
224
0
      break;
225
0
  }
226
227
  /*
228
   * The following logic is for dealing with partial buffers,
229
   * which we use for block-wise binary searches of large files
230
   */
231
0
  if (key_start == 0 && !buf_is_start) {
232
      /*
233
       * We're at the beginning of a block that might have started
234
       * in the middle of a record whose "key" might well compare
235
       * as greater than the key we're looking for, so we don't
236
       * bother comparing -- we know key_cmp must be -1 here.
237
       */
238
0
      key_cmp = -1;
239
0
      break;
240
0
  }
241
0
  if ((val_len && buf[val_start + val_len] != '\n') ||
242
0
      (!val_len && buf[key_start + key_len] != '\n')) {
243
      /*
244
       * We're at the end of a block that ends in the middle of a
245
       * record whose "key" might well compare as less than the
246
       * key we're looking for, so we don't bother comparing -- we
247
       * know key_cmp must be >= 0 but we can't tell.  Our caller
248
       * will end up reading a double-size block to handle this.
249
       */
250
0
      key_cmp = 1;
251
0
      break;
252
0
  }
253
254
0
  key_cmp = strncmp(key, &buf[key_start], key_len);
255
0
  if (key_cmp == 0 && strlen(key) != key_len)
256
0
      key_cmp = 1;
257
0
  if (key_cmp < 0) {
258
      /* search left */
259
0
      r = rmax = (linep - buf);
260
0
      i = l + ((r - l) >> 1);
261
0
      if (location)
262
0
    *location = key_start;
263
0
  } else if (key_cmp > 0) {
264
      /* search right */
265
0
      if (l == i)
266
0
    break; /* not found */
267
0
      l = i;
268
0
      i = l + ((r - l) >> 1);
269
0
      if (location)
270
0
    *location = val_start + val_len;
271
0
  } else {
272
      /* match! */
273
0
      if (location)
274
0
    *location = key_start;
275
0
      ret = 0;
276
0
      if (val_len && value) {
277
    /* Avoid strndup() so we don't need libroken here yet */
278
0
    if ((*value = malloc(val_len + 1))) {
279
0
                    (void) memcpy(*value, &buf[val_start], val_len);
280
0
                    (*value)[val_len] = '\0';
281
0
                } else {
282
0
                    ret = errno;
283
0
                }
284
0
      }
285
0
      break;
286
0
  }
287
0
    }
288
289
0
    if (cmp)
290
0
  *cmp = key_cmp;
291
0
    if (loops)
292
0
  *loops = loop_count;
293
294
0
    return ret;
295
0
}
296
297
/*
298
 * Binary search a char array containing sorted text records separated
299
 * by new-lines (or CRLF).  Each record consists of a key and an
300
 * optional value following the key, separated from the key by unquoted
301
 * whitespace.
302
 *
303
 * All output arguments are optional.
304
 *
305
 * Returns 0 if key is found, -1 if not found, or an error code such as
306
 * ENOMEM in case of error.
307
 *
308
 * Inputs:
309
 *
310
 * @buf      Char array pointer
311
 * @buf_sz   Size of buf
312
 * @key      Key to search for
313
 *
314
 * Outputs:
315
 *
316
 * @value    Location where to put the value, if any (caller must free)
317
 * @location Record location if found else the location where the record
318
 *           should be inserted (index into @buf)
319
 * @loops    Location where to put a number of loops (or comparisons)
320
 *           needed for the search (useful for benchmarking)
321
 */
322
int
323
_bsearch_text(const char *buf, size_t buf_sz, const char *key,
324
         char **value, size_t *location, size_t *loops)
325
0
{
326
0
    return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops);
327
0
}
328
329
0
#define MAX_BLOCK_SIZE (1024 * 1024)
330
0
#define DEFAULT_MAX_FILE_SIZE (1024 * 1024)
331
/*
332
 * Open a file for binary searching.  The file will be read in entirely
333
 * if it is smaller than @max_sz, else a cache of @max_sz bytes will be
334
 * allocated.
335
 *
336
 * Returns 0 on success, else an error number or -1 if the file is empty.
337
 *
338
 * Inputs:
339
 *
340
 * @fname   Name of file to open
341
 * @max_sz  Maximum size of cache to allocate, in bytes (if zero, default)
342
 * @page_sz Page size (must be a power of two, larger than 256, smaller
343
 *          than 1MB; if zero use default)
344
 * 
345
 * Outputs:
346
 *
347
 * @bfh     Handle for use with _bsearch_file() and _bsearch_file_close()
348
 * @reads   Number of reads performed
349
 */
350
int
351
_bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz,
352
        bsearch_file_handle *bfh, size_t *reads)
353
0
{
354
0
    bsearch_file_handle new_bfh = NULL;
355
0
    struct stat st;
356
0
    size_t i;
357
0
    int fd;
358
0
    int ret;
359
360
0
    *bfh = NULL;
361
362
0
    if (reads)
363
0
  *reads = 0;
364
365
0
    fd = open(fname, O_RDONLY);
366
0
    if (fd == -1)
367
0
  return errno;
368
369
0
    if (fstat(fd, &st) == -1) {
370
0
  ret = errno;
371
0
  goto err;
372
0
    }
373
374
0
    if (st.st_size == 0) {
375
0
  ret = -1; /* no data -> no binary search */
376
0
  goto err;
377
0
    }
378
379
    /* Validate / default arguments */
380
0
    if (max_sz == 0)
381
0
  max_sz = DEFAULT_MAX_FILE_SIZE;
382
0
    for (i = page_sz; i; i >>= 1) {
383
  /* Make sure page_sz is a power of two */
384
0
  if ((i % 2) && (i >> 1)) {
385
0
      page_sz = 0;
386
0
      break;
387
0
  }
388
0
    }
389
0
    if (page_sz == 0)
390
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
391
  page_sz = st.st_blksize;
392
#else
393
0
  page_sz = 4096;
394
0
#endif
395
0
    for (i = page_sz; i; i >>= 1) {
396
  /* Make sure page_sz is a power of two */
397
0
  if ((i % 2) && (i >> 1)) {
398
      /* Can't happen! Filesystems always use powers of two! */
399
0
      page_sz = 4096;
400
0
      break;
401
0
  }
402
0
    }
403
0
    if (page_sz > MAX_BLOCK_SIZE)
404
0
  page_sz = MAX_BLOCK_SIZE;
405
406
0
    new_bfh = calloc(1, sizeof (*new_bfh));
407
0
    if (new_bfh == NULL) {
408
0
  ret = ENOMEM;
409
0
  goto err;
410
0
    }
411
412
0
    new_bfh->fd = fd;
413
0
    new_bfh->page_sz = page_sz;
414
0
    new_bfh->file_sz = st.st_size;
415
416
0
    if (max_sz >= st.st_size) {
417
  /* Whole-file method */
418
0
  new_bfh->cache = malloc(st.st_size + 1);
419
0
  if (new_bfh->cache) {
420
0
      new_bfh->cache[st.st_size] = '\0';
421
0
      new_bfh->cache_sz = st.st_size;
422
0
      ret = read(fd, new_bfh->cache, st.st_size);
423
0
      if (ret < 0) {
424
0
    ret = errno;
425
0
    goto err;
426
0
      }
427
0
      if (ret != st.st_size) {
428
0
    ret = EIO; /* XXX ??? */
429
0
    goto err;
430
0
      }
431
0
      if (reads)
432
0
    *reads = 1;
433
0
      (void) close(fd);
434
0
      new_bfh->fd = -1;
435
0
      *bfh = new_bfh;
436
0
      return 0;
437
0
  }
438
0
    }
439
440
    /* Block-size method, or above malloc() failed */
441
0
    new_bfh->page = malloc(new_bfh->page_sz << 1);
442
0
    if (new_bfh->page == NULL) {
443
  /* Can't even allocate a single double-size page! */
444
0
  ret = ENOMEM;
445
0
  goto err;
446
0
    }
447
448
0
    new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size;
449
0
    new_bfh->cache = malloc(new_bfh->cache_sz);
450
0
    *bfh = new_bfh;
451
452
    /*
453
     * malloc() may have failed because we were asking for a lot of
454
     * memory, but we may still be able to operate without a cache,
455
     * so let's not fail.
456
     */
457
0
    if (new_bfh->cache == NULL) {
458
0
  new_bfh->cache_sz = 0;
459
0
  return 0;
460
0
    }
461
462
    /* Initialize cache */
463
0
    for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz)
464
0
  new_bfh->cache[i] = '\0';
465
0
    return 0;
466
467
0
err:
468
0
    (void) close(fd);
469
0
    if (new_bfh) {
470
0
  free(new_bfh->page);
471
0
  free(new_bfh->cache);
472
0
  free(new_bfh);
473
0
    }
474
0
    return ret;
475
0
}
476
477
/*
478
 * Indicate whether the given binary search file handle will be searched
479
 * with block-wise method.
480
 */
481
void
482
_bsearch_file_info(bsearch_file_handle bfh,
483
        size_t *page_sz, size_t *max_sz, int *blockwise)
484
0
{
485
0
    if (page_sz)
486
0
  *page_sz = bfh->page_sz;
487
0
    if (max_sz)
488
0
  *max_sz = bfh->cache_sz;
489
0
    if (blockwise)
490
0
  *blockwise = (bfh->file_sz != bfh->cache_sz);
491
0
}
492
493
/*
494
 * Close the given binary file search handle.
495
 *
496
 * Inputs:
497
 *
498
 * @bfh Pointer to variable containing handle to close.
499
 */
500
void
501
_bsearch_file_close(bsearch_file_handle *bfh)
502
0
{
503
0
    if (!*bfh)
504
0
  return;
505
0
    if ((*bfh)->fd >= 0)
506
0
  (void) close((*bfh)->fd);
507
0
    if ((*bfh)->page)
508
0
  free((*bfh)->page);
509
0
    if ((*bfh)->cache)
510
0
  free((*bfh)->cache);
511
0
    free(*bfh);
512
0
    *bfh = NULL;
513
0
}
514
515
/*
516
 * Private function to get a page from a cache.  The cache is a char
517
 * array of 2^n - 1 double-size page worth of bytes, where n is the
518
 * number of tree levels that the cache stores.  The cache can be
519
 * smaller than n implies.
520
 *
521
 * The page may or may not be valid.  If the first byte of it is NUL
522
 * then it's not valid, else it is.
523
 *
524
 * Returns 1 if page is in cache and valid, 0 if the cache is too small
525
 * or the page is invalid.  The page address is output in @buf if the
526
 * cache is large enough to contain it regardless of whether the page is
527
 * valid.
528
 *
529
 * Inputs:
530
 *
531
 * @bfh      Binary search file handle
532
 * @level    Level in the tree that we want a page for
533
 * @page_idx Page number in the given level (0..2^level - 1)
534
 *
535
 * Outputs:
536
 *
537
 * @buf      Set to address of page if the cache is large enough
538
 */
539
static int
540
get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx,
541
        char **buf)
542
0
{
543
0
    size_t idx = 0;
544
0
    size_t page_sz;
545
546
0
    page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */
547
548
0
    *buf = NULL;
549
550
    /*
551
     * Compute index into cache.  The cache is basically an array of
552
     * double-size pages.  The first (zeroth) double-size page in the
553
     * cache will be the middle page of the file -- the root of the
554
     * tree.  The next two double-size pages will be the left and right
555
     * pages of the second level in the tree.  The next four double-size
556
     * pages will be the four pages at the next level.  And so on for as
557
     * many pages as fit in the cache.
558
     *
559
     * The page index is the number of the page at the given level.  We
560
     * then compute (2^level - 1 + page index) * 2page size, check that
561
     * we have that in the cache, check that the page has been read (it
562
     * doesn't start with NUL).
563
     */
564
0
    if (level)
565
0
  idx = (1 << level) - 1 + page_idx;
566
0
    if (((idx + 1) * page_sz * 2) > bfh->cache_sz)
567
0
  return 0;
568
569
0
    *buf = &bfh->cache[idx * page_sz * 2];
570
0
    if (bfh->cache[idx * page_sz * 2] == '\0')
571
0
  return 0; /* cache[idx] == NUL -> page not loaded in cache */
572
0
    return 1;
573
0
}
574
575
/*
576
 * Private function to read a page of @page_sz from @fd at offset @off
577
 * into @buf, outputing the number of bytes read, which will be the same
578
 * as @page_sz unless the page being read is the last page, in which
579
 * case the number of remaining bytes in the file will be output.
580
 *
581
 * Returns 0 on success or an errno value otherwise (EIO if reads are
582
 * short).
583
 *
584
 * Inputs:
585
 *
586
 * @bfh        Binary search file handle
587
 * @level      Level in the binary search tree that we're at
588
 * @page_idx   Page "index" at the @level of the tree that we want
589
 * @page       Actual page number that we want
590
 * want_double Whether we need a page or double page read
591
 *
592
 * Outputs:
593
 *
594
 * @buf        Page read or cached
595
 * @bytes      Bytes read (may be less than page or double page size in
596
 *             the case of the last page, of course)
597
 */
598
static int
599
read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page,
600
    int want_double, const char **buf, size_t *bytes)
601
0
{
602
0
    int ret;
603
0
    off_t off;
604
0
    size_t expected;
605
0
    size_t wanted;
606
0
    char *page_buf;
607
608
    /* Figure out where we're reading and how much */
609
0
    off = page * bfh->page_sz;
610
0
    if (off < 0)
611
0
  return EOVERFLOW;
612
613
0
    wanted = bfh->page_sz << want_double;
614
0
    expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
615
616
0
    if (get_page_from_cache(bfh, level, page_idx, &page_buf)) {
617
0
  *buf = page_buf;
618
0
  *bytes = expected;
619
0
  return 0; /* found in cache */
620
0
    }
621
622
623
0
    *bytes = 0;
624
0
    *buf = NULL;
625
626
    /* OK, we have to read a page or double-size page */
627
628
0
    if (page_buf)
629
0
  want_double = 1; /* we'll be caching; we cache double-size pages */
630
0
    else
631
0
  page_buf = bfh->page; /* we won't cache this page */
632
633
0
    wanted = bfh->page_sz << want_double;
634
0
    expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
635
636
0
#ifdef HAVE_PREAD
637
0
    ret = pread(bfh->fd, page_buf, expected, off);
638
#else
639
    if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1)
640
  return errno;
641
    ret = read(bfh->fd, page_buf, expected);
642
#endif
643
0
    if (ret < 0)
644
0
  return errno;
645
    
646
0
    if (ret != expected)
647
0
  return EIO; /* XXX ??? */
648
649
0
    *buf = page_buf;
650
0
    *bytes = expected;
651
0
    return 0;
652
0
}
653
654
/*
655
 * Perform a binary search of a file where each line is a record (LF and
656
 * CRLF supported).  Each record consists of a key followed by an
657
 * optional value separated from the key by whitespace.  Whitespace can
658
 * be quoted with backslashes.  It's the caller's responsibility to
659
 * encode/decode keys/values if quoting is desired; newlines should be
660
 * encoded such that a newline does not appear in the result.
661
 *
662
 * The search is done with block-wise I/O (i.e., the whole file is not
663
 * read into memory).
664
 *
665
 * All output arguments are optional.
666
 *
667
 * Returns 0 if key is found, -1 if not found, or an error code such as
668
 * ENOMEM in case of error.
669
 *
670
 * NOTE: We could improve this by not freeing the buffer, instead
671
 *       requiring that the caller provide it.  Further, we could cache
672
 *       the top N levels of [double-size] pages (2^N - 1 pages), which
673
 *       should speed up most searches by reducing the number of reads
674
 *       by N.
675
 *
676
 * Inputs:
677
 *
678
 * @fd           File descriptor (file to search)
679
 * @page_sz      Page size (if zero then the file's st_blksize will be used)
680
 * @key          Key string to search for
681
 *
682
 * Outputs:
683
 *
684
 * @value        Location to store a copy of the value (caller must free)
685
 * @location     Record location if found else the location where the
686
 *               record should be inserted (index into @buf)
687
 * @loops        Location to store a count of bisections required for
688
 *               search (useful for confirming logarithmic performance)
689
 * @reads        Location to store a count of pages read during search
690
 *               (useful for confirming logarithmic performance)
691
 */
692
int
693
_bsearch_file(bsearch_file_handle bfh, const char *key,
694
         char **value, size_t *location, size_t *loops, size_t *reads)
695
0
{
696
0
    int ret;
697
0
    const char *buf;
698
0
    size_t buf_sz;
699
0
    size_t page, l, r;
700
0
    size_t my_reads = 0;
701
0
    size_t my_loops_total = 0;
702
0
    size_t my_loops;
703
0
    size_t level;        /* level in the tree */
704
0
    size_t page_idx = 0; /* page number in the tree level */
705
0
    size_t buf_location;
706
0
    int cmp;
707
0
    int buf_ends_in_eol = 0;
708
0
    int buf_is_start = 0;
709
710
0
    if (reads)
711
0
  *reads = 0;
712
0
    if (value)
713
0
  *value = NULL;
714
0
    if (loops)
715
0
  *loops = 0;
716
717
    /* If whole file is in memory then search that and we're done */
718
0
    if (bfh->file_sz == bfh->cache_sz)
719
0
  return _bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops);
720
721
    /* Else block-wise binary search */
722
723
0
    l = 0;
724
0
    r = (bfh->file_sz / bfh->page_sz) + 1;
725
0
    for (level = 0, page = r >> 1; page >= l && page < r ; level++) {
726
0
  ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz);
727
0
  if (ret != 0)
728
0
      return ret;
729
0
  my_reads++;
730
0
  if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n')
731
0
      buf_ends_in_eol = 1;
732
0
  else
733
0
      buf_ends_in_eol = 0;
734
735
0
  buf_is_start = page == 0 ? 1 : 0;
736
0
  ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
737
0
           value, &buf_location, &cmp, &my_loops);
738
0
  if (ret > 0)
739
0
      return ret;
740
  /* Found or no we update stats */
741
0
  my_loops_total += my_loops;
742
0
  if (loops)
743
0
      *loops = my_loops_total;
744
0
  if (reads)
745
0
      *reads = my_reads;
746
0
  if (location)
747
0
      *location = page * bfh->page_sz + buf_location;
748
0
  if (ret == 0)
749
0
      return 0; /* found! */
750
  /* Not found */
751
0
  if (cmp < 0) {
752
      /* Search left */
753
0
      page_idx <<= 1;
754
0
      r = page;
755
0
      page = l + ((r - l) >> 1);
756
0
      continue;
757
0
  } else {
758
      /*
759
       * Search right, but first search the current and next
760
       * blocks in case that the record we're looking for either
761
       * straddles the boundary between this and the next record,
762
       * or in case the record starts exactly at the next page.
763
       */
764
0
      heim_assert(cmp > 0, "cmp > 0");
765
766
0
      if (!buf_ends_in_eol || page == l || page == (r - 1)) {
767
0
    ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz);
768
0
    if (ret != 0)
769
0
        return ret;
770
0
    my_reads++;
771
772
0
    buf_is_start = page == l ? 1 : 0;
773
774
0
    ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
775
0
             value, &buf_location, &cmp, &my_loops);
776
0
    if (ret > 0)
777
0
        return ret;
778
0
    my_loops_total += my_loops;
779
0
    if (loops)
780
0
        *loops = my_loops_total;
781
0
    if (reads)
782
0
        *reads = my_reads;
783
0
    if (location)
784
0
        *location = page * bfh->page_sz + buf_location;
785
0
    if (ret == 0)
786
0
        return 0;
787
0
      }
788
789
      /* Oh well, search right */
790
0
      if (l == page && r == (l + 1))
791
0
    break;
792
0
      page_idx = (page_idx << 1) + 1;
793
0
      l = page;
794
0
      page = l + ((r - l) >> 1);
795
0
      continue;
796
0
  }
797
0
    }
798
0
    return -1;
799
0
}
800
801
802
static int
803
stdb_open(void *plug, const char *dbtype, const char *dbname,
804
       heim_dict_t options, void **db, heim_error_t *error)
805
0
{
806
0
    bsearch_file_handle bfh;
807
0
    char *p;
808
0
    int ret;
809
810
0
    if (error)
811
0
  *error = NULL;
812
0
    if (dbname == NULL || *dbname == '\0') {
813
0
  if (error)
814
0
      *error = heim_error_create(EINVAL,
815
0
               N_("DB name required for sorted-text DB "
816
0
            "plugin", ""));
817
0
  return EINVAL;
818
0
    }
819
0
    p = strrchr(dbname, '.');
820
0
    if (p == NULL || strcmp(p, ".txt") != 0) {
821
0
  if (error)
822
0
      *error = heim_error_create(ENOTSUP,
823
0
               N_("Text file (name ending in .txt) "
824
0
               "required for sorted-text DB plugin",
825
0
               ""));
826
0
  return ENOTSUP;
827
0
    }
828
829
0
    ret = _bsearch_file_open(dbname, 0, 0, &bfh, NULL);
830
0
    if (ret)
831
0
  return ret;
832
833
0
    *db = bfh;
834
0
    return 0;
835
0
}
836
837
static int
838
stdb_close(void *db, heim_error_t *error)
839
0
{
840
0
    bsearch_file_handle bfh = db;
841
842
0
    if (error)
843
0
  *error = NULL;
844
0
    _bsearch_file_close(&bfh);
845
0
    return 0;
846
0
}
847
848
static heim_data_t
849
stdb_copy_value(void *db, heim_string_t table, heim_data_t key,
850
         heim_error_t *error)
851
0
{
852
0
    bsearch_file_handle bfh = db;
853
0
    const char *k;
854
0
    char *v = NULL;
855
0
    heim_data_t value;
856
0
    int ret;
857
858
0
    if (error)
859
0
  *error = NULL;
860
861
0
    if (table == NULL)
862
0
  table = HSTR("");
863
864
0
    if (table != HSTR(""))
865
0
  return NULL;
866
867
0
    if (heim_get_tid(key) == HEIM_TID_STRING)
868
0
  k = heim_string_get_utf8((heim_string_t)key);
869
0
    else
870
0
  k = (const char *)heim_data_get_ptr(key);
871
0
    ret = _bsearch_file(bfh, k, &v, NULL, NULL, NULL);
872
0
    if (ret == 0 && v == NULL)
873
0
        ret = -1; /* Quiet lint */
874
0
    if (ret != 0) {
875
0
  if (ret > 0 && error)
876
0
      *error = heim_error_create(ret, "%s", strerror(ret));
877
0
  return NULL;
878
0
    }
879
0
    value = heim_data_create(v, strlen(v));
880
0
    free(v);
881
    /* XXX Handle ENOMEM */
882
0
    return value;
883
0
}
884
885
struct heim_db_type heim_sorted_text_file_dbtype = {
886
    1, stdb_open, NULL, stdb_close, NULL, NULL, NULL, NULL, NULL, NULL,
887
    stdb_copy_value, NULL, NULL, NULL
888
};