Coverage Report

Created: 2026-05-16 07:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/kstring.c
Line
Count
Source
1
/* The MIT License
2
3
   Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4
   Copyright (C) 2013-2018, 2020-2021, 2023, 2025 Genome Research Ltd.
5
6
   Permission is hereby granted, free of charge, to any person obtaining
7
   a copy of this software and associated documentation files (the
8
   "Software"), to deal in the Software without restriction, including
9
   without limitation the rights to use, copy, modify, merge, publish,
10
   distribute, sublicense, and/or sell copies of the Software, and to
11
   permit persons to whom the Software is furnished to do so, subject to
12
   the following conditions:
13
14
   The above copyright notice and this permission notice shall be
15
   included in all copies or substantial portions of the Software.
16
17
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
   SOFTWARE.
25
*/
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdarg.h>
31
#include <stdio.h>
32
#include <ctype.h>
33
#include <string.h>
34
#include <stdint.h>
35
#include <math.h>
36
#include "htslib/kstring.h"
37
38
577k
int kputd(double d, kstring_t *s) {
39
577k
  int len = 0;
40
577k
  char buf[21], *cp = buf+20, *ep;
41
577k
  if (d == 0) {
42
40.6k
    if (signbit(d)) {
43
4.85k
      kputsn("-0",2,s);
44
4.85k
      return 2;
45
35.7k
    } else {
46
35.7k
      kputsn("0",1,s);
47
35.7k
      return 1;
48
35.7k
    }
49
40.6k
  }
50
51
537k
  if (d < 0) {
52
154k
    kputc('-',s);
53
154k
    len = 1;
54
154k
    d=-d;
55
154k
  }
56
537k
  if (!(d >= 0.0001 && d <= 999999)) {
57
140k
    if (ks_resize(s, s->l + 50) < 0)
58
0
      return EOF;
59
    // We let stdio handle the exponent cases
60
140k
    int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d);
61
140k
    len += s2;
62
140k
    s->l += s2;
63
140k
    return len;
64
140k
  }
65
66
  // Correction for rounding - rather ugly
67
  // Optimised for small numbers.
68
69
396k
  uint32_t i;
70
396k
  if (d<0.001)         i = rint(d*1000000000), cp -= 1;
71
396k
  else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
72
388k
  else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
73
373k
  else if (d < 1)      i = rint(d*1000000),    cp -= 4;
74
361k
  else if (d < 10)     i = rint(d*100000),     cp -= 5;
75
262k
  else if (d < 100)    i = rint(d*10000),      cp -= 6;
76
214k
  else if (d < 1000)   i = rint(d*1000),       cp -= 7;
77
79.0k
  else if (d < 10000)  i = rint(d*100),        cp -= 8;
78
69.4k
  else if (d < 100000) i = rint(d*10),         cp -= 9;
79
51.3k
  else                 i = rint(d),            cp -= 10;
80
81
  // integer i is always 6 digits, so print it 2 at a time.
82
396k
  static const char kputuw_dig2r[] =
83
396k
    "00010203040506070809"
84
396k
    "10111213141516171819"
85
396k
    "20212223242526272829"
86
396k
    "30313233343536373839"
87
396k
    "40414243444546474849"
88
396k
    "50515253545556575859"
89
396k
    "60616263646566676869"
90
396k
    "70717273747576777879"
91
396k
    "80818283848586878889"
92
396k
    "90919293949596979899";
93
94
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
95
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
96
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
97
98
  // Except when it rounds up (d=0.009999999 is i=1000000)
99
396k
  if (i >= 100)
100
8.29k
    *--cp = '0' + (i/100);
101
102
103
396k
  int p = buf+20-cp;
104
396k
  if (p <= 10) { /* d < 1 */
105
    // 0.00123 is 123, so add leading zeros and 0.
106
35.8k
    ep = cp+5; // 6 precision
107
59.7k
    while (p < 10) { // aka d < 1
108
23.9k
      *--cp = '0';
109
23.9k
      p++;
110
23.9k
    }
111
35.8k
    *--cp = '.';
112
35.8k
    *--cp = '0';
113
361k
  } else {
114
    // 123.001 is 123001 with p==13, so move 123 down and add "."
115
    // Equiv to memmove(cp-1, cp, p-10); cp--;
116
361k
    char *xp = --cp;
117
361k
    ep = cp+6;
118
1.39M
    while (p > 10) {
119
1.03M
      xp[0] = xp[1];
120
1.03M
      xp++;
121
1.03M
      p--;
122
1.03M
    }
123
361k
    xp[0] = '.';
124
361k
  }
125
126
  // Cull trailing zeros
127
1.68M
  while (*ep == '0' && ep > cp)
128
1.29M
    ep--;
129
130
  // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
131
  // Also code with "123." which should be "123"
132
396k
  if (*ep && *ep != '.')
133
39.3k
    ep++;
134
396k
  *ep = 0;
135
136
396k
  int sl = ep-cp;
137
396k
  len += sl;
138
396k
  kputsn(cp, sl, s);
139
396k
  return len;
140
537k
}
141
142
int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
143
197k
{
144
197k
  va_list args;
145
197k
  int l;
146
197k
  va_copy(args, ap);
147
148
197k
  if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
149
0
    double d = va_arg(args, double);
150
0
    l = kputd(d, s);
151
0
    va_end(args);
152
0
    return l;
153
0
  }
154
155
197k
  if (!s->s) {
156
121k
    const size_t sz = 64;
157
121k
    s->s = malloc(sz);
158
121k
    if (!s->s)
159
0
      return -1;
160
121k
    s->m = sz;
161
121k
    s->l = 0;
162
121k
  }
163
164
197k
  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
165
197k
  va_end(args);
166
197k
  if (l + 1 > s->m - s->l) {
167
63.8k
    if (ks_resize(s, s->l + l + 2) < 0)
168
0
      return -1;
169
63.8k
    va_copy(args, ap);
170
63.8k
    l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
171
63.8k
    va_end(args);
172
63.8k
  }
173
197k
  s->l += l;
174
197k
  return l;
175
197k
}
176
177
int ksprintf(kstring_t *s, const char *fmt, ...)
178
197k
{
179
197k
  va_list ap;
180
197k
  int l;
181
197k
  va_start(ap, fmt);
182
197k
  l = kvsprintf(s, fmt, ap);
183
197k
  va_end(ap);
184
197k
  return l;
185
197k
}
186
187
char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
188
2.05M
{
189
2.05M
  const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
190
2.05M
  if (sep) { // set up the table
191
75.5k
    if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
192
75.5k
    aux->finished = 0;
193
75.5k
    if (sep[0] && sep[1]) {
194
0
      aux->sep = -1;
195
0
      aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
196
0
      for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
197
75.5k
    } else aux->sep = sep[0];
198
75.5k
  }
199
2.05M
  if (aux->finished) return 0;
200
2.01M
  else if (str) start = (unsigned char *) str, aux->finished = 0;
201
1.93M
  else start = (unsigned char *) aux->p + 1;
202
2.01M
  if (aux->sep < 0) {
203
0
    for (p = start; *p; ++p)
204
0
      if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
205
2.01M
  } else {
206
    // Using strchr is fast for next token, but slower for
207
    // last token due to extra pass from strlen.  Overall
208
    // on a VCF parse this func was 146% faster with // strchr.
209
    // Equiv to:
210
    // for (p = start; *p; ++p) if (*p == aux->sep) break;
211
212
    // NB: We could use strchrnul() here from glibc if detected,
213
    // which is ~40% faster again, but it's not so portable.
214
    // i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
215
2.01M
    uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
216
2.01M
    p = p2 ? p2 : start + strlen((char *)start);
217
2.01M
  }
218
2.01M
  aux->p = (const char *) p; // end of token
219
2.01M
  if (*p == 0) aux->finished = 1; // no more tokens
220
2.01M
  return (char*)start;
221
2.05M
}
222
223
// s MUST BE a null terminated string; l = strlen(s)
224
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
225
0
{
226
0
  int i, n, max, last_char, last_start, *offsets, l;
227
0
  n = 0; max = *_max; offsets = *_offsets;
228
0
  l = strlen(s);
229
230
0
#define __ksplit_aux do {           \
231
0
    if (_offsets) {           \
232
0
      s[i] = 0;         \
233
0
      if (n == max) {         \
234
0
        int *tmp;       \
235
0
        max = max? max<<1 : 2;     \
236
0
        if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
237
0
          offsets = tmp;      \
238
0
        } else {       \
239
0
          free(offsets);      \
240
0
          *_offsets = NULL;   \
241
0
          return 0;     \
242
0
        }          \
243
0
      }           \
244
0
      offsets[n++] = last_start;      \
245
0
    } else ++n;           \
246
0
  } while (0)
247
248
0
  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
249
0
    if (delimiter == 0) {
250
0
      if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
251
0
        if (isgraph(last_char))
252
0
                    __ksplit_aux; // the end of a field
253
0
      } else {
254
0
        if (isspace(last_char) || last_char == 0)
255
0
                    last_start = i;
256
0
      }
257
0
    } else {
258
0
      if (s[i] == delimiter || s[i] == 0) {
259
0
        if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
260
0
      } else {
261
0
        if (last_char == delimiter || last_char == 0) last_start = i;
262
0
      }
263
0
    }
264
0
    last_char = (int)((unsigned char)s[i]);
265
0
  }
266
0
  *_max = max; *_offsets = offsets;
267
0
  return n;
268
0
}
269
270
int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
271
0
{
272
0
  size_t l0 = s->l;
273
274
0
  while (s->l == l0 || s->s[s->l-1] != '\n') {
275
0
    if (s->m - s->l < 200) {
276
0
      if (ks_resize(s, s->m + 200) < 0)
277
0
        return EOF;
278
0
    }
279
0
    if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
280
0
    s->l += strlen(s->s + s->l);
281
0
  }
282
283
0
  if (s->l == l0) return EOF;
284
285
0
  if (s->l > l0 && s->s[s->l-1] == '\n') {
286
0
    s->l--;
287
0
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
288
0
  }
289
0
  s->s[s->l] = '\0';
290
0
  return 0;
291
0
}
292
293
// Wrap around fgets to get the right signature for kgets_func
294
static char * fgets_wrapper(char *buffer, int size, void *stream)
295
0
{
296
0
    return fgets(buffer, size, (FILE *) stream);
297
0
}
298
299
int kfgetline(kstring_t *s, FILE *fp)
300
0
{
301
0
    if (!s || !fp)
302
0
        return EOF;
303
0
    return kgetline(s, fgets_wrapper, fp);
304
0
}
305
306
int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp)
307
1.23M
{
308
1.23M
  size_t l0 = s->l;
309
310
2.47M
  while (s->l == l0 || s->s[s->l-1] != '\n') {
311
1.26M
    if (s->m - s->l < 200) {
312
      // We return EOF for both EOF and error and the caller
313
      // needs to check for errors in fp, and we haven't
314
      // even got there yet.
315
      //
316
      // The only way of propagating memory errors is to
317
      // deliberately call something that we know triggers
318
      // and error so fp is also set.  This works for
319
      // hgets, but not for gets where reading <= 0 bytes
320
      // isn't an error.
321
542k
      if (ks_resize(s, s->m + 200) < 0) {
322
0
        fgets_fn(s->s + s->l, 0, fp);
323
0
        return EOF;
324
0
      }
325
542k
    }
326
1.26M
    ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp);
327
1.26M
    if (len <= 0) break;
328
1.24M
    s->l += len;
329
1.24M
  }
330
331
1.23M
  if (s->l == l0) return EOF;
332
333
1.22M
  if (s->l > l0 && s->s[s->l-1] == '\n') {
334
1.20M
    s->l--;
335
1.20M
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
336
1.20M
  }
337
1.22M
  s->s[s->l] = '\0';
338
1.22M
  return 0;
339
1.23M
}
340
341
typedef unsigned char ubyte_t;
342
343
/**********************
344
 * Karp-Rabin search  *
345
 **********************/
346
347
// Backup solution for when we can't use Boyer-Moore, e.g. due to memory
348
// required.  Karp-Rabin only needs to store a couple of hash values
349
// so will always complete.
350
351
static uint64_t fast_exp(uint64_t x, uint64_t n)
352
0
{
353
0
  uint64_t y = 1;
354
0
  if (n == 0)
355
0
    return 1;
356
0
  while (n > 1) {
357
0
    if (n & 1)
358
0
      y *= x;
359
0
    x *= x;
360
0
    n >>= 1;
361
0
  }
362
0
  return y * x;
363
0
}
364
365
static void * karp_rabin(const void *str_, size_t n, const void *pat_, size_t m)
366
0
{
367
0
  const ubyte_t *str = (const ubyte_t *) str_;
368
0
  const ubyte_t *pat = (const ubyte_t *) pat_;
369
0
  const uint64_t b = 31;
370
0
  uint64_t hash_pat = 0;
371
0
  uint64_t hash_str = 0;
372
0
  uint64_t b_to_m = fast_exp(b, m);
373
0
  size_t i;
374
0
  ubyte_t mismatch = 0;
375
376
0
  if (m > n)
377
0
    return NULL;
378
379
  // Calculate hash of pat and initial part of str.
380
0
  for (i = 0; i < m; i++) {
381
0
    mismatch |= str[i] ^ pat[i];
382
0
    hash_pat = hash_pat * b + pat[i] + 1U;
383
0
    hash_str = hash_str * b + str[i] + 1U;
384
0
  }
385
386
0
  if (!mismatch) // Match found at start (or m == 0)
387
0
    return (void *) str_;
388
389
0
  for (; i < n; i++) {
390
0
    hash_str = hash_str * b + str[i] + 1U - b_to_m * (str[i - m] + 1U);
391
0
    if (hash_str == hash_pat) {
392
0
      if (memcmp(pat, str + i + 1 - m, m) == 0)
393
0
        return (void *) (str + i + 1 - m);
394
0
    }
395
0
  }
396
0
  return NULL;
397
0
}
398
399
/**********************
400
 * Boyer-Moore search *
401
 **********************/
402
403
404
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
405
static int *ksBM_prep(const ubyte_t *pat, int m)
406
898
{
407
898
  int i, *suff, *prep, *bmGs, *bmBc;
408
898
  if (m < 1)
409
0
    return NULL;
410
898
  prep = (int*)calloc((size_t) m + 256, sizeof(int));
411
898
  if (!prep) return NULL;
412
898
  bmGs = prep; bmBc = prep + m;
413
898
  { // preBmBc()
414
230k
    for (i = 0; i < 256; ++i) bmBc[i] = m;
415
17.7k
    for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
416
898
  }
417
898
  suff = (int*)calloc(m, sizeof(int));
418
898
  if (!suff) { free(prep); return NULL; }
419
898
  { // suffixes()
420
898
    int f = 0, g;
421
898
    suff[m - 1] = m;
422
898
    g = m - 1;
423
17.7k
    for (i = m - 2; i >= 0; --i) {
424
16.8k
      if (i > g && suff[i + m - 1 - f] < i - g)
425
0
        suff[i] = suff[i + m - 1 - f];
426
16.8k
      else {
427
16.8k
        if (i < g) g = i;
428
16.8k
        f = i;
429
17.1k
        while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
430
16.8k
        suff[i] = f - g;
431
16.8k
      }
432
16.8k
    }
433
898
  }
434
898
  { // preBmGs()
435
898
    int j = 0;
436
18.6k
    for (i = 0; i < m; ++i) bmGs[i] = m;
437
18.6k
    for (i = m - 1; i >= 0; --i)
438
17.7k
      if (suff[i] == i + 1)
439
898
        for (; j < m - 1 - i; ++j)
440
0
          if (bmGs[j] == m)
441
0
            bmGs[j] = m - 1 - i;
442
17.7k
    for (i = 0; i <= m - 2; ++i)
443
16.8k
      bmGs[m - 1 - suff[i]] = m - 1 - i;
444
898
  }
445
898
  free(suff);
446
898
  return prep;
447
898
}
448
449
static void *boyer_moore(const void *str_, size_t n, const void *pat_, int m,
450
             int **stored_prep_ptr)
451
1.32k
{
452
1.32k
  size_t j;
453
1.32k
  int i, *prep = 0, *bmGs, *bmBc;
454
1.32k
  const ubyte_t *str, *pat;
455
456
1.32k
  if (!str_ || !pat_)
457
0
    return (void *) str_;
458
459
1.32k
  str = (const ubyte_t*) str_;
460
1.32k
  pat = (const ubyte_t*) pat_;
461
462
1.32k
  if (m <= 0) // Empty string always matches at the start
463
0
    return (void *) str_;
464
1.32k
  if (n < m)  // Input shorter than pattern can't match
465
422
    return NULL;
466
898
  if (m == 1) // Switch to memchr for 1 byte patterns (likely faster)
467
0
    return memchr(str, pat[0], n);
468
469
898
  if (stored_prep_ptr && *stored_prep_ptr) {
470
0
    prep = *stored_prep_ptr;
471
898
  } else {
472
898
    prep = ksBM_prep(pat, m);
473
898
    if (!prep) return karp_rabin(str_, n, pat_, m);
474
898
    if (stored_prep_ptr)
475
0
      *stored_prep_ptr = prep;
476
898
  }
477
898
  bmGs = prep;      // Good-suffix shift array
478
898
  bmBc = prep + m;  // Bad character shift array
479
898
  j = 0;
480
1.58M
  while (j <= n - m) {
481
1.58M
    for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
482
1.58M
    if (i >= 0) {
483
1.58M
      int max = bmBc[str[i+j]] - m + 1 + i;
484
1.58M
      if (max < bmGs[i]) max = bmGs[i];
485
1.58M
      j += max;
486
1.58M
    } else { // Match found
487
15
      if (!stored_prep_ptr) free(prep);
488
15
      return (void*)(str + j);
489
15
    }
490
1.58M
  }
491
  // No match
492
883
  if (!stored_prep_ptr) free(prep);
493
883
  return NULL;
494
898
}
495
496
void *kmemmem(const void *str, int n, const void *pat, int m, int **prep)
497
1.32k
{
498
1.32k
  return boyer_moore(str, n >= 0 ? n : 0, pat, m, prep);
499
1.32k
}
500
501
char *kstrstr(const char *str, const char *pat, int **prep)
502
0
{
503
0
  size_t patlen = strlen(pat);
504
0
  if (patlen <= INT_MAX)
505
0
    return (char*)boyer_moore(str, strlen(str), pat, patlen, prep);
506
0
  else
507
0
    return karp_rabin(str, strlen(str), pat, patlen);
508
0
}
509
510
char *kstrnstr(const char *str, const char *pat, int n, int **prep)
511
0
{
512
0
  if (!pat || *pat == '\0')
513
0
    return (char *) str;
514
0
  if (n <= 0)
515
0
    return NULL;
516
0
  const char *endp = memchr(str, '\0', n);
517
0
  if (endp != NULL && endp - str < n)
518
0
    n = endp - str;
519
0
  size_t patlen = strlen(pat);
520
0
  if (patlen > n)
521
0
    return NULL;
522
0
  return (char*)boyer_moore(str, n, pat, patlen, prep);
523
0
}
524
525
/***********************
526
 * The main() function *
527
 ***********************/
528
529
#ifdef KSTRING_MAIN
530
#include <stdio.h>
531
int main()
532
{
533
  kstring_t *s;
534
  int *fields, n, i;
535
  ks_tokaux_t aux;
536
  char *p;
537
  s = (kstring_t*)calloc(1, sizeof(kstring_t));
538
  // test ksprintf()
539
  ksprintf(s, " abcdefg:    %d ", 100);
540
  printf("'%s'\n", s->s);
541
  // test ksplit()
542
  fields = ksplit(s, 0, &n);
543
  for (i = 0; i < n; ++i)
544
    printf("field[%d] = '%s'\n", i, s->s + fields[i]);
545
  // test kstrtok()
546
  s->l = 0;
547
  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
548
    kputsn(p, aux.p - p, s);
549
    kputc('\n', s);
550
  }
551
  printf("%s", s->s);
552
  // free
553
  free(s->s); free(s); free(fields);
554
555
  {
556
    static char *str = "abcdefgcdgcagtcakcdcd";
557
    static char *pat = "cd";
558
    char *ret, *s = str;
559
    int *prep = 0;
560
    while ((ret = kstrstr(s, pat, &prep)) != 0) {
561
      printf("match: %s\n", ret);
562
      s = ret + prep[0];
563
    }
564
    free(prep);
565
  }
566
  return 0;
567
}
568
#endif