Coverage Report

Created: 2026-06-08 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/kstring.c
Line
Count
Source
1
/* The MIT License
2
3
   Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4
   Copyright (C) 2013-2018, 2020-2021, 2023, 2025 Genome Research Ltd.
5
6
   Permission is hereby granted, free of charge, to any person obtaining
7
   a copy of this software and associated documentation files (the
8
   "Software"), to deal in the Software without restriction, including
9
   without limitation the rights to use, copy, modify, merge, publish,
10
   distribute, sublicense, and/or sell copies of the Software, and to
11
   permit persons to whom the Software is furnished to do so, subject to
12
   the following conditions:
13
14
   The above copyright notice and this permission notice shall be
15
   included in all copies or substantial portions of the Software.
16
17
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
   SOFTWARE.
25
*/
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdarg.h>
31
#include <stdio.h>
32
#include <ctype.h>
33
#include <string.h>
34
#include <stdint.h>
35
#include <math.h>
36
#include "htslib/kstring.h"
37
#include "htslib/hts_alloc.h"
38
39
577k
int kputd(double d, kstring_t *s) {
40
577k
  int len = 0;
41
577k
  char buf[21], *cp = buf+20, *ep;
42
577k
  if (d == 0) {
43
40.6k
    if (signbit(d)) {
44
4.85k
      kputsn("-0",2,s);
45
4.85k
      return 2;
46
35.7k
    } else {
47
35.7k
      kputsn("0",1,s);
48
35.7k
      return 1;
49
35.7k
    }
50
40.6k
  }
51
52
537k
  if (d < 0) {
53
154k
    kputc('-',s);
54
154k
    len = 1;
55
154k
    d=-d;
56
154k
  }
57
537k
  if (!(d >= 0.0001 && d <= 999999)) {
58
140k
    if (ks_resize(s, s->l + 50) < 0)
59
0
      return EOF;
60
    // We let stdio handle the exponent cases
61
140k
    int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d);
62
140k
    len += s2;
63
140k
    s->l += s2;
64
140k
    return len;
65
140k
  }
66
67
  // Correction for rounding - rather ugly
68
  // Optimised for small numbers.
69
70
396k
  uint32_t i;
71
396k
  if (d<0.001)         i = rint(d*1000000000), cp -= 1;
72
396k
  else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
73
388k
  else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
74
373k
  else if (d < 1)      i = rint(d*1000000),    cp -= 4;
75
361k
  else if (d < 10)     i = rint(d*100000),     cp -= 5;
76
262k
  else if (d < 100)    i = rint(d*10000),      cp -= 6;
77
214k
  else if (d < 1000)   i = rint(d*1000),       cp -= 7;
78
79.0k
  else if (d < 10000)  i = rint(d*100),        cp -= 8;
79
69.4k
  else if (d < 100000) i = rint(d*10),         cp -= 9;
80
51.3k
  else                 i = rint(d),            cp -= 10;
81
82
  // integer i is always 6 digits, so print it 2 at a time.
83
396k
  static const char kputuw_dig2r[] =
84
396k
    "00010203040506070809"
85
396k
    "10111213141516171819"
86
396k
    "20212223242526272829"
87
396k
    "30313233343536373839"
88
396k
    "40414243444546474849"
89
396k
    "50515253545556575859"
90
396k
    "60616263646566676869"
91
396k
    "70717273747576777879"
92
396k
    "80818283848586878889"
93
396k
    "90919293949596979899";
94
95
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
96
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
97
396k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
98
99
  // Except when it rounds up (d=0.009999999 is i=1000000)
100
396k
  if (i >= 100)
101
8.29k
    *--cp = '0' + (i/100);
102
103
104
396k
  int p = buf+20-cp;
105
396k
  if (p <= 10) { /* d < 1 */
106
    // 0.00123 is 123, so add leading zeros and 0.
107
35.8k
    ep = cp+5; // 6 precision
108
59.7k
    while (p < 10) { // aka d < 1
109
23.9k
      *--cp = '0';
110
23.9k
      p++;
111
23.9k
    }
112
35.8k
    *--cp = '.';
113
35.8k
    *--cp = '0';
114
361k
  } else {
115
    // 123.001 is 123001 with p==13, so move 123 down and add "."
116
    // Equiv to memmove(cp-1, cp, p-10); cp--;
117
361k
    char *xp = --cp;
118
361k
    ep = cp+6;
119
1.39M
    while (p > 10) {
120
1.03M
      xp[0] = xp[1];
121
1.03M
      xp++;
122
1.03M
      p--;
123
1.03M
    }
124
361k
    xp[0] = '.';
125
361k
  }
126
127
  // Cull trailing zeros
128
1.68M
  while (*ep == '0' && ep > cp)
129
1.29M
    ep--;
130
131
  // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
132
  // Also code with "123." which should be "123"
133
396k
  if (*ep && *ep != '.')
134
39.3k
    ep++;
135
396k
  *ep = 0;
136
137
396k
  int sl = ep-cp;
138
396k
  len += sl;
139
396k
  kputsn(cp, sl, s);
140
396k
  return len;
141
537k
}
142
143
int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
144
197k
{
145
197k
  va_list args;
146
197k
  int l;
147
197k
  va_copy(args, ap);
148
149
197k
  if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
150
0
    double d = va_arg(args, double);
151
0
    l = kputd(d, s);
152
0
    va_end(args);
153
0
    return l;
154
0
  }
155
156
197k
  if (!s->s) {
157
121k
    const size_t sz = 64;
158
121k
    s->s = malloc(sz);
159
121k
    if (!s->s)
160
0
      return -1;
161
121k
    s->m = sz;
162
121k
    s->l = 0;
163
121k
  }
164
165
197k
  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
166
197k
  va_end(args);
167
197k
  if (l + 1 > s->m - s->l) {
168
63.8k
    if (ks_resize(s, s->l + l + 2) < 0)
169
0
      return -1;
170
63.8k
    va_copy(args, ap);
171
63.8k
    l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
172
63.8k
    va_end(args);
173
63.8k
  }
174
197k
  s->l += l;
175
197k
  return l;
176
197k
}
177
178
int ksprintf(kstring_t *s, const char *fmt, ...)
179
197k
{
180
197k
  va_list ap;
181
197k
  int l;
182
197k
  va_start(ap, fmt);
183
197k
  l = kvsprintf(s, fmt, ap);
184
197k
  va_end(ap);
185
197k
  return l;
186
197k
}
187
188
char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
189
2.05M
{
190
2.05M
  const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
191
2.05M
  if (sep) { // set up the table
192
75.5k
    if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
193
75.5k
    aux->finished = 0;
194
75.5k
    if (sep[0] && sep[1]) {
195
0
      aux->sep = -1;
196
0
      aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
197
0
      for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
198
75.5k
    } else aux->sep = sep[0];
199
75.5k
  }
200
2.05M
  if (aux->finished) return 0;
201
2.01M
  else if (str) start = (unsigned char *) str, aux->finished = 0;
202
1.93M
  else start = (unsigned char *) aux->p + 1;
203
2.01M
  if (aux->sep < 0) {
204
0
    for (p = start; *p; ++p)
205
0
      if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
206
2.01M
  } else {
207
    // Using strchr is fast for next token, but slower for
208
    // last token due to extra pass from strlen.  Overall
209
    // on a VCF parse this func was 146% faster with // strchr.
210
    // Equiv to:
211
    // for (p = start; *p; ++p) if (*p == aux->sep) break;
212
213
    // NB: We could use strchrnul() here from glibc if detected,
214
    // which is ~40% faster again, but it's not so portable.
215
    // i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
216
2.01M
    uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
217
2.01M
    p = p2 ? p2 : start + strlen((char *)start);
218
2.01M
  }
219
2.01M
  aux->p = (const char *) p; // end of token
220
2.01M
  if (*p == 0) aux->finished = 1; // no more tokens
221
2.01M
  return (char*)start;
222
2.05M
}
223
224
// s MUST BE a null terminated string; l = strlen(s)
225
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
226
0
{
227
0
  int i, n, max, last_char, last_start, *offsets, l;
228
0
  n = 0; max = *_max; offsets = *_offsets;
229
0
  l = strlen(s);
230
231
0
#define __ksplit_aux do {           \
232
0
    if (_offsets) {           \
233
0
      s[i] = 0;         \
234
0
      if (n == max) {         \
235
0
        int *tmp;       \
236
0
        max = max? max<<1 : 2;     \
237
0
        if ((tmp = hts_realloc_p(offsets, sizeof(int), max))) {  \
238
0
          offsets = tmp;      \
239
0
        } else {       \
240
0
          free(offsets);      \
241
0
          *_offsets = NULL;   \
242
0
          return 0;     \
243
0
        }          \
244
0
      }           \
245
0
      offsets[n++] = last_start;      \
246
0
    } else ++n;           \
247
0
  } while (0)
248
249
0
  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
250
0
    if (delimiter == 0) {
251
0
      if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
252
0
        if (isgraph(last_char))
253
0
                    __ksplit_aux; // the end of a field
254
0
      } else {
255
0
        if (isspace(last_char) || last_char == 0)
256
0
                    last_start = i;
257
0
      }
258
0
    } else {
259
0
      if (s[i] == delimiter || s[i] == 0) {
260
0
        if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
261
0
      } else {
262
0
        if (last_char == delimiter || last_char == 0) last_start = i;
263
0
      }
264
0
    }
265
0
    last_char = (int)((unsigned char)s[i]);
266
0
  }
267
0
  *_max = max; *_offsets = offsets;
268
0
  return n;
269
0
}
270
271
int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
272
0
{
273
0
  size_t l0 = s->l;
274
275
0
  while (s->l == l0 || s->s[s->l-1] != '\n') {
276
0
    if (s->m - s->l < 200) {
277
0
      if (ks_resize(s, s->m + 200) < 0)
278
0
        return EOF;
279
0
    }
280
0
    if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
281
0
    s->l += strlen(s->s + s->l);
282
0
  }
283
284
0
  if (s->l == l0) return EOF;
285
286
0
  if (s->l > l0 && s->s[s->l-1] == '\n') {
287
0
    s->l--;
288
0
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
289
0
  }
290
0
  s->s[s->l] = '\0';
291
0
  return 0;
292
0
}
293
294
// Wrap around fgets to get the right signature for kgets_func
295
static char * fgets_wrapper(char *buffer, int size, void *stream)
296
0
{
297
0
    return fgets(buffer, size, (FILE *) stream);
298
0
}
299
300
int kfgetline(kstring_t *s, FILE *fp)
301
0
{
302
0
    if (!s || !fp)
303
0
        return EOF;
304
0
    return kgetline(s, fgets_wrapper, fp);
305
0
}
306
307
int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp)
308
1.23M
{
309
1.23M
  size_t l0 = s->l;
310
311
2.47M
  while (s->l == l0 || s->s[s->l-1] != '\n') {
312
1.26M
    if (s->m - s->l < 200) {
313
      // We return EOF for both EOF and error and the caller
314
      // needs to check for errors in fp, and we haven't
315
      // even got there yet.
316
      //
317
      // The only way of propagating memory errors is to
318
      // deliberately call something that we know triggers
319
      // and error so fp is also set.  This works for
320
      // hgets, but not for gets where reading <= 0 bytes
321
      // isn't an error.
322
542k
      if (ks_resize(s, s->m + 200) < 0) {
323
0
        fgets_fn(s->s + s->l, 0, fp);
324
0
        return EOF;
325
0
      }
326
542k
    }
327
1.26M
    ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp);
328
1.26M
    if (len <= 0) break;
329
1.24M
    s->l += len;
330
1.24M
  }
331
332
1.23M
  if (s->l == l0) return EOF;
333
334
1.22M
  if (s->l > l0 && s->s[s->l-1] == '\n') {
335
1.20M
    s->l--;
336
1.20M
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
337
1.20M
  }
338
1.22M
  s->s[s->l] = '\0';
339
1.22M
  return 0;
340
1.23M
}
341
342
typedef unsigned char ubyte_t;
343
344
/**********************
345
 * Karp-Rabin search  *
346
 **********************/
347
348
// Backup solution for when we can't use Boyer-Moore, e.g. due to memory
349
// required.  Karp-Rabin only needs to store a couple of hash values
350
// so will always complete.
351
352
static uint64_t fast_exp(uint64_t x, uint64_t n)
353
0
{
354
0
  uint64_t y = 1;
355
0
  if (n == 0)
356
0
    return 1;
357
0
  while (n > 1) {
358
0
    if (n & 1)
359
0
      y *= x;
360
0
    x *= x;
361
0
    n >>= 1;
362
0
  }
363
0
  return y * x;
364
0
}
365
366
static void * karp_rabin(const void *str_, size_t n, const void *pat_, size_t m)
367
0
{
368
0
  const ubyte_t *str = (const ubyte_t *) str_;
369
0
  const ubyte_t *pat = (const ubyte_t *) pat_;
370
0
  const uint64_t b = 31;
371
0
  uint64_t hash_pat = 0;
372
0
  uint64_t hash_str = 0;
373
0
  uint64_t b_to_m = fast_exp(b, m);
374
0
  size_t i;
375
0
  ubyte_t mismatch = 0;
376
377
0
  if (m > n)
378
0
    return NULL;
379
380
  // Calculate hash of pat and initial part of str.
381
0
  for (i = 0; i < m; i++) {
382
0
    mismatch |= str[i] ^ pat[i];
383
0
    hash_pat = hash_pat * b + pat[i] + 1U;
384
0
    hash_str = hash_str * b + str[i] + 1U;
385
0
  }
386
387
0
  if (!mismatch) // Match found at start (or m == 0)
388
0
    return (void *) str_;
389
390
0
  for (; i < n; i++) {
391
0
    hash_str = hash_str * b + str[i] + 1U - b_to_m * (str[i - m] + 1U);
392
0
    if (hash_str == hash_pat) {
393
0
      if (memcmp(pat, str + i + 1 - m, m) == 0)
394
0
        return (void *) (str + i + 1 - m);
395
0
    }
396
0
  }
397
0
  return NULL;
398
0
}
399
400
/**********************
401
 * Boyer-Moore search *
402
 **********************/
403
404
405
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
406
static int *ksBM_prep(const ubyte_t *pat, int m)
407
898
{
408
898
  int i, *suff, *prep, *bmGs, *bmBc;
409
898
  if (m < 1)
410
0
    return NULL;
411
898
  prep = hts_calloc_ps(sizeof(int), m, 256);
412
898
  if (!prep) return NULL;
413
898
  bmGs = prep; bmBc = prep + m;
414
898
  { // preBmBc()
415
230k
    for (i = 0; i < 256; ++i) bmBc[i] = m;
416
17.7k
    for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
417
898
  }
418
898
  suff = (int*)calloc(m, sizeof(int));
419
898
  if (!suff) { free(prep); return NULL; }
420
898
  { // suffixes()
421
898
    int f = 0, g;
422
898
    suff[m - 1] = m;
423
898
    g = m - 1;
424
17.7k
    for (i = m - 2; i >= 0; --i) {
425
16.8k
      if (i > g && suff[i + m - 1 - f] < i - g)
426
0
        suff[i] = suff[i + m - 1 - f];
427
16.8k
      else {
428
16.8k
        if (i < g) g = i;
429
16.8k
        f = i;
430
17.1k
        while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
431
16.8k
        suff[i] = f - g;
432
16.8k
      }
433
16.8k
    }
434
898
  }
435
898
  { // preBmGs()
436
898
    int j = 0;
437
18.6k
    for (i = 0; i < m; ++i) bmGs[i] = m;
438
18.6k
    for (i = m - 1; i >= 0; --i)
439
17.7k
      if (suff[i] == i + 1)
440
898
        for (; j < m - 1 - i; ++j)
441
0
          if (bmGs[j] == m)
442
0
            bmGs[j] = m - 1 - i;
443
17.7k
    for (i = 0; i <= m - 2; ++i)
444
16.8k
      bmGs[m - 1 - suff[i]] = m - 1 - i;
445
898
  }
446
898
  free(suff);
447
898
  return prep;
448
898
}
449
450
static void *boyer_moore(const void *str_, size_t n, const void *pat_, int m,
451
             int **stored_prep_ptr)
452
1.32k
{
453
1.32k
  size_t j;
454
1.32k
  int i, *prep = 0, *bmGs, *bmBc;
455
1.32k
  const ubyte_t *str, *pat;
456
457
1.32k
  if (!str_ || !pat_)
458
0
    return (void *) str_;
459
460
1.32k
  str = (const ubyte_t*) str_;
461
1.32k
  pat = (const ubyte_t*) pat_;
462
463
1.32k
  if (m <= 0) // Empty string always matches at the start
464
0
    return (void *) str_;
465
1.32k
  if (n < m)  // Input shorter than pattern can't match
466
422
    return NULL;
467
898
  if (m == 1) // Switch to memchr for 1 byte patterns (likely faster)
468
0
    return memchr(str, pat[0], n);
469
470
898
  if (stored_prep_ptr && *stored_prep_ptr) {
471
0
    prep = *stored_prep_ptr;
472
898
  } else {
473
898
    prep = ksBM_prep(pat, m);
474
898
    if (!prep) return karp_rabin(str_, n, pat_, m);
475
898
    if (stored_prep_ptr)
476
0
      *stored_prep_ptr = prep;
477
898
  }
478
898
  bmGs = prep;      // Good-suffix shift array
479
898
  bmBc = prep + m;  // Bad character shift array
480
898
  j = 0;
481
1.58M
  while (j <= n - m) {
482
1.58M
    for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
483
1.58M
    if (i >= 0) {
484
1.58M
      int max = bmBc[str[i+j]] - m + 1 + i;
485
1.58M
      if (max < bmGs[i]) max = bmGs[i];
486
1.58M
      j += max;
487
1.58M
    } else { // Match found
488
15
      if (!stored_prep_ptr) free(prep);
489
15
      return (void*)(str + j);
490
15
    }
491
1.58M
  }
492
  // No match
493
883
  if (!stored_prep_ptr) free(prep);
494
883
  return NULL;
495
898
}
496
497
void *kmemmem(const void *str, int n, const void *pat, int m, int **prep)
498
1.32k
{
499
1.32k
  return boyer_moore(str, n >= 0 ? n : 0, pat, m, prep);
500
1.32k
}
501
502
char *kstrstr(const char *str, const char *pat, int **prep)
503
0
{
504
0
  size_t patlen = strlen(pat);
505
0
  if (patlen <= INT_MAX)
506
0
    return (char*)boyer_moore(str, strlen(str), pat, patlen, prep);
507
0
  else
508
0
    return karp_rabin(str, strlen(str), pat, patlen);
509
0
}
510
511
char *kstrnstr(const char *str, const char *pat, int n, int **prep)
512
0
{
513
0
  if (!pat || *pat == '\0')
514
0
    return (char *) str;
515
0
  if (n <= 0)
516
0
    return NULL;
517
0
  const char *endp = memchr(str, '\0', n);
518
0
  if (endp != NULL && endp - str < n)
519
0
    n = endp - str;
520
0
  size_t patlen = strlen(pat);
521
0
  if (patlen > n)
522
0
    return NULL;
523
0
  return (char*)boyer_moore(str, n, pat, patlen, prep);
524
0
}
525
526
/***********************
527
 * The main() function *
528
 ***********************/
529
530
#ifdef KSTRING_MAIN
531
#include <stdio.h>
532
int main()
533
{
534
  kstring_t *s;
535
  int *fields, n, i;
536
  ks_tokaux_t aux;
537
  char *p;
538
  s = (kstring_t*)calloc(1, sizeof(kstring_t));
539
  // test ksprintf()
540
  ksprintf(s, " abcdefg:    %d ", 100);
541
  printf("'%s'\n", s->s);
542
  // test ksplit()
543
  fields = ksplit(s, 0, &n);
544
  for (i = 0; i < n; ++i)
545
    printf("field[%d] = '%s'\n", i, s->s + fields[i]);
546
  // test kstrtok()
547
  s->l = 0;
548
  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
549
    kputsn(p, aux.p - p, s);
550
    kputc('\n', s);
551
  }
552
  printf("%s", s->s);
553
  // free
554
  free(s->s); free(s); free(fields);
555
556
  {
557
    static char *str = "abcdefgcdgcagtcakcdcd";
558
    static char *pat = "cd";
559
    char *ret, *s = str;
560
    int *prep = 0;
561
    while ((ret = kstrstr(s, pat, &prep)) != 0) {
562
      printf("match: %s\n", ret);
563
      s = ret + prep[0];
564
    }
565
    free(prep);
566
  }
567
  return 0;
568
}
569
#endif