Coverage Report

Created: 2026-02-14 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/kstring.c
Line
Count
Source
1
/* The MIT License
2
3
   Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4
   Copyright (C) 2013-2018, 2020-2021, 2023, 2025 Genome Research Ltd.
5
6
   Permission is hereby granted, free of charge, to any person obtaining
7
   a copy of this software and associated documentation files (the
8
   "Software"), to deal in the Software without restriction, including
9
   without limitation the rights to use, copy, modify, merge, publish,
10
   distribute, sublicense, and/or sell copies of the Software, and to
11
   permit persons to whom the Software is furnished to do so, subject to
12
   the following conditions:
13
14
   The above copyright notice and this permission notice shall be
15
   included in all copies or substantial portions of the Software.
16
17
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
   SOFTWARE.
25
*/
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdarg.h>
31
#include <stdio.h>
32
#include <ctype.h>
33
#include <string.h>
34
#include <stdint.h>
35
#include <math.h>
36
#include "htslib/kstring.h"
37
38
722k
int kputd(double d, kstring_t *s) {
39
722k
  int len = 0;
40
722k
  char buf[21], *cp = buf+20, *ep;
41
722k
  if (d == 0) {
42
96.5k
    if (signbit(d)) {
43
3.83k
      kputsn("-0",2,s);
44
3.83k
      return 2;
45
92.6k
    } else {
46
92.6k
      kputsn("0",1,s);
47
92.6k
      return 1;
48
92.6k
    }
49
96.5k
  }
50
51
625k
  if (d < 0) {
52
173k
    kputc('-',s);
53
173k
    len = 1;
54
173k
    d=-d;
55
173k
  }
56
625k
  if (!(d >= 0.0001 && d <= 999999)) {
57
168k
    if (ks_resize(s, s->l + 50) < 0)
58
0
      return EOF;
59
    // We let stdio handle the exponent cases
60
168k
    int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d);
61
168k
    len += s2;
62
168k
    s->l += s2;
63
168k
    return len;
64
168k
  }
65
66
  // Correction for rounding - rather ugly
67
  // Optimised for small numbers.
68
69
457k
  uint32_t i;
70
457k
  if (d<0.001)         i = rint(d*1000000000), cp -= 1;
71
457k
  else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
72
450k
  else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
73
439k
  else if (d < 1)      i = rint(d*1000000),    cp -= 4;
74
428k
  else if (d < 10)     i = rint(d*100000),     cp -= 5;
75
320k
  else if (d < 100)    i = rint(d*10000),      cp -= 6;
76
266k
  else if (d < 1000)   i = rint(d*1000),       cp -= 7;
77
99.2k
  else if (d < 10000)  i = rint(d*100),        cp -= 8;
78
86.0k
  else if (d < 100000) i = rint(d*10),         cp -= 9;
79
61.5k
  else                 i = rint(d),            cp -= 10;
80
81
  // integer i is always 6 digits, so print it 2 at a time.
82
457k
  static const char kputuw_dig2r[] =
83
457k
    "00010203040506070809"
84
457k
    "10111213141516171819"
85
457k
    "20212223242526272829"
86
457k
    "30313233343536373839"
87
457k
    "40414243444546474849"
88
457k
    "50515253545556575859"
89
457k
    "60616263646566676869"
90
457k
    "70717273747576777879"
91
457k
    "80818283848586878889"
92
457k
    "90919293949596979899";
93
94
457k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
95
457k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
96
457k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
97
98
  // Except when it rounds up (d=0.009999999 is i=1000000)
99
457k
  if (i >= 100)
100
6.67k
    *--cp = '0' + (i/100);
101
102
103
457k
  int p = buf+20-cp;
104
457k
  if (p <= 10) { /* d < 1 */
105
    // 0.00123 is 123, so add leading zeros and 0.
106
28.8k
    ep = cp+5; // 6 precision
107
48.2k
    while (p < 10) { // aka d < 1
108
19.4k
      *--cp = '0';
109
19.4k
      p++;
110
19.4k
    }
111
28.8k
    *--cp = '.';
112
28.8k
    *--cp = '0';
113
428k
  } else {
114
    // 123.001 is 123001 with p==13, so move 123 down and add "."
115
    // Equiv to memmove(cp-1, cp, p-10); cp--;
116
428k
    char *xp = --cp;
117
428k
    ep = cp+6;
118
1.69M
    while (p > 10) {
119
1.26M
      xp[0] = xp[1];
120
1.26M
      xp++;
121
1.26M
      p--;
122
1.26M
    }
123
428k
    xp[0] = '.';
124
428k
  }
125
126
  // Cull trailing zeros
127
1.89M
  while (*ep == '0' && ep > cp)
128
1.44M
    ep--;
129
130
  // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
131
  // Also code with "123." which should be "123"
132
457k
  if (*ep && *ep != '.')
133
31.7k
    ep++;
134
457k
  *ep = 0;
135
136
457k
  int sl = ep-cp;
137
457k
  len += sl;
138
457k
  kputsn(cp, sl, s);
139
457k
  return len;
140
625k
}
141
142
int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
143
159k
{
144
159k
  va_list args;
145
159k
  int l;
146
159k
  va_copy(args, ap);
147
148
159k
  if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
149
0
    double d = va_arg(args, double);
150
0
    l = kputd(d, s);
151
0
    va_end(args);
152
0
    return l;
153
0
  }
154
155
159k
  if (!s->s) {
156
97.4k
    const size_t sz = 64;
157
97.4k
    s->s = malloc(sz);
158
97.4k
    if (!s->s)
159
0
      return -1;
160
97.4k
    s->m = sz;
161
97.4k
    s->l = 0;
162
97.4k
  }
163
164
159k
  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
165
159k
  va_end(args);
166
159k
  if (l + 1 > s->m - s->l) {
167
56.7k
    if (ks_resize(s, s->l + l + 2) < 0)
168
0
      return -1;
169
56.7k
    va_copy(args, ap);
170
56.7k
    l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
171
56.7k
    va_end(args);
172
56.7k
  }
173
159k
  s->l += l;
174
159k
  return l;
175
159k
}
176
177
int ksprintf(kstring_t *s, const char *fmt, ...)
178
159k
{
179
159k
  va_list ap;
180
159k
  int l;
181
159k
  va_start(ap, fmt);
182
159k
  l = kvsprintf(s, fmt, ap);
183
159k
  va_end(ap);
184
159k
  return l;
185
159k
}
186
187
char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
188
2.15M
{
189
2.15M
  const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
190
2.15M
  if (sep) { // set up the table
191
67.5k
    if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
192
67.5k
    aux->finished = 0;
193
67.5k
    if (sep[0] && sep[1]) {
194
0
      aux->sep = -1;
195
0
      aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
196
0
      for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
197
67.5k
    } else aux->sep = sep[0];
198
67.5k
  }
199
2.15M
  if (aux->finished) return 0;
200
2.11M
  else if (str) start = (unsigned char *) str, aux->finished = 0;
201
2.04M
  else start = (unsigned char *) aux->p + 1;
202
2.11M
  if (aux->sep < 0) {
203
0
    for (p = start; *p; ++p)
204
0
      if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
205
2.11M
  } else {
206
    // Using strchr is fast for next token, but slower for
207
    // last token due to extra pass from strlen.  Overall
208
    // on a VCF parse this func was 146% faster with // strchr.
209
    // Equiv to:
210
    // for (p = start; *p; ++p) if (*p == aux->sep) break;
211
212
    // NB: We could use strchrnul() here from glibc if detected,
213
    // which is ~40% faster again, but it's not so portable.
214
    // i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
215
2.11M
    uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
216
2.11M
    p = p2 ? p2 : start + strlen((char *)start);
217
2.11M
  }
218
2.11M
  aux->p = (const char *) p; // end of token
219
2.11M
  if (*p == 0) aux->finished = 1; // no more tokens
220
2.11M
  return (char*)start;
221
2.15M
}
222
223
// s MUST BE a null terminated string; l = strlen(s)
224
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
225
0
{
226
0
  int i, n, max, last_char, last_start, *offsets, l;
227
0
  n = 0; max = *_max; offsets = *_offsets;
228
0
  l = strlen(s);
229
230
0
#define __ksplit_aux do {           \
231
0
    if (_offsets) {           \
232
0
      s[i] = 0;         \
233
0
      if (n == max) {         \
234
0
        int *tmp;       \
235
0
        max = max? max<<1 : 2;     \
236
0
        if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
237
0
          offsets = tmp;      \
238
0
        } else {       \
239
0
          free(offsets);      \
240
0
          *_offsets = NULL;   \
241
0
          return 0;     \
242
0
        }          \
243
0
      }           \
244
0
      offsets[n++] = last_start;      \
245
0
    } else ++n;           \
246
0
  } while (0)
247
248
0
  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
249
0
    if (delimiter == 0) {
250
0
      if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
251
0
        if (isgraph(last_char))
252
0
                    __ksplit_aux; // the end of a field
253
0
      } else {
254
0
        if (isspace(last_char) || last_char == 0)
255
0
                    last_start = i;
256
0
      }
257
0
    } else {
258
0
      if (s[i] == delimiter || s[i] == 0) {
259
0
        if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
260
0
      } else {
261
0
        if (last_char == delimiter || last_char == 0) last_start = i;
262
0
      }
263
0
    }
264
0
    last_char = (int)((unsigned char)s[i]);
265
0
  }
266
0
  *_max = max; *_offsets = offsets;
267
0
  return n;
268
0
}
269
270
int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
271
0
{
272
0
  size_t l0 = s->l;
273
274
0
  while (s->l == l0 || s->s[s->l-1] != '\n') {
275
0
    if (s->m - s->l < 200) {
276
0
      if (ks_resize(s, s->m + 200) < 0)
277
0
        return EOF;
278
0
    }
279
0
    if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
280
0
    s->l += strlen(s->s + s->l);
281
0
  }
282
283
0
  if (s->l == l0) return EOF;
284
285
0
  if (s->l > l0 && s->s[s->l-1] == '\n') {
286
0
    s->l--;
287
0
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
288
0
  }
289
0
  s->s[s->l] = '\0';
290
0
  return 0;
291
0
}
292
293
int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp)
294
1.35M
{
295
1.35M
  size_t l0 = s->l;
296
297
2.73M
  while (s->l == l0 || s->s[s->l-1] != '\n') {
298
1.38M
    if (s->m - s->l < 200) {
299
      // We return EOF for both EOF and error and the caller
300
      // needs to check for errors in fp, and we haven't
301
      // even got there yet.
302
      //
303
      // The only way of propagating memory errors is to
304
      // deliberately call something that we know triggers
305
      // and error so fp is also set.  This works for
306
      // hgets, but not for gets where reading <= 0 bytes
307
      // isn't an error.
308
624k
      if (ks_resize(s, s->m + 200) < 0) {
309
0
        fgets_fn(s->s + s->l, 0, fp);
310
0
        return EOF;
311
0
      }
312
624k
    }
313
1.38M
    ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp);
314
1.38M
    if (len <= 0) break;
315
1.37M
    s->l += len;
316
1.37M
  }
317
318
1.35M
  if (s->l == l0) return EOF;
319
320
1.35M
  if (s->l > l0 && s->s[s->l-1] == '\n') {
321
1.34M
    s->l--;
322
1.34M
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
323
1.34M
  }
324
1.35M
  s->s[s->l] = '\0';
325
1.35M
  return 0;
326
1.35M
}
327
328
typedef unsigned char ubyte_t;
329
330
/**********************
331
 * Karp-Rabin search  *
332
 **********************/
333
334
// Backup solution for when we can't use Boyer-Moore, e.g. due to memory
335
// required.  Karp-Rabin only needs to store a couple of hash values
336
// so will always complete.
337
338
static uint64_t fast_exp(uint64_t x, uint64_t n)
339
0
{
340
0
  uint64_t y = 1;
341
0
  if (n == 0)
342
0
    return 1;
343
0
  while (n > 1) {
344
0
    if (n & 1)
345
0
      y *= x;
346
0
    x *= x;
347
0
    n >>= 1;
348
0
  }
349
0
  return y * x;
350
0
}
351
352
static void * karp_rabin(const void *str_, size_t n, const void *pat_, size_t m)
353
0
{
354
0
  const ubyte_t *str = (const ubyte_t *) str_;
355
0
  const ubyte_t *pat = (const ubyte_t *) pat_;
356
0
  const uint64_t b = 31;
357
0
  uint64_t hash_pat = 0;
358
0
  uint64_t hash_str = 0;
359
0
  uint64_t b_to_m = fast_exp(b, m);
360
0
  size_t i;
361
0
  ubyte_t mismatch = 0;
362
363
0
  if (m > n)
364
0
    return NULL;
365
366
  // Calculate hash of pat and initial part of str.
367
0
  for (i = 0; i < m; i++) {
368
0
    mismatch |= str[i] ^ pat[i];
369
0
    hash_pat = hash_pat * b + pat[i] + 1U;
370
0
    hash_str = hash_str * b + str[i] + 1U;
371
0
  }
372
373
0
  if (!mismatch) // Match found at start (or m == 0)
374
0
    return (void *) str_;
375
376
0
  for (; i < n; i++) {
377
0
    hash_str = hash_str * b + str[i] + 1U - b_to_m * (str[i - m] + 1U);
378
0
    if (hash_str == hash_pat) {
379
0
      if (memcmp(pat, str + i + 1 - m, m) == 0)
380
0
        return (void *) (str + i + 1 - m);
381
0
    }
382
0
  }
383
0
  return NULL;
384
0
}
385
386
/**********************
387
 * Boyer-Moore search *
388
 **********************/
389
390
391
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
392
static int *ksBM_prep(const ubyte_t *pat, int m)
393
394
{
394
394
  int i, *suff, *prep, *bmGs, *bmBc;
395
394
  if (m < 1)
396
0
    return NULL;
397
394
  prep = (int*)calloc((size_t) m + 256, sizeof(int));
398
394
  if (!prep) return NULL;
399
394
  bmGs = prep; bmBc = prep + m;
400
394
  { // preBmBc()
401
101k
    for (i = 0; i < 256; ++i) bmBc[i] = m;
402
7.87k
    for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
403
394
  }
404
394
  suff = (int*)calloc(m, sizeof(int));
405
394
  if (!suff) { free(prep); return NULL; }
406
394
  { // suffixes()
407
394
    int f = 0, g;
408
394
    suff[m - 1] = m;
409
394
    g = m - 1;
410
7.87k
    for (i = m - 2; i >= 0; --i) {
411
7.48k
      if (i > g && suff[i + m - 1 - f] < i - g)
412
0
        suff[i] = suff[i + m - 1 - f];
413
7.48k
      else {
414
7.48k
        if (i < g) g = i;
415
7.48k
        f = i;
416
7.61k
        while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
417
7.48k
        suff[i] = f - g;
418
7.48k
      }
419
7.48k
    }
420
394
  }
421
394
  { // preBmGs()
422
394
    int j = 0;
423
8.27k
    for (i = 0; i < m; ++i) bmGs[i] = m;
424
8.27k
    for (i = m - 1; i >= 0; --i)
425
7.87k
      if (suff[i] == i + 1)
426
394
        for (; j < m - 1 - i; ++j)
427
0
          if (bmGs[j] == m)
428
0
            bmGs[j] = m - 1 - i;
429
7.87k
    for (i = 0; i <= m - 2; ++i)
430
7.48k
      bmGs[m - 1 - suff[i]] = m - 1 - i;
431
394
  }
432
394
  free(suff);
433
394
  return prep;
434
394
}
435
436
static void *boyer_moore(const void *str_, size_t n, const void *pat_, int m,
437
             int **stored_prep_ptr)
438
466
{
439
466
  size_t j;
440
466
  int i, *prep = 0, *bmGs, *bmBc;
441
466
  const ubyte_t *str, *pat;
442
443
466
  if (!str_ || !pat_)
444
0
    return (void *) str_;
445
446
466
  str = (const ubyte_t*) str_;
447
466
  pat = (const ubyte_t*) pat_;
448
449
466
  if (m <= 0) // Empty string always matches at the start
450
0
    return (void *) str_;
451
466
  if (n < m)  // Input shorter than pattern can't match
452
72
    return NULL;
453
394
  if (m == 1) // Switch to memchr for 1 byte patterns (likely faster)
454
0
    return memchr(str, pat[0], n);
455
456
394
  if (stored_prep_ptr && *stored_prep_ptr) {
457
0
    prep = *stored_prep_ptr;
458
394
  } else {
459
394
    prep = ksBM_prep(pat, m);
460
394
    if (!prep) return karp_rabin(str_, n, pat_, m);
461
394
    if (stored_prep_ptr)
462
0
      *stored_prep_ptr = prep;
463
394
  }
464
394
  bmGs = prep;      // Good-suffix shift array
465
394
  bmBc = prep + m;  // Bad character shift array
466
394
  j = 0;
467
15.8M
  while (j <= n - m) {
468
15.8M
    for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
469
15.8M
    if (i >= 0) {
470
15.8M
      int max = bmBc[str[i+j]] - m + 1 + i;
471
15.8M
      if (max < bmGs[i]) max = bmGs[i];
472
15.8M
      j += max;
473
15.8M
    } else { // Match found
474
20
      if (!stored_prep_ptr) free(prep);
475
20
      return (void*)(str + j);
476
20
    }
477
15.8M
  }
478
  // No match
479
374
  if (!stored_prep_ptr) free(prep);
480
374
  return NULL;
481
394
}
482
483
void *kmemmem(const void *str, int n, const void *pat, int m, int **prep)
484
466
{
485
466
  return boyer_moore(str, n >= 0 ? n : 0, pat, m, prep);
486
466
}
487
488
char *kstrstr(const char *str, const char *pat, int **prep)
489
0
{
490
0
  size_t patlen = strlen(pat);
491
0
  if (patlen <= INT_MAX)
492
0
    return (char*)boyer_moore(str, strlen(str), pat, patlen, prep);
493
0
  else
494
0
    return karp_rabin(str, strlen(str), pat, patlen);
495
0
}
496
497
char *kstrnstr(const char *str, const char *pat, int n, int **prep)
498
0
{
499
0
  if (!pat || *pat == '\0')
500
0
    return (char *) str;
501
0
  if (n <= 0)
502
0
    return NULL;
503
0
  const char *endp = memchr(str, '\0', n);
504
0
  if (endp != NULL && endp - str < n)
505
0
    n = endp - str;
506
0
  size_t patlen = strlen(pat);
507
0
  if (patlen > n)
508
0
    return NULL;
509
0
  return (char*)boyer_moore(str, n, pat, patlen, prep);
510
0
}
511
512
/***********************
513
 * The main() function *
514
 ***********************/
515
516
#ifdef KSTRING_MAIN
517
#include <stdio.h>
518
int main()
519
{
520
  kstring_t *s;
521
  int *fields, n, i;
522
  ks_tokaux_t aux;
523
  char *p;
524
  s = (kstring_t*)calloc(1, sizeof(kstring_t));
525
  // test ksprintf()
526
  ksprintf(s, " abcdefg:    %d ", 100);
527
  printf("'%s'\n", s->s);
528
  // test ksplit()
529
  fields = ksplit(s, 0, &n);
530
  for (i = 0; i < n; ++i)
531
    printf("field[%d] = '%s'\n", i, s->s + fields[i]);
532
  // test kstrtok()
533
  s->l = 0;
534
  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
535
    kputsn(p, aux.p - p, s);
536
    kputc('\n', s);
537
  }
538
  printf("%s", s->s);
539
  // free
540
  free(s->s); free(s); free(fields);
541
542
  {
543
    static char *str = "abcdefgcdgcagtcakcdcd";
544
    static char *pat = "cd";
545
    char *ret, *s = str;
546
    int *prep = 0;
547
    while ((ret = kstrstr(s, pat, &prep)) != 0) {
548
      printf("match: %s\n", ret);
549
      s = ret + prep[0];
550
    }
551
    free(prep);
552
  }
553
  return 0;
554
}
555
#endif