Coverage Report

Created: 2025-11-11 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/kstring.c
Line
Count
Source
1
/* The MIT License
2
3
   Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4
   Copyright (C) 2013-2018, 2020-2021, 2023 Genome Research Ltd.
5
6
   Permission is hereby granted, free of charge, to any person obtaining
7
   a copy of this software and associated documentation files (the
8
   "Software"), to deal in the Software without restriction, including
9
   without limitation the rights to use, copy, modify, merge, publish,
10
   distribute, sublicense, and/or sell copies of the Software, and to
11
   permit persons to whom the Software is furnished to do so, subject to
12
   the following conditions:
13
14
   The above copyright notice and this permission notice shall be
15
   included in all copies or substantial portions of the Software.
16
17
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
   SOFTWARE.
25
*/
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdarg.h>
31
#include <stdio.h>
32
#include <ctype.h>
33
#include <string.h>
34
#include <stdint.h>
35
#include <math.h>
36
#include "htslib/kstring.h"
37
38
611k
int kputd(double d, kstring_t *s) {
39
611k
  int len = 0;
40
611k
  char buf[21], *cp = buf+20, *ep;
41
611k
  if (d == 0) {
42
52.5k
    if (signbit(d)) {
43
8.00k
      kputsn("-0",2,s);
44
8.00k
      return 2;
45
44.5k
    } else {
46
44.5k
      kputsn("0",1,s);
47
44.5k
      return 1;
48
44.5k
    }
49
52.5k
  }
50
51
558k
  if (d < 0) {
52
152k
    kputc('-',s);
53
152k
    len = 1;
54
152k
    d=-d;
55
152k
  }
56
558k
  if (!(d >= 0.0001 && d <= 999999)) {
57
123k
    if (ks_resize(s, s->l + 50) < 0)
58
0
      return EOF;
59
    // We let stdio handle the exponent cases
60
123k
    int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d);
61
123k
    len += s2;
62
123k
    s->l += s2;
63
123k
    return len;
64
123k
  }
65
66
  // Correction for rounding - rather ugly
67
  // Optimised for small numbers.
68
69
434k
  uint32_t i;
70
434k
  if (d<0.001)         i = rint(d*1000000000), cp -= 1;
71
434k
  else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
72
417k
  else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
73
387k
  else if (d < 1)      i = rint(d*1000000),    cp -= 4;
74
360k
  else if (d < 10)     i = rint(d*100000),     cp -= 5;
75
232k
  else if (d < 100)    i = rint(d*10000),      cp -= 6;
76
179k
  else if (d < 1000)   i = rint(d*1000),       cp -= 7;
77
68.8k
  else if (d < 10000)  i = rint(d*100),        cp -= 8;
78
63.0k
  else if (d < 100000) i = rint(d*10),         cp -= 9;
79
44.2k
  else                 i = rint(d),            cp -= 10;
80
81
  // integer i is always 6 digits, so print it 2 at a time.
82
434k
  static const char kputuw_dig2r[] =
83
434k
    "00010203040506070809"
84
434k
    "10111213141516171819"
85
434k
    "20212223242526272829"
86
434k
    "30313233343536373839"
87
434k
    "40414243444546474849"
88
434k
    "50515253545556575859"
89
434k
    "60616263646566676869"
90
434k
    "70717273747576777879"
91
434k
    "80818283848586878889"
92
434k
    "90919293949596979899";
93
94
434k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
95
434k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
96
434k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
97
98
  // Except when it rounds up (d=0.009999999 is i=1000000)
99
434k
  if (i >= 100)
100
17.3k
    *--cp = '0' + (i/100);
101
102
103
434k
  int p = buf+20-cp;
104
434k
  if (p <= 10) { /* d < 1 */
105
    // 0.00123 is 123, so add leading zeros and 0.
106
74.3k
    ep = cp+5; // 6 precision
107
121k
    while (p < 10) { // aka d < 1
108
47.1k
      *--cp = '0';
109
47.1k
      p++;
110
47.1k
    }
111
74.3k
    *--cp = '.';
112
74.3k
    *--cp = '0';
113
360k
  } else {
114
    // 123.001 is 123001 with p==13, so move 123 down and add "."
115
    // Equiv to memmove(cp-1, cp, p-10); cp--;
116
360k
    char *xp = --cp;
117
360k
    ep = cp+6;
118
1.30M
    while (p > 10) {
119
948k
      xp[0] = xp[1];
120
948k
      xp++;
121
948k
      p--;
122
948k
    }
123
360k
    xp[0] = '.';
124
360k
  }
125
126
  // Cull trailing zeros
127
1.99M
  while (*ep == '0' && ep > cp)
128
1.55M
    ep--;
129
130
  // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
131
  // Also code with "123." which should be "123"
132
434k
  if (*ep && *ep != '.')
133
81.2k
    ep++;
134
434k
  *ep = 0;
135
136
434k
  int sl = ep-cp;
137
434k
  len += sl;
138
434k
  kputsn(cp, sl, s);
139
434k
  return len;
140
558k
}
141
142
int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
143
253k
{
144
253k
  va_list args;
145
253k
  int l;
146
253k
  va_copy(args, ap);
147
148
253k
  if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
149
0
    double d = va_arg(args, double);
150
0
    l = kputd(d, s);
151
0
    va_end(args);
152
0
    return l;
153
0
  }
154
155
253k
  if (!s->s) {
156
142k
    const size_t sz = 64;
157
142k
    s->s = malloc(sz);
158
142k
    if (!s->s)
159
0
      return -1;
160
142k
    s->m = sz;
161
142k
    s->l = 0;
162
142k
  }
163
164
253k
  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
165
253k
  va_end(args);
166
253k
  if (l + 1 > s->m - s->l) {
167
64.6k
    if (ks_resize(s, s->l + l + 2) < 0)
168
0
      return -1;
169
64.6k
    va_copy(args, ap);
170
64.6k
    l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
171
64.6k
    va_end(args);
172
64.6k
  }
173
253k
  s->l += l;
174
253k
  return l;
175
253k
}
176
177
int ksprintf(kstring_t *s, const char *fmt, ...)
178
253k
{
179
253k
  va_list ap;
180
253k
  int l;
181
253k
  va_start(ap, fmt);
182
253k
  l = kvsprintf(s, fmt, ap);
183
253k
  va_end(ap);
184
253k
  return l;
185
253k
}
186
187
char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
188
2.37M
{
189
2.37M
  const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
190
2.37M
  if (sep) { // set up the table
191
89.4k
    if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
192
89.4k
    aux->finished = 0;
193
89.4k
    if (sep[0] && sep[1]) {
194
0
      aux->sep = -1;
195
0
      aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
196
0
      for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
197
89.4k
    } else aux->sep = sep[0];
198
89.4k
  }
199
2.37M
  if (aux->finished) return 0;
200
2.31M
  else if (str) start = (unsigned char *) str, aux->finished = 0;
201
2.22M
  else start = (unsigned char *) aux->p + 1;
202
2.31M
  if (aux->sep < 0) {
203
0
    for (p = start; *p; ++p)
204
0
      if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
205
2.31M
  } else {
206
    // Using strchr is fast for next token, but slower for
207
    // last token due to extra pass from strlen.  Overall
208
    // on a VCF parse this func was 146% faster with // strchr.
209
    // Equiv to:
210
    // for (p = start; *p; ++p) if (*p == aux->sep) break;
211
212
    // NB: We could use strchrnul() here from glibc if detected,
213
    // which is ~40% faster again, but it's not so portable.
214
    // i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
215
2.31M
    uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
216
2.31M
    p = p2 ? p2 : start + strlen((char *)start);
217
2.31M
  }
218
2.31M
  aux->p = (const char *) p; // end of token
219
2.31M
  if (*p == 0) aux->finished = 1; // no more tokens
220
2.31M
  return (char*)start;
221
2.37M
}
222
223
// s MUST BE a null terminated string; l = strlen(s)
224
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
225
248
{
226
248
  int i, n, max, last_char, last_start, *offsets, l;
227
248
  n = 0; max = *_max; offsets = *_offsets;
228
248
  l = strlen(s);
229
230
73.1k
#define __ksplit_aux do {           \
231
73.1k
    if (_offsets) {           \
232
73.1k
      s[i] = 0;         \
233
73.1k
      if (n == max) {         \
234
984
        int *tmp;       \
235
984
        max = max? max<<1 : 2;     \
236
984
        if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
237
984
          offsets = tmp;      \
238
984
        } else {       \
239
0
          free(offsets);      \
240
0
          *_offsets = NULL;   \
241
0
          return 0;     \
242
0
        }          \
243
984
      }            \
244
73.1k
      offsets[n++] = last_start;      \
245
73.1k
    } else ++n;           \
246
73.1k
  } while (0)
247
248
4.11M
  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
249
4.11M
    if (delimiter == 0) {
250
0
      if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
251
0
        if (isgraph(last_char))
252
0
                    __ksplit_aux; // the end of a field
253
0
      } else {
254
0
        if (isspace(last_char) || last_char == 0)
255
0
                    last_start = i;
256
0
      }
257
4.11M
    } else {
258
4.11M
      if (s[i] == delimiter || s[i] == 0) {
259
200k
        if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
260
3.91M
      } else {
261
3.91M
        if (last_char == delimiter || last_char == 0) last_start = i;
262
3.91M
      }
263
4.11M
    }
264
4.11M
    last_char = (int)((unsigned char)s[i]);
265
4.11M
  }
266
248
  *_max = max; *_offsets = offsets;
267
248
  return n;
268
248
}
269
270
int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
271
0
{
272
0
  size_t l0 = s->l;
273
274
0
  while (s->l == l0 || s->s[s->l-1] != '\n') {
275
0
    if (s->m - s->l < 200) {
276
0
      if (ks_resize(s, s->m + 200) < 0)
277
0
        return EOF;
278
0
    }
279
0
    if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
280
0
    s->l += strlen(s->s + s->l);
281
0
  }
282
283
0
  if (s->l == l0) return EOF;
284
285
0
  if (s->l > l0 && s->s[s->l-1] == '\n') {
286
0
    s->l--;
287
0
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
288
0
  }
289
0
  s->s[s->l] = '\0';
290
0
  return 0;
291
0
}
292
293
int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp)
294
1.89M
{
295
1.89M
  size_t l0 = s->l;
296
297
3.80M
  while (s->l == l0 || s->s[s->l-1] != '\n') {
298
1.94M
    if (s->m - s->l < 200) {
299
      // We return EOF for both EOF and error and the caller
300
      // needs to check for errors in fp, and we haven't
301
      // even got there yet.
302
      //
303
      // The only way of propagating memory errors is to
304
      // deliberately call something that we know triggers
305
      // and error so fp is also set.  This works for
306
      // hgets, but not for gets where reading <= 0 bytes
307
      // isn't an error.
308
871k
      if (ks_resize(s, s->m + 200) < 0) {
309
0
        fgets_fn(s->s + s->l, 0, fp);
310
0
        return EOF;
311
0
      }
312
871k
    }
313
1.94M
    ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp);
314
1.94M
    if (len <= 0) break;
315
1.91M
    s->l += len;
316
1.91M
  }
317
318
1.89M
  if (s->l == l0) return EOF;
319
320
1.88M
  if (s->l > l0 && s->s[s->l-1] == '\n') {
321
1.86M
    s->l--;
322
1.86M
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
323
1.86M
  }
324
1.88M
  s->s[s->l] = '\0';
325
1.88M
  return 0;
326
1.89M
}
327
328
/**********************
329
 * Boyer-Moore search *
330
 **********************/
331
332
typedef unsigned char ubyte_t;
333
334
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
335
static int *ksBM_prep(const ubyte_t *pat, int m)
336
1.55k
{
337
1.55k
  int i, *suff, *prep, *bmGs, *bmBc;
338
1.55k
  prep = (int*)calloc(m + 256, sizeof(int));
339
1.55k
    if (!prep) return NULL;
340
1.55k
  bmGs = prep; bmBc = prep + m;
341
1.55k
  { // preBmBc()
342
398k
    for (i = 0; i < 256; ++i) bmBc[i] = m;
343
31.0k
    for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
344
1.55k
  }
345
1.55k
  suff = (int*)calloc(m, sizeof(int));
346
1.55k
    if (!suff) { free(prep); return NULL; }
347
1.55k
  { // suffixes()
348
1.55k
    int f = 0, g;
349
1.55k
    suff[m - 1] = m;
350
1.55k
    g = m - 1;
351
31.0k
    for (i = m - 2; i >= 0; --i) {
352
29.4k
      if (i > g && suff[i + m - 1 - f] < i - g)
353
0
        suff[i] = suff[i + m - 1 - f];
354
29.4k
      else {
355
29.4k
        if (i < g) g = i;
356
29.4k
        f = i;
357
30.0k
        while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
358
29.4k
        suff[i] = f - g;
359
29.4k
      }
360
29.4k
    }
361
1.55k
  }
362
1.55k
  { // preBmGs()
363
1.55k
    int j = 0;
364
32.6k
    for (i = 0; i < m; ++i) bmGs[i] = m;
365
32.6k
    for (i = m - 1; i >= 0; --i)
366
31.0k
      if (suff[i] == i + 1)
367
1.55k
        for (; j < m - 1 - i; ++j)
368
0
          if (bmGs[j] == m)
369
0
            bmGs[j] = m - 1 - i;
370
31.0k
    for (i = 0; i <= m - 2; ++i)
371
29.4k
      bmGs[m - 1 - suff[i]] = m - 1 - i;
372
1.55k
  }
373
1.55k
  free(suff);
374
1.55k
  return prep;
375
1.55k
}
376
377
void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
378
1.55k
{
379
1.55k
  int i, j, *prep = 0, *bmGs, *bmBc;
380
1.55k
  const ubyte_t *str, *pat;
381
1.55k
  str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
382
1.55k
  if (_prep && *_prep) {
383
0
    prep = *_prep;
384
1.55k
  } else {
385
1.55k
    prep = ksBM_prep(pat, m);
386
1.55k
    if (!prep) return NULL;
387
1.55k
    if (_prep)
388
0
      *_prep = prep;
389
1.55k
  }
390
1.55k
  bmGs = prep; bmBc = prep + m;
391
1.55k
  j = 0;
392
48.7M
  while (j <= n - m) {
393
48.7M
    for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
394
48.7M
    if (i >= 0) {
395
48.7M
      int max = bmBc[str[i+j]] - m + 1 + i;
396
48.7M
      if (max < bmGs[i]) max = bmGs[i];
397
48.7M
      j += max;
398
48.7M
    } else {
399
24
      if (_prep == 0) free(prep);
400
24
      return (void*)(str + j);
401
24
    }
402
48.7M
  }
403
1.52k
  if (_prep == 0) free(prep);
404
1.52k
  return 0;
405
1.55k
}
406
407
char *kstrstr(const char *str, const char *pat, int **_prep)
408
0
{
409
0
  return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
410
0
}
411
412
char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
413
0
{
414
0
  return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
415
0
}
416
417
/***********************
418
 * The main() function *
419
 ***********************/
420
421
#ifdef KSTRING_MAIN
422
#include <stdio.h>
423
int main()
424
{
425
  kstring_t *s;
426
  int *fields, n, i;
427
  ks_tokaux_t aux;
428
  char *p;
429
  s = (kstring_t*)calloc(1, sizeof(kstring_t));
430
  // test ksprintf()
431
  ksprintf(s, " abcdefg:    %d ", 100);
432
  printf("'%s'\n", s->s);
433
  // test ksplit()
434
  fields = ksplit(s, 0, &n);
435
  for (i = 0; i < n; ++i)
436
    printf("field[%d] = '%s'\n", i, s->s + fields[i]);
437
  // test kstrtok()
438
  s->l = 0;
439
  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
440
    kputsn(p, aux.p - p, s);
441
    kputc('\n', s);
442
  }
443
  printf("%s", s->s);
444
  // free
445
  free(s->s); free(s); free(fields);
446
447
  {
448
    static char *str = "abcdefgcdgcagtcakcdcd";
449
    static char *pat = "cd";
450
    char *ret, *s = str;
451
    int *prep = 0;
452
    while ((ret = kstrstr(s, pat, &prep)) != 0) {
453
      printf("match: %s\n", ret);
454
      s = ret + prep[0];
455
    }
456
    free(prep);
457
  }
458
  return 0;
459
}
460
#endif