Coverage Report

Created: 2025-07-11 06:53

/src/htslib/kstring.c
Line
Count
Source (jump to first uncovered line)
1
/* The MIT License
2
3
   Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4
   Copyright (C) 2013-2018, 2020-2021, 2023 Genome Research Ltd.
5
6
   Permission is hereby granted, free of charge, to any person obtaining
7
   a copy of this software and associated documentation files (the
8
   "Software"), to deal in the Software without restriction, including
9
   without limitation the rights to use, copy, modify, merge, publish,
10
   distribute, sublicense, and/or sell copies of the Software, and to
11
   permit persons to whom the Software is furnished to do so, subject to
12
   the following conditions:
13
14
   The above copyright notice and this permission notice shall be
15
   included in all copies or substantial portions of the Software.
16
17
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
   SOFTWARE.
25
*/
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdarg.h>
31
#include <stdio.h>
32
#include <ctype.h>
33
#include <string.h>
34
#include <stdint.h>
35
#include <math.h>
36
#include "htslib/kstring.h"
37
38
543k
int kputd(double d, kstring_t *s) {
39
543k
  int len = 0;
40
543k
  char buf[21], *cp = buf+20, *ep;
41
543k
  if (d == 0) {
42
36.1k
    if (signbit(d)) {
43
11.0k
      kputsn("-0",2,s);
44
11.0k
      return 2;
45
25.1k
    } else {
46
25.1k
      kputsn("0",1,s);
47
25.1k
      return 1;
48
25.1k
    }
49
36.1k
  }
50
51
507k
  if (d < 0) {
52
119k
    kputc('-',s);
53
119k
    len = 1;
54
119k
    d=-d;
55
119k
  }
56
507k
  if (!(d >= 0.0001 && d <= 999999)) {
57
118k
    if (ks_resize(s, s->l + 50) < 0)
58
0
      return EOF;
59
    // We let stdio handle the exponent cases
60
118k
    int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d);
61
118k
    len += s2;
62
118k
    s->l += s2;
63
118k
    return len;
64
118k
  }
65
66
  // Correction for rounding - rather ugly
67
  // Optimised for small numbers.
68
69
388k
  uint32_t i;
70
388k
  if (d<0.001)         i = rint(d*1000000000), cp -= 1;
71
388k
  else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
72
363k
  else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
73
319k
  else if (d < 1)      i = rint(d*1000000),    cp -= 4;
74
279k
  else if (d < 10)     i = rint(d*100000),     cp -= 5;
75
157k
  else if (d < 100)    i = rint(d*10000),      cp -= 6;
76
121k
  else if (d < 1000)   i = rint(d*1000),       cp -= 7;
77
47.8k
  else if (d < 10000)  i = rint(d*100),        cp -= 8;
78
44.9k
  else if (d < 100000) i = rint(d*10),         cp -= 9;
79
28.8k
  else                 i = rint(d),            cp -= 10;
80
81
  // integer i is always 6 digits, so print it 2 at a time.
82
388k
  static const char kputuw_dig2r[] =
83
388k
    "00010203040506070809"
84
388k
    "10111213141516171819"
85
388k
    "20212223242526272829"
86
388k
    "30313233343536373839"
87
388k
    "40414243444546474849"
88
388k
    "50515253545556575859"
89
388k
    "60616263646566676869"
90
388k
    "70717273747576777879"
91
388k
    "80818283848586878889"
92
388k
    "90919293949596979899";
93
94
388k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
95
388k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
96
388k
  memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
97
98
  // Except when it rounds up (d=0.009999999 is i=1000000)
99
388k
  if (i >= 100)
100
25.2k
    *--cp = '0' + (i/100);
101
102
103
388k
  int p = buf+20-cp;
104
388k
  if (p <= 10) { /* d < 1 */
105
    // 0.00123 is 123, so add leading zeros and 0.
106
108k
    ep = cp+5; // 6 precision
107
178k
    while (p < 10) { // aka d < 1
108
69.2k
      *--cp = '0';
109
69.2k
      p++;
110
69.2k
    }
111
108k
    *--cp = '.';
112
108k
    *--cp = '0';
113
279k
  } else {
114
    // 123.001 is 123001 with p==13, so move 123 down and add "."
115
    // Equiv to memmove(cp-1, cp, p-10); cp--;
116
279k
    char *xp = --cp;
117
279k
    ep = cp+6;
118
959k
    while (p > 10) {
119
680k
      xp[0] = xp[1];
120
680k
      xp++;
121
680k
      p--;
122
680k
    }
123
279k
    xp[0] = '.';
124
279k
  }
125
126
  // Cull trailing zeros
127
1.89M
  while (*ep == '0' && ep > cp)
128
1.50M
    ep--;
129
130
  // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
131
  // Also code with "123." which should be "123"
132
388k
  if (*ep && *ep != '.')
133
117k
    ep++;
134
388k
  *ep = 0;
135
136
388k
  int sl = ep-cp;
137
388k
  len += sl;
138
388k
  kputsn(cp, sl, s);
139
388k
  return len;
140
507k
}
141
142
int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
143
149k
{
144
149k
  va_list args;
145
149k
  int l;
146
149k
  va_copy(args, ap);
147
148
149k
  if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
149
0
    double d = va_arg(args, double);
150
0
    l = kputd(d, s);
151
0
    va_end(args);
152
0
    return l;
153
0
  }
154
155
149k
  if (!s->s) {
156
90.4k
    const size_t sz = 64;
157
90.4k
    s->s = malloc(sz);
158
90.4k
    if (!s->s)
159
0
      return -1;
160
90.4k
    s->m = sz;
161
90.4k
    s->l = 0;
162
90.4k
  }
163
164
149k
  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
165
149k
  va_end(args);
166
149k
  if (l + 1 > s->m - s->l) {
167
45.8k
    if (ks_resize(s, s->l + l + 2) < 0)
168
0
      return -1;
169
45.8k
    va_copy(args, ap);
170
45.8k
    l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
171
45.8k
    va_end(args);
172
45.8k
  }
173
149k
  s->l += l;
174
149k
  return l;
175
149k
}
176
177
int ksprintf(kstring_t *s, const char *fmt, ...)
178
149k
{
179
149k
  va_list ap;
180
149k
  int l;
181
149k
  va_start(ap, fmt);
182
149k
  l = kvsprintf(s, fmt, ap);
183
149k
  va_end(ap);
184
149k
  return l;
185
149k
}
186
187
char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
188
1.15M
{
189
1.15M
  const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
190
1.15M
  if (sep) { // set up the table
191
45.1k
    if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
192
45.1k
    aux->finished = 0;
193
45.1k
    if (sep[0] && sep[1]) {
194
0
      aux->sep = -1;
195
0
      aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
196
0
      for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
197
45.1k
    } else aux->sep = sep[0];
198
45.1k
  }
199
1.15M
  if (aux->finished) return 0;
200
1.12M
  else if (str) start = (unsigned char *) str, aux->finished = 0;
201
1.07M
  else start = (unsigned char *) aux->p + 1;
202
1.12M
  if (aux->sep < 0) {
203
0
    for (p = start; *p; ++p)
204
0
      if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
205
1.12M
  } else {
206
    // Using strchr is fast for next token, but slower for
207
    // last token due to extra pass from strlen.  Overall
208
    // on a VCF parse this func was 146% faster with // strchr.
209
    // Equiv to:
210
    // for (p = start; *p; ++p) if (*p == aux->sep) break;
211
212
    // NB: We could use strchrnul() here from glibc if detected,
213
    // which is ~40% faster again, but it's not so portable.
214
    // i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
215
1.12M
    uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
216
1.12M
    p = p2 ? p2 : start + strlen((char *)start);
217
1.12M
  }
218
1.12M
  aux->p = (const char *) p; // end of token
219
1.12M
  if (*p == 0) aux->finished = 1; // no more tokens
220
1.12M
  return (char*)start;
221
1.15M
}
222
223
// s MUST BE a null terminated string; l = strlen(s)
224
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
225
131
{
226
131
  int i, n, max, last_char, last_start, *offsets, l;
227
131
  n = 0; max = *_max; offsets = *_offsets;
228
131
  l = strlen(s);
229
230
23.1k
#define __ksplit_aux do {           \
231
23.1k
    if (_offsets) {           \
232
23.1k
      s[i] = 0;         \
233
23.1k
      if (n == max) {         \
234
389
        int *tmp;       \
235
389
        max = max? max<<1 : 2;     \
236
389
        if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
237
389
          offsets = tmp;      \
238
389
        } else {       \
239
0
          free(offsets);      \
240
0
          *_offsets = NULL;   \
241
0
          return 0;     \
242
0
        }          \
243
389
      }            \
244
23.1k
      offsets[n++] = last_start;      \
245
23.1k
    } else ++n;           \
246
23.1k
  } while (0)
247
248
2.83M
  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
249
2.83M
    if (delimiter == 0) {
250
0
      if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
251
0
        if (isgraph(last_char))
252
0
                    __ksplit_aux; // the end of a field
253
0
      } else {
254
0
        if (isspace(last_char) || last_char == 0)
255
0
                    last_start = i;
256
0
      }
257
2.83M
    } else {
258
2.83M
      if (s[i] == delimiter || s[i] == 0) {
259
41.6k
        if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
260
2.79M
      } else {
261
2.79M
        if (last_char == delimiter || last_char == 0) last_start = i;
262
2.79M
      }
263
2.83M
    }
264
2.83M
    last_char = (int)((unsigned char)s[i]);
265
2.83M
  }
266
131
  *_max = max; *_offsets = offsets;
267
131
  return n;
268
131
}
269
270
int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
271
0
{
272
0
  size_t l0 = s->l;
273
274
0
  while (s->l == l0 || s->s[s->l-1] != '\n') {
275
0
    if (s->m - s->l < 200) {
276
0
      if (ks_resize(s, s->m + 200) < 0)
277
0
        return EOF;
278
0
    }
279
0
    if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
280
0
    s->l += strlen(s->s + s->l);
281
0
  }
282
283
0
  if (s->l == l0) return EOF;
284
285
0
  if (s->l > l0 && s->s[s->l-1] == '\n') {
286
0
    s->l--;
287
0
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
288
0
  }
289
0
  s->s[s->l] = '\0';
290
0
  return 0;
291
0
}
292
293
int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp)
294
4.11M
{
295
4.11M
  size_t l0 = s->l;
296
297
8.23M
  while (s->l == l0 || s->s[s->l-1] != '\n') {
298
4.15M
    if (s->m - s->l < 200) {
299
      // We return EOF for both EOF and error and the caller
300
      // needs to check for errors in fp, and we haven't
301
      // even got there yet.
302
      //
303
      // The only way of propagating memory errors is to
304
      // deliberately call something that we know triggers
305
      // and error so fp is also set.  This works for
306
      // hgets, but not for gets where reading <= 0 bytes
307
      // isn't an error.
308
2.74M
      if (ks_resize(s, s->m + 200) < 0) {
309
0
        fgets_fn(s->s + s->l, 0, fp);
310
0
        return EOF;
311
0
      }
312
2.74M
    }
313
4.15M
    ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp);
314
4.15M
    if (len <= 0) break;
315
4.12M
    s->l += len;
316
4.12M
  }
317
318
4.11M
  if (s->l == l0) return EOF;
319
320
4.09M
  if (s->l > l0 && s->s[s->l-1] == '\n') {
321
4.07M
    s->l--;
322
4.07M
    if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
323
4.07M
  }
324
4.09M
  s->s[s->l] = '\0';
325
4.09M
  return 0;
326
4.11M
}
327
328
/**********************
329
 * Boyer-Moore search *
330
 **********************/
331
332
typedef unsigned char ubyte_t;
333
334
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
335
static int *ksBM_prep(const ubyte_t *pat, int m)
336
0
{
337
0
  int i, *suff, *prep, *bmGs, *bmBc;
338
0
  prep = (int*)calloc(m + 256, sizeof(int));
339
0
    if (!prep) return NULL;
340
0
  bmGs = prep; bmBc = prep + m;
341
0
  { // preBmBc()
342
0
    for (i = 0; i < 256; ++i) bmBc[i] = m;
343
0
    for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
344
0
  }
345
0
  suff = (int*)calloc(m, sizeof(int));
346
0
    if (!suff) { free(prep); return NULL; }
347
0
  { // suffixes()
348
0
    int f = 0, g;
349
0
    suff[m - 1] = m;
350
0
    g = m - 1;
351
0
    for (i = m - 2; i >= 0; --i) {
352
0
      if (i > g && suff[i + m - 1 - f] < i - g)
353
0
        suff[i] = suff[i + m - 1 - f];
354
0
      else {
355
0
        if (i < g) g = i;
356
0
        f = i;
357
0
        while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
358
0
        suff[i] = f - g;
359
0
      }
360
0
    }
361
0
  }
362
0
  { // preBmGs()
363
0
    int j = 0;
364
0
    for (i = 0; i < m; ++i) bmGs[i] = m;
365
0
    for (i = m - 1; i >= 0; --i)
366
0
      if (suff[i] == i + 1)
367
0
        for (; j < m - 1 - i; ++j)
368
0
          if (bmGs[j] == m)
369
0
            bmGs[j] = m - 1 - i;
370
0
    for (i = 0; i <= m - 2; ++i)
371
0
      bmGs[m - 1 - suff[i]] = m - 1 - i;
372
0
  }
373
0
  free(suff);
374
0
  return prep;
375
0
}
376
377
void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
378
0
{
379
0
  int i, j, *prep = 0, *bmGs, *bmBc;
380
0
  const ubyte_t *str, *pat;
381
0
  str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
382
0
  prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
383
0
    if (!prep) return NULL;
384
0
  if (_prep && *_prep == 0) *_prep = prep;
385
0
  bmGs = prep; bmBc = prep + m;
386
0
  j = 0;
387
0
  while (j <= n - m) {
388
0
    for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
389
0
    if (i >= 0) {
390
0
      int max = bmBc[str[i+j]] - m + 1 + i;
391
0
      if (max < bmGs[i]) max = bmGs[i];
392
0
      j += max;
393
0
    } else return (void*)(str + j);
394
0
  }
395
0
  if (_prep == 0) free(prep);
396
0
  return 0;
397
0
}
398
399
char *kstrstr(const char *str, const char *pat, int **_prep)
400
0
{
401
0
  return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
402
0
}
403
404
char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
405
0
{
406
0
  return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
407
0
}
408
409
/***********************
410
 * The main() function *
411
 ***********************/
412
413
#ifdef KSTRING_MAIN
414
#include <stdio.h>
415
int main()
416
{
417
  kstring_t *s;
418
  int *fields, n, i;
419
  ks_tokaux_t aux;
420
  char *p;
421
  s = (kstring_t*)calloc(1, sizeof(kstring_t));
422
  // test ksprintf()
423
  ksprintf(s, " abcdefg:    %d ", 100);
424
  printf("'%s'\n", s->s);
425
  // test ksplit()
426
  fields = ksplit(s, 0, &n);
427
  for (i = 0; i < n; ++i)
428
    printf("field[%d] = '%s'\n", i, s->s + fields[i]);
429
  // test kstrtok()
430
  s->l = 0;
431
  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
432
    kputsn(p, aux.p - p, s);
433
    kputc('\n', s);
434
  }
435
  printf("%s", s->s);
436
  // free
437
  free(s->s); free(s); free(fields);
438
439
  {
440
    static char *str = "abcdefgcdgcagtcakcdcd";
441
    static char *pat = "cd";
442
    char *ret, *s = str;
443
    int *prep = 0;
444
    while ((ret = kstrstr(s, pat, &prep)) != 0) {
445
      printf("match: %s\n", ret);
446
      s = ret + prep[0];
447
    }
448
    free(prep);
449
  }
450
  return 0;
451
}
452
#endif