Line | Count | Source (jump to first uncovered line) |
1 | | /* The MIT License |
2 | | |
3 | | Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk> |
4 | | Copyright (C) 2013-2018, 2020-2021, 2023 Genome Research Ltd. |
5 | | |
6 | | Permission is hereby granted, free of charge, to any person obtaining |
7 | | a copy of this software and associated documentation files (the |
8 | | "Software"), to deal in the Software without restriction, including |
9 | | without limitation the rights to use, copy, modify, merge, publish, |
10 | | distribute, sublicense, and/or sell copies of the Software, and to |
11 | | permit persons to whom the Software is furnished to do so, subject to |
12 | | the following conditions: |
13 | | |
14 | | The above copyright notice and this permission notice shall be |
15 | | included in all copies or substantial portions of the Software. |
16 | | |
17 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
18 | | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
19 | | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
20 | | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
21 | | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
22 | | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
23 | | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
24 | | SOFTWARE. |
25 | | */ |
26 | | |
27 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
28 | | #include <config.h> |
29 | | |
30 | | #include <stdarg.h> |
31 | | #include <stdio.h> |
32 | | #include <ctype.h> |
33 | | #include <string.h> |
34 | | #include <stdint.h> |
35 | | #include <math.h> |
36 | | #include "htslib/kstring.h" |
37 | | |
38 | 543k | int kputd(double d, kstring_t *s) { |
39 | 543k | int len = 0; |
40 | 543k | char buf[21], *cp = buf+20, *ep; |
41 | 543k | if (d == 0) { |
42 | 36.1k | if (signbit(d)) { |
43 | 11.0k | kputsn("-0",2,s); |
44 | 11.0k | return 2; |
45 | 25.1k | } else { |
46 | 25.1k | kputsn("0",1,s); |
47 | 25.1k | return 1; |
48 | 25.1k | } |
49 | 36.1k | } |
50 | | |
51 | 507k | if (d < 0) { |
52 | 119k | kputc('-',s); |
53 | 119k | len = 1; |
54 | 119k | d=-d; |
55 | 119k | } |
56 | 507k | if (!(d >= 0.0001 && d <= 999999)) { |
57 | 118k | if (ks_resize(s, s->l + 50) < 0) |
58 | 0 | return EOF; |
59 | | // We let stdio handle the exponent cases |
60 | 118k | int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); |
61 | 118k | len += s2; |
62 | 118k | s->l += s2; |
63 | 118k | return len; |
64 | 118k | } |
65 | | |
66 | | // Correction for rounding - rather ugly |
67 | | // Optimised for small numbers. |
68 | | |
69 | 388k | uint32_t i; |
70 | 388k | if (d<0.001) i = rint(d*1000000000), cp -= 1; |
71 | 388k | else if (d < 0.01) i = rint(d*100000000), cp -= 2; |
72 | 363k | else if (d < 0.1) i = rint(d*10000000), cp -= 3; |
73 | 319k | else if (d < 1) i = rint(d*1000000), cp -= 4; |
74 | 279k | else if (d < 10) i = rint(d*100000), cp -= 5; |
75 | 157k | else if (d < 100) i = rint(d*10000), cp -= 6; |
76 | 121k | else if (d < 1000) i = rint(d*1000), cp -= 7; |
77 | 47.8k | else if (d < 10000) i = rint(d*100), cp -= 8; |
78 | 44.9k | else if (d < 100000) i = rint(d*10), cp -= 9; |
79 | 28.8k | else i = rint(d), cp -= 10; |
80 | | |
81 | | // integer i is always 6 digits, so print it 2 at a time. |
82 | 388k | static const char kputuw_dig2r[] = |
83 | 388k | "00010203040506070809" |
84 | 388k | "10111213141516171819" |
85 | 388k | "20212223242526272829" |
86 | 388k | "30313233343536373839" |
87 | 388k | "40414243444546474849" |
88 | 388k | "50515253545556575859" |
89 | 388k | "60616263646566676869" |
90 | 388k | "70717273747576777879" |
91 | 388k | "80818283848586878889" |
92 | 388k | "90919293949596979899"; |
93 | | |
94 | 388k | memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100; |
95 | 388k | memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100; |
96 | 388k | memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); |
97 | | |
98 | | // Except when it rounds up (d=0.009999999 is i=1000000) |
99 | 388k | if (i >= 100) |
100 | 25.2k | *--cp = '0' + (i/100); |
101 | | |
102 | | |
103 | 388k | int p = buf+20-cp; |
104 | 388k | if (p <= 10) { /* d < 1 */ |
105 | | // 0.00123 is 123, so add leading zeros and 0. |
106 | 108k | ep = cp+5; // 6 precision |
107 | 178k | while (p < 10) { // aka d < 1 |
108 | 69.2k | *--cp = '0'; |
109 | 69.2k | p++; |
110 | 69.2k | } |
111 | 108k | *--cp = '.'; |
112 | 108k | *--cp = '0'; |
113 | 279k | } else { |
114 | | // 123.001 is 123001 with p==13, so move 123 down and add "." |
115 | | // Equiv to memmove(cp-1, cp, p-10); cp--; |
116 | 279k | char *xp = --cp; |
117 | 279k | ep = cp+6; |
118 | 959k | while (p > 10) { |
119 | 680k | xp[0] = xp[1]; |
120 | 680k | xp++; |
121 | 680k | p--; |
122 | 680k | } |
123 | 279k | xp[0] = '.'; |
124 | 279k | } |
125 | | |
126 | | // Cull trailing zeros |
127 | 1.89M | while (*ep == '0' && ep > cp) |
128 | 1.50M | ep--; |
129 | | |
130 | | // End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case. |
131 | | // Also code with "123." which should be "123" |
132 | 388k | if (*ep && *ep != '.') |
133 | 117k | ep++; |
134 | 388k | *ep = 0; |
135 | | |
136 | 388k | int sl = ep-cp; |
137 | 388k | len += sl; |
138 | 388k | kputsn(cp, sl, s); |
139 | 388k | return len; |
140 | 507k | } |
141 | | |
142 | | int kvsprintf(kstring_t *s, const char *fmt, va_list ap) |
143 | 149k | { |
144 | 149k | va_list args; |
145 | 149k | int l; |
146 | 149k | va_copy(args, ap); |
147 | | |
148 | 149k | if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) { |
149 | 0 | double d = va_arg(args, double); |
150 | 0 | l = kputd(d, s); |
151 | 0 | va_end(args); |
152 | 0 | return l; |
153 | 0 | } |
154 | | |
155 | 149k | if (!s->s) { |
156 | 90.4k | const size_t sz = 64; |
157 | 90.4k | s->s = malloc(sz); |
158 | 90.4k | if (!s->s) |
159 | 0 | return -1; |
160 | 90.4k | s->m = sz; |
161 | 90.4k | s->l = 0; |
162 | 90.4k | } |
163 | | |
164 | 149k | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. |
165 | 149k | va_end(args); |
166 | 149k | if (l + 1 > s->m - s->l) { |
167 | 45.8k | if (ks_resize(s, s->l + l + 2) < 0) |
168 | 0 | return -1; |
169 | 45.8k | va_copy(args, ap); |
170 | 45.8k | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); |
171 | 45.8k | va_end(args); |
172 | 45.8k | } |
173 | 149k | s->l += l; |
174 | 149k | return l; |
175 | 149k | } |
176 | | |
177 | | int ksprintf(kstring_t *s, const char *fmt, ...) |
178 | 149k | { |
179 | 149k | va_list ap; |
180 | 149k | int l; |
181 | 149k | va_start(ap, fmt); |
182 | 149k | l = kvsprintf(s, fmt, ap); |
183 | 149k | va_end(ap); |
184 | 149k | return l; |
185 | 149k | } |
186 | | |
187 | | char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) |
188 | 1.15M | { |
189 | 1.15M | const unsigned char *p, *start, *sep = (unsigned char *) sep_in; |
190 | 1.15M | if (sep) { // set up the table |
191 | 45.1k | if (str == 0 && aux->finished) return 0; // no need to set up if we have finished |
192 | 45.1k | aux->finished = 0; |
193 | 45.1k | if (sep[0] && sep[1]) { |
194 | 0 | aux->sep = -1; |
195 | 0 | aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; |
196 | 0 | for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); |
197 | 45.1k | } else aux->sep = sep[0]; |
198 | 45.1k | } |
199 | 1.15M | if (aux->finished) return 0; |
200 | 1.12M | else if (str) start = (unsigned char *) str, aux->finished = 0; |
201 | 1.07M | else start = (unsigned char *) aux->p + 1; |
202 | 1.12M | if (aux->sep < 0) { |
203 | 0 | for (p = start; *p; ++p) |
204 | 0 | if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; |
205 | 1.12M | } else { |
206 | | // Using strchr is fast for next token, but slower for |
207 | | // last token due to extra pass from strlen. Overall |
208 | | // on a VCF parse this func was 146% faster with // strchr. |
209 | | // Equiv to: |
210 | | // for (p = start; *p; ++p) if (*p == aux->sep) break; |
211 | | |
212 | | // NB: We could use strchrnul() here from glibc if detected, |
213 | | // which is ~40% faster again, but it's not so portable. |
214 | | // i.e. p = (uint8_t *)strchrnul((char *)start, aux->sep); |
215 | 1.12M | uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep); |
216 | 1.12M | p = p2 ? p2 : start + strlen((char *)start); |
217 | 1.12M | } |
218 | 1.12M | aux->p = (const char *) p; // end of token |
219 | 1.12M | if (*p == 0) aux->finished = 1; // no more tokens |
220 | 1.12M | return (char*)start; |
221 | 1.15M | } |
222 | | |
223 | | // s MUST BE a null terminated string; l = strlen(s) |
224 | | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) |
225 | 131 | { |
226 | 131 | int i, n, max, last_char, last_start, *offsets, l; |
227 | 131 | n = 0; max = *_max; offsets = *_offsets; |
228 | 131 | l = strlen(s); |
229 | | |
230 | 23.1k | #define __ksplit_aux do { \ |
231 | 23.1k | if (_offsets) { \ |
232 | 23.1k | s[i] = 0; \ |
233 | 23.1k | if (n == max) { \ |
234 | 389 | int *tmp; \ |
235 | 389 | max = max? max<<1 : 2; \ |
236 | 389 | if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \ |
237 | 389 | offsets = tmp; \ |
238 | 389 | } else { \ |
239 | 0 | free(offsets); \ |
240 | 0 | *_offsets = NULL; \ |
241 | 0 | return 0; \ |
242 | 0 | } \ |
243 | 389 | } \ |
244 | 23.1k | offsets[n++] = last_start; \ |
245 | 23.1k | } else ++n; \ |
246 | 23.1k | } while (0) |
247 | | |
248 | 2.83M | for (i = 0, last_char = last_start = 0; i <= l; ++i) { |
249 | 2.83M | if (delimiter == 0) { |
250 | 0 | if (isspace((int)((unsigned char) s[i])) || s[i] == 0) { |
251 | 0 | if (isgraph(last_char)) |
252 | 0 | __ksplit_aux; // the end of a field |
253 | 0 | } else { |
254 | 0 | if (isspace(last_char) || last_char == 0) |
255 | 0 | last_start = i; |
256 | 0 | } |
257 | 2.83M | } else { |
258 | 2.83M | if (s[i] == delimiter || s[i] == 0) { |
259 | 41.6k | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field |
260 | 2.79M | } else { |
261 | 2.79M | if (last_char == delimiter || last_char == 0) last_start = i; |
262 | 2.79M | } |
263 | 2.83M | } |
264 | 2.83M | last_char = (int)((unsigned char)s[i]); |
265 | 2.83M | } |
266 | 131 | *_max = max; *_offsets = offsets; |
267 | 131 | return n; |
268 | 131 | } |
269 | | |
270 | | int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) |
271 | 0 | { |
272 | 0 | size_t l0 = s->l; |
273 | |
|
274 | 0 | while (s->l == l0 || s->s[s->l-1] != '\n') { |
275 | 0 | if (s->m - s->l < 200) { |
276 | 0 | if (ks_resize(s, s->m + 200) < 0) |
277 | 0 | return EOF; |
278 | 0 | } |
279 | 0 | if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break; |
280 | 0 | s->l += strlen(s->s + s->l); |
281 | 0 | } |
282 | | |
283 | 0 | if (s->l == l0) return EOF; |
284 | | |
285 | 0 | if (s->l > l0 && s->s[s->l-1] == '\n') { |
286 | 0 | s->l--; |
287 | 0 | if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; |
288 | 0 | } |
289 | 0 | s->s[s->l] = '\0'; |
290 | 0 | return 0; |
291 | 0 | } |
292 | | |
293 | | int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp) |
294 | 4.11M | { |
295 | 4.11M | size_t l0 = s->l; |
296 | | |
297 | 8.23M | while (s->l == l0 || s->s[s->l-1] != '\n') { |
298 | 4.15M | if (s->m - s->l < 200) { |
299 | | // We return EOF for both EOF and error and the caller |
300 | | // needs to check for errors in fp, and we haven't |
301 | | // even got there yet. |
302 | | // |
303 | | // The only way of propagating memory errors is to |
304 | | // deliberately call something that we know triggers |
305 | | // and error so fp is also set. This works for |
306 | | // hgets, but not for gets where reading <= 0 bytes |
307 | | // isn't an error. |
308 | 2.74M | if (ks_resize(s, s->m + 200) < 0) { |
309 | 0 | fgets_fn(s->s + s->l, 0, fp); |
310 | 0 | return EOF; |
311 | 0 | } |
312 | 2.74M | } |
313 | 4.15M | ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp); |
314 | 4.15M | if (len <= 0) break; |
315 | 4.12M | s->l += len; |
316 | 4.12M | } |
317 | | |
318 | 4.11M | if (s->l == l0) return EOF; |
319 | | |
320 | 4.09M | if (s->l > l0 && s->s[s->l-1] == '\n') { |
321 | 4.07M | s->l--; |
322 | 4.07M | if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; |
323 | 4.07M | } |
324 | 4.09M | s->s[s->l] = '\0'; |
325 | 4.09M | return 0; |
326 | 4.11M | } |
327 | | |
328 | | /********************** |
329 | | * Boyer-Moore search * |
330 | | **********************/ |
331 | | |
332 | | typedef unsigned char ubyte_t; |
333 | | |
334 | | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html |
335 | | static int *ksBM_prep(const ubyte_t *pat, int m) |
336 | 0 | { |
337 | 0 | int i, *suff, *prep, *bmGs, *bmBc; |
338 | 0 | prep = (int*)calloc(m + 256, sizeof(int)); |
339 | 0 | if (!prep) return NULL; |
340 | 0 | bmGs = prep; bmBc = prep + m; |
341 | 0 | { // preBmBc() |
342 | 0 | for (i = 0; i < 256; ++i) bmBc[i] = m; |
343 | 0 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; |
344 | 0 | } |
345 | 0 | suff = (int*)calloc(m, sizeof(int)); |
346 | 0 | if (!suff) { free(prep); return NULL; } |
347 | 0 | { // suffixes() |
348 | 0 | int f = 0, g; |
349 | 0 | suff[m - 1] = m; |
350 | 0 | g = m - 1; |
351 | 0 | for (i = m - 2; i >= 0; --i) { |
352 | 0 | if (i > g && suff[i + m - 1 - f] < i - g) |
353 | 0 | suff[i] = suff[i + m - 1 - f]; |
354 | 0 | else { |
355 | 0 | if (i < g) g = i; |
356 | 0 | f = i; |
357 | 0 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; |
358 | 0 | suff[i] = f - g; |
359 | 0 | } |
360 | 0 | } |
361 | 0 | } |
362 | 0 | { // preBmGs() |
363 | 0 | int j = 0; |
364 | 0 | for (i = 0; i < m; ++i) bmGs[i] = m; |
365 | 0 | for (i = m - 1; i >= 0; --i) |
366 | 0 | if (suff[i] == i + 1) |
367 | 0 | for (; j < m - 1 - i; ++j) |
368 | 0 | if (bmGs[j] == m) |
369 | 0 | bmGs[j] = m - 1 - i; |
370 | 0 | for (i = 0; i <= m - 2; ++i) |
371 | 0 | bmGs[m - 1 - suff[i]] = m - 1 - i; |
372 | 0 | } |
373 | 0 | free(suff); |
374 | 0 | return prep; |
375 | 0 | } |
376 | | |
377 | | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) |
378 | 0 | { |
379 | 0 | int i, j, *prep = 0, *bmGs, *bmBc; |
380 | 0 | const ubyte_t *str, *pat; |
381 | 0 | str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; |
382 | 0 | prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; |
383 | 0 | if (!prep) return NULL; |
384 | 0 | if (_prep && *_prep == 0) *_prep = prep; |
385 | 0 | bmGs = prep; bmBc = prep + m; |
386 | 0 | j = 0; |
387 | 0 | while (j <= n - m) { |
388 | 0 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); |
389 | 0 | if (i >= 0) { |
390 | 0 | int max = bmBc[str[i+j]] - m + 1 + i; |
391 | 0 | if (max < bmGs[i]) max = bmGs[i]; |
392 | 0 | j += max; |
393 | 0 | } else return (void*)(str + j); |
394 | 0 | } |
395 | 0 | if (_prep == 0) free(prep); |
396 | 0 | return 0; |
397 | 0 | } |
398 | | |
399 | | char *kstrstr(const char *str, const char *pat, int **_prep) |
400 | 0 | { |
401 | 0 | return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); |
402 | 0 | } |
403 | | |
404 | | char *kstrnstr(const char *str, const char *pat, int n, int **_prep) |
405 | 0 | { |
406 | 0 | return (char*)kmemmem(str, n, pat, strlen(pat), _prep); |
407 | 0 | } |
408 | | |
409 | | /*********************** |
410 | | * The main() function * |
411 | | ***********************/ |
412 | | |
413 | | #ifdef KSTRING_MAIN |
414 | | #include <stdio.h> |
415 | | int main() |
416 | | { |
417 | | kstring_t *s; |
418 | | int *fields, n, i; |
419 | | ks_tokaux_t aux; |
420 | | char *p; |
421 | | s = (kstring_t*)calloc(1, sizeof(kstring_t)); |
422 | | // test ksprintf() |
423 | | ksprintf(s, " abcdefg: %d ", 100); |
424 | | printf("'%s'\n", s->s); |
425 | | // test ksplit() |
426 | | fields = ksplit(s, 0, &n); |
427 | | for (i = 0; i < n; ++i) |
428 | | printf("field[%d] = '%s'\n", i, s->s + fields[i]); |
429 | | // test kstrtok() |
430 | | s->l = 0; |
431 | | for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { |
432 | | kputsn(p, aux.p - p, s); |
433 | | kputc('\n', s); |
434 | | } |
435 | | printf("%s", s->s); |
436 | | // free |
437 | | free(s->s); free(s); free(fields); |
438 | | |
439 | | { |
440 | | static char *str = "abcdefgcdgcagtcakcdcd"; |
441 | | static char *pat = "cd"; |
442 | | char *ret, *s = str; |
443 | | int *prep = 0; |
444 | | while ((ret = kstrstr(s, pat, &prep)) != 0) { |
445 | | printf("match: %s\n", ret); |
446 | | s = ret + prep[0]; |
447 | | } |
448 | | free(prep); |
449 | | } |
450 | | return 0; |
451 | | } |
452 | | #endif |