Line | Count | Source (jump to first uncovered line) |
1 | | /* The MIT License |
2 | | |
3 | | Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk> |
4 | | Copyright (C) 2013-2018, 2020-2021 Genome Research Ltd. |
5 | | |
6 | | Permission is hereby granted, free of charge, to any person obtaining |
7 | | a copy of this software and associated documentation files (the |
8 | | "Software"), to deal in the Software without restriction, including |
9 | | without limitation the rights to use, copy, modify, merge, publish, |
10 | | distribute, sublicense, and/or sell copies of the Software, and to |
11 | | permit persons to whom the Software is furnished to do so, subject to |
12 | | the following conditions: |
13 | | |
14 | | The above copyright notice and this permission notice shall be |
15 | | included in all copies or substantial portions of the Software. |
16 | | |
17 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
18 | | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
19 | | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
20 | | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
21 | | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
22 | | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
23 | | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
24 | | SOFTWARE. |
25 | | */ |
26 | | |
27 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
28 | | #include <config.h> |
29 | | |
30 | | #include <stdarg.h> |
31 | | #include <stdio.h> |
32 | | #include <ctype.h> |
33 | | #include <string.h> |
34 | | #include <stdint.h> |
35 | | #include <math.h> |
36 | | #include "htslib/kstring.h" |
37 | | |
38 | 829k | int kputd(double d, kstring_t *s) { |
39 | 829k | int len = 0; |
40 | 829k | char buf[21], *cp = buf+20, *ep; |
41 | 829k | if (d == 0) { |
42 | 388k | if (signbit(d)) { |
43 | 14.6k | kputsn("-0",2,s); |
44 | 14.6k | return 2; |
45 | 374k | } else { |
46 | 374k | kputsn("0",1,s); |
47 | 374k | return 1; |
48 | 374k | } |
49 | 388k | } |
50 | | |
51 | 440k | if (d < 0) { |
52 | 105k | kputc('-',s); |
53 | 105k | len = 1; |
54 | 105k | d=-d; |
55 | 105k | } |
56 | 440k | if (!(d >= 0.0001 && d <= 999999)) { |
57 | 226k | if (ks_resize(s, s->l + 50) < 0) |
58 | 0 | return EOF; |
59 | | // We let stdio handle the exponent cases |
60 | 226k | int s2 = sprintf(s->s + s->l, "%g", d); |
61 | 226k | len += s2; |
62 | 226k | s->l += s2; |
63 | 226k | return len; |
64 | 226k | } |
65 | | |
66 | 213k | uint64_t i = d*10000000000LL; |
67 | | // Correction for rounding - rather ugly |
68 | | |
69 | | // Optimised for small numbers. |
70 | | // Better still would be __builtin_clz on hi/lo 32 and get the |
71 | | // starting point very rapidly. |
72 | 213k | if (d<.0001) |
73 | 0 | i+=0; |
74 | 213k | else if (d<0.001) |
75 | 131 | i+=5; |
76 | 213k | else if (d < 0.01) |
77 | 154 | i+=50; |
78 | 213k | else if (d < 0.1) |
79 | 350 | i+=500; |
80 | 212k | else if (d < 1) |
81 | 99 | i+=5000; |
82 | 212k | else if (d < 10) |
83 | 40.2k | i+=50000; |
84 | 172k | else if (d < 100) |
85 | 21.6k | i+=500000; |
86 | 150k | else if (d < 1000) |
87 | 72.4k | i+=5000000; |
88 | 78.3k | else if (d < 10000) |
89 | 730 | i+=50000000; |
90 | 77.5k | else if (d < 100000) |
91 | 21.4k | i+=500000000; |
92 | 56.1k | else |
93 | 56.1k | i+=5000000000LL; |
94 | | |
95 | 2.88M | do { |
96 | 2.88M | *--cp = '0' + i%10; |
97 | 2.88M | i /= 10; |
98 | 2.88M | } while (i >= 1); |
99 | 213k | buf[20] = 0; |
100 | 213k | int p = buf+20-cp; |
101 | 213k | if (p <= 10) { // d < 1 |
102 | | //assert(d/1); |
103 | 734 | cp[6] = 0; ep = cp+5;// 6 precision |
104 | 1.64k | while (p < 10) { |
105 | 910 | *--cp = '0'; |
106 | 910 | p++; |
107 | 910 | } |
108 | 734 | *--cp = '.'; |
109 | 734 | *--cp = '0'; |
110 | 212k | } else { |
111 | 212k | char *xp = --cp; |
112 | 960k | while (p > 10) { |
113 | 747k | xp[0] = xp[1]; |
114 | 747k | p--; |
115 | 747k | xp++; |
116 | 747k | } |
117 | 212k | xp[0] = '.'; |
118 | 212k | cp[7] = 0; ep=cp+6; |
119 | 212k | if (cp[6] == '.') cp[6] = 0; |
120 | 212k | } |
121 | | |
122 | | // Cull trailing zeros |
123 | 745k | while (*ep == '0' && ep > cp) |
124 | 531k | ep--; |
125 | 213k | char *z = ep+1; |
126 | 551k | while (ep > cp) { |
127 | 495k | if (*ep == '.') { |
128 | 157k | if (z[-1] == '.') |
129 | 156k | z[-1] = 0; |
130 | 778 | else |
131 | 778 | z[0] = 0; |
132 | 157k | break; |
133 | 157k | } |
134 | 338k | ep--; |
135 | 338k | } |
136 | | |
137 | 213k | int sl = strlen(cp); |
138 | 213k | len += sl; |
139 | 213k | kputsn(cp, sl, s); |
140 | 213k | return len; |
141 | 440k | } |
142 | | |
143 | | int kvsprintf(kstring_t *s, const char *fmt, va_list ap) |
144 | 47.2k | { |
145 | 47.2k | va_list args; |
146 | 47.2k | int l; |
147 | 47.2k | va_copy(args, ap); |
148 | | |
149 | 47.2k | if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) { |
150 | 0 | double d = va_arg(args, double); |
151 | 0 | l = kputd(d, s); |
152 | 0 | va_end(args); |
153 | 0 | return l; |
154 | 0 | } |
155 | | |
156 | 47.2k | if (!s->s) { |
157 | 11.5k | const size_t sz = 64; |
158 | 11.5k | s->s = malloc(sz); |
159 | 11.5k | if (!s->s) |
160 | 0 | return -1; |
161 | 11.5k | s->m = sz; |
162 | 11.5k | s->l = 0; |
163 | 11.5k | } |
164 | | |
165 | 47.2k | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. |
166 | 47.2k | va_end(args); |
167 | 47.2k | if (l + 1 > s->m - s->l) { |
168 | 4.77k | if (ks_resize(s, s->l + l + 2) < 0) |
169 | 0 | return -1; |
170 | 4.77k | va_copy(args, ap); |
171 | 4.77k | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); |
172 | 4.77k | va_end(args); |
173 | 4.77k | } |
174 | 47.2k | s->l += l; |
175 | 47.2k | return l; |
176 | 47.2k | } |
177 | | |
178 | | int ksprintf(kstring_t *s, const char *fmt, ...) |
179 | 47.2k | { |
180 | 47.2k | va_list ap; |
181 | 47.2k | int l; |
182 | 47.2k | va_start(ap, fmt); |
183 | 47.2k | l = kvsprintf(s, fmt, ap); |
184 | 47.2k | va_end(ap); |
185 | 47.2k | return l; |
186 | 47.2k | } |
187 | | |
188 | | char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) |
189 | 2.42M | { |
190 | 2.42M | const unsigned char *p, *start, *sep = (unsigned char *) sep_in; |
191 | 2.42M | if (sep) { // set up the table |
192 | 47.3k | if (str == 0 && aux->finished) return 0; // no need to set up if we have finished |
193 | 47.3k | aux->finished = 0; |
194 | 47.3k | if (sep[0] && sep[1]) { |
195 | 0 | aux->sep = -1; |
196 | 0 | aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; |
197 | 0 | for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); |
198 | 47.3k | } else aux->sep = sep[0]; |
199 | 47.3k | } |
200 | 2.42M | if (aux->finished) return 0; |
201 | 2.38M | else if (str) start = (unsigned char *) str, aux->finished = 0; |
202 | 2.33M | else start = (unsigned char *) aux->p + 1; |
203 | 2.38M | if (aux->sep < 0) { |
204 | 0 | for (p = start; *p; ++p) |
205 | 0 | if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; |
206 | 2.38M | } else { |
207 | 1.03G | for (p = start; *p; ++p) |
208 | 1.03G | if (*p == aux->sep) break; |
209 | 2.38M | } |
210 | 2.38M | aux->p = (const char *) p; // end of token |
211 | 2.38M | if (*p == 0) aux->finished = 1; // no more tokens |
212 | 2.38M | return (char*)start; |
213 | 2.42M | } |
214 | | |
215 | | // s MUST BE a null terminated string; l = strlen(s) |
216 | | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) |
217 | 0 | { |
218 | 0 | int i, n, max, last_char, last_start, *offsets, l; |
219 | 0 | n = 0; max = *_max; offsets = *_offsets; |
220 | 0 | l = strlen(s); |
221 | |
|
222 | 0 | #define __ksplit_aux do { \ |
223 | 0 | if (_offsets) { \ |
224 | 0 | s[i] = 0; \ |
225 | 0 | if (n == max) { \ |
226 | 0 | int *tmp; \ |
227 | 0 | max = max? max<<1 : 2; \ |
228 | 0 | if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \ |
229 | 0 | offsets = tmp; \ |
230 | 0 | } else { \ |
231 | 0 | free(offsets); \ |
232 | 0 | *_offsets = NULL; \ |
233 | 0 | return 0; \ |
234 | 0 | } \ |
235 | 0 | } \ |
236 | 0 | offsets[n++] = last_start; \ |
237 | 0 | } else ++n; \ |
238 | 0 | } while (0) |
239 | |
|
240 | 0 | for (i = 0, last_char = last_start = 0; i <= l; ++i) { |
241 | 0 | if (delimiter == 0) { |
242 | 0 | if (isspace((int)((unsigned char) s[i])) || s[i] == 0) { |
243 | 0 | if (isgraph(last_char)) |
244 | 0 | __ksplit_aux; // the end of a field |
245 | 0 | } else { |
246 | 0 | if (isspace(last_char) || last_char == 0) |
247 | 0 | last_start = i; |
248 | 0 | } |
249 | 0 | } else { |
250 | 0 | if (s[i] == delimiter || s[i] == 0) { |
251 | 0 | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field |
252 | 0 | } else { |
253 | 0 | if (last_char == delimiter || last_char == 0) last_start = i; |
254 | 0 | } |
255 | 0 | } |
256 | 0 | last_char = (int)((unsigned char)s[i]); |
257 | 0 | } |
258 | 0 | *_max = max; *_offsets = offsets; |
259 | 0 | return n; |
260 | 0 | } |
261 | | |
262 | | int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) |
263 | 0 | { |
264 | 0 | size_t l0 = s->l; |
265 | |
|
266 | 0 | while (s->l == l0 || s->s[s->l-1] != '\n') { |
267 | 0 | if (s->m - s->l < 200) { |
268 | 0 | if (ks_resize(s, s->m + 200) < 0) |
269 | 0 | return EOF; |
270 | 0 | } |
271 | 0 | if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break; |
272 | 0 | s->l += strlen(s->s + s->l); |
273 | 0 | } |
274 | | |
275 | 0 | if (s->l == l0) return EOF; |
276 | | |
277 | 0 | if (s->l > l0 && s->s[s->l-1] == '\n') { |
278 | 0 | s->l--; |
279 | 0 | if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; |
280 | 0 | } |
281 | 0 | s->s[s->l] = '\0'; |
282 | 0 | return 0; |
283 | 0 | } |
284 | | |
285 | | int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp) |
286 | 76.4k | { |
287 | 76.4k | size_t l0 = s->l; |
288 | | |
289 | 156k | while (s->l == l0 || s->s[s->l-1] != '\n') { |
290 | 81.4k | if (s->m - s->l < 200) { |
291 | | // We return EOF for both EOF and error and the caller |
292 | | // needs to check for errors in fp, and we haven't |
293 | | // even got there yet. |
294 | | // |
295 | | // The only way of propagating memory errors is to |
296 | | // deliberately call something that we know triggers |
297 | | // and error so fp is also set. This works for |
298 | | // hgets, but not for gets where reading <= 0 bytes |
299 | | // isn't an error. |
300 | 14.6k | if (ks_resize(s, s->m + 200) < 0) { |
301 | 0 | fgets_fn(s->s + s->l, 0, fp); |
302 | 0 | return EOF; |
303 | 0 | } |
304 | 14.6k | } |
305 | 81.4k | ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp); |
306 | 81.4k | if (len <= 0) break; |
307 | 79.7k | s->l += len; |
308 | 79.7k | } |
309 | | |
310 | 76.4k | if (s->l == l0) return EOF; |
311 | | |
312 | 75.7k | if (s->l > l0 && s->s[s->l-1] == '\n') { |
313 | 74.7k | s->l--; |
314 | 74.7k | if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; |
315 | 74.7k | } |
316 | 75.7k | s->s[s->l] = '\0'; |
317 | 75.7k | return 0; |
318 | 76.4k | } |
319 | | |
320 | | /********************** |
321 | | * Boyer-Moore search * |
322 | | **********************/ |
323 | | |
324 | | typedef unsigned char ubyte_t; |
325 | | |
326 | | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html |
327 | | static int *ksBM_prep(const ubyte_t *pat, int m) |
328 | 0 | { |
329 | 0 | int i, *suff, *prep, *bmGs, *bmBc; |
330 | 0 | prep = (int*)calloc(m + 256, sizeof(int)); |
331 | 0 | if (!prep) return NULL; |
332 | 0 | bmGs = prep; bmBc = prep + m; |
333 | 0 | { // preBmBc() |
334 | 0 | for (i = 0; i < 256; ++i) bmBc[i] = m; |
335 | 0 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; |
336 | 0 | } |
337 | 0 | suff = (int*)calloc(m, sizeof(int)); |
338 | 0 | if (!suff) { free(prep); return NULL; } |
339 | 0 | { // suffixes() |
340 | 0 | int f = 0, g; |
341 | 0 | suff[m - 1] = m; |
342 | 0 | g = m - 1; |
343 | 0 | for (i = m - 2; i >= 0; --i) { |
344 | 0 | if (i > g && suff[i + m - 1 - f] < i - g) |
345 | 0 | suff[i] = suff[i + m - 1 - f]; |
346 | 0 | else { |
347 | 0 | if (i < g) g = i; |
348 | 0 | f = i; |
349 | 0 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; |
350 | 0 | suff[i] = f - g; |
351 | 0 | } |
352 | 0 | } |
353 | 0 | } |
354 | 0 | { // preBmGs() |
355 | 0 | int j = 0; |
356 | 0 | for (i = 0; i < m; ++i) bmGs[i] = m; |
357 | 0 | for (i = m - 1; i >= 0; --i) |
358 | 0 | if (suff[i] == i + 1) |
359 | 0 | for (; j < m - 1 - i; ++j) |
360 | 0 | if (bmGs[j] == m) |
361 | 0 | bmGs[j] = m - 1 - i; |
362 | 0 | for (i = 0; i <= m - 2; ++i) |
363 | 0 | bmGs[m - 1 - suff[i]] = m - 1 - i; |
364 | 0 | } |
365 | 0 | free(suff); |
366 | 0 | return prep; |
367 | 0 | } |
368 | | |
369 | | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) |
370 | 0 | { |
371 | 0 | int i, j, *prep = 0, *bmGs, *bmBc; |
372 | 0 | const ubyte_t *str, *pat; |
373 | 0 | str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; |
374 | 0 | prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; |
375 | 0 | if (!prep) return NULL; |
376 | 0 | if (_prep && *_prep == 0) *_prep = prep; |
377 | 0 | bmGs = prep; bmBc = prep + m; |
378 | 0 | j = 0; |
379 | 0 | while (j <= n - m) { |
380 | 0 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); |
381 | 0 | if (i >= 0) { |
382 | 0 | int max = bmBc[str[i+j]] - m + 1 + i; |
383 | 0 | if (max < bmGs[i]) max = bmGs[i]; |
384 | 0 | j += max; |
385 | 0 | } else return (void*)(str + j); |
386 | 0 | } |
387 | 0 | if (_prep == 0) free(prep); |
388 | 0 | return 0; |
389 | 0 | } |
390 | | |
391 | | char *kstrstr(const char *str, const char *pat, int **_prep) |
392 | 0 | { |
393 | 0 | return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); |
394 | 0 | } |
395 | | |
396 | | char *kstrnstr(const char *str, const char *pat, int n, int **_prep) |
397 | 0 | { |
398 | 0 | return (char*)kmemmem(str, n, pat, strlen(pat), _prep); |
399 | 0 | } |
400 | | |
401 | | /*********************** |
402 | | * The main() function * |
403 | | ***********************/ |
404 | | |
405 | | #ifdef KSTRING_MAIN |
406 | | #include <stdio.h> |
407 | | int main() |
408 | | { |
409 | | kstring_t *s; |
410 | | int *fields, n, i; |
411 | | ks_tokaux_t aux; |
412 | | char *p; |
413 | | s = (kstring_t*)calloc(1, sizeof(kstring_t)); |
414 | | // test ksprintf() |
415 | | ksprintf(s, " abcdefg: %d ", 100); |
416 | | printf("'%s'\n", s->s); |
417 | | // test ksplit() |
418 | | fields = ksplit(s, 0, &n); |
419 | | for (i = 0; i < n; ++i) |
420 | | printf("field[%d] = '%s'\n", i, s->s + fields[i]); |
421 | | // test kstrtok() |
422 | | s->l = 0; |
423 | | for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { |
424 | | kputsn(p, aux.p - p, s); |
425 | | kputc('\n', s); |
426 | | } |
427 | | printf("%s", s->s); |
428 | | // free |
429 | | free(s->s); free(s); free(fields); |
430 | | |
431 | | { |
432 | | static char *str = "abcdefgcdgcagtcakcdcd"; |
433 | | static char *pat = "cd"; |
434 | | char *ret, *s = str; |
435 | | int *prep = 0; |
436 | | while ((ret = kstrstr(s, pat, &prep)) != 0) { |
437 | | printf("match: %s\n", ret); |
438 | | s = ret + prep[0]; |
439 | | } |
440 | | free(prep); |
441 | | } |
442 | | return 0; |
443 | | } |
444 | | #endif |