Coverage Report

Created: 2026-01-09 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "htslib/hts_expr.h"
58
#include "header.h"
59
60
#include "htslib/khash.h"
61
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
62
KHASH_SET_INIT_INT(tag)
63
64
#ifndef EFTYPE
65
0
#define EFTYPE ENOEXEC
66
#endif
67
#ifndef EOVERFLOW
68
#define EOVERFLOW ERANGE
69
#endif
70
71
/**********************
72
 *** BAM header I/O ***
73
 **********************/
74
75
HTSLIB_EXPORT
76
const int8_t bam_cigar_table[256] = {
77
    // 0 .. 47
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
82
    // 48 .. 63  (including =)
83
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
84
85
    // 64 .. 79  (including MIDNHB)
86
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
87
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
88
89
    // 80 .. 95  (including SPX)
90
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
91
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
92
93
    // 96 .. 127
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
97
    // 128 .. 255
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
106
};
107
108
sam_hdr_t *sam_hdr_init(void)
109
20.5k
{
110
20.5k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
111
20.5k
    if (bh == NULL) return NULL;
112
113
20.5k
    bh->cigar_tab = bam_cigar_table;
114
20.5k
    return bh;
115
20.5k
}
116
117
void sam_hdr_destroy(sam_hdr_t *bh)
118
47.5k
{
119
47.5k
    int32_t i;
120
121
47.5k
    if (bh == NULL) return;
122
123
26.5k
    if (bh->ref_count > 0) {
124
6.01k
        --bh->ref_count;
125
6.01k
        return;
126
6.01k
    }
127
128
20.5k
    if (bh->target_name) {
129
27.5k
        for (i = 0; i < bh->n_targets; ++i)
130
16.0k
            free(bh->target_name[i]);
131
11.4k
        free(bh->target_name);
132
11.4k
        free(bh->target_len);
133
11.4k
    }
134
20.5k
    free(bh->text);
135
20.5k
    if (bh->hrecs)
136
12.5k
        sam_hrecs_free(bh->hrecs);
137
20.5k
    if (bh->sdict)
138
877
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
139
20.5k
    free(bh);
140
20.5k
}
141
142
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
143
// references before sam_hdr_t::hrecs is populated
144
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
145
0
{
146
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
147
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
148
0
    int i;
149
0
    if (!dest_long_refs) return -1;
150
151
0
    for (i = 0; i < h->n_targets; i++) {
152
0
        int ret;
153
0
        khiter_t ksrc, kdest;
154
0
        if (h->target_len[i] < UINT32_MAX) continue;
155
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
156
0
        if (ksrc == kh_end(src_long_refs)) continue;
157
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
158
0
        if (ret < 0) {
159
0
            kh_destroy(s2i, dest_long_refs);
160
0
            return -1;
161
0
        }
162
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
163
0
    }
164
165
0
    h->sdict = dest_long_refs;
166
0
    return 0;
167
0
}
168
169
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
170
11.7k
{
171
11.7k
    if (h0 == NULL) return NULL;
172
11.7k
    sam_hdr_t *h;
173
11.7k
    if ((h = sam_hdr_init()) == NULL) return NULL;
174
    // copy the simple data
175
11.7k
    h->n_targets = 0;
176
11.7k
    h->ignore_sam_err = h0->ignore_sam_err;
177
11.7k
    h->l_text = 0;
178
179
    // Then the pointery stuff
180
181
11.7k
    if (!h0->hrecs) {
182
0
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
183
0
        if (!h->target_len) goto fail;
184
0
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
185
0
        if (!h->target_name) goto fail;
186
187
0
        int i;
188
0
        for (i = 0; i < h0->n_targets; ++i) {
189
0
            h->target_len[i] = h0->target_len[i];
190
0
            h->target_name[i] = strdup(h0->target_name[i]);
191
0
            if (!h->target_name[i]) break;
192
0
        }
193
0
        h->n_targets = i;
194
0
        if (i < h0->n_targets) goto fail;
195
196
0
        if (h0->sdict) {
197
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
198
0
        }
199
0
    }
200
201
11.7k
    if (h0->hrecs) {
202
11.7k
        kstring_t tmp = { 0, 0, NULL };
203
11.7k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
204
0
            free(ks_release(&tmp));
205
0
            goto fail;
206
0
        }
207
208
11.7k
        h->l_text = tmp.l;
209
11.7k
        h->text   = ks_release(&tmp);
210
211
11.7k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
212
0
            goto fail;
213
11.7k
    } else {
214
0
        h->l_text = h0->text ? h0->l_text : 0;
215
0
        h->text = malloc(h->l_text + 1);
216
0
        if (!h->text) goto fail;
217
0
        if (h0->text)
218
0
            memcpy(h->text, h0->text, h->l_text);
219
0
        h->text[h->l_text] = '\0';
220
0
    }
221
222
11.7k
    return h;
223
224
0
 fail:
225
0
    sam_hdr_destroy(h);
226
0
    return NULL;
227
11.7k
}
228
229
sam_hdr_t *bam_hdr_read(BGZF *fp)
230
153
{
231
153
    sam_hdr_t *h;
232
153
    uint8_t buf[4];
233
153
    int magic_len, has_EOF;
234
153
    int32_t i, name_len, num_names = 0;
235
153
    size_t bufsize;
236
153
    ssize_t bytes;
237
    // check EOF
238
153
    has_EOF = bgzf_check_EOF(fp);
239
153
    if (has_EOF < 0) {
240
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
241
153
    } else if (has_EOF == 0) {
242
153
        hts_log_warning("EOF marker is absent. The input is probably truncated");
243
153
    }
244
    // read "BAM1"
245
153
    magic_len = bgzf_read(fp, buf, 4);
246
153
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
247
0
        hts_log_error("Invalid BAM binary header");
248
0
        return 0;
249
0
    }
250
153
    h = sam_hdr_init();
251
153
    if (!h) goto nomem;
252
253
    // read plain text and the number of reference sequences
254
153
    bytes = bgzf_read(fp, buf, 4);
255
153
    if (bytes != 4) goto read_err;
256
153
    h->l_text = le_to_u32(buf);
257
258
153
    bufsize = h->l_text + 1;
259
153
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
260
153
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
261
153
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
262
153
#endif
263
153
    h->text = (char*)malloc(bufsize);
264
153
    if (!h->text) goto nomem;
265
153
    h->text[h->l_text] = 0; // make sure it is NULL terminated
266
153
    bytes = bgzf_read(fp, h->text, h->l_text);
267
153
    if (bytes != h->l_text) goto read_err;
268
269
147
    bytes = bgzf_read(fp, &h->n_targets, 4);
270
147
    if (bytes != 4) goto read_err;
271
147
    if (fp->is_be) ed_swap_4p(&h->n_targets);
272
273
147
    if (h->n_targets < 0) goto invalid;
274
275
    // read reference sequence names and lengths
276
144
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
277
144
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
278
0
        goto nomem;
279
144
#endif
280
144
    if (h->n_targets > 0) {
281
99
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
282
99
        if (!h->target_name) goto nomem;
283
99
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
284
99
        if (!h->target_len) goto nomem;
285
99
    }
286
45
    else {
287
45
        h->target_name = NULL;
288
45
        h->target_len = NULL;
289
45
    }
290
291
651
    for (i = 0; i != h->n_targets; ++i) {
292
549
        bytes = bgzf_read(fp, &name_len, 4);
293
549
        if (bytes != 4) goto read_err;
294
549
        if (fp->is_be) ed_swap_4p(&name_len);
295
549
        if (name_len <= 0) goto invalid;
296
297
543
        h->target_name[i] = (char*)malloc(name_len);
298
543
        if (!h->target_name[i]) goto nomem;
299
543
        num_names++;
300
301
543
        bytes = bgzf_read(fp, h->target_name[i], name_len);
302
543
        if (bytes != name_len) goto read_err;
303
304
507
        if (h->target_name[i][name_len - 1] != '\0') {
305
            /* Fix missing NUL-termination.  Is this being too nice?
306
               We could alternatively bail out with an error. */
307
252
            char *new_name;
308
252
            if (name_len == INT32_MAX) goto invalid;
309
252
            new_name = realloc(h->target_name[i], name_len + 1);
310
252
            if (new_name == NULL) goto nomem;
311
252
            h->target_name[i] = new_name;
312
252
            h->target_name[i][name_len] = '\0';
313
252
        }
314
315
507
        bytes = bgzf_read(fp, &h->target_len[i], 4);
316
507
        if (bytes != 4) goto read_err;
317
507
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
318
507
    }
319
102
    return h;
320
321
0
 nomem:
322
0
    hts_log_error("Out of memory");
323
0
    goto clean;
324
325
42
 read_err:
326
42
    if (bytes < 0) {
327
18
        hts_log_error("Error reading BGZF stream");
328
24
    } else {
329
24
        hts_log_error("Truncated BAM header");
330
24
    }
331
42
    goto clean;
332
333
9
 invalid:
334
9
    hts_log_error("Invalid BAM binary header");
335
336
51
 clean:
337
51
    if (h != NULL) {
338
51
        h->n_targets = num_names; // ensure we free only allocated target_names
339
51
        sam_hdr_destroy(h);
340
51
    }
341
51
    return NULL;
342
9
}
343
344
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
345
2.63k
{
346
2.63k
    int32_t i, name_len, x;
347
2.63k
    kstring_t hdr_ks = { 0, 0, NULL };
348
2.63k
    char *text;
349
2.63k
    uint32_t l_text;
350
351
2.63k
    if (!h) return -1;
352
353
2.63k
    if (h->hrecs) {
354
2.63k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
355
2.63k
        if (hdr_ks.l > UINT32_MAX) {
356
0
            hts_log_error("Header too long for BAM format");
357
0
            free(hdr_ks.s);
358
0
            return -1;
359
2.63k
        } else if (hdr_ks.l > INT32_MAX) {
360
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
361
0
            hts_log_warning("Output file may not be portable");
362
0
        }
363
2.63k
        text = hdr_ks.s;
364
2.63k
        l_text = hdr_ks.l;
365
2.63k
    } else {
366
0
        if (h->l_text > UINT32_MAX) {
367
0
            hts_log_error("Header too long for BAM format");
368
0
            return -1;
369
0
        } else if (h->l_text > INT32_MAX) {
370
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
371
0
            hts_log_warning("Output file may not be portable");
372
0
        }
373
0
        text = h->text;
374
0
        l_text = h->l_text;
375
0
    }
376
    // write "BAM1"
377
2.63k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
378
    // write plain text and the number of reference sequences
379
2.63k
    if (fp->is_be) {
380
0
        x = ed_swap_4(l_text);
381
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
382
0
        if (l_text) {
383
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
384
0
        }
385
0
        x = ed_swap_4(h->n_targets);
386
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
387
2.63k
    } else {
388
2.63k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
389
2.63k
        if (l_text) {
390
1.70k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
391
1.70k
        }
392
2.63k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
393
2.63k
    }
394
2.63k
    free(hdr_ks.s);
395
    // write sequence names and lengths
396
4.73k
    for (i = 0; i != h->n_targets; ++i) {
397
2.09k
        char *p = h->target_name[i];
398
2.09k
        name_len = strlen(p) + 1;
399
2.09k
        if (fp->is_be) {
400
0
            x = ed_swap_4(name_len);
401
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
402
2.09k
        } else {
403
2.09k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
404
2.09k
        }
405
2.09k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
406
2.09k
        if (fp->is_be) {
407
0
            x = ed_swap_4(h->target_len[i]);
408
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
409
2.09k
        } else {
410
2.09k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
411
2.09k
        }
412
2.09k
    }
413
2.63k
    if (bgzf_flush(fp) < 0) return -1;
414
2.63k
    return 0;
415
2.63k
}
416
417
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
418
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
419
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
420
0
}
421
422
/*************************
423
 *** BAM alignment I/O ***
424
 *************************/
425
426
bam1_t *bam_init1(void)
427
1.15M
{
428
1.15M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
429
1.15M
}
430
431
int sam_realloc_bam_data(bam1_t *b, size_t desired)
432
1.18M
{
433
1.18M
    uint32_t new_m_data;
434
1.18M
    uint8_t *new_data;
435
1.18M
    new_m_data = desired;
436
1.18M
    kroundup32(new_m_data); // next power of 2
437
1.18M
    new_m_data += 32; // reduces malloc arena migrations?
438
1.18M
    if (new_m_data < desired) {
439
0
        errno = ENOMEM; // Not strictly true but we can't store the size
440
0
        return -1;
441
0
    }
442
1.18M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443
1.18M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
444
4
        errno = ENOMEM;
445
4
        return -1;
446
4
    }
447
1.18M
#endif
448
1.18M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
449
1.18M
        new_data = realloc(b->data, new_m_data);
450
1.18M
    } else {
451
0
        if ((new_data = malloc(new_m_data)) != NULL) {
452
0
            if (b->l_data > 0)
453
0
                memcpy(new_data, b->data,
454
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
455
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
456
0
        }
457
0
    }
458
1.18M
    if (!new_data) return -1;
459
1.18M
    b->data = new_data;
460
1.18M
    b->m_data = new_m_data;
461
1.18M
    return 0;
462
1.18M
}
463
464
void bam_destroy1(bam1_t *b)
465
17.0M
{
466
17.0M
    if (b == 0) return;
467
1.15M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
468
1.15M
        free(b->data);
469
1.15M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
470
            // In case of reuse
471
0
            b->data = NULL;
472
0
            b->m_data = 0;
473
0
            b->l_data = 0;
474
0
        }
475
1.15M
    }
476
477
1.15M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
478
1.15M
        free(b);
479
1.15M
}
480
481
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
482
5.87M
{
483
5.87M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
484
5.87M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
485
5.87M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
486
5.87M
    bdst->l_data = bsrc->l_data;
487
5.87M
    bdst->id = bsrc->id;
488
5.87M
    return bdst;
489
5.87M
}
490
491
bam1_t *bam_dup1(const bam1_t *bsrc)
492
1.14M
{
493
1.14M
    if (bsrc == NULL) return NULL;
494
1.14M
    bam1_t *bdst = bam_init1();
495
1.14M
    if (bdst == NULL) return NULL;
496
1.14M
    if (bam_copy1(bdst, bsrc) == NULL) {
497
0
        bam_destroy1(bdst);
498
0
        return NULL;
499
0
    }
500
1.14M
    return bdst;
501
1.14M
}
502
503
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
504
                             hts_pos_t *rlen, hts_pos_t *qlen)
505
367
{
506
367
    int k;
507
367
    *rlen = *qlen = 0;
508
15.2k
    for (k = 0; k < n_cigar; ++k) {
509
14.9k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
510
14.9k
        int len = bam_cigar_oplen(cigar[k]);
511
14.9k
        if (type & 1) *qlen += len;
512
14.9k
        if (type & 2) *rlen += len;
513
14.9k
    }
514
367
}
515
516
static int subtract_check_underflow(size_t length, size_t *limit)
517
86.4M
{
518
86.4M
    if (length <= *limit) {
519
86.4M
        *limit -= length;
520
86.4M
        return 0;
521
86.4M
    }
522
523
0
    return -1;
524
86.4M
}
525
526
int bam_set1(bam1_t *bam,
527
             size_t l_qname, const char *qname,
528
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
529
             size_t n_cigar, const uint32_t *cigar,
530
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
531
             size_t l_seq, const char *seq, const char *qual,
532
             size_t l_aux)
533
17.2M
{
534
    // use a default qname "*" if none is provided
535
17.2M
    if (l_qname == 0) {
536
15.7M
        l_qname = 1;
537
15.7M
        qname = "*";
538
15.7M
    }
539
540
    // note: the qname is stored nul terminated and padded as described in the
541
    // documentation for the bam1_t struct.
542
17.2M
    size_t qname_nuls = 4 - l_qname % 4;
543
544
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
545
    // can't use bam_endpos() directly as some fields not yet set up.
546
17.2M
    hts_pos_t rlen = 0, qlen = 0;
547
17.2M
    if (!(flag & BAM_FUNMAP)) {
548
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
549
0
    }
550
17.2M
    if (rlen == 0) {
551
17.2M
        rlen = 1;
552
17.2M
    }
553
554
    // validate parameters
555
17.2M
    if (l_qname > 254) {
556
111
        hts_log_error("Query name too long");
557
111
        errno = EINVAL;
558
111
        return -1;
559
111
    }
560
17.2M
    if (HTS_POS_MAX - rlen <= pos) {
561
0
        hts_log_error("Read ends beyond highest supported position");
562
0
        errno = EINVAL;
563
0
        return -1;
564
0
    }
565
17.2M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
566
0
        hts_log_error("Mapped query must have a CIGAR");
567
0
        errno = EINVAL;
568
0
        return -1;
569
0
    }
570
17.2M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
571
0
        hts_log_error("CIGAR and query sequence are of different length");
572
0
        errno = EINVAL;
573
0
        return -1;
574
0
    }
575
576
17.2M
    size_t limit = INT32_MAX;
577
17.2M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
578
17.2M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
579
17.2M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
580
17.2M
    u    += subtract_check_underflow(l_seq, &limit);
581
17.2M
    u    += subtract_check_underflow(l_aux, &limit);
582
17.2M
    if (u != 0) {
583
0
        hts_log_error("Size overflow");
584
0
        errno = EINVAL;
585
0
        return -1;
586
0
    }
587
588
    // re-allocate the data buffer as needed.
589
17.2M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
590
17.2M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
591
0
        return -1;
592
0
    }
593
594
17.2M
    bam->l_data = (int)data_len;
595
17.2M
    bam->core.pos = pos;
596
17.2M
    bam->core.tid = tid;
597
17.2M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
598
17.2M
    bam->core.qual = mapq;
599
17.2M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
600
17.2M
    bam->core.flag = flag;
601
17.2M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
602
17.2M
    bam->core.n_cigar = (uint32_t)n_cigar;
603
17.2M
    bam->core.l_qseq = (int32_t)l_seq;
604
17.2M
    bam->core.mtid = mtid;
605
17.2M
    bam->core.mpos = mpos;
606
17.2M
    bam->core.isize = isize;
607
608
17.2M
    uint8_t *cp = bam->data;
609
17.2M
    strncpy((char *)cp, qname, l_qname);
610
17.2M
    int i;
611
68.5M
    for (i = 0; i < qname_nuls; i++) {
612
51.2M
        cp[l_qname + i] = '\0';
613
51.2M
    }
614
17.2M
    cp += l_qname + qname_nuls;
615
616
17.2M
    if (n_cigar > 0) {
617
0
        memcpy(cp, cigar, n_cigar * 4);
618
0
    }
619
17.2M
    cp += n_cigar * 4;
620
621
1.29G
#define NN 16
622
17.2M
    const uint8_t *useq = (uint8_t *)seq;
623
123M
    for (i = 0; i + NN < l_seq; i += NN) {
624
106M
        int j;
625
106M
        const uint8_t *u2 = useq+i;
626
956M
        for (j = 0; j < NN/2; j++)
627
850M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
628
106M
        cp += NN/2;
629
106M
    }
630
18.8M
    for (; i + 1 < l_seq; i += 2) {
631
1.52M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
632
1.52M
    }
633
634
17.4M
    for (; i < l_seq; i++) {
635
206k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
636
206k
    }
637
638
17.2M
    if (qual) {
639
6
        memcpy(cp, qual, l_seq);
640
6
    }
641
17.2M
    else {
642
17.2M
        memset(cp, '\xff', l_seq);
643
17.2M
    }
644
645
17.2M
    return (int)data_len;
646
17.2M
}
647
648
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
649
5.96M
{
650
5.96M
    int k;
651
5.96M
    hts_pos_t l;
652
10.5M
    for (k = l = 0; k < n_cigar; ++k)
653
4.59M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
654
4.15M
            l += bam_cigar_oplen(cigar[k]);
655
5.96M
    return l;
656
5.96M
}
657
658
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
659
237k
{
660
237k
    int k;
661
237k
    hts_pos_t l;
662
16.1M
    for (k = l = 0; k < n_cigar; ++k)
663
15.8M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
664
14.7M
            l += bam_cigar_oplen(cigar[k]);
665
237k
    return l;
666
237k
}
667
668
hts_pos_t bam_endpos(const bam1_t *b)
669
1.41k
{
670
1.41k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
671
1.41k
    if (rlen == 0) rlen = 1;
672
1.41k
    return b->core.pos + rlen;
673
1.41k
}
674
675
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
676
343k
{
677
343k
    bam1_core_t *c = &b->core;
678
679
    // Bail out as fast as possible for the easy case
680
343k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
681
343k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
682
240k
        return 0;
683
684
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
685
    // but this is much less likely so do as a secondary check.
686
103k
    if (c->tid < 0 || c->pos < 0)
687
56.0k
        return 0;
688
689
    // Do we have a CG tag?
690
47.1k
    uint8_t *CG = bam_aux_get(b, "CG");
691
47.1k
    int saved_errno = errno;
692
47.1k
    if (!CG) {
693
45.2k
        if (errno != ENOENT) return -1;  // Bad aux data
694
45.2k
        errno = saved_errno; // restore errno on expected no-CG-tag case
695
45.2k
        return 0;
696
45.2k
    }
697
698
    // Now we start with the serious work migrating CG to CIGAR
699
1.88k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
700
1.88k
        *cigar0, CG_len, fake_bytes;
701
1.88k
    cigar0 = bam_get_cigar(b);
702
1.88k
    fake_bytes = c->n_cigar * 4;
703
1.88k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
704
468
        return 0; // not of type B,I
705
1.41k
    CG_len = le_to_u32(CG + 2);
706
    // don't move if the real CIGAR length is shorter than the fake cigar length
707
1.41k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
708
709
    // move from the CG tag to the right position
710
1.41k
    cigar_st = (uint8_t*)cigar0 - b->data;
711
1.41k
    c->n_cigar = CG_len;
712
1.41k
    n_cigar4 = c->n_cigar * 4;
713
1.41k
    CG_st = CG - b->data - 2;
714
1.41k
    CG_en = CG_st + 8 + n_cigar4;
715
1.41k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
716
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
717
1.41k
    b->l_data = b->l_data - fake_bytes + n_cigar4;
718
    // insert c->n_cigar-fake_bytes empty space to make room
719
1.41k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
720
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
721
1.41k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
722
1.41k
    if (ori_len > CG_en) // move data after the CG tag
723
290
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
724
1.41k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
725
1.41k
    if (recal_bin)
726
1.41k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
727
1.41k
    if (give_warning)
728
1.41k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
729
1.41k
    return 1;
730
1.41k
}
731
732
static inline int aux_type2size(uint8_t type)
733
3.21M
{
734
3.21M
    switch (type) {
735
1.62M
    case 'A': case 'c': case 'C':
736
1.62M
        return 1;
737
198k
    case 's': case 'S':
738
198k
        return 2;
739
666k
    case 'i': case 'I': case 'f':
740
666k
        return 4;
741
13.5k
    case 'd':
742
13.5k
        return 8;
743
707k
    case 'Z': case 'H': case 'B':
744
707k
        return type;
745
84
    default:
746
84
        return 0;
747
3.21M
    }
748
3.21M
}
749
750
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
751
0
{
752
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
753
0
    uint32_t i;
754
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
755
0
}
756
757
// Fix bad records where qname is not terminated correctly.
758
197
static int fixup_missing_qname_nul(bam1_t *b) {
759
197
    bam1_core_t *c = &b->core;
760
761
    // Note this is called before c->l_extranul is added to c->l_qname
762
197
    if (c->l_extranul > 0) {
763
188
        b->data[c->l_qname++] = '\0';
764
188
        c->l_extranul--;
765
188
    } else {
766
9
        if (b->l_data > INT_MAX - 4) return -1;
767
9
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
768
9
        b->l_data += 4;
769
9
        b->data[c->l_qname++] = '\0';
770
9
        c->l_extranul = 3;
771
9
    }
772
197
    return 0;
773
197
}
774
775
/*
776
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
777
 * in multi-threaded handling.  This may be worth considering for htslib2.
778
 */
779
int bam_read1(BGZF *fp, bam1_t *b)
780
431
{
781
431
    bam1_core_t *c = &b->core;
782
431
    int32_t block_len, ret, i;
783
431
    uint32_t new_l_data;
784
431
    uint8_t tmp[32], *x;
785
786
431
    b->l_data = 0;
787
788
431
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
789
0
        if (ret == 0) return -1; // normal end-of-file
790
0
        else return -2; // truncated
791
0
    }
792
431
    if (fp->is_be)
793
0
        ed_swap_4p(&block_len);
794
431
    if (block_len < 32) return -4;  // block_len includes core data
795
427
    if (fp->block_length - fp->block_offset > 32) {
796
        // Avoid bgzf_read and a temporary copy to a local buffer
797
427
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
798
427
        fp->block_offset += 32;
799
427
    } else {
800
0
        x = tmp;
801
0
        if (bgzf_read(fp, x, 32) != 32) return -3;
802
0
    }
803
804
427
    c->tid        = le_to_u32(x);
805
427
    c->pos        = le_to_i32(x+4);
806
427
    uint32_t x2   = le_to_u32(x+8);
807
427
    c->bin        = x2>>16;
808
427
    c->qual       = x2>>8&0xff;
809
427
    c->l_qname    = x2&0xff;
810
427
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
811
427
    uint32_t x3   = le_to_u32(x+12);
812
427
    c->flag       = x3>>16;
813
427
    c->n_cigar    = x3&0xffff;
814
427
    c->l_qseq     = le_to_u32(x+16);
815
427
    c->mtid       = le_to_u32(x+20);
816
427
    c->mpos       = le_to_i32(x+24);
817
427
    c->isize      = le_to_i32(x+28);
818
819
427
    new_l_data = block_len - 32 + c->l_extranul;
820
427
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
821
427
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
822
427
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
823
25
        return -4;
824
402
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
825
398
    b->l_data = new_l_data;
826
827
398
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
828
396
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
829
197
        if (fixup_missing_qname_nul(b) < 0) return -4;
830
197
    }
831
747
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
832
396
    c->l_qname += c->l_extranul;
833
396
    if (b->l_data < c->l_qname ||
834
396
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
835
19
        return -4;
836
377
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
837
377
    if (bam_tag2cigar(b, 0, 0) < 0)
838
0
        return -4;
839
840
    // TODO: consider making this conditional
841
377
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
842
367
        hts_pos_t rlen, qlen;
843
367
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
844
367
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
845
367
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
846
        // Sanity check for broken CIGAR alignments
847
367
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
848
10
            hts_log_error("CIGAR and query sequence lengths differ for %s",
849
10
                    bam_get_qname(b));
850
10
            return -4;
851
10
        }
852
367
    }
853
854
367
    return 4 + block_len;
855
377
}
856
857
int bam_write1(BGZF *fp, const bam1_t *b)
858
5.87M
{
859
5.87M
    const bam1_core_t *c = &b->core;
860
5.87M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
861
5.87M
    int i, ok;
862
5.87M
    if (c->l_qname - c->l_extranul > 255) {
863
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
864
0
        errno = EOVERFLOW;
865
0
        return -1;
866
0
    }
867
5.87M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
868
5.87M
    if (c->pos > INT_MAX ||
869
5.87M
        c->mpos > INT_MAX ||
870
5.87M
        c->isize < INT_MIN || c->isize > INT_MAX) {
871
67
        hts_log_error("Positional data is too large for BAM format");
872
67
        return -1;
873
67
    }
874
5.87M
    x[0] = c->tid;
875
5.87M
    x[1] = c->pos;
876
5.87M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
877
5.87M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
878
5.87M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
879
5.87M
    x[4] = c->l_qseq;
880
5.87M
    x[5] = c->mtid;
881
5.87M
    x[6] = c->mpos;
882
5.87M
    x[7] = c->isize;
883
5.87M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
884
5.87M
    if (fp->is_be) {
885
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
886
0
        y = block_len;
887
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
888
0
        swap_data(c, b->l_data, b->data, 1);
889
5.87M
    } else {
890
5.87M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
891
5.87M
    }
892
5.87M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
893
5.87M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
894
5.87M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
895
5.87M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
896
5.87M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
897
28
        uint8_t buf[8];
898
28
        uint32_t cigar_st, cigar_en, cigar[2];
899
28
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
900
28
        if (cigreflen >= (1<<28)) {
901
            // Length of reference covered is greater than the biggest
902
            // CIGAR operation currently allowed.
903
11
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
904
11
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
905
11
                          bam_get_qname(b), c->n_cigar, cigreflen);
906
11
            return -1;
907
11
        }
908
17
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
909
17
        cigar_en = cigar_st + c->n_cigar * 4;
910
17
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
911
17
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
912
17
        u32_to_le(cigar[0], buf);
913
17
        u32_to_le(cigar[1], buf + 4);
914
17
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
915
17
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
916
17
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
917
17
        u32_to_le(c->n_cigar, buf);
918
17
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
919
17
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
920
17
    }
921
5.87M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
922
5.87M
    return ok? 4 + block_len : -1;
923
5.87M
}
924
925
/*
926
 * Write a BAM file and append to the in-memory index simultaneously.
927
 */
928
5.87M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
929
5.87M
    BGZF *bfp = fp->fp.bgzf;
930
931
5.87M
    if (!fp->idx)
932
5.87M
        return bam_write1(bfp, b);
933
934
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
935
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
936
0
        return -1;
937
0
    if (!bfp->mt)
938
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
939
940
0
    int ret = bam_write1(bfp, b);
941
0
    if (ret < 0)
942
0
        return -1;
943
944
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
945
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
946
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947
0
        ret = -1;
948
0
    }
949
950
0
    return ret;
951
0
}
952
953
/*
954
 * Set the qname in a BAM record
955
 */
956
int bam_set_qname(bam1_t *rec, const char *qname)
957
0
{
958
0
    if (!rec) return -1;
959
0
    if (!qname || !*qname) return -1;
960
961
0
    size_t old_len = rec->core.l_qname;
962
0
    size_t new_len = strlen(qname) + 1;
963
0
    if (new_len < 1 || new_len > 255) return -1;
964
965
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
966
967
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
968
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
969
970
    // Make room
971
0
    if (new_len + extranul != rec->core.l_qname)
972
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
973
    // Copy in new name and pad if needed
974
0
    memcpy(rec->data, qname, new_len);
975
0
    int n;
976
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
977
978
0
    rec->l_data = new_data_len;
979
0
    rec->core.l_qname = new_len + extranul;
980
0
    rec->core.l_extranul = extranul;
981
982
0
    return 0;
983
0
}
984
985
/********************
986
 *** BAM indexing ***
987
 ********************/
988
989
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
990
0
{
991
0
    int n_lvls, i, fmt, ret;
992
0
    bam1_t *b;
993
0
    hts_idx_t *idx;
994
0
    sam_hdr_t *h;
995
0
    h = sam_hdr_read(fp);
996
0
    if (h == NULL) return NULL;
997
0
    if (min_shift > 0) {
998
0
        hts_pos_t max_len = 0;
999
0
        for (i = 0; i < h->n_targets; ++i) {
1000
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1001
0
            if (max_len < len) max_len = len;
1002
0
        }
1003
0
        n_lvls = 0;
1004
0
        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1005
0
        fmt = HTS_FMT_CSI;
1006
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1007
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1008
0
    b = bam_init1();
1009
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1010
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1011
0
        if (ret < 0) { // unsorted or doesn't fit
1012
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1013
0
            goto err;
1014
0
        }
1015
0
    }
1016
0
    if (ret < -1) goto err; // corrupted BAM file
1017
1018
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1019
0
    sam_hdr_destroy(h);
1020
0
    bam_destroy1(b);
1021
0
    return idx;
1022
1023
0
err:
1024
0
    bam_destroy1(b);
1025
0
    hts_idx_destroy(idx);
1026
0
    return NULL;
1027
0
}
1028
1029
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1030
0
{
1031
0
    hts_idx_t *idx;
1032
0
    htsFile *fp;
1033
0
    int ret = 0;
1034
1035
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1036
0
    if (nthreads)
1037
0
        hts_set_threads(fp, nthreads);
1038
1039
0
    switch (fp->format.format) {
1040
0
    case cram:
1041
1042
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1043
0
        break;
1044
1045
0
    case bam:
1046
0
    case sam:
1047
0
        if (fp->format.compression != bgzf) {
1048
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1049
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1050
0
            ret = -1;
1051
0
            break;
1052
0
        }
1053
0
        idx = sam_index(fp, min_shift);
1054
0
        if (idx) {
1055
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1056
0
            if (ret < 0) ret = -4;
1057
0
            hts_idx_destroy(idx);
1058
0
        }
1059
0
        else ret = -1;
1060
0
        break;
1061
1062
0
    default:
1063
0
        ret = -3;
1064
0
        break;
1065
0
    }
1066
0
    hts_close(fp);
1067
1068
0
    return ret;
1069
0
}
1070
1071
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1072
0
{
1073
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1074
0
}
1075
1076
int sam_index_build(const char *fn, int min_shift)
1077
0
{
1078
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1079
0
}
1080
1081
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1082
#undef bam_index_build
1083
int bam_index_build(const char *fn, int min_shift)
1084
0
{
1085
0
    return sam_index_build2(fn, NULL, min_shift);
1086
0
}
1087
1088
// Initialise fp->idx for the current format type.
1089
// This must be called after the header has been written but no other data.
1090
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1091
0
    fp->fnidx = fnidx;
1092
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1093
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1094
0
        int n_lvls, fmt = HTS_FMT_CSI;
1095
0
        if (min_shift > 0) {
1096
0
            int64_t max_len = 0;
1097
0
            int i;
1098
0
            for (i = 0; i < h->n_targets; ++i)
1099
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1100
0
            n_lvls = 0;
1101
0
            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1102
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1103
1104
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1105
0
        return fp->idx ? 0 : -1;
1106
0
    }
1107
1108
0
    if (fp->format.format == cram) {
1109
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1110
0
        return fp->fp.cram->idxfp ? 0 : -1;
1111
0
    }
1112
1113
0
    return -1;
1114
0
}
1115
1116
// Finishes an index. Call after the last record has been written.
1117
// Returns 0 on success, <0 on failure.
1118
0
int sam_idx_save(htsFile *fp) {
1119
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1120
0
        fp->format.format == vcf || fp->format.format == sam) {
1121
0
        int ret;
1122
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1123
0
            errno = -ret;
1124
0
            return -1;
1125
0
        }
1126
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1127
0
            return -1;
1128
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1129
1130
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1131
0
            return -1;
1132
1133
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1134
1135
0
    } else if (fp->format.format == cram) {
1136
        // flushed and closed by cram_close
1137
0
    }
1138
1139
0
    return 0;
1140
0
}
1141
1142
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1143
0
{
1144
0
    htsFile *fp = (htsFile *)fpv;
1145
0
    bam1_t *b = bv;
1146
0
    fp->line.l = 0;
1147
0
    int ret = sam_read1(fp, fp->bam_header, b);
1148
0
    if (ret >= 0) {
1149
0
        *tid = b->core.tid;
1150
0
        *beg = b->core.pos;
1151
0
        *end = bam_endpos(b);
1152
0
    }
1153
0
    return ret;
1154
0
}
1155
1156
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1157
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1158
0
{
1159
0
    htsFile *fp = (htsFile *)fpv;
1160
0
    bam1_t *b = bv;
1161
0
    fp->line.l = 0;
1162
0
    int ret = sam_read1(fp, fp->bam_header, b);
1163
0
    return ret;
1164
0
}
1165
1166
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1167
// samtools/bam.c.
1168
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1169
0
{
1170
0
    const char *rg;
1171
0
    kstring_t lib = { 0, 0, NULL };
1172
0
    rg = (char *)bam_aux_get(b, "RG");
1173
1174
0
    if (!rg)
1175
0
        return NULL;
1176
0
    else
1177
0
        rg++;
1178
1179
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1180
0
        return NULL;
1181
1182
0
    static char LB_text[1024];
1183
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1184
1185
0
    memcpy(LB_text, lib.s, len);
1186
0
    LB_text[len] = 0;
1187
1188
0
    free(lib.s);
1189
1190
0
    return LB_text;
1191
0
}
1192
1193
1194
// Bam record pointer and SAM header combined
1195
typedef struct {
1196
    const sam_hdr_t *h;
1197
    const bam1_t *b;
1198
} hb_pair;
1199
1200
// Looks up variable names in str and replaces them with their value.
1201
// Also supports aux tags.
1202
//
1203
// Note the expression parser deliberately overallocates str size so it
1204
// is safe to use memcmp over strcmp.
1205
static int bam_sym_lookup(void *data, char *str, char **end,
1206
0
                          hts_expr_val_t *res) {
1207
0
    hb_pair *hb = (hb_pair *)data;
1208
0
    const bam1_t *b = hb->b;
1209
1210
0
    res->is_str = 0;
1211
0
    switch(*str) {
1212
0
    case 'c':
1213
0
        if (memcmp(str, "cigar", 5) == 0) {
1214
0
            *end = str+5;
1215
0
            res->is_str = 1;
1216
0
            ks_clear(&res->s);
1217
0
            uint32_t *cigar = bam_get_cigar(b);
1218
0
            int i, n = b->core.n_cigar, r = 0;
1219
0
            if (n) {
1220
0
                for (i = 0; i < n; i++) {
1221
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1222
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1223
0
                }
1224
0
                r |= kputs("", &res->s) < 0;
1225
0
            } else {
1226
0
                r |= kputs("*", &res->s) < 0;
1227
0
            }
1228
0
            return r ? -1 : 0;
1229
0
        }
1230
0
        break;
1231
1232
0
    case 'e':
1233
0
        if (memcmp(str, "endpos", 6) == 0) {
1234
0
            *end = str+6;
1235
0
            res->d = bam_endpos(b);
1236
0
            return 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'f':
1241
0
        if (memcmp(str, "flag", 4) == 0) {
1242
0
            str = *end = str+4;
1243
0
            if (*str != '.') {
1244
0
                res->d = b->core.flag;
1245
0
                return 0;
1246
0
            } else {
1247
0
                str++;
1248
0
                if (!memcmp(str, "paired", 6)) {
1249
0
                    *end = str+6;
1250
0
                    res->d = b->core.flag & BAM_FPAIRED;
1251
0
                    return 0;
1252
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1253
0
                    *end = str+11;
1254
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1255
0
                    return 0;
1256
0
                } else if (!memcmp(str, "unmap", 5)) {
1257
0
                    *end = str+5;
1258
0
                    res->d = b->core.flag & BAM_FUNMAP;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "munmap", 6)) {
1261
0
                    *end = str+6;
1262
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "reverse", 7)) {
1265
0
                    *end = str+7;
1266
0
                    res->d = b->core.flag & BAM_FREVERSE;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "mreverse", 8)) {
1269
0
                    *end = str+8;
1270
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "read1", 5)) {
1273
0
                    *end = str+5;
1274
0
                    res->d = b->core.flag & BAM_FREAD1;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "read2", 5)) {
1277
0
                    *end = str+5;
1278
0
                    res->d = b->core.flag & BAM_FREAD2;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "secondary", 9)) {
1281
0
                    *end = str+9;
1282
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "qcfail", 6)) {
1285
0
                    *end = str+6;
1286
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "dup", 3)) {
1289
0
                    *end = str+3;
1290
0
                    res->d = b->core.flag & BAM_FDUP;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "supplementary", 13)) {
1293
0
                    *end = str+13;
1294
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1295
0
                    return 0;
1296
0
                } else {
1297
0
                    hts_log_error("Unrecognised flag string");
1298
0
                    return -1;
1299
0
                }
1300
0
            }
1301
0
        }
1302
0
        break;
1303
1304
0
    case 'h':
1305
0
        if (memcmp(str, "hclen", 5) == 0) {
1306
0
            int hclen = 0;
1307
0
            uint32_t *cigar = bam_get_cigar(b);
1308
0
            uint32_t ncigar = b->core.n_cigar;
1309
1310
            // left
1311
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1312
0
                hclen = bam_cigar_oplen(cigar[0]);
1313
1314
            // right
1315
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1316
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1317
1318
0
            *end = str+5;
1319
0
            res->d = hclen;
1320
0
            return 0;
1321
0
        }
1322
0
        break;
1323
1324
0
    case 'l':
1325
0
        if (memcmp(str, "library", 7) == 0) {
1326
0
            *end = str+7;
1327
0
            res->is_str = 1;
1328
0
            const char *lib = bam_get_library(hb->h, b);
1329
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1330
0
            return 0;
1331
0
        }
1332
0
        break;
1333
1334
0
    case 'm':
1335
0
        if (memcmp(str, "mapq", 4) == 0) {
1336
0
            *end = str+4;
1337
0
            res->d = b->core.qual;
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1340
0
            *end = str+4;
1341
0
            res->d = b->core.mpos+1;
1342
0
            return 0;
1343
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1344
0
            *end = str+6;
1345
0
            res->is_str = 1;
1346
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1347
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1348
0
            return 0;
1349
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1350
0
            *end = str+6;
1351
0
            res->d = b->core.mtid;
1352
0
            return 0;
1353
0
        }
1354
0
        break;
1355
1356
0
    case 'n':
1357
0
        if (memcmp(str, "ncigar", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.n_cigar;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'p':
1365
0
        if (memcmp(str, "pos", 3) == 0) {
1366
0
            *end = str+3;
1367
0
            res->d = b->core.pos+1;
1368
0
            return 0;
1369
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1370
0
            *end = str+5;
1371
0
            res->d = b->core.mpos+1;
1372
0
            return 0;
1373
0
        }
1374
0
        break;
1375
1376
0
    case 'q':
1377
0
        if (memcmp(str, "qlen", 4) == 0) {
1378
0
            *end = str+4;
1379
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1380
0
            return 0;
1381
0
        } else if (memcmp(str, "qname", 5) == 0) {
1382
0
            *end = str+5;
1383
0
            res->is_str = 1;
1384
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qual", 4) == 0) {
1387
0
            *end = str+4;
1388
0
            ks_clear(&res->s);
1389
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1390
0
                return -1;
1391
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1392
0
            res->s.l = b->core.l_qseq;
1393
0
            res->is_str = 1;
1394
0
            return 0;
1395
0
        }
1396
0
        break;
1397
1398
0
    case 'r':
1399
0
        if (memcmp(str, "rlen", 4) == 0) {
1400
0
            *end = str+4;
1401
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1402
0
            return 0;
1403
0
        } else if (memcmp(str, "rname", 5) == 0) {
1404
0
            *end = str+5;
1405
0
            res->is_str = 1;
1406
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1407
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1408
0
            return 0;
1409
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1410
0
            *end = str+5;
1411
0
            res->is_str = 1;
1412
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1413
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1414
0
            return 0;
1415
0
        } else if (memcmp(str, "refid", 5) == 0) {
1416
0
            *end = str+5;
1417
0
            res->d = b->core.tid;
1418
0
            return 0;
1419
0
        }
1420
0
        break;
1421
1422
0
    case 's':
1423
0
        if (memcmp(str, "seq", 3) == 0) {
1424
0
            *end = str+3;
1425
0
            ks_clear(&res->s);
1426
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1427
0
                return -1;
1428
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1429
0
            res->s.s[b->core.l_qseq] = 0;
1430
0
            res->s.l = b->core.l_qseq;
1431
0
            res->is_str = 1;
1432
0
            return 0;
1433
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1434
0
            int sclen = 0;
1435
0
            uint32_t *cigar = bam_get_cigar(b);
1436
0
            int ncigar = b->core.n_cigar;
1437
0
            int left = 0;
1438
1439
            // left
1440
0
            if (ncigar > 0
1441
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1442
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1443
0
            else if (ncigar > 1
1444
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1445
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1446
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1447
1448
            // right
1449
0
            if (ncigar-1 > left
1450
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1451
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1452
0
            else if (ncigar-2 > left
1453
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1454
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1455
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1456
1457
0
            *end = str+5;
1458
0
            res->d = sclen;
1459
0
            return 0;
1460
0
        }
1461
0
        break;
1462
1463
0
    case 't':
1464
0
        if (memcmp(str, "tlen", 4) == 0) {
1465
0
            *end = str+4;
1466
0
            res->d = b->core.isize;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case '[':
1472
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1473
            /* aux tags */
1474
0
            *end = str+4;
1475
1476
0
            uint8_t *aux = bam_aux_get(b, str+1);
1477
0
            if (aux) {
1478
                // we define the truth of a tag to be its presence, even if 0.
1479
0
                res->is_true = 1;
1480
0
                switch (*aux) {
1481
0
                case 'Z':
1482
0
                case 'H':
1483
0
                    res->is_str = 1;
1484
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1485
0
                    break;
1486
1487
0
                case 'A':
1488
0
                    res->is_str = 1;
1489
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'i': case 'I':
1493
0
                case 's': case 'S':
1494
0
                case 'c': case 'C':
1495
0
                    res->is_str = 0;
1496
0
                    res->d = bam_aux2i(aux);
1497
0
                    break;
1498
1499
0
                case 'f':
1500
0
                case 'd':
1501
0
                    res->is_str = 0;
1502
0
                    res->d = bam_aux2f(aux);
1503
0
                    break;
1504
1505
0
                default:
1506
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1507
0
                                  *aux);
1508
0
                    return -1;
1509
0
                }
1510
0
                return 0;
1511
1512
0
            } else {
1513
                // hence absent tags are always false (and strings)
1514
0
                res->is_str = 1;
1515
0
                res->s.l = 0;
1516
0
                res->d = 0;
1517
0
                res->is_true = 0;
1518
0
                return 0;
1519
0
            }
1520
0
        }
1521
0
        break;
1522
0
    }
1523
1524
    // All successful matches in switch should return 0.
1525
    // So if we didn't match, it's a parse error.
1526
0
    return -1;
1527
0
}
1528
1529
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1530
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1531
0
{
1532
0
    hb_pair hb = {h, b};
1533
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1534
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1535
0
        hts_log_error("Couldn't process filter expression");
1536
0
        hts_expr_val_free(&res);
1537
0
        return -1;
1538
0
    }
1539
1540
0
    int t = res.is_true;
1541
0
    hts_expr_val_free(&res);
1542
1543
0
    return t;
1544
0
}
1545
1546
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1547
0
{
1548
0
    htsFile *fp = fpv;
1549
0
    bam1_t *b = bv;
1550
0
    int pass_filter, ret;
1551
1552
0
    do {
1553
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1554
0
        if (ret < 0)
1555
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1556
1557
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1558
0
            return -2;
1559
1560
0
        *tid = b->core.tid;
1561
0
        *beg = b->core.pos;
1562
0
        *end = bam_endpos(b);
1563
1564
0
        if (fp->filter) {
1565
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1566
0
            if (pass_filter < 0)
1567
0
                return -2;
1568
0
        } else {
1569
0
            pass_filter = 1;
1570
0
        }
1571
0
    } while (pass_filter == 0);
1572
1573
0
    return ret;
1574
0
}
1575
1576
static int cram_pseek(void *fp, int64_t offset, int whence)
1577
0
{
1578
0
    cram_fd *fd =  (cram_fd *)fp;
1579
1580
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1581
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1582
0
        return -1;
1583
1584
0
    fd->curr_position = offset;
1585
1586
0
    if (fd->ctr) {
1587
0
        cram_free_container(fd->ctr);
1588
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1589
0
            cram_free_container(fd->ctr_mt);
1590
1591
0
        fd->ctr = NULL;
1592
0
        fd->ctr_mt = NULL;
1593
0
        fd->ooc = 0;
1594
0
    }
1595
1596
0
    return 0;
1597
0
}
1598
1599
/*
1600
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1601
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1602
 *   container previously fetched. It was designed like this to integrate with the functionality
1603
 *   of the iterator stepping logic.
1604
 */
1605
1606
static int64_t cram_ptell(void *fp)
1607
0
{
1608
0
    cram_fd *fd = (cram_fd *)fp;
1609
0
    cram_container *c;
1610
0
    cram_slice *s;
1611
0
    int64_t ret = -1L;
1612
1613
0
    if (fd) {
1614
0
        if ((c = fd->ctr) != NULL) {
1615
0
            if ((s = c->slice) != NULL && s->max_rec) {
1616
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1617
0
                    fd->curr_position += c->offset + c->length;
1618
0
            }
1619
0
        }
1620
0
        ret = fd->curr_position;
1621
0
    }
1622
1623
0
    return ret;
1624
0
}
1625
1626
static int bam_pseek(void *fp, int64_t offset, int whence)
1627
0
{
1628
0
    BGZF *fd = (BGZF *)fp;
1629
1630
0
    return bgzf_seek(fd, offset, whence);
1631
0
}
1632
1633
static int64_t bam_ptell(void *fp)
1634
0
{
1635
0
    BGZF *fd = (BGZF *)fp;
1636
0
    if (!fd)
1637
0
        return -1L;
1638
1639
0
    return bgzf_tell(fd);
1640
0
}
1641
1642
1643
1644
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1645
0
{
1646
0
    switch (fp->format.format) {
1647
0
    case bam:
1648
0
    case sam:
1649
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1650
1651
0
    case cram: {
1652
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1653
1654
        // Cons up a fake "index" just pointing at the associated cram_fd:
1655
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1656
0
        if (idx == NULL) return NULL;
1657
0
        idx->fmt = HTS_FMT_CRAI;
1658
0
        idx->cram = fp->fp.cram;
1659
0
        return (hts_idx_t *) idx;
1660
0
        }
1661
1662
0
    default:
1663
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1664
0
    }
1665
0
}
1666
1667
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1668
0
{
1669
0
    return index_load(fp, fn, fnidx, flags);
1670
0
}
1671
1672
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1673
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1674
0
}
1675
1676
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1677
0
{
1678
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1682
0
{
1683
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1684
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1685
0
    if (iter == NULL) return NULL;
1686
1687
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1688
    // the readrec function:
1689
0
    iter->is_cram = 1;
1690
0
    iter->read_rest = 1;
1691
0
    iter->off = NULL;
1692
0
    iter->bins.a = NULL;
1693
0
    iter->readrec = readrec;
1694
1695
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1696
0
        cram_range r = { tid, beg+1, end };
1697
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1698
1699
0
        iter->curr_off = 0;
1700
        // The following fields are not required by hts_itr_next(), but are
1701
        // filled in in case user code wants to look at them.
1702
0
        iter->tid = tid;
1703
0
        iter->beg = beg;
1704
0
        iter->end = end;
1705
1706
0
        switch (ret) {
1707
0
        case 0:
1708
0
            break;
1709
1710
0
        case -2:
1711
            // No data vs this ref, so mark iterator as completed.
1712
            // Same as HTS_IDX_NONE.
1713
0
            iter->finished = 1;
1714
0
            break;
1715
1716
0
        default:
1717
0
            free(iter);
1718
0
            return NULL;
1719
0
        }
1720
0
    }
1721
0
    else switch (tid) {
1722
0
    case HTS_IDX_REST:
1723
0
        iter->curr_off = 0;
1724
0
        break;
1725
0
    case HTS_IDX_NONE:
1726
0
        iter->curr_off = 0;
1727
0
        iter->finished = 1;
1728
0
        break;
1729
0
    default:
1730
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1731
0
        abort();
1732
0
        break;
1733
0
    }
1734
1735
0
    return iter;
1736
0
}
1737
1738
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1739
0
{
1740
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1741
0
    if (idx == NULL)
1742
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1743
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1744
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1745
0
    else
1746
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1747
0
}
1748
1749
static int cram_name2id(void *fdv, const char *ref)
1750
0
{
1751
0
    cram_fd *fd = (cram_fd *) fdv;
1752
0
    return sam_hdr_name2tid(fd->header, ref);
1753
0
}
1754
1755
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1756
0
{
1757
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1758
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1759
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1760
0
                          sam_readrec);
1761
0
}
1762
1763
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1764
0
{
1765
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1766
0
    hts_reglist_t *r_list = NULL;
1767
0
    int r_count = 0;
1768
1769
0
    if (!cidx || !hdr)
1770
0
        return NULL;
1771
1772
0
    hts_itr_t *itr = NULL;
1773
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1774
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1775
0
        if (!r_list)
1776
0
            return NULL;
1777
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1778
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1779
0
    } else {
1780
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1781
0
        if (!r_list)
1782
0
            return NULL;
1783
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1784
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1785
0
    }
1786
1787
0
    if (!itr)
1788
0
        hts_reglist_free(r_list, r_count);
1789
1790
0
    return itr;
1791
0
}
1792
1793
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1794
0
{
1795
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1796
1797
0
    if(!cidx || !hdr || !reglist)
1798
0
        return NULL;
1799
1800
0
    if (cidx->fmt == HTS_FMT_CRAI)
1801
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1802
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1803
0
    else
1804
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1805
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1806
0
}
1807
1808
/**********************
1809
 *** SAM header I/O ***
1810
 **********************/
1811
1812
#include "htslib/kseq.h"
1813
#include "htslib/kstring.h"
1814
1815
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1816
0
{
1817
0
    sam_hdr_t *bh = sam_hdr_init();
1818
0
    if (!bh) return NULL;
1819
1820
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1821
0
        sam_hdr_destroy(bh);
1822
0
        return NULL;
1823
0
    }
1824
1825
0
    return bh;
1826
0
}
1827
1828
// Minimal sanitisation of a header to ensure.
1829
// - null terminated string.
1830
// - all lines start with @ (also implies no blank lines).
1831
//
1832
// Much more could be done, but currently is not, including:
1833
// - checking header types are known (HD, SQ, etc).
1834
// - syntax (eg checking tab separated fields).
1835
// - validating n_targets matches @SQ records.
1836
// - validating target lengths against @SQ records.
1837
6.06k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1838
6.06k
    if (!h)
1839
51
        return NULL;
1840
1841
    // Special case for empty headers.
1842
6.01k
    if (h->l_text == 0)
1843
939
        return h;
1844
1845
5.07k
    size_t i;
1846
5.07k
    unsigned int lnum = 0;
1847
5.07k
    char *cp = h->text, last = '\n';
1848
31.3M
    for (i = 0; i < h->l_text; i++) {
1849
        // NB: l_text excludes terminating nul.  This finds early ones.
1850
31.3M
        if (cp[i] == 0)
1851
2.27k
            break;
1852
1853
        // Error on \n[^@], including duplicate newlines
1854
31.3M
        if (last == '\n') {
1855
152k
            lnum++;
1856
152k
            if (cp[i] != '@') {
1857
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1858
0
                sam_hdr_destroy(h);
1859
0
                return NULL;
1860
0
            }
1861
152k
        }
1862
1863
31.3M
        last = cp[i];
1864
31.3M
    }
1865
1866
5.07k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1867
2.27k
        size_t j = i;
1868
14.7k
        while (j < h->l_text && cp[j] == '\0') j++;
1869
2.27k
        if (j < h->l_text)
1870
2.27k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1871
2.27k
    }
1872
1873
    // Add trailing newline and/or trailing nul if required.
1874
5.07k
    if (last != '\n') {
1875
2.26k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1876
1877
2.26k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1878
189
            if (h->l_text >= SIZE_MAX - 2) {
1879
0
                hts_log_error("No room for extra newline");
1880
0
                sam_hdr_destroy(h);
1881
0
                return NULL;
1882
0
            }
1883
1884
189
            cp = realloc(h->text, (size_t) h->l_text+2);
1885
189
            if (!cp) {
1886
0
                sam_hdr_destroy(h);
1887
0
                return NULL;
1888
0
            }
1889
189
            h->text = cp;
1890
189
        }
1891
2.26k
        cp[i++] = '\n';
1892
1893
        // l_text may be larger already due to multiple nul padding
1894
2.26k
        if (h->l_text < i)
1895
0
            h->l_text = i;
1896
2.26k
        cp[h->l_text] = '\0';
1897
2.26k
    }
1898
1899
5.07k
    return h;
1900
5.07k
}
1901
1902
5.01k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1903
5.01k
    sam_hdr_t* h = sam_hdr_init();
1904
5.01k
    if (!h)
1905
0
        return NULL;
1906
1907
5.01k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1908
405
        sam_hdr_destroy(h);
1909
405
        return NULL;
1910
405
    }
1911
1912
4.60k
    if (fp->bam_header)
1913
0
        sam_hdr_destroy(fp->bam_header);
1914
4.60k
    fp->bam_header = sam_hdr_sanitise(h);
1915
4.60k
    fp->bam_header->ref_count = 1;
1916
1917
4.60k
    return fp->bam_header;
1918
5.01k
}
1919
1920
sam_hdr_t *sam_hdr_read(htsFile *fp)
1921
8.37k
{
1922
8.37k
    sam_hdr_t *h = NULL;
1923
8.37k
    if (!fp) {
1924
0
        errno = EINVAL;
1925
0
        return NULL;
1926
0
    }
1927
1928
8.37k
    switch (fp->format.format) {
1929
153
    case bam:
1930
153
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1931
153
        break;
1932
1933
1.30k
    case cram:
1934
1.30k
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1935
1.30k
        break;
1936
1937
5.01k
    case sam:
1938
5.01k
        h = sam_hdr_create(fp);
1939
5.01k
        break;
1940
1941
24
    case fastq_format:
1942
1.89k
    case fasta_format:
1943
1.89k
        return sam_hdr_init();
1944
1945
0
    case empty_format:
1946
0
        errno = EPIPE;
1947
0
        return NULL;
1948
1949
0
    default:
1950
0
        errno = EFTYPE;
1951
0
        return NULL;
1952
8.37k
    }
1953
    //only sam,bam and cram reaches here
1954
6.47k
    if (h && !fp->bam_header) { //set except for sam which already has it
1955
        //for cram, it is the o/p header as for rest and not the internal header
1956
1.41k
        fp->bam_header = h;
1957
1.41k
        sam_hdr_incr_ref(fp->bam_header);
1958
1.41k
    }
1959
6.47k
    return h;
1960
8.37k
}
1961
1962
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1963
7.91k
{
1964
7.91k
    if (!fp || !h) {
1965
0
        errno = EINVAL;
1966
0
        return -1;
1967
0
    }
1968
1969
7.91k
    switch (fp->format.format) {
1970
2.63k
    case binary_format:
1971
2.63k
        fp->format.category = sequence_data;
1972
2.63k
        fp->format.format = bam;
1973
        /* fall-through */
1974
2.63k
    case bam:
1975
2.63k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1976
2.63k
        break;
1977
1978
2.63k
    case cram: {
1979
2.63k
        cram_fd *fd = fp->fp.cram;
1980
2.63k
        if (cram_set_header2(fd, h) < 0) return -1;
1981
2.50k
        if (fp->fn_aux)
1982
0
            cram_load_reference(fd, fp->fn_aux);
1983
2.50k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1984
2.50k
        }
1985
2.50k
        break;
1986
1987
2.63k
    case text_format:
1988
2.63k
        fp->format.category = sequence_data;
1989
2.63k
        fp->format.format = sam;
1990
        /* fall-through */
1991
2.63k
    case sam: {
1992
2.63k
        if (!h->hrecs && !h->text)
1993
0
            return 0;
1994
2.63k
        char *text;
1995
2.63k
        kstring_t hdr_ks = { 0, 0, NULL };
1996
2.63k
        size_t l_text;
1997
2.63k
        ssize_t bytes;
1998
2.63k
        int r = 0, no_sq = 0;
1999
2000
2.63k
        if (h->hrecs) {
2001
2.63k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2002
0
                return -1;
2003
2.63k
            text = hdr_ks.s;
2004
2.63k
            l_text = hdr_ks.l;
2005
2.63k
        } else {
2006
0
            const char *p = NULL;
2007
0
            do {
2008
0
                const char *q = p == NULL ? h->text : p + 4;
2009
0
                p = strstr(q, "@SQ\t");
2010
0
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2011
0
            no_sq = p == NULL;
2012
0
            text = h->text;
2013
0
            l_text = h->l_text;
2014
0
        }
2015
2016
2.63k
        if (fp->is_bgzf) {
2017
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2018
2.63k
        } else {
2019
2.63k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2020
2.63k
        }
2021
2.63k
        free(hdr_ks.s);
2022
2.63k
        if (bytes != l_text)
2023
0
            return -1;
2024
2025
2.63k
        if (no_sq) {
2026
0
            int i;
2027
0
            for (i = 0; i < h->n_targets; ++i) {
2028
0
                fp->line.l = 0;
2029
0
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2030
0
                r |= kputs(h->target_name[i], &fp->line) < 0;
2031
0
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2032
0
                r |= kputw(h->target_len[i], &fp->line) < 0;
2033
0
                r |= kputc('\n', &fp->line) < 0;
2034
0
                if (r != 0)
2035
0
                    return -1;
2036
2037
0
                if (fp->is_bgzf) {
2038
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2039
0
                } else {
2040
0
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2041
0
                }
2042
0
                if (bytes != fp->line.l)
2043
0
                    return -1;
2044
0
            }
2045
0
        }
2046
2.63k
        if (fp->is_bgzf) {
2047
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2048
2.63k
        } else {
2049
2.63k
            if (hflush(fp->fp.hfile) != 0) return -1;
2050
2.63k
        }
2051
2.63k
        }
2052
2.63k
        break;
2053
2054
2.63k
    case fastq_format:
2055
0
    case fasta_format:
2056
        // Nothing to output; FASTQ has no file headers.
2057
0
        return 0;
2058
0
        break;
2059
2060
0
    default:
2061
0
        errno = EBADF;
2062
0
        return -1;
2063
7.91k
    }
2064
    //only sam,bam and cram reaches here
2065
7.78k
    if (h) {    //the new header
2066
7.78k
        sam_hdr_t *tmp = fp->bam_header;
2067
7.78k
        fp->bam_header = sam_hdr_dup(h);
2068
7.78k
        sam_hdr_destroy(tmp);
2069
7.78k
        if (!fp->bam_header && h)
2070
0
            return -1;  //failed to duplicate
2071
7.78k
    }
2072
7.78k
    return 0;
2073
7.78k
}
2074
2075
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2076
0
{
2077
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2078
0
    size_t new_l_text;
2079
0
    if (!h || !key)
2080
0
        return -1;
2081
2082
0
    if (h->l_text > 3) {
2083
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2084
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2085
0
            *p = '\0'; // for strstr call
2086
2087
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2088
2089
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2090
0
                *p = '\n'; // change back
2091
2092
                // mark the key:val
2093
0
                beg = q;
2094
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2095
0
                end = q;
2096
2097
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2098
0
                    && strlen(val) == end - beg - 4)
2099
0
                     return 0; // val is the same, no need to change
2100
2101
0
            } else {
2102
0
                beg = end = p;
2103
0
                *p = '\n';
2104
0
            }
2105
0
        }
2106
0
    }
2107
0
    if (beg == NULL) { // no @HD
2108
0
        new_l_text = h->l_text;
2109
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2110
0
            return -1;
2111
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2112
0
        if (val) {
2113
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2114
0
                return -1;
2115
0
            new_l_text += strlen(val) + 4;
2116
0
        }
2117
0
        newtext = (char*)malloc(new_l_text + 1);
2118
0
        if (!newtext) return -1;
2119
2120
0
        if (val)
2121
0
            snprintf(newtext, new_l_text + 1,
2122
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2123
0
        else
2124
0
            snprintf(newtext, new_l_text + 1,
2125
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2126
0
    } else { // has @HD but different or no key
2127
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2128
0
        if (val) {
2129
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2130
0
                return -1;
2131
0
            new_l_text += strlen(val) + 4;
2132
0
        }
2133
0
        newtext = (char*)malloc(new_l_text + 1);
2134
0
        if (!newtext) return -1;
2135
2136
0
        if (val) {
2137
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2138
0
                    (int) (beg - h->text), h->text, key, val, end);
2139
0
        } else { //delete key
2140
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2141
0
                    (int) (beg - h->text), h->text, end);
2142
0
        }
2143
0
    }
2144
0
    free(h->text);
2145
0
    h->text = newtext;
2146
0
    h->l_text = new_l_text;
2147
0
    return 0;
2148
0
}
2149
2150
2151
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2152
0
{
2153
0
    if (!h || !key)
2154
0
        return -1;
2155
2156
0
    if (!h->hrecs)
2157
0
        return old_sam_hdr_change_HD(h, key, val);
2158
2159
0
    if (val) {
2160
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2161
0
            return -1;
2162
0
    } else {
2163
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2164
0
            return -1;
2165
0
    }
2166
0
    return sam_hdr_rebuild(h);
2167
0
}
2168
2169
/* releases existing header and sets new one; increments ref count if not
2170
duplicating */
2171
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2172
0
{
2173
0
    if (!fp)
2174
0
        return -1;
2175
2176
0
    if (duplicate) {
2177
0
        sam_hdr_t *tmp = fp->bam_header;
2178
0
        fp->bam_header = sam_hdr_dup(h);
2179
0
        sam_hdr_destroy(tmp);
2180
0
        if (!fp->bam_header && h)
2181
0
            return -1;  //duplicate failed
2182
0
    } else {
2183
0
        if (fp->bam_header != h) {  //if not the same
2184
0
            sam_hdr_destroy(fp->bam_header);
2185
0
            fp->bam_header = h;
2186
0
            sam_hdr_incr_ref(fp->bam_header);
2187
0
        }
2188
0
    }
2189
2190
0
    return 0;
2191
0
}
2192
2193
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2194
sam_hdr_t* sam_hdr_get(samFile* fp)
2195
0
{
2196
0
    if (!fp)
2197
0
        return NULL;
2198
0
    return fp->bam_header;
2199
0
}
2200
2201
/**********************
2202
 *** SAM record I/O ***
2203
 **********************/
2204
2205
// The speed of this code can vary considerably depending on minor code
2206
// changes elsewhere as some of the tight loops are particularly prone to
2207
// speed changes when the instruction blocks are split over a 32-byte
2208
// boundary.  To protect against this, we explicitly specify an alignment
2209
// for this function.  If this is insufficient, we may also wish to
2210
// consider alignment of blocks within this function via
2211
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2212
// However it's not very portable.
2213
// Instead we break into separate functions so we can explicitly specify
2214
// use __attribute__((aligned(32))) instead and force consistent loop
2215
// alignment.
2216
357k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2217
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2218
357k
    if (*n > INT32_MAX*0.666) {
2219
0
        errno = ENOMEM;
2220
0
        return -1;
2221
0
    }
2222
2223
357k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2224
357k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2225
0
        hts_log_error("Out of memory");
2226
0
        return -1;
2227
0
    }
2228
2229
357k
    (*n)+=*n>>1;
2230
357k
    return 0;
2231
357k
}
2232
2233
2234
// This ensures that q always ends up at the next comma after
2235
// reading a number even if it's followed by junk.  It
2236
// prevents the possibility of trying to read more than n items.
2237
18.2M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2238
2239
HTS_ALIGN32
2240
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2241
33.3k
                               uint32_t *nalloc, int *overflow) {
2242
2.31M
    while (*q == ',') {
2243
2.28M
        if ((*nused)++ >= (*nalloc)) {
2244
567
            if (grow_B_array(b, nalloc, 1) < 0)
2245
0
                return NULL;
2246
567
        }
2247
2.28M
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2248
2.28M
        b->l_data++;
2249
2.28M
    }
2250
33.3k
    return q;
2251
33.3k
}
2252
2253
HTS_ALIGN32
2254
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2255
26.7k
                               uint32_t *nalloc, int *overflow) {
2256
1.30M
    while (*q == ',') {
2257
1.27M
        if ((*nused)++ >= (*nalloc)) {
2258
2.57k
            if (grow_B_array(b, nalloc, 1) < 0)
2259
0
                return NULL;
2260
2.57k
        }
2261
1.27M
        if (q[1] != '-') {
2262
1.24M
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2263
1.24M
            b->l_data++;
2264
1.24M
        } else {
2265
26.6k
            *overflow = 1;
2266
26.6k
            q++;
2267
26.6k
            skip_to_comma_(q);
2268
26.6k
        }
2269
1.27M
    }
2270
26.7k
    return q;
2271
26.7k
}
2272
2273
HTS_ALIGN32
2274
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2275
11.1k
                               uint32_t *nalloc, int *overflow) {
2276
4.26M
    while (*q == ',') {
2277
4.25M
        if ((*nused)++ >= (*nalloc)) {
2278
6.16k
            if (grow_B_array(b, nalloc, 2) < 0)
2279
0
                return NULL;
2280
6.16k
        }
2281
4.25M
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2282
4.25M
                  b->data + b->l_data);
2283
4.25M
        b->l_data += 2;
2284
4.25M
    }
2285
11.1k
    return q;
2286
11.1k
}
2287
2288
HTS_ALIGN32
2289
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2290
6.21k
                               uint32_t *nalloc, int *overflow) {
2291
6.49M
    while (*q == ',') {
2292
6.49M
        if ((*nused)++ >= (*nalloc)) {
2293
14.4k
            if (grow_B_array(b, nalloc, 2) < 0)
2294
0
                return NULL;
2295
14.4k
        }
2296
6.49M
        if (q[1] != '-') {
2297
6.39M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2298
6.39M
                      b->data + b->l_data);
2299
6.39M
            b->l_data += 2;
2300
6.39M
        } else {
2301
98.7k
            *overflow = 1;
2302
98.7k
            q++;
2303
98.7k
            skip_to_comma_(q);
2304
98.7k
        }
2305
6.49M
    }
2306
6.21k
    return q;
2307
6.21k
}
2308
2309
HTS_ALIGN32
2310
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2311
33.3k
                               uint32_t *nalloc, int *overflow) {
2312
16.0M
    while (*q == ',') {
2313
16.0M
        if ((*nused)++ >= (*nalloc)) {
2314
164
            if (grow_B_array(b, nalloc, 4) < 0)
2315
0
                return NULL;
2316
164
        }
2317
16.0M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2318
16.0M
                  b->data + b->l_data);
2319
16.0M
        b->l_data += 4;
2320
16.0M
    }
2321
33.3k
    return q;
2322
33.3k
}
2323
2324
HTS_ALIGN32
2325
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2326
97.8k
                               uint32_t *nalloc, int *overflow) {
2327
4.22M
    while (*q == ',') {
2328
4.13M
        if ((*nused)++ >= (*nalloc)) {
2329
279k
            if (grow_B_array(b, nalloc, 4) < 0)
2330
0
                return NULL;
2331
279k
        }
2332
4.13M
        if (q[1] != '-') {
2333
3.97M
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2334
3.97M
                      b->data + b->l_data);
2335
3.97M
            b->l_data += 4;
2336
3.97M
        } else {
2337
159k
            *overflow = 1;
2338
159k
            q++;
2339
159k
            skip_to_comma_(q);
2340
159k
        }
2341
4.13M
    }
2342
97.8k
    return q;
2343
97.8k
}
2344
2345
HTS_ALIGN32
2346
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2347
16.4k
                               uint32_t *nalloc, int *overflow) {
2348
432k
    while (*q == ',') {
2349
416k
        if ((*nused)++ >= (*nalloc)) {
2350
53.7k
            if (grow_B_array(b, nalloc, 4) < 0)
2351
0
                return NULL;
2352
53.7k
        }
2353
416k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2354
416k
        b->l_data += 4;
2355
416k
    }
2356
16.4k
    return q;
2357
16.4k
}
2358
2359
HTS_ALIGN32
2360
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2361
                              char **end, bam1_t *b,
2362
225k
                              int *ctr) {
2363
    // Protect against infinite recursion when dealing with invalid input.
2364
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2365
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2366
    //
2367
    // Loop detection is the safest solution incase there are other
2368
    // strange corner cases with malformed inputs.
2369
225k
    if (++(*ctr) > 2) {
2370
5
        hts_log_error("Malformed data in B:%c array", type);
2371
5
        return -1;
2372
5
    }
2373
2374
225k
    int orig_l = b->l_data;
2375
225k
    char *q = in;
2376
225k
    int32_t size;
2377
225k
    size_t bytes;
2378
225k
    int overflow = 0;
2379
2380
225k
    size = aux_type2size(type);
2381
225k
    if (size <= 0 || size > 4) {
2382
1
        hts_log_error("Unrecognized type B:%c", type);
2383
1
        return -1;
2384
1
    }
2385
2386
    // Ensure space for type + values.
2387
    // The first pass through here we don't know the number of entries and
2388
    // nalloc == 0.  We start with a small working set and then parse the
2389
    // data, growing as needed.
2390
    //
2391
    // If we have a second pass through we do know the number of entries
2392
    // and nalloc is already known.  We have no need to expand the bam data.
2393
225k
    if (!nalloc)
2394
154k
         nalloc=7;
2395
2396
    // Ensure allocated memory is big enough (for current nalloc estimate)
2397
225k
    bytes = (size_t) nalloc * (size_t) size;
2398
225k
    if (bytes / size != nalloc
2399
225k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2400
0
        hts_log_error("Out of memory");
2401
0
        return -1;
2402
0
    }
2403
2404
225k
    uint32_t nused = 0;
2405
2406
225k
    b->data[b->l_data++] = 'B';
2407
225k
    b->data[b->l_data++] = type;
2408
    // 32-bit B-array length is inserted later once we know it.
2409
225k
    int b_len_idx = b->l_data;
2410
225k
    b->l_data += sizeof(uint32_t);
2411
2412
225k
    if (type == 'c') {
2413
33.3k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2414
0
            return -1;
2415
191k
    } else if (type == 'C') {
2416
26.7k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2417
0
            return -1;
2418
165k
    } else if (type == 's') {
2419
11.1k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2420
0
            return -1;
2421
153k
    } else if (type == 'S') {
2422
6.21k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2423
0
            return -1;
2424
147k
    } else if (type == 'i') {
2425
33.3k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2426
0
            return -1;
2427
114k
    } else if (type == 'I') {
2428
97.8k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2429
0
            return -1;
2430
97.8k
    } else if (type == 'f') {
2431
16.4k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2432
0
            return -1;
2433
16.4k
    }
2434
225k
    if (*q != '\t' && *q != '\0') {
2435
        // Unknown B array type or junk in the numbers
2436
183
        hts_log_error("Malformed B:%c", type);
2437
183
        return -1;
2438
183
    }
2439
224k
    i32_to_le(nused, b->data + b_len_idx);
2440
2441
224k
    if (!overflow) {
2442
154k
        *end = q;
2443
154k
        return 0;
2444
154k
    } else {
2445
70.6k
        int64_t max = 0, min = 0, val;
2446
        // Given type was incorrect.  Try to rescue the situation.
2447
70.6k
        char *r = q;
2448
70.6k
        q = in;
2449
70.6k
        overflow = 0;
2450
70.6k
        b->l_data = orig_l;
2451
        // Find out what range of values is present
2452
16.8M
        while (q < r) {
2453
16.7M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2454
16.7M
            if (max < val) max = val;
2455
16.7M
            if (min > val) min = val;
2456
16.7M
            skip_to_comma_(q);
2457
16.7M
        }
2458
        // Retry with appropriate type
2459
70.6k
        if (!overflow) {
2460
70.6k
            if (min < 0) {
2461
70.4k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2462
32.6k
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2463
37.7k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2464
4.40k
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2465
33.3k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2466
33.3k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2467
33.3k
                }
2468
70.4k
            } else {
2469
189
                if (max < UINT8_MAX) {
2470
10
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2471
179
                } else if (max <= UINT16_MAX) {
2472
0
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2473
179
                } else if (max <= UINT32_MAX) {
2474
177
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2475
177
                }
2476
189
            }
2477
70.6k
        }
2478
        // If here then at least one of the values is too big to store
2479
50
        hts_log_error("Numeric value in B array out of allowed range");
2480
50
        return -1;
2481
70.6k
    }
2482
224k
#undef skip_to_comma_
2483
224k
}
2484
2485
HTS_ALIGN32
2486
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2487
154k
{
2488
154k
    int ctr = 0;
2489
154k
    uint32_t nalloc = 0;
2490
154k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2491
154k
}
2492
2493
345k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2494
345k
    if (*v >= '1' && *v <= '9') {
2495
98.8k
        return hts_str2uint(v, rv, 16, overflow);
2496
98.8k
    }
2497
246k
    else if (*v == '0') {
2498
        // handle single-digit "0" directly; otherwise it's hex or octal
2499
86.5k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2500
2.02k
        else {
2501
2.02k
            unsigned long val = strtoul(v, rv, 0);
2502
2.02k
            if (val > 65535) { *overflow = 1; return 65535; }
2503
2.02k
            return val;
2504
2.02k
        }
2505
86.5k
    }
2506
159k
    else {
2507
        // TODO implement symbolic flag letters
2508
159k
        *rv = v;
2509
159k
        return 0;
2510
159k
    }
2511
345k
}
2512
2513
// Parse tag line and append to bam object b.
2514
// Shared by both SAM and FASTQ parsers.
2515
//
2516
// The difference between the two is how lenient we are to recognising
2517
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2518
// non-SAM looking strings.
2519
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2520
344k
                            khash_t(tag) *tag_whitelist) {
2521
344k
    int overflow = 0;
2522
344k
    int checkpoint;
2523
344k
    char logbuf[40];
2524
344k
    char *q = start, *p = end;
2525
2526
344k
#define _parse_err(cond, ...)                   \
2527
8.56M
    do {                                        \
2528
18.6M
        if (cond) {                             \
2529
666
            if (lenient) {                      \
2530
0
                while (q < p && !isspace_c(*q))   \
2531
0
                    q++;                        \
2532
0
                while (q < p && isspace_c(*q))    \
2533
0
                    q++;                        \
2534
0
                b->l_data = checkpoint;         \
2535
0
                goto loop;                      \
2536
666
            } else {                            \
2537
666
                hts_log_error(__VA_ARGS__);     \
2538
666
                goto err_ret;                   \
2539
666
            }                                   \
2540
666
        }                                       \
2541
8.56M
    } while (0)
2542
2543
8.06M
    while (q < p) loop: {
2544
8.06M
        char type;
2545
8.06M
        checkpoint = b->l_data;
2546
8.06M
        if (p - q < 5) {
2547
27
            if (lenient) {
2548
0
                break;
2549
27
            } else {
2550
27
                hts_log_error("Incomplete aux field");
2551
27
                goto err_ret;
2552
27
            }
2553
27
        }
2554
4.03M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2555
2556
4.03M
        if (lenient && (q[2] | q[4]) != ':') {
2557
0
            while (q < p && !isspace_c(*q))
2558
0
                q++;
2559
0
            while (q < p && isspace_c(*q))
2560
0
                q++;
2561
0
            continue;
2562
0
        }
2563
2564
4.03M
        if (tag_whitelist) {
2565
0
            int tt = q[0]*256 + q[1];
2566
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2567
0
                while (q < p && *q != '\t')
2568
0
                    q++;
2569
0
                continue;
2570
0
            }
2571
0
        }
2572
2573
        // Copy over id
2574
4.03M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2575
4.03M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2576
4.03M
        q += 3; type = *q++; ++q; // q points to value
2577
4.03M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2578
3.21M
            _parse_err(*q <= '\t', "incomplete aux field");
2579
2580
        // Ensure enough space for a double + type allocated.
2581
4.03M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2582
2583
4.03M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2584
1.16M
            b->data[b->l_data++] = 'A';
2585
1.16M
            b->data[b->l_data++] = *q++;
2586
2.87M
        } else if (type == 'i' || type == 'I') {
2587
1.81M
            if (*q == '-') {
2588
1.49M
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2589
1.49M
                if (x >= INT8_MIN) {
2590
796k
                    b->data[b->l_data++] = 'c';
2591
796k
                    b->data[b->l_data++] = x;
2592
796k
                } else if (x >= INT16_MIN) {
2593
199k
                    b->data[b->l_data++] = 's';
2594
199k
                    i16_to_le(x, b->data + b->l_data);
2595
199k
                    b->l_data += 2;
2596
503k
                } else {
2597
503k
                    b->data[b->l_data++] = 'i';
2598
503k
                    i32_to_le(x, b->data + b->l_data);
2599
503k
                    b->l_data += 4;
2600
503k
                }
2601
1.49M
            } else {
2602
317k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2603
317k
                if (x <= UINT8_MAX) {
2604
185k
                    b->data[b->l_data++] = 'C';
2605
185k
                    b->data[b->l_data++] = x;
2606
185k
                } else if (x <= UINT16_MAX) {
2607
104k
                    b->data[b->l_data++] = 'S';
2608
104k
                    u16_to_le(x, b->data + b->l_data);
2609
104k
                    b->l_data += 2;
2610
104k
                } else {
2611
27.0k
                    b->data[b->l_data++] = 'I';
2612
27.0k
                    u32_to_le(x, b->data + b->l_data);
2613
27.0k
                    b->l_data += 4;
2614
27.0k
                }
2615
317k
            }
2616
1.81M
        } else if (type == 'f') {
2617
41.9k
            b->data[b->l_data++] = 'f';
2618
41.9k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2619
41.9k
            b->l_data += sizeof(float);
2620
1.01M
        } else if (type == 'd') {
2621
42.9k
            b->data[b->l_data++] = 'd';
2622
42.9k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2623
42.9k
            b->l_data += sizeof(double);
2624
969k
        } else if (type == 'Z' || type == 'H') {
2625
814k
            char *end = strchr(q, '\t');
2626
814k
            if (!end) end = q + strlen(q);
2627
814k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2628
814k
                       "hex field does not have an even number of digits");
2629
814k
            b->data[b->l_data++] = type;
2630
814k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2631
814k
            memcpy(b->data + b->l_data, q, end - q);
2632
814k
            b->l_data += end - q;
2633
814k
            b->data[b->l_data++] = '\0';
2634
814k
            q = end;
2635
814k
        } else if (type == 'B') {
2636
154k
            type = *q++; // q points to the first ',' following the typing byte
2637
154k
            _parse_err(*q && *q != ',' && *q != '\t',
2638
154k
                       "B aux field type not followed by ','");
2639
2640
154k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2641
239
                goto err_ret;
2642
154k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2643
2644
21.4M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2645
4.03M
        q++;
2646
4.03M
    }
2647
2648
343k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2649
343k
#undef _parse_err
2650
2651
343k
    return 0;
2652
2653
932
err_ret:
2654
932
    return -2;
2655
343k
}
2656
2657
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2658
345k
{
2659
1.42M
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2660
2661
345k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2662
2663
// Macro that operates on 64-bits at a time.
2664
345k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2665
345k
    do {                                                        \
2666
274k
        uint64_u *from8 = (uint64_u *)(from);                   \
2667
274k
        uint64_u *to8 = (uint64_u *)(to);                       \
2668
274k
        uint64_t uflow = 0;                                     \
2669
274k
        size_t l8 = (l)>>3, i;                                  \
2670
274k
        for (i = 0; i < l8; i++) {                              \
2671
0
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2672
0
            uflow |= to8[i];                                    \
2673
0
        }                                                       \
2674
277k
        for (i<<=3; i < (l); ++i) {                             \
2675
2.86k
            to[i] = from[i] - (n);                              \
2676
2.86k
            uflow |= to[i];                                     \
2677
2.86k
        }                                                       \
2678
274k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2679
274k
    } while (0)
2680
2681
#else
2682
2683
// Basic version which operates a byte at a time
2684
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2685
        uint8_t uflow = 0;                                   \
2686
        for (i = 0; i < (l); ++i) {                          \
2687
            (to)[i] = (from)[i] - (n);                       \
2688
            uflow |= (uint8_t) (to)[i];                      \
2689
        }                                                    \
2690
        failed = (uflow & 0x80) > 0;                         \
2691
    } while (0)
2692
2693
#endif
2694
2695
604k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2696
4.38M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2697
1.22M
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2698
2699
345k
    uint8_t *t;
2700
2701
345k
    char *p = s->s, *q;
2702
345k
    int i, overflow = 0;
2703
345k
    char logbuf[40];
2704
345k
    hts_pos_t cigreflen;
2705
345k
    bam1_core_t *c = &b->core;
2706
2707
345k
    b->l_data = 0;
2708
345k
    memset(c, 0, 32);
2709
2710
    // qname
2711
345k
    q = _read_token(p);
2712
2713
345k
    _parse_warn(p - q <= 1, "empty query name");
2714
345k
    _parse_err(p - q > 255, "query name too long");
2715
    // resize large enough for name + extranul
2716
345k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2717
345k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2718
2719
345k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2720
345k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2721
345k
    b->l_data += c->l_extranul;
2722
2723
345k
    c->l_qname = p - q + c->l_extranul;
2724
2725
    // flag
2726
345k
    c->flag = parse_sam_flag(p, &p, &overflow);
2727
345k
    if (*p++ != '\t') goto err_ret; // malformated flag
2728
2729
    // chr
2730
345k
    q = _read_token(p);
2731
345k
    if (strcmp(q, "*")) {
2732
309k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2733
309k
        c->tid = bam_name2id(h, q);
2734
309k
        _parse_err(c->tid < -1, "failed to parse header");
2735
309k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2736
309k
    } else c->tid = -1;
2737
2738
    // pos
2739
345k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2740
345k
    if (*p++ != '\t') goto err_ret;
2741
345k
    if (c->pos < 0 && c->tid >= 0) {
2742
42.3k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2743
42.3k
        c->tid = -1;
2744
42.3k
    }
2745
345k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2746
2747
    // mapq
2748
345k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2749
345k
    if (*p++ != '\t') goto err_ret;
2750
    // cigar
2751
344k
    if (*p != '*') {
2752
302k
        uint32_t *cigar = NULL;
2753
302k
        int old_l_data = b->l_data;
2754
302k
        int n_cigar = bam_parse_cigar(p, &p, b);
2755
302k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2756
301k
        cigar = (uint32_t *)(b->data + old_l_data);
2757
2758
        // can't use bam_endpos() directly as some fields not yet set up
2759
301k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2760
301k
        if (cigreflen == 0) cigreflen = 1;
2761
301k
    } else {
2762
42.9k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2763
42.9k
        c->flag |= BAM_FUNMAP;
2764
42.9k
        q = _read_token(p);
2765
42.9k
        cigreflen = 1;
2766
42.9k
    }
2767
344k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2768
344k
               "read ends beyond highest supported position");
2769
344k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2770
    // mate chr
2771
344k
    q = _read_token(p);
2772
344k
    if (strcmp(q, "=") == 0) {
2773
2
        c->mtid = c->tid;
2774
344k
    } else if (strcmp(q, "*") == 0) {
2775
0
        c->mtid = -1;
2776
344k
    } else {
2777
344k
        c->mtid = bam_name2id(h, q);
2778
344k
        _parse_err(c->mtid < -1, "failed to parse header");
2779
344k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2780
344k
    }
2781
    // mpos
2782
344k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2783
344k
    if (*p++ != '\t') goto err_ret;
2784
344k
    if (c->mpos < 0 && c->mtid >= 0) {
2785
138k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2786
138k
        c->mtid = -1;
2787
138k
    }
2788
    // tlen
2789
344k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2790
344k
    if (*p++ != '\t') goto err_ret;
2791
344k
    _parse_err(overflow, "number outside allowed range");
2792
    // seq
2793
344k
    q = _read_token(p);
2794
344k
    if (strcmp(q, "*")) {
2795
260k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2796
260k
        c->l_qseq = p - q - 1;
2797
260k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2798
260k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2799
260k
        i = (c->l_qseq + 1) >> 1;
2800
260k
        _get_mem(uint8_t, &t, b, i);
2801
2802
260k
        unsigned int lqs2 = c->l_qseq&~1, i;
2803
277k
        for (i = 0; i < lqs2; i+=2)
2804
17.7k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2805
330k
        for (; i < c->l_qseq; ++i)
2806
70.6k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2807
260k
    } else c->l_qseq = 0;
2808
    // qual
2809
688k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2810
688k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2811
69.6k
        memset(t, 0xff, c->l_qseq);
2812
69.6k
        p += 2;
2813
274k
    } else {
2814
274k
        int failed = 0;
2815
274k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2816
274k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2817
274k
                   "SEQ and QUAL are of different length");
2818
274k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2819
274k
        _parse_err(failed, "invalid QUAL character");
2820
274k
        p += c->l_qseq + 1;
2821
274k
    }
2822
2823
    // aux
2824
344k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2825
932
        goto err_ret;
2826
2827
343k
    if (bam_tag2cigar(b, 1, 1) < 0)
2828
0
        return -2;
2829
343k
    return 0;
2830
2831
0
#undef _parse_warn
2832
0
#undef _parse_err
2833
0
#undef _get_mem
2834
0
#undef _read_token
2835
2.58k
err_ret:
2836
2.58k
    return -2;
2837
343k
}
2838
2839
302k
static uint32_t read_ncigar(const char *q) {
2840
302k
    uint32_t n_cigar = 0;
2841
3.40M
    for (; *q && *q != '\t'; ++q)
2842
3.10M
        if (!isdigit_c(*q)) ++n_cigar;
2843
302k
    if (!n_cigar) {
2844
104
        hts_log_error("No CIGAR operations");
2845
104
        return 0;
2846
104
    }
2847
301k
    if (n_cigar >= 2147483647) {
2848
0
        hts_log_error("Too many CIGAR operations");
2849
0
        return 0;
2850
0
    }
2851
2852
301k
    return n_cigar;
2853
301k
}
2854
2855
/*! @function
2856
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2857
 @param  in      [in]  pointer to the source string
2858
 @param  a_cigar [out]  address of the destination uint32_t buffer
2859
 @return         number of processed input characters; 0 on error
2860
 */
2861
301k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2862
301k
    int i, overflow = 0;
2863
301k
    const char *p = in;
2864
931k
    for (i = 0; i < n_cigar; i++) {
2865
630k
        uint32_t len;
2866
630k
        int op;
2867
630k
        char *q;
2868
630k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2869
630k
        if (q == p) {
2870
123
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2871
123
            return 0;
2872
123
        }
2873
630k
        if (overflow) {
2874
24
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2875
24
            return 0;
2876
24
        }
2877
630k
        p = q;
2878
630k
        op = bam_cigar_table[(unsigned char)*p++];
2879
630k
        if (op < 0) {
2880
207
            hts_log_error("Unrecognized CIGAR operator");
2881
207
            return 0;
2882
207
        }
2883
629k
        a_cigar[i] = len;
2884
629k
        a_cigar[i] |= op;
2885
629k
    }
2886
2887
301k
    return p-in;
2888
301k
}
2889
2890
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2891
0
    size_t n_cigar = 0;
2892
0
    int diff;
2893
2894
0
    if (!in || !a_cigar || !a_mem) {
2895
0
        hts_log_error("NULL pointer arguments");
2896
0
        return -1;
2897
0
    }
2898
0
    if (end) *end = (char *)in;
2899
2900
0
    if (*in == '*') {
2901
0
        if (end) (*end)++;
2902
0
        return 0;
2903
0
    }
2904
0
    n_cigar = read_ncigar(in);
2905
0
    if (!n_cigar) return 0;
2906
0
    if (n_cigar > *a_mem) {
2907
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2908
0
        if (a_tmp) {
2909
0
            *a_cigar = a_tmp;
2910
0
            *a_mem = n_cigar;
2911
0
        } else {
2912
0
            hts_log_error("Memory allocation error");
2913
0
            return -1;
2914
0
        }
2915
0
    }
2916
2917
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2918
0
    if (end) *end = (char *)in+diff;
2919
2920
0
    return n_cigar;
2921
0
}
2922
2923
302k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2924
302k
    size_t n_cigar = 0;
2925
302k
    int diff;
2926
2927
302k
    if (!in || !b) {
2928
0
        hts_log_error("NULL pointer arguments");
2929
0
        return -1;
2930
0
    }
2931
302k
    if (end) *end = (char *)in;
2932
2933
302k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2934
302k
    if (!n_cigar && b->core.n_cigar == 0) {
2935
104
        if (end) *end = (char *)in+1;
2936
104
        return 0;
2937
104
    }
2938
2939
301k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2940
301k
    if (cig_diff > 0 &&
2941
301k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2942
0
        hts_log_error("Memory allocation error");
2943
0
        return -1;
2944
0
    }
2945
2946
301k
    uint32_t *cig = bam_get_cigar(b);
2947
301k
    if ((uint8_t *)cig != b->data + b->l_data) {
2948
        // Modifying an BAM existing BAM record
2949
0
        uint8_t  *seq = bam_get_seq(b);
2950
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2951
0
    }
2952
2953
301k
    if (n_cigar) {
2954
301k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2955
354
            return -1;
2956
301k
    } else {
2957
0
        diff = 1; // handle "*"
2958
0
    }
2959
2960
301k
    b->l_data += cig_diff * sizeof(uint32_t);
2961
301k
    b->core.n_cigar = n_cigar;
2962
301k
    if (end) *end = (char *)in + diff;
2963
2964
301k
    return n_cigar;
2965
301k
}
2966
2967
/*
2968
 * -----------------------------------------------------------------------------
2969
 * SAM threading
2970
 */
2971
// Size of SAM text block (reading)
2972
0
#define SAM_NBYTES 240000
2973
2974
// Number of BAM records (writing, up to NB_mem in size)
2975
0
#define SAM_NBAM 1000
2976
2977
struct SAM_state;
2978
2979
// Output job - a block of BAM records
2980
typedef struct sp_bams {
2981
    struct sp_bams *next;
2982
    int serial;
2983
2984
    bam1_t *bams;
2985
    int nbams, abams; // used and alloc for bams[] array
2986
    size_t bam_mem;   // very approximate total size
2987
2988
    struct SAM_state *fd;
2989
} sp_bams;
2990
2991
// Input job - a block of SAM text
2992
typedef struct sp_lines {
2993
    struct sp_lines *next;
2994
    int serial;
2995
2996
    char *data;
2997
    int data_size;
2998
    int alloc;
2999
3000
    struct SAM_state *fd;
3001
    sp_bams *bams;
3002
} sp_lines;
3003
3004
enum sam_cmd {
3005
    SAM_NONE = 0,
3006
    SAM_CLOSE,
3007
    SAM_CLOSE_DONE,
3008
    SAM_AT_EOF,
3009
};
3010
3011
typedef struct SAM_state {
3012
    sam_hdr_t *h;
3013
3014
    hts_tpool *p;
3015
    int own_pool;
3016
    pthread_mutex_t lines_m;
3017
    hts_tpool_process *q;
3018
    pthread_t dispatcher;
3019
    int dispatcher_set;
3020
3021
    sp_lines *lines;
3022
    sp_bams *bams;
3023
3024
    sp_bams *curr_bam;
3025
    int curr_idx;
3026
    int serial;
3027
3028
    // Be warned: moving these mutexes around in this struct can reduce
3029
    // threading performance by up to 70%!
3030
    pthread_mutex_t command_m;
3031
    pthread_cond_t command_c;
3032
    enum sam_cmd command;
3033
3034
    // One of the E* errno codes
3035
    int errcode;
3036
3037
    htsFile *fp;
3038
} SAM_state;
3039
3040
// Returns a SAM_state struct from a generic hFILE.
3041
//
3042
// Returns NULL on failure.
3043
0
static SAM_state *sam_state_create(htsFile *fp) {
3044
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3045
    // be a redirect call with an additional 'S' mode.  This in turn would
3046
    // correctly set the designed format to sam instead of a generic
3047
    // text_format.
3048
0
    if (fp->format.format != sam && fp->format.format != text_format)
3049
0
        return NULL;
3050
3051
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3052
0
    if (!fd)
3053
0
        return NULL;
3054
3055
0
    fp->state = fd;
3056
0
    fd->fp = fp;
3057
3058
0
    return fd;
3059
0
}
3060
3061
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3062
static void *sam_format_worker(void *arg);
3063
3064
0
static void sam_state_err(SAM_state *fd, int errcode) {
3065
0
    pthread_mutex_lock(&fd->command_m);
3066
0
    if (!fd->errcode)
3067
0
        fd->errcode = errcode;
3068
0
    pthread_mutex_unlock(&fd->command_m);
3069
0
}
3070
3071
0
static void sam_free_sp_bams(sp_bams *b) {
3072
0
    if (!b)
3073
0
        return;
3074
3075
0
    if (b->bams) {
3076
0
        int i;
3077
0
        for (i = 0; i < b->abams; i++) {
3078
0
            if (b->bams[i].data)
3079
0
                free(b->bams[i].data);
3080
0
        }
3081
0
        free(b->bams);
3082
0
    }
3083
0
    free(b);
3084
0
}
3085
3086
// Destroys the state produce by sam_state_create.
3087
9.31k
int sam_state_destroy(htsFile *fp) {
3088
9.31k
    int ret = 0;
3089
3090
9.31k
    if (!fp->state)
3091
9.31k
        return 0;
3092
3093
0
    SAM_state *fd = fp->state;
3094
0
    if (fd->p) {
3095
0
        if (fd->h) {
3096
            // Notify sam_dispatcher we're closing
3097
0
            pthread_mutex_lock(&fd->command_m);
3098
0
            if (fd->command != SAM_CLOSE_DONE)
3099
0
                fd->command = SAM_CLOSE;
3100
0
            pthread_cond_signal(&fd->command_c);
3101
0
            ret = -fd->errcode;
3102
0
            if (fd->q)
3103
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3104
3105
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3106
0
                for (;;) {
3107
                    // Avoid deadlocks with dispatcher
3108
0
                    if (fd->command == SAM_CLOSE_DONE)
3109
0
                        break;
3110
0
                    hts_tpool_wake_dispatch(fd->q);
3111
0
                    pthread_mutex_unlock(&fd->command_m);
3112
0
                    hts_usleep(10000);
3113
0
                    pthread_mutex_lock(&fd->command_m);
3114
0
                }
3115
0
            }
3116
0
            pthread_mutex_unlock(&fd->command_m);
3117
3118
0
            if (fp->is_write) {
3119
                // Dispatch the last partial block.
3120
0
                sp_bams *gb = fd->curr_bam;
3121
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3122
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3123
3124
                // Flush and drain output
3125
0
                if (fd->q)
3126
0
                    hts_tpool_process_flush(fd->q);
3127
0
                pthread_mutex_lock(&fd->command_m);
3128
0
                if (!ret) ret = -fd->errcode;
3129
0
                pthread_mutex_unlock(&fd->command_m);
3130
3131
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3132
0
                    hts_usleep(10000);
3133
0
                    pthread_mutex_lock(&fd->command_m);
3134
0
                    ret = -fd->errcode;
3135
                    // not empty but shutdown implies error
3136
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3137
0
                        ret = EIO;
3138
0
                    pthread_mutex_unlock(&fd->command_m);
3139
0
                }
3140
0
                if (fd->q)
3141
0
                    hts_tpool_process_shutdown(fd->q);
3142
0
            }
3143
3144
            // Wait for it to acknowledge
3145
0
            if (fd->dispatcher_set)
3146
0
                pthread_join(fd->dispatcher, NULL);
3147
0
            if (!ret) ret = -fd->errcode;
3148
0
        }
3149
3150
        // Tidy up memory
3151
0
        if (fd->q)
3152
0
            hts_tpool_process_destroy(fd->q);
3153
3154
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3155
0
            hts_tpool_destroy(fd->p);
3156
0
            fd->p = NULL;
3157
0
        }
3158
0
        pthread_mutex_destroy(&fd->lines_m);
3159
0
        pthread_mutex_destroy(&fd->command_m);
3160
0
        pthread_cond_destroy(&fd->command_c);
3161
3162
0
        sp_lines *l = fd->lines;
3163
0
        while (l) {
3164
0
            sp_lines *n = l->next;
3165
0
            free(l->data);
3166
0
            free(l);
3167
0
            l = n;
3168
0
        }
3169
3170
0
        sp_bams *b = fd->bams;
3171
0
        while (b) {
3172
0
            if (fd->curr_bam == b)
3173
0
                fd->curr_bam = NULL;
3174
0
            sp_bams *n = b->next;
3175
0
            sam_free_sp_bams(b);
3176
0
            b = n;
3177
0
        }
3178
3179
0
        if (fd->curr_bam)
3180
0
            sam_free_sp_bams(fd->curr_bam);
3181
3182
        // Decrement counter by one, maybe destroying too.
3183
        // This is to permit the caller using bam_hdr_destroy
3184
        // before sam_close without triggering decode errors
3185
        // in the background threads.
3186
0
        bam_hdr_destroy(fd->h);
3187
0
    }
3188
3189
0
    free(fp->state);
3190
0
    fp->state = NULL;
3191
0
    return ret;
3192
9.31k
}
3193
3194
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3195
0
static void cleanup_sp_lines(void *arg) {
3196
0
    sp_lines *gl = (sp_lines *)arg;
3197
0
    if (!gl) return;
3198
3199
    // Should always be true for lines passed to / from thread workers.
3200
0
    assert(gl->next == NULL);
3201
3202
0
    free(gl->data);
3203
0
    sam_free_sp_bams(gl->bams);
3204
0
    free(gl);
3205
0
}
3206
3207
// Run from one of the worker threads.
3208
// Convert a passed in array of lines to array of BAMs, returning
3209
// the result back to the thread queue.
3210
0
static void *sam_parse_worker(void *arg) {
3211
0
    sp_lines *gl = (sp_lines *)arg;
3212
0
    sp_bams *gb = NULL;
3213
0
    char *lines = gl->data;
3214
0
    int i;
3215
0
    bam1_t *b;
3216
0
    SAM_state *fd = gl->fd;
3217
3218
    // Use a block of BAM structs we had earlier if available.
3219
0
    pthread_mutex_lock(&fd->lines_m);
3220
0
    if (fd->bams) {
3221
0
        gb = fd->bams;
3222
0
        fd->bams = gb->next;
3223
0
    }
3224
0
    pthread_mutex_unlock(&fd->lines_m);
3225
3226
0
    if (gb == NULL) {
3227
0
        gb = calloc(1, sizeof(*gb));
3228
0
        if (!gb) {
3229
0
            return NULL;
3230
0
        }
3231
0
        gb->abams = 100;
3232
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3233
0
        if (!gb->bams) {
3234
0
            sam_state_err(fd, ENOMEM);
3235
0
            goto err;
3236
0
        }
3237
0
        gb->nbams = 0;
3238
0
        gb->bam_mem = 0;
3239
0
    }
3240
0
    gb->serial = gl->serial;
3241
0
    gb->next = NULL;
3242
3243
0
    b = (bam1_t *)gb->bams;
3244
0
    if (!b) {
3245
0
        sam_state_err(fd, ENOMEM);
3246
0
        goto err;
3247
0
    }
3248
3249
0
    i = 0;
3250
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3251
0
    while (cp < cp_end) {
3252
0
        if (i >= gb->abams) {
3253
0
            int old_abams = gb->abams;
3254
0
            gb->abams *= 2;
3255
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3256
0
            if (!b) {
3257
0
                gb->abams /= 2;
3258
0
                sam_state_err(fd, ENOMEM);
3259
0
                goto err;
3260
0
            }
3261
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3262
0
            gb->bams = b;
3263
0
        }
3264
3265
        // Ideally we'd get sam_parse1 to return the number of
3266
        // bytes decoded and to be able to stop on newline as
3267
        // well as \0.
3268
        //
3269
        // We can then avoid the additional strchr loop.
3270
        // It's around 6% of our CPU cost, albeit threadable.
3271
        //
3272
        // However this is an API change so for now we copy.
3273
3274
0
        char *nl = strchr(cp, '\n');
3275
0
        char *line_end;
3276
0
        if (nl) {
3277
0
            line_end = nl;
3278
0
            if (line_end > cp && *(line_end - 1) == '\r')
3279
0
                line_end--;
3280
0
            nl++;
3281
0
        } else {
3282
0
            nl = line_end = cp_end;
3283
0
        }
3284
0
        *line_end = '\0';
3285
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3286
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3287
0
            sam_state_err(fd, errno ? errno : EIO);
3288
0
            cleanup_sp_lines(gl);
3289
0
            goto err;
3290
0
        }
3291
3292
0
        cp = nl;
3293
0
        i++;
3294
0
    }
3295
0
    gb->nbams = i;
3296
3297
0
    pthread_mutex_lock(&fd->lines_m);
3298
0
    gl->next = fd->lines;
3299
0
    fd->lines = gl;
3300
0
    pthread_mutex_unlock(&fd->lines_m);
3301
0
    return gb;
3302
3303
0
 err:
3304
0
    sam_free_sp_bams(gb);
3305
0
    return NULL;
3306
0
}
3307
3308
0
static void *sam_parse_eof(void *arg) {
3309
0
    return NULL;
3310
0
}
3311
3312
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3313
0
static void cleanup_sp_bams(void *arg) {
3314
0
    sam_free_sp_bams((sp_bams *) arg);
3315
0
}
3316
3317
// Runs in its own thread.
3318
// Reads a block of text (SAM) and sends a new job to the thread queue to
3319
// translate this to BAM.
3320
0
static void *sam_dispatcher_read(void *vp) {
3321
0
    htsFile *fp = vp;
3322
0
    kstring_t line = {0};
3323
0
    int line_frag = 0;
3324
0
    SAM_state *fd = fp->state;
3325
0
    sp_lines *l = NULL;
3326
3327
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3328
    // matter as it will grow if necessary).
3329
0
    if (ks_resize(&line, 1000) < 0)
3330
0
        goto err;
3331
3332
0
    for (;;) {
3333
        // Check for command
3334
0
        pthread_mutex_lock(&fd->command_m);
3335
0
        switch (fd->command) {
3336
3337
0
        case SAM_CLOSE:
3338
0
            pthread_cond_signal(&fd->command_c);
3339
0
            pthread_mutex_unlock(&fd->command_m);
3340
0
            hts_tpool_process_shutdown(fd->q);
3341
0
            goto tidyup;
3342
3343
0
        default:
3344
0
            break;
3345
0
        }
3346
0
        pthread_mutex_unlock(&fd->command_m);
3347
3348
0
        pthread_mutex_lock(&fd->lines_m);
3349
0
        if (fd->lines) {
3350
            // reuse existing line buffer
3351
0
            l = fd->lines;
3352
0
            fd->lines = l->next;
3353
0
        }
3354
0
        pthread_mutex_unlock(&fd->lines_m);
3355
3356
0
        if (l == NULL) {
3357
            // none to reuse, to create a new one
3358
0
            l = calloc(1, sizeof(*l));
3359
0
            if (!l)
3360
0
                goto err;
3361
0
            l->alloc = SAM_NBYTES;
3362
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3363
0
            if (!l->data) {
3364
0
                free(l);
3365
0
                l = NULL;
3366
0
                goto err;
3367
0
            }
3368
0
            l->fd = fd;
3369
0
        }
3370
0
        l->next = NULL;
3371
3372
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3373
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3374
0
            if (!rp)
3375
0
                goto err;
3376
0
            l->alloc = line_frag+SAM_NBYTES/2;
3377
0
            l->data = rp;
3378
0
        }
3379
0
        memcpy(l->data, line.s, line_frag);
3380
3381
0
        l->data_size = line_frag;
3382
0
        ssize_t nbytes;
3383
0
    longer_line:
3384
0
        if (fp->is_bgzf)
3385
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3386
0
        else
3387
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3388
0
        if (nbytes < 0) {
3389
0
            sam_state_err(fd, errno ? errno : EIO);
3390
0
            goto err;
3391
0
        } else if (nbytes == 0)
3392
0
            break; // EOF
3393
0
        l->data_size += nbytes;
3394
3395
        // trim to last \n. Maybe \r\n, but that's still fine
3396
0
        if (nbytes == l->alloc - line_frag) {
3397
0
            char *cp_end = l->data + l->data_size;
3398
0
            char *cp = cp_end-1;
3399
3400
0
            while (cp > (char *)l->data && *cp != '\n')
3401
0
                cp--;
3402
3403
            // entire buffer is part of a single line
3404
0
            if (cp == l->data) {
3405
0
                line_frag = l->data_size;
3406
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3407
0
                if (!rp)
3408
0
                    goto err;
3409
0
                l->alloc *= 2;
3410
0
                l->data = rp;
3411
0
                assert(l->alloc >= l->data_size);
3412
0
                assert(l->alloc >= line_frag);
3413
0
                assert(l->alloc >= l->alloc - line_frag);
3414
0
                goto longer_line;
3415
0
            }
3416
0
            cp++;
3417
3418
            // line holds the remainder of our line.
3419
0
            if (ks_resize(&line, cp_end - cp) < 0)
3420
0
                goto err;
3421
0
            memcpy(line.s, cp, cp_end - cp);
3422
0
            line_frag = cp_end - cp;
3423
0
            l->data_size = l->alloc - line_frag;
3424
0
        } else {
3425
            // out of buffer
3426
0
            line_frag = 0;
3427
0
        }
3428
3429
0
        l->serial = fd->serial++;
3430
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3431
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3432
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3433
0
            goto err;
3434
0
        pthread_mutex_lock(&fd->command_m);
3435
0
        if (fd->command == SAM_CLOSE) {
3436
0
            pthread_mutex_unlock(&fd->command_m);
3437
0
            l = NULL;
3438
0
            goto tidyup;
3439
0
        }
3440
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3441
0
        pthread_mutex_unlock(&fd->command_m);
3442
0
    }
3443
3444
    // Submit a NULL sp_bams entry to act as an EOF marker
3445
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3446
0
        goto err;
3447
3448
    // At EOF, wait for close request.
3449
    // (In future if we add support for seek, this is where we need to catch it.)
3450
0
    for (;;) {
3451
0
        pthread_mutex_lock(&fd->command_m);
3452
0
        if (fd->command == SAM_NONE)
3453
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3454
0
        switch (fd->command) {
3455
0
        case SAM_CLOSE:
3456
0
            pthread_cond_signal(&fd->command_c);
3457
0
            pthread_mutex_unlock(&fd->command_m);
3458
0
            hts_tpool_process_shutdown(fd->q);
3459
0
            goto tidyup;
3460
3461
0
        default:
3462
0
            pthread_mutex_unlock(&fd->command_m);
3463
0
            break;
3464
0
        }
3465
0
    }
3466
3467
0
 tidyup:
3468
0
    pthread_mutex_lock(&fd->command_m);
3469
0
    fd->command = SAM_CLOSE_DONE;
3470
0
    pthread_cond_signal(&fd->command_c);
3471
0
    pthread_mutex_unlock(&fd->command_m);
3472
3473
0
    if (l) {
3474
0
        pthread_mutex_lock(&fd->lines_m);
3475
0
        l->next = fd->lines;
3476
0
        fd->lines = l;
3477
0
        pthread_mutex_unlock(&fd->lines_m);
3478
0
    }
3479
0
    free(line.s);
3480
3481
0
    return NULL;
3482
3483
0
 err:
3484
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3485
0
    hts_tpool_process_shutdown(fd->q);
3486
0
    goto tidyup;
3487
0
}
3488
3489
// Runs in its own thread.
3490
// Takes encoded blocks of SAM off the thread results queue and writes them
3491
// to our output stream.
3492
0
static void *sam_dispatcher_write(void *vp) {
3493
0
    htsFile *fp = vp;
3494
0
    SAM_state *fd = fp->state;
3495
0
    hts_tpool_result *r;
3496
3497
    // Iterates until result queue is shutdown, where it returns NULL.
3498
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3499
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3500
0
        if (!gl) {
3501
0
            sam_state_err(fd, ENOMEM);
3502
0
            goto err;
3503
0
        }
3504
3505
0
        if (fp->idx) {
3506
0
            sp_bams *gb = gl->bams;
3507
0
            int i = 0, count = 0;
3508
0
            while (i < gl->data_size) {
3509
0
                int j = i;
3510
0
                while (i < gl->data_size && gl->data[i] != '\n')
3511
0
                    i++;
3512
0
                if (i < gl->data_size)
3513
0
                    i++;
3514
3515
0
                if (fp->is_bgzf) {
3516
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3517
0
                        goto err;
3518
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3519
0
                        goto err;
3520
0
                } else {
3521
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3522
0
                        goto err;
3523
0
                }
3524
3525
0
                bam1_t *b = &gb->bams[count++];
3526
0
                if (fp->format.compression == bgzf) {
3527
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3528
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3529
0
                                      bgzf_tell(fp->fp.bgzf),
3530
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3531
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3532
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3533
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3534
0
                        goto err;
3535
0
                    }
3536
0
                } else {
3537
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3538
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3539
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3540
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3541
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3542
0
                        goto err;
3543
0
                    }
3544
0
                }
3545
0
            }
3546
3547
0
            assert(count == gb->nbams);
3548
3549
            // Add bam array to free-list
3550
0
            pthread_mutex_lock(&fd->lines_m);
3551
0
            gb->next = fd->bams;
3552
0
            fd->bams = gl->bams;
3553
0
            gl->bams = NULL;
3554
0
            pthread_mutex_unlock(&fd->lines_m);
3555
0
        } else {
3556
0
            if (fp->is_bgzf) {
3557
                // We keep track of how much in the current block we have
3558
                // remaining => R.  We look for the last newline in input
3559
                // [i] to [i+R], backwards => position N.
3560
                //
3561
                // If we find a newline, we write out bytes i to N.
3562
                // We know we cannot fit the next record in this bgzf block,
3563
                // so we flush what we have and copy input N to i+R into
3564
                // the start of a new block, and recompute a new R for that.
3565
                //
3566
                // If we don't find a newline (i==N) then we cannot extend
3567
                // the current block at all, so flush whatever is in it now
3568
                // if it ends on a newline.
3569
                // We still copy i(==N) to i+R to the next block and
3570
                // continue as before with a new R.
3571
                //
3572
                // The only exception on the flush is when we run out of
3573
                // data in the input.  In that case we skip it as we don't
3574
                // yet know if the next record will fit.
3575
                //
3576
                // Both conditions share the same code here:
3577
                // - Look for newline (pos N)
3578
                // - Write i to N (which maybe 0)
3579
                // - Flush if block ends on newline and not end of input
3580
                // - write N to i+R
3581
3582
0
                int i = 0;
3583
0
                BGZF *fb = fp->fp.bgzf;
3584
0
                while (i < gl->data_size) {
3585
                    // remaining space in block
3586
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3587
0
                    int eod = 0;
3588
0
                    if (R > gl->data_size-i)
3589
0
                        R = gl->data_size-i, eod = 1;
3590
3591
                    // Find last newline in input data
3592
0
                    int N = i + R;
3593
0
                    while (--N > i) {
3594
0
                        if (gl->data[N] == '\n')
3595
0
                            break;
3596
0
                    }
3597
3598
0
                    if (N != i) {
3599
                        // Found a newline
3600
0
                        N++;
3601
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3602
0
                            goto err;
3603
0
                    }
3604
3605
                    // Flush bgzf block
3606
0
                    int b_off = fb->block_offset;
3607
0
                    if (!eod && b_off &&
3608
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3609
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3610
0
                            goto err;
3611
3612
                    // Copy from N onwards into next block
3613
0
                    if (i+R > N)
3614
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3615
0
                            != i+R - N)
3616
0
                            goto err;
3617
3618
0
                    i = i+R;
3619
0
                }
3620
0
            } else {
3621
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3622
0
                    goto err;
3623
0
            }
3624
0
        }
3625
3626
0
        hts_tpool_delete_result(r, 0);
3627
3628
        // Also updated by main thread
3629
0
        pthread_mutex_lock(&fd->lines_m);
3630
0
        gl->next = fd->lines;
3631
0
        fd->lines = gl;
3632
0
        pthread_mutex_unlock(&fd->lines_m);
3633
0
    }
3634
3635
0
    sam_state_err(fd, 0); // success
3636
0
    hts_tpool_process_shutdown(fd->q);
3637
0
    return NULL;
3638
3639
0
 err:
3640
0
    sam_state_err(fd, errno ? errno : EIO);
3641
0
    return (void *)-1;
3642
0
}
3643
3644
// Run from one of the worker threads.
3645
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3646
// of text SAM records (sp_lines).
3647
0
static void *sam_format_worker(void *arg) {
3648
0
    sp_bams *gb = (sp_bams *)arg;
3649
0
    sp_lines *gl = NULL;
3650
0
    int i;
3651
0
    SAM_state *fd = gb->fd;
3652
0
    htsFile *fp = fd->fp;
3653
3654
    // Use a block of SAM strings we had earlier if available.
3655
0
    pthread_mutex_lock(&fd->lines_m);
3656
0
    if (fd->lines) {
3657
0
        gl = fd->lines;
3658
0
        fd->lines = gl->next;
3659
0
    }
3660
0
    pthread_mutex_unlock(&fd->lines_m);
3661
3662
0
    if (gl == NULL) {
3663
0
        gl = calloc(1, sizeof(*gl));
3664
0
        if (!gl) {
3665
0
            sam_state_err(fd, ENOMEM);
3666
0
            return NULL;
3667
0
        }
3668
0
        gl->alloc = gl->data_size = 0;
3669
0
        gl->data = NULL;
3670
0
    }
3671
0
    gl->serial = gb->serial;
3672
0
    gl->next = NULL;
3673
3674
0
    kstring_t ks = {0, gl->alloc, gl->data};
3675
3676
0
    for (i = 0; i < gb->nbams; i++) {
3677
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3678
0
            sam_state_err(fd, errno ? errno : EIO);
3679
0
            goto err;
3680
0
        }
3681
0
        kputc('\n', &ks);
3682
0
    }
3683
3684
0
    pthread_mutex_lock(&fd->lines_m);
3685
0
    gl->data_size = ks.l;
3686
0
    gl->alloc = ks.m;
3687
0
    gl->data = ks.s;
3688
3689
0
    if (fp->idx) {
3690
        // Keep hold of the bam array a little longer as
3691
        // sam_dispatcher_write needs to use them for building the index.
3692
0
        gl->bams = gb;
3693
0
    } else {
3694
        // Add bam array to free-list
3695
0
        gb->next = fd->bams;
3696
0
        fd->bams = gb;
3697
0
    }
3698
0
    pthread_mutex_unlock(&fd->lines_m);
3699
3700
0
    return gl;
3701
3702
0
 err:
3703
    // Possible race between this and fd->curr_bam.
3704
    // Easier to not free and leave it on the input list so it
3705
    // gets freed there instead?
3706
    // sam_free_sp_bams(gb);
3707
0
    if (gl) {
3708
0
        free(gl->data);
3709
0
        free(gl);
3710
0
    }
3711
0
    return NULL;
3712
0
}
3713
3714
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3715
0
    if (fp->state)
3716
0
        return 0;
3717
3718
0
    if (!(fp->state = sam_state_create(fp)))
3719
0
        return -1;
3720
0
    SAM_state *fd = (SAM_state *)fp->state;
3721
3722
0
    pthread_mutex_init(&fd->lines_m, NULL);
3723
0
    pthread_mutex_init(&fd->command_m, NULL);
3724
0
    pthread_cond_init(&fd->command_c, NULL);
3725
0
    fd->p = p->pool;
3726
0
    int qsize = p->qsize;
3727
0
    if (!qsize)
3728
0
        qsize = 2*hts_tpool_size(fd->p);
3729
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3730
0
    if (!fd->q) {
3731
0
        sam_state_destroy(fp);
3732
0
        return -1;
3733
0
    }
3734
3735
0
    if (fp->format.compression == bgzf)
3736
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3737
3738
0
    return 0;
3739
0
}
3740
3741
0
int sam_set_threads(htsFile *fp, int nthreads) {
3742
0
    if (nthreads <= 0)
3743
0
        return 0;
3744
3745
0
    htsThreadPool p;
3746
0
    p.pool = hts_tpool_init(nthreads);
3747
0
    p.qsize = nthreads*2;
3748
3749
0
    int ret = sam_set_thread_pool(fp, &p);
3750
0
    if (ret < 0)
3751
0
        return ret;
3752
3753
0
    SAM_state *fd = (SAM_state *)fp->state;
3754
0
    fd->own_pool = 1;
3755
3756
0
    return 0;
3757
0
}
3758
3759
0
#define UMI_TAGS 5
3760
typedef struct {
3761
    kstring_t name;
3762
    kstring_t comment; // NB: pointer into name, do not free
3763
    kstring_t seq;
3764
    kstring_t qual;
3765
    int casava;
3766
    int aux;
3767
    int rnum;
3768
    char BC[3];         // aux tag ID for barcode
3769
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3770
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3771
    char nprefix;
3772
    int sra_names;
3773
    regex_t regex;
3774
} fastq_state;
3775
3776
// Initialise fastq state.
3777
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3778
1.89k
static fastq_state *fastq_state_init(int name_char) {
3779
1.89k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3780
1.89k
    if (!x)
3781
0
        return NULL;
3782
1.89k
    strcpy(x->BC, "BC");
3783
1.89k
    x->nprefix = name_char;
3784
    // Default Illumina naming convention
3785
1.89k
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3786
1.89k
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3787
0
        free(x);
3788
0
        return NULL;
3789
0
    }
3790
3791
1.89k
    return x;
3792
1.89k
}
3793
3794
2.53k
void fastq_state_destroy(htsFile *fp) {
3795
2.53k
    if (fp->state) {
3796
1.89k
        fastq_state *x = (fastq_state *)fp->state;
3797
1.89k
        if (x->tags)
3798
0
            kh_destroy(tag, x->tags);
3799
1.89k
        ks_free(&x->name);
3800
1.89k
        ks_free(&x->seq);
3801
1.89k
        ks_free(&x->qual);
3802
1.89k
        regfree(&x->regex);
3803
1.89k
        free(fp->state);
3804
1.89k
    }
3805
2.53k
}
3806
3807
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3808
0
    va_list args;
3809
3810
0
    if (!fp)
3811
0
        return -1;
3812
0
    if (!fp->state)
3813
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3814
0
                                           ? '@' : '>')))
3815
0
            return -1;
3816
3817
0
    fastq_state *x = (fastq_state *)fp->state;
3818
3819
0
    switch (opt) {
3820
0
    case FASTQ_OPT_CASAVA:
3821
0
        x->casava = 1;
3822
0
        break;
3823
3824
0
    case FASTQ_OPT_NAME2:
3825
0
        x->sra_names = 1;
3826
0
        break;
3827
3828
0
    case FASTQ_OPT_AUX: {
3829
0
        va_start(args, opt);
3830
0
        x->aux = 1;
3831
0
        char *tag = va_arg(args, char *);
3832
0
        va_end(args);
3833
0
        if (tag && strcmp(tag, "1") != 0) {
3834
0
            if (!x->tags)
3835
0
                if (!(x->tags = kh_init(tag)))
3836
0
                    return -1;
3837
3838
0
            size_t i, tlen = strlen(tag);
3839
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3840
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3841
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3842
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3843
0
                    break;
3844
0
                }
3845
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3846
0
                kh_put(tag, x->tags, tcode, &ret);
3847
0
                if (ret < 0)
3848
0
                    return -1;
3849
0
            }
3850
0
        }
3851
0
        break;
3852
0
    }
3853
3854
0
    case FASTQ_OPT_BARCODE: {
3855
0
        va_start(args, opt);
3856
0
        char *bc = va_arg(args, char *);
3857
0
        va_end(args);
3858
0
        strncpy(x->BC, bc, 2);
3859
0
        x->BC[2] = 0;
3860
0
        break;
3861
0
    }
3862
3863
0
    case FASTQ_OPT_UMI: {
3864
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3865
0
        va_start(args, opt);
3866
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3867
0
        va_end(args);
3868
0
        if (!bc || strcmp(bc, "1") == 0)
3869
0
            bc = "RX";
3870
0
        int ntags = 0, err = 0;
3871
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3872
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3873
0
                err = 1;
3874
0
                break;
3875
0
            }
3876
3877
0
            strncpy(x->UMI[ntags], bc, 3);
3878
0
            bc += 2;
3879
0
            if (*bc && *bc != ',') {
3880
0
                err = 1;
3881
0
                break;
3882
0
            }
3883
0
            bc+=(*bc==',');
3884
0
            x->UMI[ntags][2] = 0;
3885
0
        }
3886
0
        for (; ntags < UMI_TAGS; ntags++)
3887
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3888
3889
3890
0
        if (err)
3891
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3892
3893
0
        break;
3894
0
    }
3895
3896
0
    case FASTQ_OPT_UMI_REGEX: {
3897
0
        va_start(args, opt);
3898
0
        char *re = va_arg(args, char *);
3899
0
        va_end(args);
3900
3901
0
        regfree(&x->regex);
3902
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3903
0
            hts_log_error("Regular expression '%s' is not supported", re);
3904
0
            return -1;
3905
0
        }
3906
0
        break;
3907
0
    }
3908
3909
0
    case FASTQ_OPT_RNUM:
3910
0
        x->rnum = 1;
3911
0
        break;
3912
3913
0
    default:
3914
0
        break;
3915
0
    }
3916
0
    return 0;
3917
0
}
3918
3919
17.2M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3920
17.2M
    fastq_state *x = (fastq_state *)fp->state;
3921
17.2M
    size_t i, l;
3922
17.2M
    int ret = 0;
3923
3924
17.2M
    if (fp->format.format == fasta_format && fp->line.s) {
3925
        // For FASTA we've already read the >name line; steal it
3926
        // Not the most efficient, but we don't optimise for fasta reading.
3927
17.2M
        if (fp->line.l == 0)
3928
900
            return -1; // EOF
3929
3930
17.2M
        free(x->name.s);
3931
17.2M
        x->name = fp->line;
3932
17.2M
        fp->line.l = fp->line.m = 0;
3933
17.2M
        fp->line.s = NULL;
3934
17.2M
    } else {
3935
        // Read a FASTQ format entry.
3936
1.90k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3937
1.90k
        if (ret == -1)
3938
0
            return -1;  // EOF
3939
1.90k
        else if (ret < -1)
3940
18
            return ret; // ERR
3941
1.90k
    }
3942
3943
    // Name
3944
17.2M
    if (*x->name.s != x->nprefix)
3945
3
        return -2;
3946
3947
    // Reverse the SRA strangeness of putting the run_name.number before
3948
    // the read name.
3949
17.2M
    i = 0;
3950
17.2M
    char *name = x->name.s+1;
3951
17.2M
    if (x->sra_names) {
3952
0
        char *cp = strpbrk(x->name.s, " \t");
3953
0
        if (cp) {
3954
0
            while (*cp == ' ' || *cp == '\t')
3955
0
                cp++;
3956
0
            *--cp = '@';
3957
0
            i = cp - x->name.s;
3958
0
            name = cp+1;
3959
0
        }
3960
0
    }
3961
3962
17.2M
    l = x->name.l;
3963
17.2M
    char *s = x->name.s;
3964
53.6M
    while (i < l && !isspace_c(s[i]))
3965
36.3M
        i++;
3966
17.2M
    if (i < l) {
3967
121k
        s[i] = 0;
3968
121k
        x->name.l = i++;
3969
121k
    }
3970
3971
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3972
17.7M
    while (i < l && isspace_c(s[i]))
3973
508k
        i++;
3974
17.2M
    x->comment.s = s+i;
3975
17.2M
    x->comment.l = l - i;
3976
3977
    // Seq
3978
17.2M
    x->seq.l = 0;
3979
131M
    for (;;) {
3980
131M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3981
1.77k
            if (fp->format.format == fastq_format || ret < -1)
3982
852
                return -2;
3983
131M
        if (ret == -1 ||
3984
131M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3985
17.2M
            break;
3986
113M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3987
0
            return -2;
3988
113M
    }
3989
3990
    // Qual
3991
17.2M
    if (fp->format.format == fastq_format) {
3992
21
        size_t remainder = x->seq.l;
3993
21
        x->qual.l = 0;
3994
10.3k
        do {
3995
10.3k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3996
6
                return -2;
3997
10.3k
            if (fp->line.l > remainder)
3998
9
                return -2;
3999
10.3k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4000
0
                return -2;
4001
10.3k
            remainder -= fp->line.l;
4002
10.3k
        } while (remainder > 0);
4003
4004
        // Decr qual
4005
179k
        for (i = 0; i < x->qual.l; i++)
4006
179k
            x->qual.s[i] -= '!';
4007
6
    }
4008
4009
17.2M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4010
17.2M
    if (x->name.l > 2 &&
4011
1.35M
        x->name.s[x->name.l-2] == '/' &&
4012
71.9k
        isdigit_c(x->name.s[x->name.l-1])) {
4013
65.5k
        switch(x->name.s[x->name.l-1]) {
4014
6.66k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4015
10.6k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4016
48.1k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4017
65.5k
        }
4018
65.5k
        x->name.s[x->name.l-=2] = 0;
4019
65.5k
    }
4020
4021
    // Strip Illumina formatted UMI off read-name
4022
17.2M
    char UMI_seq[256]; // maximum length in spec
4023
17.2M
    size_t UMI_len = 0;
4024
17.2M
    if (x->UMI[0][0]) {
4025
0
        regmatch_t match[3];
4026
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4027
0
            && match[0].rm_so >= 0     // whole regex
4028
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4029
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4030
0
            if (UMI_len > 255) {
4031
0
                hts_log_error("SAM read name is too long");
4032
0
                return -2;
4033
0
            }
4034
4035
            // The SAMTags spec recommends (but not requires) separating
4036
            // barcodes with hyphen ('-').
4037
0
            size_t i;
4038
0
            for (i = 0; i < UMI_len; i++)
4039
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4040
0
                    ? x->name.s[i+match[1].rm_so]
4041
0
                    : '-';
4042
4043
            // Move any trailing #num earlier in the name
4044
0
            if (UMI_len) {
4045
0
                UMI_seq[UMI_len++] = 0;
4046
4047
0
                x->name.l = match[1].rm_so;
4048
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4049
0
                    x->name.l--; // remove colon too
4050
0
                char *cp = x->name.s + match[1].rm_eo;
4051
0
                while (*cp)
4052
0
                    x->name.s[x->name.l++] = *cp++;
4053
0
                x->name.s[x->name.l] = 0;
4054
0
            }
4055
0
        }
4056
0
    }
4057
4058
    // Convert to BAM
4059
17.2M
    ret = bam_set1(b,
4060
17.2M
                   x->name.s + x->name.l - name, name,
4061
17.2M
                   flag,
4062
17.2M
                   -1, -1, 0, // ref '*', pos, mapq,
4063
17.2M
                   0, NULL,     // no cigar,
4064
17.2M
                   -1, -1, 0,    // mate
4065
17.2M
                   x->seq.l, x->seq.s, x->qual.s,
4066
17.2M
                   0);
4067
17.2M
    if (ret < 0) return -2;
4068
4069
    // Add UMI tag if removed from read-name above
4070
17.2M
    if (UMI_len) {
4071
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4072
0
            ret = -2;
4073
0
    }
4074
4075
    // Identify Illumina CASAVA strings.
4076
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4077
17.2M
    char *barcode = NULL;
4078
17.2M
    int barcode_len = 0;
4079
17.2M
    kstring_t *kc = &x->comment;
4080
17.2M
    char *endptr;
4081
17.2M
    if (x->casava &&
4082
        // \d:[YN]:\d+:[ACGTN]+
4083
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4084
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4085
0
        && *endptr == ':') {
4086
4087
        // read num
4088
0
        switch(kc->s[0]) {
4089
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4090
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4091
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4092
0
        }
4093
4094
0
        if (kc->s[2] == 'Y')
4095
0
            b->core.flag |= BAM_FQCFAIL;
4096
4097
        // Barcode, maybe numeric in which case we skip it
4098
0
        if (!isdigit_c(endptr[1])) {
4099
0
            barcode = endptr+1;
4100
0
            for (i = barcode - kc->s; i < kc->l; i++)
4101
0
                if (isspace_c(kc->s[i]))
4102
0
                    break;
4103
4104
0
            kc->s[i] = 0;
4105
0
            barcode_len = i+1-(barcode - kc->s);
4106
0
        }
4107
0
    }
4108
4109
17.2M
    if (ret >= 0 && barcode_len)
4110
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4111
0
            ret = -2;
4112
4113
17.2M
    if (!x->aux)
4114
17.2M
        return ret;
4115
4116
    // Identify any SAM style aux tags in comments too.
4117
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4118
0
        ret = -2;
4119
4120
0
    return ret;
4121
17.2M
}
4122
4123
// Internal component of sam_read1 below
4124
431
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4125
431
    int ret = bam_read1(fp->fp.bgzf, b);
4126
431
    if (h && ret >= 0) {
4127
367
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4128
355
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4129
15
            errno = ERANGE;
4130
15
            return -3;
4131
15
        }
4132
367
    }
4133
416
    return ret;
4134
431
}
4135
4136
// Internal component of sam_read1 below
4137
1.17k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4138
1.17k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4139
1.17k
    if (ret < 0)
4140
1.17k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4141
4142
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4143
0
        return -2;
4144
4145
0
    return ret;
4146
0
}
4147
4148
// Internal component of sam_read1 below
4149
347k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4150
347k
    int ret;
4151
4152
    // Consume 1st line after header parsing as it wasn't using peek
4153
347k
    if (fp->line.l != 0) {
4154
0
        ret = sam_parse1(&fp->line, h, b);
4155
0
        fp->line.l = 0;
4156
0
        return ret;
4157
0
    }
4158
4159
347k
    if (fp->state) {
4160
0
        SAM_state *fd = (SAM_state *)fp->state;
4161
4162
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4163
            // We don't support multi-threaded SAM parsing with seeks yet.
4164
0
            int ret;
4165
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4166
0
                errno = -ret;
4167
0
                return -2;
4168
0
            }
4169
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4170
0
                return -2;
4171
0
            fp->fp.bgzf->seeked = 0;
4172
0
            goto err_recover;
4173
0
        }
4174
4175
0
        if (!fd->h) {
4176
0
            fd->h = h;
4177
0
            fd->h->ref_count++;
4178
            // Ensure hrecs is initialised now as we don't want multiple
4179
            // threads trying to do this simultaneously.
4180
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4181
0
                return -2;
4182
4183
            // We can only do this once we've got a header
4184
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4185
0
                               fp) != 0)
4186
0
                return -2;
4187
0
            fd->dispatcher_set = 1;
4188
0
        }
4189
4190
0
        if (fd->h != h) {
4191
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4192
0
            return -2;
4193
0
        }
4194
4195
0
        sp_bams *gb = fd->curr_bam;
4196
0
        if (!gb) {
4197
0
            if (fd->errcode) {
4198
                // In case reader failed
4199
0
                errno = fd->errcode;
4200
0
                return -2;
4201
0
            }
4202
4203
0
            pthread_mutex_lock(&fd->command_m);
4204
0
            int cmd = fd->command;
4205
0
            pthread_mutex_unlock(&fd->command_m);
4206
0
            if (cmd == SAM_AT_EOF)
4207
0
                return -1;
4208
4209
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4210
0
            if (!r)
4211
0
                return -2;
4212
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4213
0
            hts_tpool_delete_result(r, 0);
4214
0
        }
4215
0
        if (!gb) {
4216
0
            pthread_mutex_lock(&fd->command_m);
4217
0
            fd->command = SAM_AT_EOF;
4218
0
            pthread_mutex_unlock(&fd->command_m);
4219
0
            return fd->errcode ? -2 : -1;
4220
0
        }
4221
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4222
0
        if (fd->curr_idx < gb->nbams)
4223
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4224
0
                return -2;
4225
0
        if (fd->curr_idx == gb->nbams) {
4226
0
            pthread_mutex_lock(&fd->lines_m);
4227
0
            gb->next = fd->bams;
4228
0
            fd->bams = gb;
4229
0
            pthread_mutex_unlock(&fd->lines_m);
4230
4231
0
            fd->curr_bam = NULL;
4232
0
            fd->curr_idx = 0;
4233
        // Consider prefetching next record?  I.e.
4234
        // } else {
4235
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4236
0
        }
4237
4238
0
        ret = 0;
4239
4240
347k
    } else  {
4241
347k
    err_recover:
4242
347k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4243
347k
        if (ret < 0) return ret;
4244
4245
345k
        ret = sam_parse1(&fp->line, h, b);
4246
345k
        fp->line.l = 0;
4247
345k
        if (ret < 0) {
4248
2.58k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4249
2.58k
            if (h && h->ignore_sam_err) goto err_recover;
4250
2.58k
        }
4251
345k
    }
4252
4253
345k
    return ret;
4254
347k
}
4255
4256
// Returns 0 on success,
4257
//        -1 on EOF,
4258
//       <-1 on error
4259
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4260
17.6M
{
4261
17.6M
    int ret, pass_filter;
4262
4263
17.6M
    do {
4264
17.6M
        switch (fp->format.format) {
4265
431
        case bam:
4266
431
            ret = sam_read1_bam(fp, h, b);
4267
431
            break;
4268
4269
1.17k
        case cram:
4270
1.17k
            ret = sam_read1_cram(fp, h, &b);
4271
1.17k
            break;
4272
4273
347k
        case sam:
4274
347k
            ret = sam_read1_sam(fp, h, b);
4275
347k
            break;
4276
4277
17.2M
        case fasta_format:
4278
17.2M
        case fastq_format: {
4279
17.2M
            fastq_state *x = (fastq_state *)fp->state;
4280
17.2M
            if (!x) {
4281
1.89k
                if (!(fp->state = fastq_state_init(fp->format.format
4282
1.89k
                                                   == fastq_format ? '@' : '>')))
4283
0
                    return -2;
4284
1.89k
            }
4285
4286
17.2M
            return fastq_parse1(fp, b);
4287
17.2M
        }
4288
4289
0
        case empty_format:
4290
0
            errno = EPIPE;
4291
0
            return -3;
4292
4293
0
        default:
4294
0
            errno = EFTYPE;
4295
0
            return -3;
4296
17.6M
        }
4297
4298
349k
        pass_filter = (ret >= 0 && fp->filter)
4299
349k
            ? sam_passes_filter(h, b, fp->filter)
4300
349k
            : 1;
4301
349k
    } while (pass_filter == 0);
4302
4303
349k
    return pass_filter < 0 ? -2 : ret;
4304
17.6M
}
4305
4306
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4307
// this code isn't vectorised and runs far slower than is necessary (even
4308
// with the restrict keyword being used).
4309
static inline void HTS_OPT3
4310
485
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4311
485
    uint32_t i;
4312
61.8k
    for (i = 0; i < len; i++)
4313
61.3k
        a[i] = b[i]+33;
4314
485
}
4315
4316
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4317
5.87M
{
4318
5.87M
    int i, r = 0;
4319
5.87M
    uint8_t *s, *end;
4320
5.87M
    const bam1_core_t *c = &b->core;
4321
4322
5.87M
    if (c->l_qname == 0)
4323
0
        return -1;
4324
5.87M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4325
5.87M
    r |= kputc_('\t', str); // query name
4326
5.87M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4327
5.87M
    if (c->tid >= 0) { // chr
4328
76.2k
        r |= kputs(h->target_name[c->tid] , str);
4329
76.2k
        r |= kputc_('\t', str);
4330
5.80M
    } else r |= kputsn_("*\t", 2, str);
4331
5.87M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4332
5.87M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4333
5.87M
    if (c->n_cigar) { // cigar
4334
100k
        uint32_t *cigar = bam_get_cigar(b);
4335
4.33M
        for (i = 0; i < c->n_cigar; ++i) {
4336
4.23M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4337
4.23M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4338
4.23M
        }
4339
5.77M
    } else r |= kputc_('*', str);
4340
5.87M
    r |= kputc_('\t', str);
4341
5.87M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4342
6.14k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4343
5.64k
    else {
4344
5.64k
        r |= kputs(h->target_name[c->mtid], str);
4345
5.64k
        r |= kputc_('\t', str);
4346
5.64k
    }
4347
5.87M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4348
5.87M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4349
5.87M
    if (c->l_qseq) { // seq and qual
4350
175k
        uint8_t *s = bam_get_seq(b);
4351
175k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4352
175k
        char *cp = str->s + str->l;
4353
4354
        // Sequence, 2 bases at a time
4355
175k
        nibble2base(s, cp, c->l_qseq);
4356
175k
        cp[c->l_qseq] = '\t';
4357
175k
        cp += c->l_qseq+1;
4358
4359
        // Quality
4360
175k
        s = bam_get_qual(b);
4361
175k
        i = 0;
4362
175k
        if (s[0] == 0xff) {
4363
175k
            cp[i++] = '*';
4364
175k
        } else {
4365
485
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4366
485
            i = c->l_qseq;
4367
485
        }
4368
175k
        cp[i] = 0;
4369
175k
        cp += i;
4370
175k
        str->l = cp - str->s;
4371
5.70M
    } else r |= kputsn_("*\t*", 3, str);
4372
4373
5.87M
    s = bam_get_aux(b); // aux
4374
5.87M
    end = b->data + b->l_data;
4375
4376
7.17M
    while (end - s >= 4) {
4377
1.29M
        r |= kputc_('\t', str);
4378
1.29M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4379
22
            goto bad_aux;
4380
1.29M
    }
4381
5.87M
    r |= kputsn("", 0, str); // nul terminate
4382
5.87M
    if (r < 0) goto mem_err;
4383
4384
5.87M
    return str->l;
4385
4386
22
 bad_aux:
4387
22
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4388
22
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4389
22
    errno = EINVAL;
4390
22
    return -1;
4391
4392
0
 mem_err:
4393
0
    hts_log_error("Out of memory");
4394
0
    errno = ENOMEM;
4395
0
    return -1;
4396
5.87M
}
4397
4398
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4399
5.87M
{
4400
5.87M
    str->l = 0;
4401
5.87M
    return sam_format1_append(h, b, str);
4402
5.87M
}
4403
4404
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4405
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4406
0
{
4407
0
    unsigned flag = b->core.flag;
4408
0
    int i, e = 0, len = b->core.l_qseq;
4409
0
    uint8_t *seq, *qual;
4410
4411
0
    str->l = 0;
4412
4413
    // Name
4414
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4415
0
        return -1;
4416
4417
    // UMI tag
4418
0
    if (x && *x->UMI[0]) {
4419
        // Temporary copy of '#num' if present
4420
0
        char plex[256];
4421
0
        size_t len = str->l;
4422
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4423
0
            len--;
4424
4425
0
        if (str->s[len] == '#' && str->l - len < 255) {
4426
0
            memcpy(plex, &str->s[len], str->l - len);
4427
0
            plex[str->l - len] = 0;
4428
0
            str->l = len;
4429
0
        } else {
4430
0
            *plex = 0;
4431
0
        }
4432
4433
0
        uint8_t *bc = NULL;
4434
0
        int n;
4435
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4436
0
            bc = bam_aux_get(b, x->UMI[n]);
4437
0
        if (bc && *bc == 'Z') {
4438
0
            int err = kputc(':', str) < 0;
4439
            // Replace any non-alpha with '+'
4440
0
            while (*++bc)
4441
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4442
0
            if (err)
4443
0
                return -1;
4444
0
        }
4445
4446
0
        if (*plex && kputs(plex, str) < 0)
4447
0
            return -1;
4448
0
    }
4449
4450
    // /1 or /2 suffix
4451
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4452
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4453
0
        if (r12 == BAM_FREAD1) {
4454
0
            if (kputs("/1", str) == EOF)
4455
0
                return -1;
4456
0
        } else if (r12 == BAM_FREAD2) {
4457
0
            if (kputs("/2", str) == EOF)
4458
0
                return -1;
4459
0
        }
4460
0
    }
4461
4462
    // Illumina CASAVA tag.
4463
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4464
0
    if (x && x->casava) {
4465
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4466
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4467
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4468
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4469
0
                     bc ? (char *)bc+1 : "0") < 0)
4470
0
            return -1;
4471
4472
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4473
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4474
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4475
0
            str->s[str->l-1] = '0';
4476
0
            str->s[str->l] = 0;
4477
0
            bc = NULL;
4478
0
        }
4479
4480
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4481
0
        if (bc) {
4482
0
            int l = strlen((char *)bc+1);
4483
0
            char *c = (char *)str->s + str->l - l;
4484
0
            for (i = 0; i < l; i++) {
4485
0
                if (!isalpha_c(c[i]))
4486
0
                    c[i] = '+';
4487
0
                else if (islower_c(c[i]))
4488
0
                    c[i] = toupper_c(c[i]);
4489
0
            }
4490
0
        }
4491
0
    }
4492
4493
    // Aux tags
4494
0
    if (x && x->aux) {
4495
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4496
0
        while (s && end - s >= 4) {
4497
0
            int tt = s[0]*256 + s[1];
4498
0
            if (x->tags == NULL ||
4499
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4500
0
                e |= kputc_('\t', str) < 0;
4501
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4502
0
                    return -1;
4503
0
            } else {
4504
0
                s = skip_aux(s+2, end);
4505
0
            }
4506
0
        }
4507
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4508
0
    }
4509
4510
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4511
0
    e |= kputc_('\n', str) < 0;
4512
4513
    // Seq line
4514
0
    seq = bam_get_seq(b);
4515
0
    if (flag & BAM_FREVERSE)
4516
0
        for (i = len-1; i >= 0; i--)
4517
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4518
0
    else
4519
0
        for (i = 0; i < len; i++)
4520
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4521
4522
4523
    // Qual line
4524
0
    if (x->nprefix == '@') {
4525
0
        kputsn("\n+\n", 3, str);
4526
0
        qual = bam_get_qual(b);
4527
0
        if (qual[0] == 0xff)
4528
0
            for (i = 0; i < len; i++)
4529
0
                e |= kputc_('B', str) < 0;
4530
0
        else if (flag & BAM_FREVERSE)
4531
0
            for (i = len-1; i >= 0; i--)
4532
0
                e |= kputc_(33 + qual[i], str) < 0;
4533
0
        else
4534
0
            for (i = 0; i < len; i++)
4535
0
                e |= kputc_(33 + qual[i], str) < 0;
4536
4537
0
    }
4538
0
    e |= kputc('\n', str) < 0;
4539
4540
0
    return e ? -1 : str->l;
4541
0
}
4542
4543
// Sadly we need to be able to modify the bam_hdr here so we can
4544
// reference count the structure.
4545
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4546
17.6M
{
4547
17.6M
    switch (fp->format.format) {
4548
0
    case binary_format:
4549
0
        fp->format.category = sequence_data;
4550
0
        fp->format.format = bam;
4551
        /* fall-through */
4552
5.87M
    case bam:
4553
5.87M
        return bam_write_idx1(fp, h, b);
4554
4555
5.87M
    case cram:
4556
5.87M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4557
4558
0
    case text_format:
4559
0
        fp->format.category = sequence_data;
4560
0
        fp->format.format = sam;
4561
        /* fall-through */
4562
5.87M
    case sam:
4563
5.87M
        if (fp->state) {
4564
0
            SAM_state *fd = (SAM_state *)fp->state;
4565
4566
            // Threaded output
4567
0
            if (!fd->h) {
4568
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4569
                // just data pointed to by it (which is a bit weasely still),
4570
                // but out cached pointer must be non-const as we want to
4571
                // destroy it later on and sam_hdr_destroy takes non-const.
4572
                //
4573
                // We do this because some tools do sam_hdr_destroy; sam_close
4574
                // while others do sam_close; sam_hdr_destroy.  The former is
4575
                // an issue as we need the header still when flushing.
4576
0
                fd->h = (sam_hdr_t *)h;
4577
0
                fd->h->ref_count++;
4578
4579
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4580
0
                                   fp) != 0)
4581
0
                    return -2;
4582
0
                fd->dispatcher_set = 1;
4583
0
            }
4584
4585
0
            if (fd->h != h) {
4586
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4587
0
                return -2;
4588
0
            }
4589
4590
            // Find a suitable BAM array to copy to
4591
0
            sp_bams *gb = fd->curr_bam;
4592
0
            if (!gb) {
4593
0
                pthread_mutex_lock(&fd->lines_m);
4594
0
                if (fd->bams) {
4595
0
                    fd->curr_bam = gb = fd->bams;
4596
0
                    fd->bams = gb->next;
4597
0
                    gb->next = NULL;
4598
0
                    gb->nbams = 0;
4599
0
                    gb->bam_mem = 0;
4600
0
                    pthread_mutex_unlock(&fd->lines_m);
4601
0
                } else {
4602
0
                    pthread_mutex_unlock(&fd->lines_m);
4603
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4604
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4605
0
                        free(gb);
4606
0
                        return -1;
4607
0
                    }
4608
0
                    gb->nbams = 0;
4609
0
                    gb->abams = SAM_NBAM;
4610
0
                    gb->bam_mem = 0;
4611
0
                    gb->fd = fd;
4612
0
                    fd->curr_idx = 0;
4613
0
                    fd->curr_bam = gb;
4614
0
                }
4615
0
            }
4616
4617
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4618
0
                return -2;
4619
0
            gb->bam_mem += b->l_data + sizeof(*b);
4620
4621
            // Dispatch if full
4622
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4623
0
                gb->serial = fd->serial++;
4624
0
                pthread_mutex_lock(&fd->command_m);
4625
0
                if (fd->errcode != 0) {
4626
0
                    pthread_mutex_unlock(&fd->command_m);
4627
0
                    return -fd->errcode;
4628
0
                }
4629
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4630
0
                                        cleanup_sp_bams,
4631
0
                                        cleanup_sp_lines, 0) < 0) {
4632
0
                    pthread_mutex_unlock(&fd->command_m);
4633
0
                    return -1;
4634
0
                }
4635
0
                pthread_mutex_unlock(&fd->command_m);
4636
0
                fd->curr_bam = NULL;
4637
0
            }
4638
4639
            // Dummy value as we don't know how long it really is.
4640
            // We could track file sizes via a SAM_state field, but I don't think
4641
            // it is necessary.
4642
0
            return 1;
4643
5.87M
        } else {
4644
5.87M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4645
5.87M
            kputc('\n', &fp->line);
4646
5.87M
            if (fp->is_bgzf) {
4647
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4648
0
                    return -1;
4649
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4650
5.87M
            } else {
4651
5.87M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4652
5.87M
            }
4653
4654
5.87M
            if (fp->idx) {
4655
0
                if (fp->format.compression == bgzf) {
4656
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4657
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4658
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4659
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4660
0
                        return -1;
4661
0
                    }
4662
0
                } else {
4663
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4664
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4665
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4666
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4667
0
                        return -1;
4668
0
                    }
4669
0
                }
4670
0
            }
4671
4672
5.87M
            return fp->line.l;
4673
5.87M
        }
4674
4675
4676
0
    case fasta_format:
4677
0
    case fastq_format: {
4678
0
        fastq_state *x = (fastq_state *)fp->state;
4679
0
        if (!x) {
4680
0
            if (!(fp->state = fastq_state_init(fp->format.format
4681
0
                                               == fastq_format ? '@' : '>')))
4682
0
                return -2;
4683
0
        }
4684
4685
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4686
0
            return -1;
4687
0
        if (fp->is_bgzf) {
4688
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4689
0
                return -1;
4690
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4691
0
                return -1;
4692
0
        } else {
4693
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4694
0
                return -1;
4695
0
        }
4696
0
        return fp->line.l;
4697
0
    }
4698
4699
0
    default:
4700
0
        errno = EBADF;
4701
0
        return -1;
4702
17.6M
    }
4703
17.6M
}
4704
4705
/************************
4706
 *** Auxiliary fields ***
4707
 ************************/
4708
#ifndef HTS_LITTLE_ENDIAN
4709
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4710
    int tsz = aux_type2size(type);
4711
4712
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4713
4714
    switch (tsz) {
4715
        case 'H': case 'Z': case 1:  // Trivial
4716
            memcpy(out, in, len);
4717
            break;
4718
4719
#define aux_val_to_le(type_t, store_le) do {                            \
4720
        type_t v;                                                       \
4721
        size_t i;                                                       \
4722
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4723
            memcpy(&v, in + i, sizeof(type_t));                         \
4724
            store_le(v, out);                                           \
4725
        }                                                               \
4726
    } while (0)
4727
4728
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4729
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4730
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4731
4732
#undef aux_val_to_le
4733
4734
        case 'B': { // Recurse!
4735
            uint32_t n;
4736
            if (len < 5) return -1;
4737
            memcpy(&n, in + 1, 4);
4738
            out[0] = in[0];
4739
            u32_to_le(n, out + 1);
4740
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4741
        }
4742
4743
        default: // Unknown type code
4744
            return -1;
4745
    }
4746
4747
4748
4749
    return 0;
4750
}
4751
#endif
4752
4753
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4754
0
{
4755
0
    uint32_t new_len;
4756
4757
0
    assert(b->l_data >= 0);
4758
0
    new_len = b->l_data + 3 + len;
4759
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4760
4761
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4762
4763
0
    b->data[b->l_data] = tag[0];
4764
0
    b->data[b->l_data + 1] = tag[1];
4765
0
    b->data[b->l_data + 2] = type;
4766
4767
0
#ifdef HTS_LITTLE_ENDIAN
4768
0
    memcpy(b->data + b->l_data + 3, data, len);
4769
#else
4770
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4771
        errno = EINVAL;
4772
        return -1;
4773
    }
4774
#endif
4775
4776
0
    b->l_data = new_len;
4777
4778
0
    return 0;
4779
4780
0
 nomem:
4781
0
    errno = ENOMEM;
4782
0
    return -1;
4783
0
}
4784
4785
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4786
2.89M
{
4787
2.89M
    int size;
4788
2.89M
    uint32_t n;
4789
2.89M
    if (s >= end) return end;
4790
2.89M
    size = aux_type2size(*s); ++s; // skip type
4791
2.89M
    switch (size) {
4792
595k
    case 'Z':
4793
613k
    case 'H':
4794
613k
        s = memchr(s, 0, end-s);
4795
613k
        return s ? s+1 : end;
4796
94.3k
    case 'B':
4797
94.3k
        if (end - s < 5) return NULL;
4798
94.3k
        size = aux_type2size(*s); ++s;
4799
94.3k
        n = le_to_u32(s);
4800
94.3k
        s += 4;
4801
94.3k
        if (size == 0 || end - s < size * n) return NULL;
4802
94.3k
        return s + size * n;
4803
84
    case 0:
4804
84
        return NULL;
4805
2.18M
    default:
4806
2.18M
        if (end - s < size) return NULL;
4807
2.18M
        return s + size;
4808
2.89M
    }
4809
2.89M
}
4810
4811
uint8_t *bam_aux_first(const bam1_t *b)
4812
6.04M
{
4813
6.04M
    uint8_t *s = bam_get_aux(b);
4814
6.04M
    uint8_t *end = b->data + b->l_data;
4815
6.04M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4816
241k
    return s+2;
4817
6.04M
}
4818
4819
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4820
2.82M
{
4821
2.82M
    uint8_t *end = b->data + b->l_data;
4822
2.82M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4823
2.82M
    if (next == NULL) goto bad_aux;
4824
2.82M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4825
2.65M
    return next+2;
4826
4827
84
 bad_aux:
4828
84
    hts_log_error("Corrupted aux data for read %s flag %d",
4829
84
                  bam_get_qname(b), b->core.flag);
4830
84
    errno = EINVAL;
4831
84
    return NULL;
4832
2.82M
}
4833
4834
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4835
6.04M
{
4836
6.04M
    uint8_t *s;
4837
8.87M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4838
2.89M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4839
            // Check the tag value is valid and complete
4840
73.6k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4841
73.6k
            if (e == NULL) goto bad_aux;
4842
73.6k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4843
4844
73.6k
            return s;
4845
73.6k
        }
4846
4847
    // errno now as set by bam_aux_first()/bam_aux_next()
4848
5.97M
    return NULL;
4849
4850
0
 bad_aux:
4851
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4852
0
                  bam_get_qname(b), b->core.flag);
4853
0
    errno = EINVAL;
4854
0
    return NULL;
4855
6.04M
}
4856
4857
int bam_aux_del(bam1_t *b, uint8_t *s)
4858
0
{
4859
0
    s = bam_aux_remove(b, s);
4860
0
    return (s || errno == ENOENT)? 0 : -1;
4861
0
}
4862
4863
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4864
0
{
4865
0
    uint8_t *end = b->data + b->l_data;
4866
0
    uint8_t *next = skip_aux(s, end);
4867
0
    if (next == NULL) goto bad_aux;
4868
4869
0
    b->l_data -= next - (s-2);
4870
0
    if (next >= end) { errno = ENOENT; return NULL; }
4871
4872
0
    memmove(s-2, next, end - next);
4873
0
    return s;
4874
4875
0
 bad_aux:
4876
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4877
0
                  bam_get_qname(b), b->core.flag);
4878
0
    errno = EINVAL;
4879
0
    return NULL;
4880
0
}
4881
4882
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4883
0
{
4884
    // FIXME: This is not at all efficient!
4885
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4886
0
    size_t old_ln = 0;
4887
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4888
0
    int save_errno = errno;
4889
0
    int new_tag = 0;
4890
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4891
4892
0
    if (s) {  // Replacing existing tag
4893
0
        char type = *s;
4894
0
        if (type != 'Z') {
4895
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4896
0
            errno = EINVAL;
4897
0
            return -1;
4898
0
        }
4899
0
        s++;
4900
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4901
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4902
0
        s -= 3;
4903
0
    } else {
4904
0
        if (errno != ENOENT) { // Invalid aux data, give up
4905
0
            return -1;
4906
0
        } else { // Tag doesn't exist - put it on the end
4907
0
            errno = save_errno;
4908
0
            s = b->data + b->l_data;
4909
0
            new_tag = 3;
4910
0
        }
4911
0
    }
4912
4913
0
    if (old_ln < ln + need_nul + new_tag) {
4914
0
        ptrdiff_t s_offset = s - b->data;
4915
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4916
0
            return -1;
4917
0
        s = b->data + s_offset;
4918
0
    }
4919
0
    if (!new_tag) {
4920
0
        memmove(s + 3 + ln + need_nul,
4921
0
                s + 3 + old_ln,
4922
0
                b->l_data - (s + 3 - b->data) - old_ln);
4923
0
    }
4924
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4925
4926
0
    s[0] = tag[0];
4927
0
    s[1] = tag[1];
4928
0
    s[2] = 'Z';
4929
0
    memmove(s+3,data,ln);
4930
0
    if (need_nul) s[3 + ln] = '\0';
4931
0
    return 0;
4932
0
}
4933
4934
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4935
0
{
4936
0
    uint32_t sz, old_sz = 0, new = 0;
4937
0
    uint8_t *s, type;
4938
4939
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4940
0
        errno = EOVERFLOW;
4941
0
        return -1;
4942
0
    }
4943
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4944
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4945
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4946
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4947
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4948
0
    else                       { type = 'I'; sz = 4; }
4949
4950
0
    s = bam_aux_get(b, tag);
4951
0
    if (s) {  // Tag present - how big was the old one?
4952
0
        switch (*s) {
4953
0
            case 'c': case 'C': old_sz = 1; break;
4954
0
            case 's': case 'S': old_sz = 2; break;
4955
0
            case 'i': case 'I': old_sz = 4; break;
4956
0
            default: errno = EINVAL; return -1;  // Not an integer
4957
0
        }
4958
0
    } else {
4959
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4960
0
            s = b->data + b->l_data;
4961
0
            new = 1;
4962
0
        }  else { // Invalid aux data, give up.
4963
0
            return -1;
4964
0
        }
4965
0
    }
4966
4967
0
    if (new || old_sz < sz) {
4968
        // Make room for new tag
4969
0
        ptrdiff_t s_offset = s - b->data;
4970
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4971
0
            return -1;
4972
0
        s =  b->data + s_offset;
4973
0
        if (new) { // Add tag id
4974
0
            *s++ = tag[0];
4975
0
            *s++ = tag[1];
4976
0
        } else {   // Shift following data so we have space
4977
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4978
0
        }
4979
0
    } else {
4980
        // Reuse old space.  Data value may be bigger than necessary but
4981
        // we avoid having to move everything else
4982
0
        sz = old_sz;
4983
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4984
0
        assert(type > 0);
4985
0
    }
4986
0
    *s++ = type;
4987
0
#ifdef HTS_LITTLE_ENDIAN
4988
0
    memcpy(s, &val, sz);
4989
#else
4990
    switch (sz) {
4991
        case 4:  u32_to_le(val, s); break;
4992
        case 2:  u16_to_le(val, s); break;
4993
        default: *s = val; break;
4994
    }
4995
#endif
4996
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
4997
0
    return 0;
4998
0
}
4999
5000
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5001
0
{
5002
0
    uint8_t *s = bam_aux_get(b, tag);
5003
0
    int shrink = 0, new = 0;
5004
5005
0
    if (s) { // Tag present - what was it?
5006
0
        switch (*s) {
5007
0
            case 'f': break;
5008
0
            case 'd': shrink = 1; break;
5009
0
            default: errno = EINVAL; return -1;  // Not a float
5010
0
        }
5011
0
    } else {
5012
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5013
0
            new = 1;
5014
0
        }  else { // Invalid aux data, give up.
5015
0
            return -1;
5016
0
        }
5017
0
    }
5018
5019
0
    if (new) { // Ensure there's room
5020
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5021
0
            return -1;
5022
0
        s = b->data + b->l_data;
5023
0
        *s++ = tag[0];
5024
0
        *s++ = tag[1];
5025
0
    } else if (shrink) { // Convert non-standard double tag to float
5026
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5027
0
        b->l_data -= 4;
5028
0
    }
5029
0
    *s++ = 'f';
5030
0
    float_to_le(val, s);
5031
0
    if (new) b->l_data += 7;
5032
5033
0
    return 0;
5034
0
}
5035
5036
int bam_aux_update_array(bam1_t *b, const char tag[2],
5037
                         uint8_t type, uint32_t items, void *data)
5038
0
{
5039
0
    uint8_t *s = bam_aux_get(b, tag);
5040
0
    size_t old_sz = 0, new_sz;
5041
0
    int new = 0;
5042
5043
0
    if (s) { // Tag present
5044
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5045
0
        old_sz = aux_type2size(s[1]);
5046
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5047
0
        old_sz *= le_to_u32(s + 2);
5048
0
    } else {
5049
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5050
0
            s = b->data + b->l_data;
5051
0
            new = 1;
5052
0
        }  else { // Invalid aux data, give up.
5053
0
            return -1;
5054
0
        }
5055
0
    }
5056
5057
0
    new_sz = aux_type2size(type);
5058
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5059
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5060
0
    new_sz *= items;
5061
5062
0
    if (new || old_sz < new_sz) {
5063
        // Make room for new tag
5064
0
        ptrdiff_t s_offset = s - b->data;
5065
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5066
0
            return -1;
5067
0
        s =  b->data + s_offset;
5068
0
    }
5069
0
    if (new) { // Add tag id and type
5070
0
        *s++ = tag[0];
5071
0
        *s++ = tag[1];
5072
0
        *s = 'B';
5073
0
        b->l_data += 8 + new_sz;
5074
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5075
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5076
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5077
0
        b->l_data -= old_sz;
5078
0
        b->l_data += new_sz;
5079
0
    }
5080
5081
0
    s[1] = type;
5082
0
    u32_to_le(items, s + 2);
5083
0
    if (new_sz > 0) {
5084
0
#ifdef HTS_LITTLE_ENDIAN
5085
0
        memcpy(s + 6, data, new_sz);
5086
#else
5087
        return aux_to_le(type, s + 6, data, new_sz);
5088
#endif
5089
0
    }
5090
0
    return 0;
5091
0
}
5092
5093
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5094
                                      uint32_t idx)
5095
0
{
5096
0
    switch (type) {
5097
0
        case 'c': return le_to_i8(s + idx);
5098
0
        case 'C': return s[idx];
5099
0
        case 's': return le_to_i16(s + 2 * idx);
5100
0
        case 'S': return le_to_u16(s + 2 * idx);
5101
0
        case 'i': return le_to_i32(s + 4 * idx);
5102
0
        case 'I': return le_to_u32(s + 4 * idx);
5103
0
        default:
5104
0
            errno = EINVAL;
5105
0
            return 0;
5106
0
    }
5107
0
}
5108
5109
int64_t bam_aux2i(const uint8_t *s)
5110
0
{
5111
0
    int type;
5112
0
    type = *s++;
5113
0
    return get_int_aux_val(type, s, 0);
5114
0
}
5115
5116
double bam_aux2f(const uint8_t *s)
5117
0
{
5118
0
    int type;
5119
0
    type = *s++;
5120
0
    if (type == 'd') return le_to_double(s);
5121
0
    else if (type == 'f') return le_to_float(s);
5122
0
    else return get_int_aux_val(type, s, 0);
5123
0
}
5124
5125
char bam_aux2A(const uint8_t *s)
5126
0
{
5127
0
    int type;
5128
0
    type = *s++;
5129
0
    if (type == 'A') return *(char*)s;
5130
0
    errno = EINVAL;
5131
0
    return 0;
5132
0
}
5133
5134
char *bam_aux2Z(const uint8_t *s)
5135
0
{
5136
0
    int type;
5137
0
    type = *s++;
5138
0
    if (type == 'Z' || type == 'H') return (char*)s;
5139
0
    errno = EINVAL;
5140
0
    return 0;
5141
0
}
5142
5143
uint32_t bam_auxB_len(const uint8_t *s)
5144
0
{
5145
0
    if (s[0] != 'B') {
5146
0
        errno = EINVAL;
5147
0
        return 0;
5148
0
    }
5149
0
    return le_to_u32(s + 2);
5150
0
}
5151
5152
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5153
0
{
5154
0
    uint32_t len = bam_auxB_len(s);
5155
0
    if (idx >= len) {
5156
0
        errno = ERANGE;
5157
0
        return 0;
5158
0
    }
5159
0
    return get_int_aux_val(s[1], s + 6, idx);
5160
0
}
5161
5162
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5163
0
{
5164
0
    uint32_t len = bam_auxB_len(s);
5165
0
    if (idx >= len) {
5166
0
        errno = ERANGE;
5167
0
        return 0.0;
5168
0
    }
5169
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5170
0
    else return get_int_aux_val(s[1], s + 6, idx);
5171
0
}
5172
5173
int sam_open_mode(char *mode, const char *fn, const char *format)
5174
0
{
5175
    // TODO Parse "bam5" etc for compression level
5176
0
    if (format == NULL) {
5177
        // Try to pick a format based on the filename extension
5178
0
        char extension[HTS_MAX_EXT_LEN];
5179
0
        if (find_file_extension(fn, extension) < 0) return -1;
5180
0
        return sam_open_mode(mode, fn, extension);
5181
0
    }
5182
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5183
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5184
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5185
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5186
0
    else if (strcasecmp(format, "fastq") == 0 ||
5187
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5188
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5189
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5190
0
    else if (strcasecmp(format, "fasta") == 0 ||
5191
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5192
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5193
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5194
0
    else return -1;
5195
5196
0
    return 0;
5197
0
}
5198
5199
// A version of sam_open_mode that can handle ,key=value options.
5200
// The format string is allocated and returned, to be freed by the caller.
5201
// Prefix should be "r" or "w",
5202
char *sam_open_mode_opts(const char *fn,
5203
                         const char *mode,
5204
                         const char *format)
5205
0
{
5206
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5207
0
                             (mode   ? strlen(mode)   : 1) + 12);
5208
0
    char *opts, *cp;
5209
0
    int format_len;
5210
5211
0
    if (!mode_opts)
5212
0
        return NULL;
5213
5214
0
    strcpy(mode_opts, mode ? mode : "r");
5215
0
    cp = mode_opts + strlen(mode_opts);
5216
5217
0
    if (format == NULL) {
5218
        // Try to pick a format based on the filename extension
5219
0
        char extension[HTS_MAX_EXT_LEN];
5220
0
        if (find_file_extension(fn, extension) < 0) {
5221
0
            free(mode_opts);
5222
0
            return NULL;
5223
0
        }
5224
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5225
0
            return mode_opts;
5226
0
        } else {
5227
0
            free(mode_opts);
5228
0
            return NULL;
5229
0
        }
5230
0
    }
5231
5232
0
    if ((opts = strchr(format, ','))) {
5233
0
        format_len = opts-format;
5234
0
    } else {
5235
0
        opts="";
5236
0
        format_len = strlen(format);
5237
0
    }
5238
5239
0
    if (strncmp(format, "bam", format_len) == 0) {
5240
0
        *cp++ = 'b';
5241
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5242
0
        *cp++ = 'c';
5243
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5244
0
        *cp++ = 'c';
5245
0
        strcpy(cp, ",VERSION=2.1");
5246
0
        cp += 12;
5247
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5248
0
        *cp++ = 'c';
5249
0
        strcpy(cp, ",VERSION=3.0");
5250
0
        cp += 12;
5251
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5252
0
        ; // format mode=""
5253
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5254
0
        *cp++ = 'z';
5255
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5256
0
               strncmp(format, "fq", format_len) == 0) {
5257
0
        *cp++ = 'f';
5258
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5259
0
               strncmp(format, "fq.gz", format_len) == 0) {
5260
0
        *cp++ = 'f';
5261
0
        *cp++ = 'z';
5262
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5263
0
               strncmp(format, "fa", format_len) == 0) {
5264
0
        *cp++ = 'F';
5265
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5266
0
               strncmp(format, "fa", format_len) == 0) {
5267
0
        *cp++ = 'F';
5268
0
        *cp++ = 'z';
5269
0
    } else {
5270
0
        free(mode_opts);
5271
0
        return NULL;
5272
0
    }
5273
5274
0
    strcpy(cp, opts);
5275
5276
0
    return mode_opts;
5277
0
}
5278
5279
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5280
int bam_str2flag(const char *str)
5281
0
{
5282
0
    char *end, *beg = (char*) str;
5283
0
    long int flag = strtol(str, &end, 0);
5284
0
    if ( end!=str ) return flag;    // the conversion was successful
5285
0
    flag = 0;
5286
0
    while ( *str )
5287
0
    {
5288
0
        end = beg;
5289
0
        while ( *end && *end!=',' ) end++;
5290
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5291
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5292
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5293
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5294
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5295
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5296
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5297
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5298
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5299
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5300
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5301
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5302
0
        else return -1;
5303
0
        if ( !*end ) break;
5304
0
        beg = end + 1;
5305
0
    }
5306
0
    return flag;
5307
0
}
5308
5309
char *bam_flag2str(int flag)
5310
0
{
5311
0
    kstring_t str = {0,0,0};
5312
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5313
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5314
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5315
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5316
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5317
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5318
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5319
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5320
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5321
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5322
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5323
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5324
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5325
0
    return str.s;
5326
0
}
5327
5328
5329
/**************************
5330
 *** Pileup and Mpileup ***
5331
 **************************/
5332
5333
#if !defined(BAM_NO_PILEUP)
5334
5335
#include <assert.h>
5336
5337
/*******************
5338
 *** Memory pool ***
5339
 *******************/
5340
5341
typedef struct {
5342
    int k, y;
5343
    hts_pos_t x, end;
5344
} cstate_t;
5345
5346
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5347
5348
typedef struct __linkbuf_t {
5349
    bam1_t b;
5350
    hts_pos_t beg, end;
5351
    cstate_t s;
5352
    struct __linkbuf_t *next;
5353
    bam_pileup_cd cd;
5354
} lbnode_t;
5355
5356
typedef struct {
5357
    int cnt, n, max;
5358
    lbnode_t **buf;
5359
} mempool_t;
5360
5361
static mempool_t *mp_init(void)
5362
0
{
5363
0
    mempool_t *mp;
5364
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5365
0
    return mp;
5366
0
}
5367
static void mp_destroy(mempool_t *mp)
5368
0
{
5369
0
    int k;
5370
0
    for (k = 0; k < mp->n; ++k) {
5371
0
        free(mp->buf[k]->b.data);
5372
0
        free(mp->buf[k]);
5373
0
    }
5374
0
    free(mp->buf);
5375
0
    free(mp);
5376
0
}
5377
static inline lbnode_t *mp_alloc(mempool_t *mp)
5378
0
{
5379
0
    ++mp->cnt;
5380
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5381
0
    else return mp->buf[--mp->n];
5382
0
}
5383
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5384
0
{
5385
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5386
0
    if (mp->n == mp->max) {
5387
0
        mp->max = mp->max? mp->max<<1 : 256;
5388
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5389
0
    }
5390
0
    mp->buf[mp->n++] = p;
5391
0
}
5392
5393
/**********************
5394
 *** CIGAR resolver ***
5395
 **********************/
5396
5397
/* s->k: the index of the CIGAR operator that has just been processed.
5398
   s->x: the reference coordinate of the start of s->k
5399
   s->y: the query coordinate of the start of s->k
5400
 */
5401
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5402
0
{
5403
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5404
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5405
5406
0
    bam1_t *b = p->b;
5407
0
    bam1_core_t *c = &b->core;
5408
0
    uint32_t *cigar = bam_get_cigar(b);
5409
0
    int k;
5410
    // determine the current CIGAR operation
5411
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5412
0
    if (s->k == -1) { // never processed
5413
0
        p->qpos = 0;
5414
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5415
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5416
0
        } else { // find the first match or deletion
5417
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5418
0
                int op = _cop(cigar[k]);
5419
0
                int l = _cln(cigar[k]);
5420
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5421
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5422
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5423
0
            }
5424
0
            assert(k < c->n_cigar);
5425
0
            s->k = k;
5426
0
        }
5427
0
    } else { // the read has been processed before
5428
0
        int op, l = _cln(cigar[s->k]);
5429
0
        if (pos - s->x >= l) { // jump to the next operation
5430
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5431
0
            op = _cop(cigar[s->k+1]);
5432
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5433
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5434
0
                s->x += l;
5435
0
                ++s->k;
5436
0
            } else { // find the next M/D/N/=/X
5437
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5438
0
                s->x += l;
5439
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5440
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5441
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5442
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5443
0
                }
5444
0
                s->k = k;
5445
0
            }
5446
0
            assert(s->k < c->n_cigar); // otherwise a bug
5447
0
        } // else, do nothing
5448
0
    }
5449
0
    { // collect pileup information
5450
0
        int op, l;
5451
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5452
0
        p->is_del = p->indel = p->is_refskip = 0;
5453
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5454
0
            int op2 = _cop(cigar[s->k+1]);
5455
0
            int l2 = _cln(cigar[s->k+1]);
5456
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5457
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5458
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5459
                // and rely on is_del=1 as we would for 3D.
5460
0
                p->indel = -(int)l2;
5461
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5462
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5463
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5464
0
                    else break;
5465
0
                }
5466
0
            } else if (op2 == BAM_CINS) {
5467
0
                p->indel = l2;
5468
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5469
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5470
0
                    if (op2 == BAM_CINS) p->indel += l2;
5471
0
                    else if (op2 != BAM_CPAD) break;
5472
0
                }
5473
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5474
0
                int l3 = 0;
5475
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5476
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5477
0
                    if (op2 == BAM_CINS) l3 += l2;
5478
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5479
0
                }
5480
0
                if (l3 > 0) p->indel = l3;
5481
0
            }
5482
0
        }
5483
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5484
0
            p->qpos = s->y + (pos - s->x);
5485
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5486
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5487
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5488
0
        } // cannot be other operations; otherwise a bug
5489
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5490
0
    }
5491
0
    p->cigar_ind = s->k;
5492
0
    return 1;
5493
0
}
5494
5495
/*******************************
5496
 *** Expansion of insertions ***
5497
 *******************************/
5498
5499
/*
5500
 * Fills out the kstring with the padded insertion sequence for the current
5501
 * location in 'p'.  If this is not an insertion site, the string is blank.
5502
 *
5503
 * This variant handles base modifications, but only when "m" is non-NULL.
5504
 *
5505
 * Returns the number of inserted base on success, with string length being
5506
 *        accessable via ins->l;
5507
 *        -1 on failure.
5508
 */
5509
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5510
                          hts_base_mod_state *m,
5511
0
                          kstring_t *ins, int *del_len) {
5512
0
    int j, k, indel, nb = 0;
5513
0
    uint32_t *cigar;
5514
5515
0
    if (p->indel <= 0) {
5516
0
        if (ks_resize(ins, 1) < 0)
5517
0
            return -1;
5518
0
        ins->l = 0;
5519
0
        ins->s[0] = '\0';
5520
0
        return 0;
5521
0
    }
5522
5523
0
    if (del_len)
5524
0
        *del_len = 0;
5525
5526
    // Measure indel length including pads
5527
0
    indel = 0;
5528
0
    k = p->cigar_ind+1;
5529
0
    cigar = bam_get_cigar(p->b);
5530
0
    while (k < p->b->core.n_cigar) {
5531
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5532
0
        case BAM_CPAD:
5533
0
        case BAM_CINS:
5534
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5535
0
            break;
5536
0
        default:
5537
0
            k = p->b->core.n_cigar;
5538
0
            break;
5539
0
        }
5540
0
        k++;
5541
0
    }
5542
0
    nb = ins->l = indel;
5543
5544
    // Produce sequence
5545
0
    if (ks_resize(ins, indel+1) < 0)
5546
0
        return -1;
5547
0
    indel = 0;
5548
0
    k = p->cigar_ind+1;
5549
0
    j = 1;
5550
0
    while (k < p->b->core.n_cigar) {
5551
0
        int l, c;
5552
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5553
0
        case BAM_CPAD:
5554
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5555
0
                ins->s[indel++] = '*';
5556
0
            break;
5557
0
        case BAM_CINS:
5558
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5559
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5560
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5561
0
                                            p->qpos + j - p->is_del)]
5562
0
                    : 'N';
5563
0
                ins->s[indel++] = c;
5564
0
                int nm;
5565
0
                hts_base_mod mod[256];
5566
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5567
0
                                                m, mod, 256)) > 0) {
5568
0
                    int o_indel = indel;
5569
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5570
0
                        return -1;
5571
0
                    ins->s[indel++] = '[';
5572
0
                    int j;
5573
0
                    for (j = 0; j < nm; j++) {
5574
0
                        char qual[20];
5575
0
                        if (mod[j].qual >= 0)
5576
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5577
0
                        else
5578
0
                            *qual=0;
5579
0
                        if (mod[j].modified_base < 0)
5580
                            // ChEBI
5581
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5582
0
                                              "%c(%d)%s",
5583
0
                                              "+-"[mod[j].strand],
5584
0
                                              -mod[j].modified_base,
5585
0
                                              qual);
5586
0
                        else
5587
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5588
0
                                              "%c%c%s",
5589
0
                                              "+-"[mod[j].strand],
5590
0
                                              mod[j].modified_base,
5591
0
                                              qual);
5592
0
                    }
5593
0
                    ins->s[indel++] = ']';
5594
0
                    ins->l += indel - o_indel; // grow by amount we used
5595
0
                }
5596
0
            }
5597
0
            break;
5598
0
        case BAM_CDEL:
5599
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5600
0
            if (del_len)
5601
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5602
            // fall through
5603
0
        default:
5604
0
            k = p->b->core.n_cigar;
5605
0
            break;
5606
0
        }
5607
0
        k++;
5608
0
    }
5609
0
    ins->s[indel] = '\0';
5610
0
    ins->l = indel; // string length
5611
5612
0
    return nb;      // base length
5613
0
}
5614
5615
/*
5616
 * Fills out the kstring with the padded insertion sequence for the current
5617
 * location in 'p'.  If this is not an insertion site, the string is blank.
5618
 *
5619
 * This is the original interface with no capability for reporting base
5620
 * modifications.
5621
 *
5622
 * Returns the length of insertion string on success;
5623
 *        -1 on failure.
5624
 */
5625
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5626
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5627
0
}
5628
5629
/***********************
5630
 *** Pileup iterator ***
5631
 ***********************/
5632
5633
// Dictionary of overlapping reads
5634
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5635
typedef khash_t(olap_hash) olap_hash_t;
5636
5637
struct bam_plp_s {
5638
    mempool_t *mp;
5639
    lbnode_t *head, *tail;
5640
    int32_t tid, max_tid;
5641
    hts_pos_t pos, max_pos;
5642
    int is_eof, max_plp, error, maxcnt;
5643
    uint64_t id;
5644
    bam_pileup1_t *plp;
5645
    // for the "auto" interface only
5646
    bam1_t *b;
5647
    bam_plp_auto_f func;
5648
    void *data;
5649
    olap_hash_t *overlaps;
5650
5651
    // For notification of creation and destruction events
5652
    // and associated client-owned pointer.
5653
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5654
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5655
};
5656
5657
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5658
0
{
5659
0
    bam_plp_t iter;
5660
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5661
0
    iter->mp = mp_init();
5662
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5663
0
    iter->max_tid = iter->max_pos = -1;
5664
0
    iter->maxcnt = 8000;
5665
0
    if (func) {
5666
0
        iter->func = func;
5667
0
        iter->data = data;
5668
0
        iter->b = bam_init1();
5669
0
    }
5670
0
    return iter;
5671
0
}
5672
5673
int bam_plp_init_overlaps(bam_plp_t iter)
5674
0
{
5675
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5676
0
    return iter->overlaps ? 0 : -1;
5677
0
}
5678
5679
void bam_plp_destroy(bam_plp_t iter)
5680
0
{
5681
0
    lbnode_t *p, *pnext;
5682
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5683
0
    for (p = iter->head; p != NULL; p = pnext) {
5684
0
        if (iter->plp_destruct && p != iter->tail)
5685
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5686
0
        pnext = p->next;
5687
0
        mp_free(iter->mp, p);
5688
0
    }
5689
0
    mp_destroy(iter->mp);
5690
0
    if (iter->b) bam_destroy1(iter->b);
5691
0
    free(iter->plp);
5692
0
    free(iter);
5693
0
}
5694
5695
void bam_plp_constructor(bam_plp_t plp,
5696
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5697
0
    plp->plp_construct = func;
5698
0
}
5699
5700
void bam_plp_destructor(bam_plp_t plp,
5701
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5702
0
    plp->plp_destruct = func;
5703
0
}
5704
5705
//---------------------------------
5706
//---  Tweak overlapping reads
5707
//---------------------------------
5708
5709
/**
5710
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5711
 *  cigar_iref2iseq_next() - get the next CMATCH base
5712
 *  @cigar:       pointer to current cigar block (rw)
5713
 *  @cigar_max:   pointer just beyond the last cigar block
5714
 *  @icig:        position within the current cigar block (rw)
5715
 *  @iseq:        position in the sequence (rw)
5716
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5717
 *
5718
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5719
 *  or -2 on error.
5720
 */
5721
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5722
                                      const uint32_t *cigar_max,
5723
                                      hts_pos_t *icig,
5724
                                      hts_pos_t *iseq,
5725
                                      hts_pos_t *iref)
5726
0
{
5727
0
    hts_pos_t pos = *iref;
5728
0
    if ( pos < 0 ) return -1;
5729
0
    *icig = 0;
5730
0
    *iseq = 0;
5731
0
    *iref = 0;
5732
0
    while ( *cigar<cigar_max )
5733
0
    {
5734
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5735
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5736
5737
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5738
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5739
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5740
0
        {
5741
0
            pos -= ncig;
5742
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5743
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5744
0
            continue;
5745
0
        }
5746
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5747
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5748
0
        {
5749
0
            pos -= ncig;
5750
0
            if ( pos<0 ) pos = 0;
5751
0
            (*cigar)++; *icig = 0; *iref += ncig;
5752
0
            continue;
5753
0
        }
5754
0
        hts_log_error("Unexpected cigar %d", cig);
5755
0
        return -2;
5756
0
    }
5757
0
    *iseq = -1;
5758
0
    return -1;
5759
0
}
5760
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5761
                                       const uint32_t *cigar_max,
5762
                                       hts_pos_t *icig,
5763
                                       hts_pos_t *iseq,
5764
                                       hts_pos_t *iref)
5765
0
{
5766
0
    while ( *cigar < cigar_max )
5767
0
    {
5768
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5769
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5770
5771
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5772
0
        {
5773
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5774
0
            (*iseq)++; (*icig)++; (*iref)++;
5775
0
            return BAM_CMATCH;
5776
0
        }
5777
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5778
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5779
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5780
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5781
0
        hts_log_error("Unexpected cigar %d", cig);
5782
0
        return -2;
5783
0
    }
5784
0
    *iseq = -1;
5785
0
    *iref = -1;
5786
0
    return -1;
5787
0
}
5788
5789
// Given overlapping read 'a' (left) and 'b' (right) on the same
5790
// template, adjust quality values to zero for either a or b.
5791
// Note versions 1.12 and earlier always removed quality from 'b' for
5792
// matching bases.  Now we select a or b semi-randomly based on name hash.
5793
// Returns 0 on success,
5794
//        -1 on failure
5795
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5796
0
{
5797
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5798
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5799
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5800
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5801
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5802
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5803
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5804
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5805
5806
0
    hts_pos_t iref   = b->core.pos;
5807
0
    hts_pos_t a_iref = iref - a->core.pos;
5808
0
    hts_pos_t b_iref = iref - b->core.pos;
5809
5810
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5811
0
                                    &a_icig, &a_iseq, &a_iref);
5812
0
    if ( a_ret<0 )
5813
        // no overlap or error
5814
0
        return a_ret<-1 ? -1:0;
5815
5816
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5817
0
                                    &b_icig, &b_iseq, &b_iref);
5818
0
    if ( b_ret<0 )
5819
        // no overlap or error
5820
0
        return b_ret<-1 ? -1:0;
5821
5822
    // Determine which seq is the one getting modified qualities.
5823
0
    uint8_t amul, bmul;
5824
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5825
0
        amul = 1;
5826
0
        bmul = 0;
5827
0
    } else {
5828
0
        amul = 0;
5829
0
        bmul = 1;
5830
0
    }
5831
5832
    // Loop over the overlapping region nulling qualities in either
5833
    // seq a or b.
5834
0
    int err = 0;
5835
0
    while ( 1 ) {
5836
        // Step to next matching reference position in a and b
5837
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5838
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5839
0
                                         &a_icig, &a_iseq, &a_iref);
5840
0
        if ( a_ret<0 ) { // done
5841
0
            err = a_ret<-1?-1:0;
5842
0
            break;
5843
0
        }
5844
5845
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5846
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5847
0
                                         &b_iseq, &b_iref);
5848
0
        if ( b_ret<0 ) { // done
5849
0
            err = b_ret<-1?-1:0;
5850
0
            break;
5851
0
        }
5852
5853
0
        if ( iref < a_iref + a->core.pos )
5854
0
            iref = a_iref + a->core.pos;
5855
5856
0
        if ( iref < b_iref + b->core.pos )
5857
0
            iref = b_iref + b->core.pos;
5858
5859
0
        iref++;
5860
5861
        // If A or B has a deletion then we catch up the other to this point.
5862
        // We also amend quality values using the same rules for mismatch.
5863
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5864
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5865
0
                && b_cigar > bam_get_cigar(b)
5866
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5867
                // Del in B means it's moved on further than A
5868
0
                do {
5869
0
                    a_qual[a_iseq] = amul
5870
0
                        ? a_qual[a_iseq]*0.8
5871
0
                        : 0;
5872
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5873
0
                                                 &a_icig, &a_iseq, &a_iref);
5874
0
                    if (a_ret < 0)
5875
0
                        return -(a_ret<-1); // 0 or -1
5876
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5877
0
            } else if (a_cigar > bam_get_cigar(a)
5878
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5879
                // Del in A means it's moved on further than B
5880
0
                do {
5881
0
                    b_qual[b_iseq] = bmul
5882
0
                        ? b_qual[b_iseq]*0.8
5883
0
                        : 0;
5884
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5885
0
                                                 &b_icig, &b_iseq, &b_iref);
5886
0
                    if (b_ret < 0)
5887
0
                        return -(b_ret<-1); // 0 or -1
5888
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5889
0
            } else {
5890
                // Anything else, eg ref-skip, we don't support here
5891
0
                continue;
5892
0
            }
5893
0
        }
5894
5895
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5896
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5897
        //         a_cigar-bam_get_cigar(a), a_icig,
5898
        //         b_cigar-bam_get_cigar(b), b_icig,
5899
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5900
        //         a_iseq, b_iseq);
5901
5902
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5903
            // Fell off end of sequence, bad CIGAR?
5904
0
            return -1;
5905
5906
        // We're finally at the same ref base in both a and b.
5907
        // Check if the bases match (confident) or mismatch
5908
        // (not so confident).
5909
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5910
            // We are very confident about this base.  Use sum of quals
5911
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5912
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5913
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5914
0
        } else {
5915
            // Not so confident about anymore given the mismatch.
5916
            // Reduce qual for lowest quality base.
5917
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5918
                // A highest qual base; keep
5919
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5920
0
                b_qual[b_iseq] = 0;
5921
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5922
                // B highest qual base; keep
5923
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5924
0
                a_qual[a_iseq] = 0;
5925
0
            } else {
5926
                // Both equal, so pick randomly
5927
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5928
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5929
0
            }
5930
0
        }
5931
0
    }
5932
5933
0
    return err;
5934
0
}
5935
5936
// Fix overlapping reads. Simple soft-clipping did not give good results.
5937
// Lowering qualities of unwanted bases is more selective and works better.
5938
//
5939
// Returns 0 on success, -1 on failure
5940
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5941
0
{
5942
0
    if ( !iter->overlaps ) return 0;
5943
5944
    // mapped mates and paired reads only
5945
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5946
5947
    // no overlap possible, unless some wild cigar
5948
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5949
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5950
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5951
0
       ) return 0;
5952
5953
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5954
0
    if ( kitr==kh_end(iter->overlaps) )
5955
0
    {
5956
        // Only add reads where the mate is still to arrive
5957
0
        if (node->b.core.mpos >= node->b.core.pos ||
5958
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5959
0
            int ret;
5960
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5961
0
            if (ret < 0) return -1;
5962
0
            kh_value(iter->overlaps, kitr) = node;
5963
0
        }
5964
0
    }
5965
0
    else
5966
0
    {
5967
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5968
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5969
0
        kh_del(olap_hash, iter->overlaps, kitr);
5970
0
        assert(a->end-1 == a->s.end);
5971
0
        return err;
5972
0
    }
5973
0
    return 0;
5974
0
}
5975
5976
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5977
0
{
5978
0
    if ( !iter->overlaps ) return;
5979
5980
0
    khiter_t kitr;
5981
0
    if ( b )
5982
0
    {
5983
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5984
0
        if ( kitr!=kh_end(iter->overlaps) )
5985
0
            kh_del(olap_hash, iter->overlaps, kitr);
5986
0
    }
5987
0
    else
5988
0
    {
5989
        // remove all
5990
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5991
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5992
0
    }
5993
0
}
5994
5995
5996
5997
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5998
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
5999
// buffer yet (the current position is still the maximum position across all buffered reads).
6000
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6001
0
{
6002
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6003
0
    *_n_plp = 0;
6004
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6005
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6006
0
        int n_plp = 0;
6007
        // write iter->plp at iter->pos
6008
0
        lbnode_t **pptr = &iter->head;
6009
0
        while (*pptr != iter->tail) {
6010
0
            lbnode_t *p = *pptr;
6011
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6012
0
                overlap_remove(iter, &p->b);
6013
0
                if (iter->plp_destruct)
6014
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6015
0
                *pptr = p->next; mp_free(iter->mp, p);
6016
0
            }
6017
0
            else {
6018
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6019
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6020
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6021
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6022
0
                    }
6023
0
                    iter->plp[n_plp].b = &p->b;
6024
0
                    iter->plp[n_plp].cd = p->cd;
6025
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6026
0
                }
6027
0
                pptr = &(*pptr)->next;
6028
0
            }
6029
0
        }
6030
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6031
        // update iter->tid and iter->pos
6032
0
        if (iter->head != iter->tail) {
6033
0
            if (iter->tid > iter->head->b.core.tid) {
6034
0
                hts_log_error("Unsorted input. Pileup aborts");
6035
0
                iter->error = 1;
6036
0
                *_n_plp = -1;
6037
0
                return NULL;
6038
0
            }
6039
0
        }
6040
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6041
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6042
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6043
0
            iter->pos = iter->head->beg; // jump to the next position
6044
0
        } else ++iter->pos; // scan contiguously
6045
        // return
6046
0
        if (n_plp) return iter->plp;
6047
0
        if (iter->is_eof && iter->head == iter->tail) break;
6048
0
    }
6049
0
    return NULL;
6050
0
}
6051
6052
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6053
0
{
6054
0
    hts_pos_t pos64 = 0;
6055
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6056
0
    if (pos64 < INT_MAX) {
6057
0
        *_pos = pos64;
6058
0
    } else {
6059
0
        hts_log_error("Position %"PRId64" too large", pos64);
6060
0
        *_pos = INT_MAX;
6061
0
        iter->error = 1;
6062
0
        *_n_plp = -1;
6063
0
        return NULL;
6064
0
    }
6065
0
    return p;
6066
0
}
6067
6068
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6069
0
{
6070
0
    if (iter->error) return -1;
6071
0
    if (b) {
6072
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6073
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6074
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6075
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6076
0
        {
6077
0
            overlap_remove(iter, b);
6078
0
            return 0;
6079
0
        }
6080
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6081
0
            return -1;
6082
0
        iter->tail->b.id = iter->id++;
6083
0
        iter->tail->beg = b->core.pos;
6084
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6085
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6086
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6087
0
        if (b->core.tid < iter->max_tid) {
6088
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6089
0
            iter->error = 1;
6090
0
            return -1;
6091
0
        }
6092
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6093
0
            hts_log_error("The input is not sorted (reads out of order)");
6094
0
            iter->error = 1;
6095
0
            return -1;
6096
0
        }
6097
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6098
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6099
0
            lbnode_t *next = mp_alloc(iter->mp);
6100
0
            if (!next) {
6101
0
                iter->error = 1;
6102
0
                return -1;
6103
0
            }
6104
0
            if (iter->plp_construct) {
6105
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6106
0
                                        &iter->tail->cd) < 0) {
6107
0
                    mp_free(iter->mp, next);
6108
0
                    iter->error = 1;
6109
0
                    return -1;
6110
0
                }
6111
0
            }
6112
0
            if (overlap_push(iter, iter->tail) < 0) {
6113
0
                mp_free(iter->mp, next);
6114
0
                iter->error = 1;
6115
0
                return -1;
6116
0
            }
6117
0
            iter->tail->next = next;
6118
0
            iter->tail = iter->tail->next;
6119
0
        }
6120
0
    } else iter->is_eof = 1;
6121
0
    return 0;
6122
0
}
6123
6124
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6125
0
{
6126
0
    const bam_pileup1_t *plp;
6127
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6128
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6129
0
    else { // no pileup line can be obtained; read alignments
6130
0
        *_n_plp = 0;
6131
0
        if (iter->is_eof) return 0;
6132
0
        int ret;
6133
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6134
0
            if (bam_plp_push(iter, iter->b) < 0) {
6135
0
                *_n_plp = -1;
6136
0
                return 0;
6137
0
            }
6138
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6139
            // otherwise no pileup line can be returned; read the next alignment.
6140
0
        }
6141
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6142
0
        if (bam_plp_push(iter, 0) < 0) {
6143
0
            *_n_plp = -1;
6144
0
            return 0;
6145
0
        }
6146
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6147
0
        return 0;
6148
0
    }
6149
0
}
6150
6151
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6152
0
{
6153
0
    hts_pos_t pos64 = 0;
6154
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6155
0
    if (pos64 < INT_MAX) {
6156
0
        *_pos = pos64;
6157
0
    } else {
6158
0
        hts_log_error("Position %"PRId64" too large", pos64);
6159
0
        *_pos = INT_MAX;
6160
0
        iter->error = 1;
6161
0
        *_n_plp = -1;
6162
0
        return NULL;
6163
0
    }
6164
0
    return p;
6165
0
}
6166
6167
void bam_plp_reset(bam_plp_t iter)
6168
0
{
6169
0
    overlap_remove(iter, NULL);
6170
0
    iter->max_tid = iter->max_pos = -1;
6171
0
    iter->tid = iter->pos = 0;
6172
0
    iter->is_eof = 0;
6173
0
    while (iter->head != iter->tail) {
6174
0
        lbnode_t *p = iter->head;
6175
0
        iter->head = p->next;
6176
0
        mp_free(iter->mp, p);
6177
0
    }
6178
0
}
6179
6180
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6181
0
{
6182
0
    iter->maxcnt = maxcnt;
6183
0
}
6184
6185
/************************
6186
 *** Mpileup iterator ***
6187
 ************************/
6188
6189
struct bam_mplp_s {
6190
    int n;
6191
    int32_t min_tid, *tid;
6192
    hts_pos_t min_pos, *pos;
6193
    bam_plp_t *iter;
6194
    int *n_plp;
6195
    const bam_pileup1_t **plp;
6196
};
6197
6198
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6199
0
{
6200
0
    int i;
6201
0
    bam_mplp_t iter;
6202
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6203
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6204
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6205
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6206
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6207
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6208
0
    iter->n = n;
6209
0
    iter->min_pos = HTS_POS_MAX;
6210
0
    iter->min_tid = (uint32_t)-1;
6211
0
    for (i = 0; i < n; ++i) {
6212
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6213
0
        iter->pos[i] = iter->min_pos;
6214
0
        iter->tid[i] = iter->min_tid;
6215
0
    }
6216
0
    return iter;
6217
0
}
6218
6219
int bam_mplp_init_overlaps(bam_mplp_t iter)
6220
0
{
6221
0
    int i, r = 0;
6222
0
    for (i = 0; i < iter->n; ++i)
6223
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6224
0
    return r == 0 ? 0 : -1;
6225
0
}
6226
6227
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6228
0
{
6229
0
    int i;
6230
0
    for (i = 0; i < iter->n; ++i)
6231
0
        iter->iter[i]->maxcnt = maxcnt;
6232
0
}
6233
6234
void bam_mplp_destroy(bam_mplp_t iter)
6235
0
{
6236
0
    int i;
6237
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6238
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6239
0
    free(iter->n_plp); free(iter->plp);
6240
0
    free(iter);
6241
0
}
6242
6243
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6244
0
{
6245
0
    int i, ret = 0;
6246
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6247
0
    uint32_t new_min_tid = (uint32_t)-1;
6248
0
    for (i = 0; i < iter->n; ++i) {
6249
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6250
0
            int tid;
6251
0
            hts_pos_t pos;
6252
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6253
0
            if ( iter->iter[i]->error ) return -1;
6254
0
            if (iter->plp[i]) {
6255
0
                iter->tid[i] = tid;
6256
0
                iter->pos[i] = pos;
6257
0
            } else {
6258
0
                iter->tid[i] = 0;
6259
0
                iter->pos[i] = 0;
6260
0
            }
6261
0
        }
6262
0
        if (iter->plp[i]) {
6263
0
            if (iter->tid[i] < new_min_tid) {
6264
0
                new_min_tid = iter->tid[i];
6265
0
                new_min_pos = iter->pos[i];
6266
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6267
0
                new_min_pos = iter->pos[i];
6268
0
            }
6269
0
        }
6270
0
    }
6271
0
    iter->min_pos = new_min_pos;
6272
0
    iter->min_tid = new_min_tid;
6273
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6274
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6275
0
    for (i = 0; i < iter->n; ++i) {
6276
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6277
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6278
0
            ++ret;
6279
0
        } else n_plp[i] = 0, plp[i] = 0;
6280
0
    }
6281
0
    return ret;
6282
0
}
6283
6284
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6285
0
{
6286
0
    hts_pos_t pos64 = 0;
6287
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6288
0
    if (ret >= 0) {
6289
0
        if (pos64 < INT_MAX) {
6290
0
            *_pos = pos64;
6291
0
        } else {
6292
0
            hts_log_error("Position %"PRId64" too large", pos64);
6293
0
            *_pos = INT_MAX;
6294
0
            return -1;
6295
0
        }
6296
0
    }
6297
0
    return ret;
6298
0
}
6299
6300
void bam_mplp_reset(bam_mplp_t iter)
6301
0
{
6302
0
    int i;
6303
0
    iter->min_pos = HTS_POS_MAX;
6304
0
    iter->min_tid = (uint32_t)-1;
6305
0
    for (i = 0; i < iter->n; ++i) {
6306
0
        bam_plp_reset(iter->iter[i]);
6307
0
        iter->pos[i] = HTS_POS_MAX;
6308
0
        iter->tid[i] = (uint32_t)-1;
6309
0
        iter->n_plp[i] = 0;
6310
0
        iter->plp[i] = NULL;
6311
0
    }
6312
0
}
6313
6314
void bam_mplp_constructor(bam_mplp_t iter,
6315
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6316
0
    int i;
6317
0
    for (i = 0; i < iter->n; ++i)
6318
0
        bam_plp_constructor(iter->iter[i], func);
6319
0
}
6320
6321
void bam_mplp_destructor(bam_mplp_t iter,
6322
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6323
0
    int i;
6324
0
    for (i = 0; i < iter->n; ++i)
6325
0
        bam_plp_destructor(iter->iter[i], func);
6326
0
}
6327
6328
#endif // ~!defined(BAM_NO_PILEUP)