Coverage Report

Created: 2026-01-13 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "htslib/hts_expr.h"
58
#include "header.h"
59
60
#include "htslib/khash.h"
61
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
62
KHASH_SET_INIT_INT(tag)
63
64
#ifndef EFTYPE
65
0
#define EFTYPE ENOEXEC
66
#endif
67
#ifndef EOVERFLOW
68
#define EOVERFLOW ERANGE
69
#endif
70
71
/**********************
72
 *** BAM header I/O ***
73
 **********************/
74
75
HTSLIB_EXPORT
76
const int8_t bam_cigar_table[256] = {
77
    // 0 .. 47
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
82
    // 48 .. 63  (including =)
83
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
84
85
    // 64 .. 79  (including MIDNHB)
86
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
87
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
88
89
    // 80 .. 95  (including SPX)
90
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
91
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
92
93
    // 96 .. 127
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
97
    // 128 .. 255
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
106
};
107
108
sam_hdr_t *sam_hdr_init(void)
109
28.4k
{
110
28.4k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
111
28.4k
    if (bh == NULL) return NULL;
112
113
28.4k
    bh->cigar_tab = bam_cigar_table;
114
28.4k
    return bh;
115
28.4k
}
116
117
void sam_hdr_destroy(sam_hdr_t *bh)
118
65.2k
{
119
65.2k
    int32_t i;
120
121
65.2k
    if (bh == NULL) return;
122
123
36.9k
    if (bh->ref_count > 0) {
124
8.49k
        --bh->ref_count;
125
8.49k
        return;
126
8.49k
    }
127
128
28.4k
    if (bh->target_name) {
129
41.9k
        for (i = 0; i < bh->n_targets; ++i)
130
25.2k
            free(bh->target_name[i]);
131
16.6k
        free(bh->target_name);
132
16.6k
        free(bh->target_len);
133
16.6k
    }
134
28.4k
    free(bh->text);
135
28.4k
    if (bh->hrecs)
136
17.4k
        sam_hrecs_free(bh->hrecs);
137
28.4k
    if (bh->sdict)
138
1.78k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
139
28.4k
    free(bh);
140
28.4k
}
141
142
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
143
// references before sam_hdr_t::hrecs is populated
144
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
145
0
{
146
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
147
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
148
0
    int i;
149
0
    if (!dest_long_refs) return -1;
150
151
0
    for (i = 0; i < h->n_targets; i++) {
152
0
        int ret;
153
0
        khiter_t ksrc, kdest;
154
0
        if (h->target_len[i] < UINT32_MAX) continue;
155
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
156
0
        if (ksrc == kh_end(src_long_refs)) continue;
157
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
158
0
        if (ret < 0) {
159
0
            kh_destroy(s2i, dest_long_refs);
160
0
            return -1;
161
0
        }
162
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
163
0
    }
164
165
0
    h->sdict = dest_long_refs;
166
0
    return 0;
167
0
}
168
169
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
170
16.3k
{
171
16.3k
    if (h0 == NULL) return NULL;
172
16.3k
    sam_hdr_t *h;
173
16.3k
    if ((h = sam_hdr_init()) == NULL) return NULL;
174
    // copy the simple data
175
16.3k
    h->n_targets = 0;
176
16.3k
    h->ignore_sam_err = h0->ignore_sam_err;
177
16.3k
    h->l_text = 0;
178
179
    // Then the pointery stuff
180
181
16.3k
    if (!h0->hrecs) {
182
6
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
183
6
        if (!h->target_len) goto fail;
184
6
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
185
6
        if (!h->target_name) goto fail;
186
187
6
        int i;
188
12
        for (i = 0; i < h0->n_targets; ++i) {
189
6
            h->target_len[i] = h0->target_len[i];
190
6
            h->target_name[i] = strdup(h0->target_name[i]);
191
6
            if (!h->target_name[i]) break;
192
6
        }
193
6
        h->n_targets = i;
194
6
        if (i < h0->n_targets) goto fail;
195
196
6
        if (h0->sdict) {
197
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
198
0
        }
199
6
    }
200
201
16.3k
    if (h0->hrecs) {
202
16.3k
        kstring_t tmp = { 0, 0, NULL };
203
16.3k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
204
0
            free(ks_release(&tmp));
205
0
            goto fail;
206
0
        }
207
208
16.3k
        h->l_text = tmp.l;
209
16.3k
        h->text   = ks_release(&tmp);
210
211
16.3k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
212
0
            goto fail;
213
16.3k
    } else {
214
6
        h->l_text = h0->text ? h0->l_text : 0;
215
6
        h->text = malloc(h->l_text + 1);
216
6
        if (!h->text) goto fail;
217
6
        if (h0->text)
218
6
            memcpy(h->text, h0->text, h->l_text);
219
6
        h->text[h->l_text] = '\0';
220
6
    }
221
222
16.3k
    return h;
223
224
0
 fail:
225
0
    sam_hdr_destroy(h);
226
0
    return NULL;
227
16.3k
}
228
229
sam_hdr_t *bam_hdr_read(BGZF *fp)
230
261
{
231
261
    sam_hdr_t *h;
232
261
    uint8_t buf[4];
233
261
    int magic_len, has_EOF;
234
261
    int32_t i, name_len, num_names = 0;
235
261
    size_t bufsize;
236
261
    ssize_t bytes;
237
    // check EOF
238
261
    has_EOF = bgzf_check_EOF(fp);
239
261
    if (has_EOF < 0) {
240
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
241
261
    } else if (has_EOF == 0) {
242
261
        hts_log_warning("EOF marker is absent. The input is probably truncated");
243
261
    }
244
    // read "BAM1"
245
261
    magic_len = bgzf_read(fp, buf, 4);
246
261
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
247
0
        hts_log_error("Invalid BAM binary header");
248
0
        return 0;
249
0
    }
250
261
    h = sam_hdr_init();
251
261
    if (!h) goto nomem;
252
253
    // read plain text and the number of reference sequences
254
261
    bytes = bgzf_read(fp, buf, 4);
255
261
    if (bytes != 4) goto read_err;
256
261
    h->l_text = le_to_u32(buf);
257
258
261
    bufsize = h->l_text + 1;
259
261
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
260
261
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
261
261
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
262
261
#endif
263
261
    h->text = (char*)malloc(bufsize);
264
261
    if (!h->text) goto nomem;
265
261
    h->text[h->l_text] = 0; // make sure it is NULL terminated
266
261
    bytes = bgzf_read(fp, h->text, h->l_text);
267
261
    if (bytes != h->l_text) goto read_err;
268
269
255
    bytes = bgzf_read(fp, &h->n_targets, 4);
270
255
    if (bytes != 4) goto read_err;
271
255
    if (fp->is_be) ed_swap_4p(&h->n_targets);
272
273
255
    if (h->n_targets < 0) goto invalid;
274
275
    // read reference sequence names and lengths
276
252
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
277
252
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
278
0
        goto nomem;
279
252
#endif
280
252
    if (h->n_targets > 0) {
281
165
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
282
165
        if (!h->target_name) goto nomem;
283
165
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
284
165
        if (!h->target_len) goto nomem;
285
165
    }
286
87
    else {
287
87
        h->target_name = NULL;
288
87
        h->target_len = NULL;
289
87
    }
290
291
813
    for (i = 0; i != h->n_targets; ++i) {
292
615
        bytes = bgzf_read(fp, &name_len, 4);
293
615
        if (bytes != 4) goto read_err;
294
615
        if (fp->is_be) ed_swap_4p(&name_len);
295
615
        if (name_len <= 0) goto invalid;
296
297
606
        h->target_name[i] = (char*)malloc(name_len);
298
606
        if (!h->target_name[i]) goto nomem;
299
606
        num_names++;
300
301
606
        bytes = bgzf_read(fp, h->target_name[i], name_len);
302
606
        if (bytes != name_len) goto read_err;
303
304
561
        if (h->target_name[i][name_len - 1] != '\0') {
305
            /* Fix missing NUL-termination.  Is this being too nice?
306
               We could alternatively bail out with an error. */
307
306
            char *new_name;
308
306
            if (name_len == INT32_MAX) goto invalid;
309
306
            new_name = realloc(h->target_name[i], name_len + 1);
310
306
            if (new_name == NULL) goto nomem;
311
306
            h->target_name[i] = new_name;
312
306
            h->target_name[i][name_len] = '\0';
313
306
        }
314
315
561
        bytes = bgzf_read(fp, &h->target_len[i], 4);
316
561
        if (bytes != 4) goto read_err;
317
561
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
318
561
    }
319
198
    return h;
320
321
0
 nomem:
322
0
    hts_log_error("Out of memory");
323
0
    goto clean;
324
325
51
 read_err:
326
51
    if (bytes < 0) {
327
27
        hts_log_error("Error reading BGZF stream");
328
27
    } else {
329
24
        hts_log_error("Truncated BAM header");
330
24
    }
331
51
    goto clean;
332
333
12
 invalid:
334
12
    hts_log_error("Invalid BAM binary header");
335
336
63
 clean:
337
63
    if (h != NULL) {
338
63
        h->n_targets = num_names; // ensure we free only allocated target_names
339
63
        sam_hdr_destroy(h);
340
63
    }
341
63
    return NULL;
342
12
}
343
344
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
345
3.65k
{
346
3.65k
    int32_t i, name_len, x;
347
3.65k
    kstring_t hdr_ks = { 0, 0, NULL };
348
3.65k
    char *text;
349
3.65k
    uint32_t l_text;
350
351
3.65k
    if (!h) return -1;
352
353
3.65k
    if (h->hrecs) {
354
3.65k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
355
3.65k
        if (hdr_ks.l > UINT32_MAX) {
356
0
            hts_log_error("Header too long for BAM format");
357
0
            free(hdr_ks.s);
358
0
            return -1;
359
3.65k
        } else if (hdr_ks.l > INT32_MAX) {
360
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
361
0
            hts_log_warning("Output file may not be portable");
362
0
        }
363
3.65k
        text = hdr_ks.s;
364
3.65k
        l_text = hdr_ks.l;
365
3.65k
    } else {
366
2
        if (h->l_text > UINT32_MAX) {
367
0
            hts_log_error("Header too long for BAM format");
368
0
            return -1;
369
2
        } else if (h->l_text > INT32_MAX) {
370
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
371
0
            hts_log_warning("Output file may not be portable");
372
0
        }
373
2
        text = h->text;
374
2
        l_text = h->l_text;
375
2
    }
376
    // write "BAM1"
377
3.65k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
378
    // write plain text and the number of reference sequences
379
3.65k
    if (fp->is_be) {
380
0
        x = ed_swap_4(l_text);
381
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
382
0
        if (l_text) {
383
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
384
0
        }
385
0
        x = ed_swap_4(h->n_targets);
386
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
387
3.65k
    } else {
388
3.65k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
389
3.65k
        if (l_text) {
390
2.44k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
391
2.44k
        }
392
3.65k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
393
3.65k
    }
394
3.65k
    free(hdr_ks.s);
395
    // write sequence names and lengths
396
6.85k
    for (i = 0; i != h->n_targets; ++i) {
397
3.20k
        char *p = h->target_name[i];
398
3.20k
        name_len = strlen(p) + 1;
399
3.20k
        if (fp->is_be) {
400
0
            x = ed_swap_4(name_len);
401
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
402
3.20k
        } else {
403
3.20k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
404
3.20k
        }
405
3.20k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
406
3.20k
        if (fp->is_be) {
407
0
            x = ed_swap_4(h->target_len[i]);
408
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
409
3.20k
        } else {
410
3.20k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
411
3.20k
        }
412
3.20k
    }
413
3.65k
    if (bgzf_flush(fp) < 0) return -1;
414
3.65k
    return 0;
415
3.65k
}
416
417
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
418
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
419
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
420
0
}
421
422
/*************************
423
 *** BAM alignment I/O ***
424
 *************************/
425
426
bam1_t *bam_init1(void)
427
1.41M
{
428
1.41M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
429
1.41M
}
430
431
int sam_realloc_bam_data(bam1_t *b, size_t desired)
432
1.46M
{
433
1.46M
    uint32_t new_m_data;
434
1.46M
    uint8_t *new_data;
435
1.46M
    new_m_data = desired;
436
1.46M
    kroundup32(new_m_data); // next power of 2
437
1.46M
    new_m_data += 32; // reduces malloc arena migrations?
438
1.46M
    if (new_m_data < desired) {
439
0
        errno = ENOMEM; // Not strictly true but we can't store the size
440
0
        return -1;
441
0
    }
442
1.46M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443
1.46M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
444
6
        errno = ENOMEM;
445
6
        return -1;
446
6
    }
447
1.46M
#endif
448
1.46M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
449
1.46M
        new_data = realloc(b->data, new_m_data);
450
1.46M
    } else {
451
0
        if ((new_data = malloc(new_m_data)) != NULL) {
452
0
            if (b->l_data > 0)
453
0
                memcpy(new_data, b->data,
454
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
455
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
456
0
        }
457
0
    }
458
1.46M
    if (!new_data) return -1;
459
1.46M
    b->data = new_data;
460
1.46M
    b->m_data = new_m_data;
461
1.46M
    return 0;
462
1.46M
}
463
464
void bam_destroy1(bam1_t *b)
465
23.4M
{
466
23.4M
    if (b == 0) return;
467
1.41M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
468
1.41M
        free(b->data);
469
1.41M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
470
            // In case of reuse
471
0
            b->data = NULL;
472
0
            b->m_data = 0;
473
0
            b->l_data = 0;
474
0
        }
475
1.41M
    }
476
477
1.41M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
478
1.41M
        free(b);
479
1.41M
}
480
481
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
482
9.88M
{
483
9.88M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
484
9.88M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
485
9.88M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
486
9.88M
    bdst->l_data = bsrc->l_data;
487
9.88M
    bdst->id = bsrc->id;
488
9.88M
    return bdst;
489
9.88M
}
490
491
bam1_t *bam_dup1(const bam1_t *bsrc)
492
1.40M
{
493
1.40M
    if (bsrc == NULL) return NULL;
494
1.40M
    bam1_t *bdst = bam_init1();
495
1.40M
    if (bdst == NULL) return NULL;
496
1.40M
    if (bam_copy1(bdst, bsrc) == NULL) {
497
0
        bam_destroy1(bdst);
498
0
        return NULL;
499
0
    }
500
1.40M
    return bdst;
501
1.40M
}
502
503
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
504
                             hts_pos_t *rlen, hts_pos_t *qlen)
505
655
{
506
655
    int k;
507
655
    *rlen = *qlen = 0;
508
25.5k
    for (k = 0; k < n_cigar; ++k) {
509
24.8k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
510
24.8k
        int len = bam_cigar_oplen(cigar[k]);
511
24.8k
        if (type & 1) *qlen += len;
512
24.8k
        if (type & 2) *rlen += len;
513
24.8k
    }
514
655
}
515
516
static int subtract_check_underflow(size_t length, size_t *limit)
517
146M
{
518
146M
    if (length <= *limit) {
519
146M
        *limit -= length;
520
146M
        return 0;
521
146M
    }
522
523
0
    return -1;
524
146M
}
525
526
int bam_set1(bam1_t *bam,
527
             size_t l_qname, const char *qname,
528
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
529
             size_t n_cigar, const uint32_t *cigar,
530
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
531
             size_t l_seq, const char *seq, const char *qual,
532
             size_t l_aux)
533
29.2M
{
534
    // use a default qname "*" if none is provided
535
29.2M
    if (l_qname == 0) {
536
27.2M
        l_qname = 1;
537
27.2M
        qname = "*";
538
27.2M
    }
539
540
    // note: the qname is stored nul terminated and padded as described in the
541
    // documentation for the bam1_t struct.
542
29.2M
    size_t qname_nuls = 4 - l_qname % 4;
543
544
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
545
    // can't use bam_endpos() directly as some fields not yet set up.
546
29.2M
    hts_pos_t rlen = 0, qlen = 0;
547
29.2M
    if (!(flag & BAM_FUNMAP)) {
548
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
549
0
    }
550
29.2M
    if (rlen == 0) {
551
29.2M
        rlen = 1;
552
29.2M
    }
553
554
    // validate parameters
555
29.2M
    if (l_qname > 254) {
556
159
        hts_log_error("Query name too long");
557
159
        errno = EINVAL;
558
159
        return -1;
559
159
    }
560
29.2M
    if (HTS_POS_MAX - rlen <= pos) {
561
0
        hts_log_error("Read ends beyond highest supported position");
562
0
        errno = EINVAL;
563
0
        return -1;
564
0
    }
565
29.2M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
566
0
        hts_log_error("Mapped query must have a CIGAR");
567
0
        errno = EINVAL;
568
0
        return -1;
569
0
    }
570
29.2M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
571
0
        hts_log_error("CIGAR and query sequence are of different length");
572
0
        errno = EINVAL;
573
0
        return -1;
574
0
    }
575
576
29.2M
    size_t limit = INT32_MAX;
577
29.2M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
578
29.2M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
579
29.2M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
580
29.2M
    u    += subtract_check_underflow(l_seq, &limit);
581
29.2M
    u    += subtract_check_underflow(l_aux, &limit);
582
29.2M
    if (u != 0) {
583
0
        hts_log_error("Size overflow");
584
0
        errno = EINVAL;
585
0
        return -1;
586
0
    }
587
588
    // re-allocate the data buffer as needed.
589
29.2M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
590
29.2M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
591
0
        return -1;
592
0
    }
593
594
29.2M
    bam->l_data = (int)data_len;
595
29.2M
    bam->core.pos = pos;
596
29.2M
    bam->core.tid = tid;
597
29.2M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
598
29.2M
    bam->core.qual = mapq;
599
29.2M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
600
29.2M
    bam->core.flag = flag;
601
29.2M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
602
29.2M
    bam->core.n_cigar = (uint32_t)n_cigar;
603
29.2M
    bam->core.l_qseq = (int32_t)l_seq;
604
29.2M
    bam->core.mtid = mtid;
605
29.2M
    bam->core.mpos = mpos;
606
29.2M
    bam->core.isize = isize;
607
608
29.2M
    uint8_t *cp = bam->data;
609
29.2M
    strncpy((char *)cp, qname, l_qname);
610
29.2M
    int i;
611
116M
    for (i = 0; i < qname_nuls; i++) {
612
86.9M
        cp[l_qname + i] = '\0';
613
86.9M
    }
614
29.2M
    cp += l_qname + qname_nuls;
615
616
29.2M
    if (n_cigar > 0) {
617
0
        memcpy(cp, cigar, n_cigar * 4);
618
0
    }
619
29.2M
    cp += n_cigar * 4;
620
621
1.32G
#define NN 16
622
29.2M
    const uint8_t *useq = (uint8_t *)seq;
623
137M
    for (i = 0; i + NN < l_seq; i += NN) {
624
108M
        int j;
625
108M
        const uint8_t *u2 = useq+i;
626
974M
        for (j = 0; j < NN/2; j++)
627
866M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
628
108M
        cp += NN/2;
629
108M
    }
630
31.7M
    for (; i + 1 < l_seq; i += 2) {
631
2.46M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
632
2.46M
    }
633
634
29.6M
    for (; i < l_seq; i++) {
635
369k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
636
369k
    }
637
638
29.2M
    if (qual) {
639
15
        memcpy(cp, qual, l_seq);
640
15
    }
641
29.2M
    else {
642
29.2M
        memset(cp, '\xff', l_seq);
643
29.2M
    }
644
645
29.2M
    return (int)data_len;
646
29.2M
}
647
648
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
649
9.91M
{
650
9.91M
    int k;
651
9.91M
    hts_pos_t l;
652
14.6M
    for (k = l = 0; k < n_cigar; ++k)
653
4.71M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
654
4.25M
            l += bam_cigar_oplen(cigar[k]);
655
9.91M
    return l;
656
9.91M
}
657
658
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
659
259k
{
660
259k
    int k;
661
259k
    hts_pos_t l;
662
16.2M
    for (k = l = 0; k < n_cigar; ++k)
663
16.0M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
664
14.8M
            l += bam_cigar_oplen(cigar[k]);
665
259k
    return l;
666
259k
}
667
668
hts_pos_t bam_endpos(const bam1_t *b)
669
1.51k
{
670
1.51k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
671
1.51k
    if (rlen == 0) rlen = 1;
672
1.51k
    return b->core.pos + rlen;
673
1.51k
}
674
675
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
676
392k
{
677
392k
    bam1_core_t *c = &b->core;
678
679
    // Bail out as fast as possible for the easy case
680
392k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
681
392k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
682
243k
        return 0;
683
684
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
685
    // but this is much less likely so do as a secondary check.
686
149k
    if (c->tid < 0 || c->pos < 0)
687
81.6k
        return 0;
688
689
    // Do we have a CG tag?
690
67.4k
    uint8_t *CG = bam_aux_get(b, "CG");
691
67.4k
    int saved_errno = errno;
692
67.4k
    if (!CG) {
693
65.3k
        if (errno != ENOENT) return -1;  // Bad aux data
694
65.3k
        errno = saved_errno; // restore errno on expected no-CG-tag case
695
65.3k
        return 0;
696
65.3k
    }
697
698
    // Now we start with the serious work migrating CG to CIGAR
699
2.11k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
700
2.11k
        *cigar0, CG_len, fake_bytes;
701
2.11k
    cigar0 = bam_get_cigar(b);
702
2.11k
    fake_bytes = c->n_cigar * 4;
703
2.11k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
704
596
        return 0; // not of type B,I
705
1.51k
    CG_len = le_to_u32(CG + 2);
706
    // don't move if the real CIGAR length is shorter than the fake cigar length
707
1.51k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
708
709
    // move from the CG tag to the right position
710
1.51k
    cigar_st = (uint8_t*)cigar0 - b->data;
711
1.51k
    c->n_cigar = CG_len;
712
1.51k
    n_cigar4 = c->n_cigar * 4;
713
1.51k
    CG_st = CG - b->data - 2;
714
1.51k
    CG_en = CG_st + 8 + n_cigar4;
715
1.51k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
716
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
717
1.51k
    b->l_data = b->l_data - fake_bytes + n_cigar4;
718
    // insert c->n_cigar-fake_bytes empty space to make room
719
1.51k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
720
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
721
1.51k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
722
1.51k
    if (ori_len > CG_en) // move data after the CG tag
723
234
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
724
1.51k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
725
1.51k
    if (recal_bin)
726
1.51k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
727
1.51k
    if (give_warning)
728
1.51k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
729
1.51k
    return 1;
730
1.51k
}
731
732
static inline int aux_type2size(uint8_t type)
733
3.33M
{
734
3.33M
    switch (type) {
735
1.72M
    case 'A': case 'c': case 'C':
736
1.72M
        return 1;
737
210k
    case 's': case 'S':
738
210k
        return 2;
739
688k
    case 'i': case 'I': case 'f':
740
688k
        return 4;
741
13.7k
    case 'd':
742
13.7k
        return 8;
743
693k
    case 'Z': case 'H': case 'B':
744
693k
        return type;
745
259
    default:
746
259
        return 0;
747
3.33M
    }
748
3.33M
}
749
750
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
751
0
{
752
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
753
0
    uint32_t i;
754
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
755
0
}
756
757
// Fix bad records where qname is not terminated correctly.
758
293
static int fixup_missing_qname_nul(bam1_t *b) {
759
293
    bam1_core_t *c = &b->core;
760
761
    // Note this is called before c->l_extranul is added to c->l_qname
762
293
    if (c->l_extranul > 0) {
763
271
        b->data[c->l_qname++] = '\0';
764
271
        c->l_extranul--;
765
271
    } else {
766
22
        if (b->l_data > INT_MAX - 4) return -1;
767
22
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
768
22
        b->l_data += 4;
769
22
        b->data[c->l_qname++] = '\0';
770
22
        c->l_extranul = 3;
771
22
    }
772
293
    return 0;
773
293
}
774
775
/*
776
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
777
 * in multi-threaded handling.  This may be worth considering for htslib2.
778
 */
779
int bam_read1(BGZF *fp, bam1_t *b)
780
769
{
781
769
    bam1_core_t *c = &b->core;
782
769
    int32_t block_len, ret, i;
783
769
    uint32_t new_l_data;
784
769
    uint8_t tmp[32], *x;
785
786
769
    b->l_data = 0;
787
788
769
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
789
0
        if (ret == 0) return -1; // normal end-of-file
790
0
        else return -2; // truncated
791
0
    }
792
769
    if (fp->is_be)
793
0
        ed_swap_4p(&block_len);
794
769
    if (block_len < 32) return -4;  // block_len includes core data
795
752
    if (fp->block_length - fp->block_offset > 32) {
796
        // Avoid bgzf_read and a temporary copy to a local buffer
797
750
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
798
750
        fp->block_offset += 32;
799
750
    } else {
800
2
        x = tmp;
801
2
        if (bgzf_read(fp, x, 32) != 32) return -3;
802
2
    }
803
804
750
    c->tid        = le_to_u32(x);
805
750
    c->pos        = le_to_i32(x+4);
806
750
    uint32_t x2   = le_to_u32(x+8);
807
750
    c->bin        = x2>>16;
808
750
    c->qual       = x2>>8&0xff;
809
750
    c->l_qname    = x2&0xff;
810
750
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
811
750
    uint32_t x3   = le_to_u32(x+12);
812
750
    c->flag       = x3>>16;
813
750
    c->n_cigar    = x3&0xffff;
814
750
    c->l_qseq     = le_to_u32(x+16);
815
750
    c->mtid       = le_to_u32(x+20);
816
750
    c->mpos       = le_to_i32(x+24);
817
750
    c->isize      = le_to_i32(x+28);
818
819
750
    new_l_data = block_len - 32 + c->l_extranul;
820
750
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
821
741
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
822
741
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
823
38
        return -4;
824
703
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
825
697
    b->l_data = new_l_data;
826
827
697
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
828
693
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
829
293
        if (fixup_missing_qname_nul(b) < 0) return -4;
830
293
    }
831
1.48k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
832
693
    c->l_qname += c->l_extranul;
833
693
    if (b->l_data < c->l_qname ||
834
693
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
835
22
        return -4;
836
671
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
837
671
    if (bam_tag2cigar(b, 0, 0) < 0)
838
0
        return -4;
839
840
    // TODO: consider making this conditional
841
671
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
842
655
        hts_pos_t rlen, qlen;
843
655
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
844
655
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
845
655
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
846
        // Sanity check for broken CIGAR alignments
847
655
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
848
20
            hts_log_error("CIGAR and query sequence lengths differ for %s",
849
20
                    bam_get_qname(b));
850
20
            return -4;
851
20
        }
852
655
    }
853
854
651
    return 4 + block_len;
855
671
}
856
857
int bam_write1(BGZF *fp, const bam1_t *b)
858
9.88M
{
859
9.88M
    const bam1_core_t *c = &b->core;
860
9.88M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
861
9.88M
    int i, ok;
862
9.88M
    if (c->l_qname - c->l_extranul > 255) {
863
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
864
0
        errno = EOVERFLOW;
865
0
        return -1;
866
0
    }
867
9.88M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
868
9.88M
    if (c->pos > INT_MAX ||
869
9.88M
        c->mpos > INT_MAX ||
870
9.88M
        c->isize < INT_MIN || c->isize > INT_MAX) {
871
130
        hts_log_error("Positional data is too large for BAM format");
872
130
        return -1;
873
130
    }
874
9.88M
    x[0] = c->tid;
875
9.88M
    x[1] = c->pos;
876
9.88M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
877
9.88M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
878
9.88M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
879
9.88M
    x[4] = c->l_qseq;
880
9.88M
    x[5] = c->mtid;
881
9.88M
    x[6] = c->mpos;
882
9.88M
    x[7] = c->isize;
883
9.88M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
884
9.88M
    if (fp->is_be) {
885
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
886
0
        y = block_len;
887
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
888
0
        swap_data(c, b->l_data, b->data, 1);
889
9.88M
    } else {
890
9.88M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
891
9.88M
    }
892
9.88M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
893
9.88M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
894
9.88M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
895
9.88M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
896
9.88M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
897
28
        uint8_t buf[8];
898
28
        uint32_t cigar_st, cigar_en, cigar[2];
899
28
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
900
28
        if (cigreflen >= (1<<28)) {
901
            // Length of reference covered is greater than the biggest
902
            // CIGAR operation currently allowed.
903
10
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
904
10
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
905
10
                          bam_get_qname(b), c->n_cigar, cigreflen);
906
10
            return -1;
907
10
        }
908
18
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
909
18
        cigar_en = cigar_st + c->n_cigar * 4;
910
18
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
911
18
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
912
18
        u32_to_le(cigar[0], buf);
913
18
        u32_to_le(cigar[1], buf + 4);
914
18
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
915
18
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
916
18
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
917
18
        u32_to_le(c->n_cigar, buf);
918
18
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
919
18
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
920
18
    }
921
9.88M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
922
9.88M
    return ok? 4 + block_len : -1;
923
9.88M
}
924
925
/*
926
 * Write a BAM file and append to the in-memory index simultaneously.
927
 */
928
9.88M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
929
9.88M
    BGZF *bfp = fp->fp.bgzf;
930
931
9.88M
    if (!fp->idx)
932
9.88M
        return bam_write1(bfp, b);
933
934
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
935
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
936
0
        return -1;
937
0
    if (!bfp->mt)
938
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
939
940
0
    int ret = bam_write1(bfp, b);
941
0
    if (ret < 0)
942
0
        return -1;
943
944
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
945
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
946
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947
0
        ret = -1;
948
0
    }
949
950
0
    return ret;
951
0
}
952
953
/*
954
 * Set the qname in a BAM record
955
 */
956
int bam_set_qname(bam1_t *rec, const char *qname)
957
0
{
958
0
    if (!rec) return -1;
959
0
    if (!qname || !*qname) return -1;
960
961
0
    size_t old_len = rec->core.l_qname;
962
0
    size_t new_len = strlen(qname) + 1;
963
0
    if (new_len < 1 || new_len > 255) return -1;
964
965
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
966
967
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
968
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
969
970
    // Make room
971
0
    if (new_len + extranul != rec->core.l_qname)
972
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
973
    // Copy in new name and pad if needed
974
0
    memcpy(rec->data, qname, new_len);
975
0
    int n;
976
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
977
978
0
    rec->l_data = new_data_len;
979
0
    rec->core.l_qname = new_len + extranul;
980
0
    rec->core.l_extranul = extranul;
981
982
0
    return 0;
983
0
}
984
985
/********************
986
 *** BAM indexing ***
987
 ********************/
988
989
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
990
0
{
991
0
    int n_lvls, i, fmt, ret;
992
0
    bam1_t *b;
993
0
    hts_idx_t *idx;
994
0
    sam_hdr_t *h;
995
0
    h = sam_hdr_read(fp);
996
0
    if (h == NULL) return NULL;
997
0
    if (min_shift > 0) {
998
0
        hts_pos_t max_len = 0;
999
0
        for (i = 0; i < h->n_targets; ++i) {
1000
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1001
0
            if (max_len < len) max_len = len;
1002
0
        }
1003
0
        n_lvls = 0;
1004
0
        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1005
0
        fmt = HTS_FMT_CSI;
1006
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1007
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1008
0
    b = bam_init1();
1009
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1010
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1011
0
        if (ret < 0) { // unsorted or doesn't fit
1012
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1013
0
            goto err;
1014
0
        }
1015
0
    }
1016
0
    if (ret < -1) goto err; // corrupted BAM file
1017
1018
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1019
0
    sam_hdr_destroy(h);
1020
0
    bam_destroy1(b);
1021
0
    return idx;
1022
1023
0
err:
1024
0
    bam_destroy1(b);
1025
0
    hts_idx_destroy(idx);
1026
0
    return NULL;
1027
0
}
1028
1029
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1030
0
{
1031
0
    hts_idx_t *idx;
1032
0
    htsFile *fp;
1033
0
    int ret = 0;
1034
1035
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1036
0
    if (nthreads)
1037
0
        hts_set_threads(fp, nthreads);
1038
1039
0
    switch (fp->format.format) {
1040
0
    case cram:
1041
1042
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1043
0
        break;
1044
1045
0
    case bam:
1046
0
    case sam:
1047
0
        if (fp->format.compression != bgzf) {
1048
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1049
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1050
0
            ret = -1;
1051
0
            break;
1052
0
        }
1053
0
        idx = sam_index(fp, min_shift);
1054
0
        if (idx) {
1055
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1056
0
            if (ret < 0) ret = -4;
1057
0
            hts_idx_destroy(idx);
1058
0
        }
1059
0
        else ret = -1;
1060
0
        break;
1061
1062
0
    default:
1063
0
        ret = -3;
1064
0
        break;
1065
0
    }
1066
0
    hts_close(fp);
1067
1068
0
    return ret;
1069
0
}
1070
1071
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1072
0
{
1073
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1074
0
}
1075
1076
int sam_index_build(const char *fn, int min_shift)
1077
0
{
1078
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1079
0
}
1080
1081
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1082
#undef bam_index_build
1083
int bam_index_build(const char *fn, int min_shift)
1084
0
{
1085
0
    return sam_index_build2(fn, NULL, min_shift);
1086
0
}
1087
1088
// Initialise fp->idx for the current format type.
1089
// This must be called after the header has been written but no other data.
1090
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1091
0
    fp->fnidx = fnidx;
1092
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1093
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1094
0
        int n_lvls, fmt = HTS_FMT_CSI;
1095
0
        if (min_shift > 0) {
1096
0
            int64_t max_len = 0;
1097
0
            int i;
1098
0
            for (i = 0; i < h->n_targets; ++i)
1099
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1100
0
            n_lvls = 0;
1101
0
            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1102
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1103
1104
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1105
0
        return fp->idx ? 0 : -1;
1106
0
    }
1107
1108
0
    if (fp->format.format == cram) {
1109
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1110
0
        return fp->fp.cram->idxfp ? 0 : -1;
1111
0
    }
1112
1113
0
    return -1;
1114
0
}
1115
1116
// Finishes an index. Call after the last record has been written.
1117
// Returns 0 on success, <0 on failure.
1118
0
int sam_idx_save(htsFile *fp) {
1119
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1120
0
        fp->format.format == vcf || fp->format.format == sam) {
1121
0
        int ret;
1122
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1123
0
            errno = -ret;
1124
0
            return -1;
1125
0
        }
1126
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1127
0
            return -1;
1128
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1129
1130
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1131
0
            return -1;
1132
1133
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1134
1135
0
    } else if (fp->format.format == cram) {
1136
        // flushed and closed by cram_close
1137
0
    }
1138
1139
0
    return 0;
1140
0
}
1141
1142
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1143
0
{
1144
0
    htsFile *fp = (htsFile *)fpv;
1145
0
    bam1_t *b = bv;
1146
0
    fp->line.l = 0;
1147
0
    int ret = sam_read1(fp, fp->bam_header, b);
1148
0
    if (ret >= 0) {
1149
0
        *tid = b->core.tid;
1150
0
        *beg = b->core.pos;
1151
0
        *end = bam_endpos(b);
1152
0
    }
1153
0
    return ret;
1154
0
}
1155
1156
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1157
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1158
0
{
1159
0
    htsFile *fp = (htsFile *)fpv;
1160
0
    bam1_t *b = bv;
1161
0
    fp->line.l = 0;
1162
0
    int ret = sam_read1(fp, fp->bam_header, b);
1163
0
    return ret;
1164
0
}
1165
1166
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1167
// samtools/bam.c.
1168
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1169
0
{
1170
0
    const char *rg;
1171
0
    kstring_t lib = { 0, 0, NULL };
1172
0
    rg = (char *)bam_aux_get(b, "RG");
1173
1174
0
    if (!rg)
1175
0
        return NULL;
1176
0
    else
1177
0
        rg++;
1178
1179
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1180
0
        return NULL;
1181
1182
0
    static char LB_text[1024];
1183
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1184
1185
0
    memcpy(LB_text, lib.s, len);
1186
0
    LB_text[len] = 0;
1187
1188
0
    free(lib.s);
1189
1190
0
    return LB_text;
1191
0
}
1192
1193
1194
// Bam record pointer and SAM header combined
1195
typedef struct {
1196
    const sam_hdr_t *h;
1197
    const bam1_t *b;
1198
} hb_pair;
1199
1200
// Looks up variable names in str and replaces them with their value.
1201
// Also supports aux tags.
1202
//
1203
// Note the expression parser deliberately overallocates str size so it
1204
// is safe to use memcmp over strcmp.
1205
static int bam_sym_lookup(void *data, char *str, char **end,
1206
0
                          hts_expr_val_t *res) {
1207
0
    hb_pair *hb = (hb_pair *)data;
1208
0
    const bam1_t *b = hb->b;
1209
1210
0
    res->is_str = 0;
1211
0
    switch(*str) {
1212
0
    case 'c':
1213
0
        if (memcmp(str, "cigar", 5) == 0) {
1214
0
            *end = str+5;
1215
0
            res->is_str = 1;
1216
0
            ks_clear(&res->s);
1217
0
            uint32_t *cigar = bam_get_cigar(b);
1218
0
            int i, n = b->core.n_cigar, r = 0;
1219
0
            if (n) {
1220
0
                for (i = 0; i < n; i++) {
1221
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1222
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1223
0
                }
1224
0
                r |= kputs("", &res->s) < 0;
1225
0
            } else {
1226
0
                r |= kputs("*", &res->s) < 0;
1227
0
            }
1228
0
            return r ? -1 : 0;
1229
0
        }
1230
0
        break;
1231
1232
0
    case 'e':
1233
0
        if (memcmp(str, "endpos", 6) == 0) {
1234
0
            *end = str+6;
1235
0
            res->d = bam_endpos(b);
1236
0
            return 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'f':
1241
0
        if (memcmp(str, "flag", 4) == 0) {
1242
0
            str = *end = str+4;
1243
0
            if (*str != '.') {
1244
0
                res->d = b->core.flag;
1245
0
                return 0;
1246
0
            } else {
1247
0
                str++;
1248
0
                if (!memcmp(str, "paired", 6)) {
1249
0
                    *end = str+6;
1250
0
                    res->d = b->core.flag & BAM_FPAIRED;
1251
0
                    return 0;
1252
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1253
0
                    *end = str+11;
1254
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1255
0
                    return 0;
1256
0
                } else if (!memcmp(str, "unmap", 5)) {
1257
0
                    *end = str+5;
1258
0
                    res->d = b->core.flag & BAM_FUNMAP;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "munmap", 6)) {
1261
0
                    *end = str+6;
1262
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "reverse", 7)) {
1265
0
                    *end = str+7;
1266
0
                    res->d = b->core.flag & BAM_FREVERSE;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "mreverse", 8)) {
1269
0
                    *end = str+8;
1270
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "read1", 5)) {
1273
0
                    *end = str+5;
1274
0
                    res->d = b->core.flag & BAM_FREAD1;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "read2", 5)) {
1277
0
                    *end = str+5;
1278
0
                    res->d = b->core.flag & BAM_FREAD2;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "secondary", 9)) {
1281
0
                    *end = str+9;
1282
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "qcfail", 6)) {
1285
0
                    *end = str+6;
1286
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "dup", 3)) {
1289
0
                    *end = str+3;
1290
0
                    res->d = b->core.flag & BAM_FDUP;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "supplementary", 13)) {
1293
0
                    *end = str+13;
1294
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1295
0
                    return 0;
1296
0
                } else {
1297
0
                    hts_log_error("Unrecognised flag string");
1298
0
                    return -1;
1299
0
                }
1300
0
            }
1301
0
        }
1302
0
        break;
1303
1304
0
    case 'h':
1305
0
        if (memcmp(str, "hclen", 5) == 0) {
1306
0
            int hclen = 0;
1307
0
            uint32_t *cigar = bam_get_cigar(b);
1308
0
            uint32_t ncigar = b->core.n_cigar;
1309
1310
            // left
1311
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1312
0
                hclen = bam_cigar_oplen(cigar[0]);
1313
1314
            // right
1315
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1316
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1317
1318
0
            *end = str+5;
1319
0
            res->d = hclen;
1320
0
            return 0;
1321
0
        }
1322
0
        break;
1323
1324
0
    case 'l':
1325
0
        if (memcmp(str, "library", 7) == 0) {
1326
0
            *end = str+7;
1327
0
            res->is_str = 1;
1328
0
            const char *lib = bam_get_library(hb->h, b);
1329
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1330
0
            return 0;
1331
0
        }
1332
0
        break;
1333
1334
0
    case 'm':
1335
0
        if (memcmp(str, "mapq", 4) == 0) {
1336
0
            *end = str+4;
1337
0
            res->d = b->core.qual;
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1340
0
            *end = str+4;
1341
0
            res->d = b->core.mpos+1;
1342
0
            return 0;
1343
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1344
0
            *end = str+6;
1345
0
            res->is_str = 1;
1346
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1347
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1348
0
            return 0;
1349
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1350
0
            *end = str+6;
1351
0
            res->d = b->core.mtid;
1352
0
            return 0;
1353
0
        }
1354
0
        break;
1355
1356
0
    case 'n':
1357
0
        if (memcmp(str, "ncigar", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.n_cigar;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'p':
1365
0
        if (memcmp(str, "pos", 3) == 0) {
1366
0
            *end = str+3;
1367
0
            res->d = b->core.pos+1;
1368
0
            return 0;
1369
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1370
0
            *end = str+5;
1371
0
            res->d = b->core.mpos+1;
1372
0
            return 0;
1373
0
        }
1374
0
        break;
1375
1376
0
    case 'q':
1377
0
        if (memcmp(str, "qlen", 4) == 0) {
1378
0
            *end = str+4;
1379
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1380
0
            return 0;
1381
0
        } else if (memcmp(str, "qname", 5) == 0) {
1382
0
            *end = str+5;
1383
0
            res->is_str = 1;
1384
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qual", 4) == 0) {
1387
0
            *end = str+4;
1388
0
            ks_clear(&res->s);
1389
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1390
0
                return -1;
1391
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1392
0
            res->s.l = b->core.l_qseq;
1393
0
            res->is_str = 1;
1394
0
            return 0;
1395
0
        }
1396
0
        break;
1397
1398
0
    case 'r':
1399
0
        if (memcmp(str, "rlen", 4) == 0) {
1400
0
            *end = str+4;
1401
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1402
0
            return 0;
1403
0
        } else if (memcmp(str, "rname", 5) == 0) {
1404
0
            *end = str+5;
1405
0
            res->is_str = 1;
1406
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1407
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1408
0
            return 0;
1409
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1410
0
            *end = str+5;
1411
0
            res->is_str = 1;
1412
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1413
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1414
0
            return 0;
1415
0
        } else if (memcmp(str, "refid", 5) == 0) {
1416
0
            *end = str+5;
1417
0
            res->d = b->core.tid;
1418
0
            return 0;
1419
0
        }
1420
0
        break;
1421
1422
0
    case 's':
1423
0
        if (memcmp(str, "seq", 3) == 0) {
1424
0
            *end = str+3;
1425
0
            ks_clear(&res->s);
1426
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1427
0
                return -1;
1428
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1429
0
            res->s.s[b->core.l_qseq] = 0;
1430
0
            res->s.l = b->core.l_qseq;
1431
0
            res->is_str = 1;
1432
0
            return 0;
1433
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1434
0
            int sclen = 0;
1435
0
            uint32_t *cigar = bam_get_cigar(b);
1436
0
            int ncigar = b->core.n_cigar;
1437
0
            int left = 0;
1438
1439
            // left
1440
0
            if (ncigar > 0
1441
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1442
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1443
0
            else if (ncigar > 1
1444
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1445
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1446
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1447
1448
            // right
1449
0
            if (ncigar-1 > left
1450
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1451
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1452
0
            else if (ncigar-2 > left
1453
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1454
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1455
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1456
1457
0
            *end = str+5;
1458
0
            res->d = sclen;
1459
0
            return 0;
1460
0
        }
1461
0
        break;
1462
1463
0
    case 't':
1464
0
        if (memcmp(str, "tlen", 4) == 0) {
1465
0
            *end = str+4;
1466
0
            res->d = b->core.isize;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case '[':
1472
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1473
            /* aux tags */
1474
0
            *end = str+4;
1475
1476
0
            uint8_t *aux = bam_aux_get(b, str+1);
1477
0
            if (aux) {
1478
                // we define the truth of a tag to be its presence, even if 0.
1479
0
                res->is_true = 1;
1480
0
                switch (*aux) {
1481
0
                case 'Z':
1482
0
                case 'H':
1483
0
                    res->is_str = 1;
1484
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1485
0
                    break;
1486
1487
0
                case 'A':
1488
0
                    res->is_str = 1;
1489
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'i': case 'I':
1493
0
                case 's': case 'S':
1494
0
                case 'c': case 'C':
1495
0
                    res->is_str = 0;
1496
0
                    res->d = bam_aux2i(aux);
1497
0
                    break;
1498
1499
0
                case 'f':
1500
0
                case 'd':
1501
0
                    res->is_str = 0;
1502
0
                    res->d = bam_aux2f(aux);
1503
0
                    break;
1504
1505
0
                default:
1506
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1507
0
                                  *aux);
1508
0
                    return -1;
1509
0
                }
1510
0
                return 0;
1511
1512
0
            } else {
1513
                // hence absent tags are always false (and strings)
1514
0
                res->is_str = 1;
1515
0
                res->s.l = 0;
1516
0
                res->d = 0;
1517
0
                res->is_true = 0;
1518
0
                return 0;
1519
0
            }
1520
0
        }
1521
0
        break;
1522
0
    }
1523
1524
    // All successful matches in switch should return 0.
1525
    // So if we didn't match, it's a parse error.
1526
0
    return -1;
1527
0
}
1528
1529
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1530
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1531
0
{
1532
0
    hb_pair hb = {h, b};
1533
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1534
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1535
0
        hts_log_error("Couldn't process filter expression");
1536
0
        hts_expr_val_free(&res);
1537
0
        return -1;
1538
0
    }
1539
1540
0
    int t = res.is_true;
1541
0
    hts_expr_val_free(&res);
1542
1543
0
    return t;
1544
0
}
1545
1546
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1547
0
{
1548
0
    htsFile *fp = fpv;
1549
0
    bam1_t *b = bv;
1550
0
    int pass_filter, ret;
1551
1552
0
    do {
1553
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1554
0
        if (ret < 0)
1555
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1556
1557
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1558
0
            return -2;
1559
1560
0
        *tid = b->core.tid;
1561
0
        *beg = b->core.pos;
1562
0
        *end = bam_endpos(b);
1563
1564
0
        if (fp->filter) {
1565
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1566
0
            if (pass_filter < 0)
1567
0
                return -2;
1568
0
        } else {
1569
0
            pass_filter = 1;
1570
0
        }
1571
0
    } while (pass_filter == 0);
1572
1573
0
    return ret;
1574
0
}
1575
1576
static int cram_pseek(void *fp, int64_t offset, int whence)
1577
0
{
1578
0
    cram_fd *fd =  (cram_fd *)fp;
1579
1580
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1581
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1582
0
        return -1;
1583
1584
0
    fd->curr_position = offset;
1585
1586
0
    if (fd->ctr) {
1587
0
        cram_free_container(fd->ctr);
1588
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1589
0
            cram_free_container(fd->ctr_mt);
1590
1591
0
        fd->ctr = NULL;
1592
0
        fd->ctr_mt = NULL;
1593
0
        fd->ooc = 0;
1594
0
    }
1595
1596
0
    return 0;
1597
0
}
1598
1599
/*
1600
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1601
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1602
 *   container previously fetched. It was designed like this to integrate with the functionality
1603
 *   of the iterator stepping logic.
1604
 */
1605
1606
static int64_t cram_ptell(void *fp)
1607
0
{
1608
0
    cram_fd *fd = (cram_fd *)fp;
1609
0
    cram_container *c;
1610
0
    cram_slice *s;
1611
0
    int64_t ret = -1L;
1612
1613
0
    if (fd) {
1614
0
        if ((c = fd->ctr) != NULL) {
1615
0
            if ((s = c->slice) != NULL && s->max_rec) {
1616
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1617
0
                    fd->curr_position += c->offset + c->length;
1618
0
            }
1619
0
        }
1620
0
        ret = fd->curr_position;
1621
0
    }
1622
1623
0
    return ret;
1624
0
}
1625
1626
static int bam_pseek(void *fp, int64_t offset, int whence)
1627
0
{
1628
0
    BGZF *fd = (BGZF *)fp;
1629
1630
0
    return bgzf_seek(fd, offset, whence);
1631
0
}
1632
1633
static int64_t bam_ptell(void *fp)
1634
0
{
1635
0
    BGZF *fd = (BGZF *)fp;
1636
0
    if (!fd)
1637
0
        return -1L;
1638
1639
0
    return bgzf_tell(fd);
1640
0
}
1641
1642
1643
1644
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1645
0
{
1646
0
    switch (fp->format.format) {
1647
0
    case bam:
1648
0
    case sam:
1649
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1650
1651
0
    case cram: {
1652
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1653
1654
        // Cons up a fake "index" just pointing at the associated cram_fd:
1655
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1656
0
        if (idx == NULL) return NULL;
1657
0
        idx->fmt = HTS_FMT_CRAI;
1658
0
        idx->cram = fp->fp.cram;
1659
0
        return (hts_idx_t *) idx;
1660
0
        }
1661
1662
0
    default:
1663
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1664
0
    }
1665
0
}
1666
1667
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1668
0
{
1669
0
    return index_load(fp, fn, fnidx, flags);
1670
0
}
1671
1672
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1673
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1674
0
}
1675
1676
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1677
0
{
1678
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1682
0
{
1683
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1684
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1685
0
    if (iter == NULL) return NULL;
1686
1687
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1688
    // the readrec function:
1689
0
    iter->is_cram = 1;
1690
0
    iter->read_rest = 1;
1691
0
    iter->off = NULL;
1692
0
    iter->bins.a = NULL;
1693
0
    iter->readrec = readrec;
1694
1695
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1696
0
        cram_range r = { tid, beg+1, end };
1697
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1698
1699
0
        iter->curr_off = 0;
1700
        // The following fields are not required by hts_itr_next(), but are
1701
        // filled in in case user code wants to look at them.
1702
0
        iter->tid = tid;
1703
0
        iter->beg = beg;
1704
0
        iter->end = end;
1705
1706
0
        switch (ret) {
1707
0
        case 0:
1708
0
            break;
1709
1710
0
        case -2:
1711
            // No data vs this ref, so mark iterator as completed.
1712
            // Same as HTS_IDX_NONE.
1713
0
            iter->finished = 1;
1714
0
            break;
1715
1716
0
        default:
1717
0
            free(iter);
1718
0
            return NULL;
1719
0
        }
1720
0
    }
1721
0
    else switch (tid) {
1722
0
    case HTS_IDX_REST:
1723
0
        iter->curr_off = 0;
1724
0
        break;
1725
0
    case HTS_IDX_NONE:
1726
0
        iter->curr_off = 0;
1727
0
        iter->finished = 1;
1728
0
        break;
1729
0
    default:
1730
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1731
0
        abort();
1732
0
        break;
1733
0
    }
1734
1735
0
    return iter;
1736
0
}
1737
1738
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1739
0
{
1740
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1741
0
    if (idx == NULL)
1742
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1743
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1744
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1745
0
    else
1746
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1747
0
}
1748
1749
static int cram_name2id(void *fdv, const char *ref)
1750
0
{
1751
0
    cram_fd *fd = (cram_fd *) fdv;
1752
0
    return sam_hdr_name2tid(fd->header, ref);
1753
0
}
1754
1755
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1756
0
{
1757
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1758
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1759
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1760
0
                          sam_readrec);
1761
0
}
1762
1763
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1764
0
{
1765
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1766
0
    hts_reglist_t *r_list = NULL;
1767
0
    int r_count = 0;
1768
1769
0
    if (!cidx || !hdr)
1770
0
        return NULL;
1771
1772
0
    hts_itr_t *itr = NULL;
1773
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1774
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1775
0
        if (!r_list)
1776
0
            return NULL;
1777
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1778
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1779
0
    } else {
1780
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1781
0
        if (!r_list)
1782
0
            return NULL;
1783
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1784
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1785
0
    }
1786
1787
0
    if (!itr)
1788
0
        hts_reglist_free(r_list, r_count);
1789
1790
0
    return itr;
1791
0
}
1792
1793
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1794
0
{
1795
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1796
1797
0
    if(!cidx || !hdr || !reglist)
1798
0
        return NULL;
1799
1800
0
    if (cidx->fmt == HTS_FMT_CRAI)
1801
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1802
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1803
0
    else
1804
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1805
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1806
0
}
1807
1808
/**********************
1809
 *** SAM header I/O ***
1810
 **********************/
1811
1812
#include "htslib/kseq.h"
1813
#include "htslib/kstring.h"
1814
1815
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1816
0
{
1817
0
    sam_hdr_t *bh = sam_hdr_init();
1818
0
    if (!bh) return NULL;
1819
1820
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1821
0
        sam_hdr_destroy(bh);
1822
0
        return NULL;
1823
0
    }
1824
1825
0
    return bh;
1826
0
}
1827
1828
// Minimal sanitisation of a header to ensure.
1829
// - null terminated string.
1830
// - all lines start with @ (also implies no blank lines).
1831
//
1832
// Much more could be done, but currently is not, including:
1833
// - checking header types are known (HD, SQ, etc).
1834
// - syntax (eg checking tab separated fields).
1835
// - validating n_targets matches @SQ records.
1836
// - validating target lengths against @SQ records.
1837
8.55k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1838
8.55k
    if (!h)
1839
63
        return NULL;
1840
1841
    // Special case for empty headers.
1842
8.49k
    if (h->l_text == 0)
1843
1.22k
        return h;
1844
1845
7.27k
    size_t i;
1846
7.27k
    unsigned int lnum = 0;
1847
7.27k
    char *cp = h->text, last = '\n';
1848
32.0M
    for (i = 0; i < h->l_text; i++) {
1849
        // NB: l_text excludes terminating nul.  This finds early ones.
1850
32.0M
        if (cp[i] == 0)
1851
2.97k
            break;
1852
1853
        // Error on \n[^@], including duplicate newlines
1854
32.0M
        if (last == '\n') {
1855
157k
            lnum++;
1856
157k
            if (cp[i] != '@') {
1857
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1858
0
                sam_hdr_destroy(h);
1859
0
                return NULL;
1860
0
            }
1861
157k
        }
1862
1863
32.0M
        last = cp[i];
1864
32.0M
    }
1865
1866
7.27k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1867
2.97k
        size_t j = i;
1868
21.0k
        while (j < h->l_text && cp[j] == '\0') j++;
1869
2.97k
        if (j < h->l_text)
1870
2.95k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1871
2.97k
    }
1872
1873
    // Add trailing newline and/or trailing nul if required.
1874
7.27k
    if (last != '\n') {
1875
2.94k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1876
1877
2.94k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1878
285
            if (h->l_text >= SIZE_MAX - 2) {
1879
0
                hts_log_error("No room for extra newline");
1880
0
                sam_hdr_destroy(h);
1881
0
                return NULL;
1882
0
            }
1883
1884
285
            cp = realloc(h->text, (size_t) h->l_text+2);
1885
285
            if (!cp) {
1886
0
                sam_hdr_destroy(h);
1887
0
                return NULL;
1888
0
            }
1889
285
            h->text = cp;
1890
285
        }
1891
2.94k
        cp[i++] = '\n';
1892
1893
        // l_text may be larger already due to multiple nul padding
1894
2.94k
        if (h->l_text < i)
1895
0
            h->l_text = i;
1896
2.94k
        cp[h->l_text] = '\0';
1897
2.94k
    }
1898
1899
7.27k
    return h;
1900
7.27k
}
1901
1902
6.73k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1903
6.73k
    sam_hdr_t* h = sam_hdr_init();
1904
6.73k
    if (!h)
1905
0
        return NULL;
1906
1907
6.73k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1908
426
        sam_hdr_destroy(h);
1909
426
        return NULL;
1910
426
    }
1911
1912
6.30k
    if (fp->bam_header)
1913
0
        sam_hdr_destroy(fp->bam_header);
1914
6.30k
    fp->bam_header = sam_hdr_sanitise(h);
1915
6.30k
    fp->bam_header->ref_count = 1;
1916
1917
6.30k
    return fp->bam_header;
1918
6.73k
}
1919
1920
sam_hdr_t *sam_hdr_read(htsFile *fp)
1921
11.4k
{
1922
11.4k
    sam_hdr_t *h = NULL;
1923
11.4k
    if (!fp) {
1924
0
        errno = EINVAL;
1925
0
        return NULL;
1926
0
    }
1927
1928
11.4k
    switch (fp->format.format) {
1929
261
    case bam:
1930
261
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1931
261
        break;
1932
1933
1.98k
    case cram:
1934
1.98k
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1935
1.98k
        break;
1936
1937
6.73k
    case sam:
1938
6.73k
        h = sam_hdr_create(fp);
1939
6.73k
        break;
1940
1941
33
    case fastq_format:
1942
2.46k
    case fasta_format:
1943
2.46k
        return sam_hdr_init();
1944
1945
0
    case empty_format:
1946
0
        errno = EPIPE;
1947
0
        return NULL;
1948
1949
0
    default:
1950
0
        errno = EFTYPE;
1951
0
        return NULL;
1952
11.4k
    }
1953
    //only sam,bam and cram reaches here
1954
8.98k
    if (h && !fp->bam_header) { //set except for sam which already has it
1955
        //for cram, it is the o/p header as for rest and not the internal header
1956
2.18k
        fp->bam_header = h;
1957
2.18k
        sam_hdr_incr_ref(fp->bam_header);
1958
2.18k
    }
1959
8.98k
    return h;
1960
11.4k
}
1961
1962
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1963
10.9k
{
1964
10.9k
    if (!fp || !h) {
1965
0
        errno = EINVAL;
1966
0
        return -1;
1967
0
    }
1968
1969
10.9k
    switch (fp->format.format) {
1970
3.65k
    case binary_format:
1971
3.65k
        fp->format.category = sequence_data;
1972
3.65k
        fp->format.format = bam;
1973
        /* fall-through */
1974
3.65k
    case bam:
1975
3.65k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1976
3.65k
        break;
1977
1978
3.65k
    case cram: {
1979
3.65k
        cram_fd *fd = fp->fp.cram;
1980
3.65k
        if (cram_set_header2(fd, h) < 0) return -1;
1981
3.42k
        if (fp->fn_aux)
1982
0
            cram_load_reference(fd, fp->fn_aux);
1983
3.42k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1984
3.42k
        }
1985
3.42k
        break;
1986
1987
3.65k
    case text_format:
1988
3.65k
        fp->format.category = sequence_data;
1989
3.65k
        fp->format.format = sam;
1990
        /* fall-through */
1991
3.65k
    case sam: {
1992
3.65k
        if (!h->hrecs && !h->text)
1993
0
            return 0;
1994
3.65k
        char *text;
1995
3.65k
        kstring_t hdr_ks = { 0, 0, NULL };
1996
3.65k
        size_t l_text;
1997
3.65k
        ssize_t bytes;
1998
3.65k
        int r = 0, no_sq = 0;
1999
2000
3.65k
        if (h->hrecs) {
2001
3.65k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2002
0
                return -1;
2003
3.65k
            text = hdr_ks.s;
2004
3.65k
            l_text = hdr_ks.l;
2005
3.65k
        } else {
2006
2
            const char *p = NULL;
2007
2
            do {
2008
2
                const char *q = p == NULL ? h->text : p + 4;
2009
2
                p = strstr(q, "@SQ\t");
2010
2
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2011
2
            no_sq = p == NULL;
2012
2
            text = h->text;
2013
2
            l_text = h->l_text;
2014
2
        }
2015
2016
3.65k
        if (fp->is_bgzf) {
2017
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2018
3.65k
        } else {
2019
3.65k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2020
3.65k
        }
2021
3.65k
        free(hdr_ks.s);
2022
3.65k
        if (bytes != l_text)
2023
0
            return -1;
2024
2025
3.65k
        if (no_sq) {
2026
2
            int i;
2027
4
            for (i = 0; i < h->n_targets; ++i) {
2028
2
                fp->line.l = 0;
2029
2
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2030
2
                r |= kputs(h->target_name[i], &fp->line) < 0;
2031
2
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2032
2
                r |= kputw(h->target_len[i], &fp->line) < 0;
2033
2
                r |= kputc('\n', &fp->line) < 0;
2034
2
                if (r != 0)
2035
0
                    return -1;
2036
2037
2
                if (fp->is_bgzf) {
2038
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2039
2
                } else {
2040
2
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2041
2
                }
2042
2
                if (bytes != fp->line.l)
2043
0
                    return -1;
2044
2
            }
2045
2
        }
2046
3.65k
        if (fp->is_bgzf) {
2047
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2048
3.65k
        } else {
2049
3.65k
            if (hflush(fp->fp.hfile) != 0) return -1;
2050
3.65k
        }
2051
3.65k
        }
2052
3.65k
        break;
2053
2054
3.65k
    case fastq_format:
2055
0
    case fasta_format:
2056
        // Nothing to output; FASTQ has no file headers.
2057
0
        return 0;
2058
0
        break;
2059
2060
0
    default:
2061
0
        errno = EBADF;
2062
0
        return -1;
2063
10.9k
    }
2064
    //only sam,bam and cram reaches here
2065
10.7k
    if (h) {    //the new header
2066
10.7k
        sam_hdr_t *tmp = fp->bam_header;
2067
10.7k
        fp->bam_header = sam_hdr_dup(h);
2068
10.7k
        sam_hdr_destroy(tmp);
2069
10.7k
        if (!fp->bam_header && h)
2070
0
            return -1;  //failed to duplicate
2071
10.7k
    }
2072
10.7k
    return 0;
2073
10.7k
}
2074
2075
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2076
0
{
2077
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2078
0
    size_t new_l_text;
2079
0
    if (!h || !key)
2080
0
        return -1;
2081
2082
0
    if (h->l_text > 3) {
2083
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2084
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2085
0
            *p = '\0'; // for strstr call
2086
2087
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2088
2089
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2090
0
                *p = '\n'; // change back
2091
2092
                // mark the key:val
2093
0
                beg = q;
2094
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2095
0
                end = q;
2096
2097
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2098
0
                    && strlen(val) == end - beg - 4)
2099
0
                     return 0; // val is the same, no need to change
2100
2101
0
            } else {
2102
0
                beg = end = p;
2103
0
                *p = '\n';
2104
0
            }
2105
0
        }
2106
0
    }
2107
0
    if (beg == NULL) { // no @HD
2108
0
        new_l_text = h->l_text;
2109
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2110
0
            return -1;
2111
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2112
0
        if (val) {
2113
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2114
0
                return -1;
2115
0
            new_l_text += strlen(val) + 4;
2116
0
        }
2117
0
        newtext = (char*)malloc(new_l_text + 1);
2118
0
        if (!newtext) return -1;
2119
2120
0
        if (val)
2121
0
            snprintf(newtext, new_l_text + 1,
2122
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2123
0
        else
2124
0
            snprintf(newtext, new_l_text + 1,
2125
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2126
0
    } else { // has @HD but different or no key
2127
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2128
0
        if (val) {
2129
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2130
0
                return -1;
2131
0
            new_l_text += strlen(val) + 4;
2132
0
        }
2133
0
        newtext = (char*)malloc(new_l_text + 1);
2134
0
        if (!newtext) return -1;
2135
2136
0
        if (val) {
2137
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2138
0
                    (int) (beg - h->text), h->text, key, val, end);
2139
0
        } else { //delete key
2140
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2141
0
                    (int) (beg - h->text), h->text, end);
2142
0
        }
2143
0
    }
2144
0
    free(h->text);
2145
0
    h->text = newtext;
2146
0
    h->l_text = new_l_text;
2147
0
    return 0;
2148
0
}
2149
2150
2151
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2152
0
{
2153
0
    if (!h || !key)
2154
0
        return -1;
2155
2156
0
    if (!h->hrecs)
2157
0
        return old_sam_hdr_change_HD(h, key, val);
2158
2159
0
    if (val) {
2160
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2161
0
            return -1;
2162
0
    } else {
2163
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2164
0
            return -1;
2165
0
    }
2166
0
    return sam_hdr_rebuild(h);
2167
0
}
2168
2169
/* releases existing header and sets new one; increments ref count if not
2170
duplicating */
2171
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2172
0
{
2173
0
    if (!fp)
2174
0
        return -1;
2175
2176
0
    if (duplicate) {
2177
0
        sam_hdr_t *tmp = fp->bam_header;
2178
0
        fp->bam_header = sam_hdr_dup(h);
2179
0
        sam_hdr_destroy(tmp);
2180
0
        if (!fp->bam_header && h)
2181
0
            return -1;  //duplicate failed
2182
0
    } else {
2183
0
        if (fp->bam_header != h) {  //if not the same
2184
0
            sam_hdr_destroy(fp->bam_header);
2185
0
            fp->bam_header = h;
2186
0
            sam_hdr_incr_ref(fp->bam_header);
2187
0
        }
2188
0
    }
2189
2190
0
    return 0;
2191
0
}
2192
2193
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2194
sam_hdr_t* sam_hdr_get(samFile* fp)
2195
0
{
2196
0
    if (!fp)
2197
0
        return NULL;
2198
0
    return fp->bam_header;
2199
0
}
2200
2201
/**********************
2202
 *** SAM record I/O ***
2203
 **********************/
2204
2205
// The speed of this code can vary considerably depending on minor code
2206
// changes elsewhere as some of the tight loops are particularly prone to
2207
// speed changes when the instruction blocks are split over a 32-byte
2208
// boundary.  To protect against this, we explicitly specify an alignment
2209
// for this function.  If this is insufficient, we may also wish to
2210
// consider alignment of blocks within this function via
2211
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2212
// However it's not very portable.
2213
// Instead we break into separate functions so we can explicitly specify
2214
// use __attribute__((aligned(32))) instead and force consistent loop
2215
// alignment.
2216
305k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2217
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2218
305k
    if (*n > INT32_MAX*0.666) {
2219
0
        errno = ENOMEM;
2220
0
        return -1;
2221
0
    }
2222
2223
305k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2224
305k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2225
0
        hts_log_error("Out of memory");
2226
0
        return -1;
2227
0
    }
2228
2229
305k
    (*n)+=*n>>1;
2230
305k
    return 0;
2231
305k
}
2232
2233
2234
// This ensures that q always ends up at the next comma after
2235
// reading a number even if it's followed by junk.  It
2236
// prevents the possibility of trying to read more than n items.
2237
18.7M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2238
2239
HTS_ALIGN32
2240
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2241
30.6k
                               uint32_t *nalloc, int *overflow) {
2242
2.27M
    while (*q == ',') {
2243
2.24M
        if ((*nused)++ >= (*nalloc)) {
2244
730
            if (grow_B_array(b, nalloc, 1) < 0)
2245
0
                return NULL;
2246
730
        }
2247
2.24M
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2248
2.24M
        b->l_data++;
2249
2.24M
    }
2250
30.6k
    return q;
2251
30.6k
}
2252
2253
HTS_ALIGN32
2254
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2255
28.6k
                               uint32_t *nalloc, int *overflow) {
2256
1.38M
    while (*q == ',') {
2257
1.35M
        if ((*nused)++ >= (*nalloc)) {
2258
3.90k
            if (grow_B_array(b, nalloc, 1) < 0)
2259
0
                return NULL;
2260
3.90k
        }
2261
1.35M
        if (q[1] != '-') {
2262
1.32M
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2263
1.32M
            b->l_data++;
2264
1.32M
        } else {
2265
31.0k
            *overflow = 1;
2266
31.0k
            q++;
2267
31.0k
            skip_to_comma_(q);
2268
31.0k
        }
2269
1.35M
    }
2270
28.6k
    return q;
2271
28.6k
}
2272
2273
HTS_ALIGN32
2274
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2275
11.3k
                               uint32_t *nalloc, int *overflow) {
2276
3.72M
    while (*q == ',') {
2277
3.71M
        if ((*nused)++ >= (*nalloc)) {
2278
5.02k
            if (grow_B_array(b, nalloc, 2) < 0)
2279
0
                return NULL;
2280
5.02k
        }
2281
3.71M
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2282
3.71M
                  b->data + b->l_data);
2283
3.71M
        b->l_data += 2;
2284
3.71M
    }
2285
11.3k
    return q;
2286
11.3k
}
2287
2288
HTS_ALIGN32
2289
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2290
6.62k
                               uint32_t *nalloc, int *overflow) {
2291
7.65M
    while (*q == ',') {
2292
7.64M
        if ((*nused)++ >= (*nalloc)) {
2293
14.2k
            if (grow_B_array(b, nalloc, 2) < 0)
2294
0
                return NULL;
2295
14.2k
        }
2296
7.64M
        if (q[1] != '-') {
2297
7.52M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2298
7.52M
                      b->data + b->l_data);
2299
7.52M
            b->l_data += 2;
2300
7.52M
        } else {
2301
115k
            *overflow = 1;
2302
115k
            q++;
2303
115k
            skip_to_comma_(q);
2304
115k
        }
2305
7.64M
    }
2306
6.62k
    return q;
2307
6.62k
}
2308
2309
HTS_ALIGN32
2310
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2311
28.4k
                               uint32_t *nalloc, int *overflow) {
2312
16.0M
    while (*q == ',') {
2313
16.0M
        if ((*nused)++ >= (*nalloc)) {
2314
174
            if (grow_B_array(b, nalloc, 4) < 0)
2315
0
                return NULL;
2316
174
        }
2317
16.0M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2318
16.0M
                  b->data + b->l_data);
2319
16.0M
        b->l_data += 4;
2320
16.0M
    }
2321
28.4k
    return q;
2322
28.4k
}
2323
2324
HTS_ALIGN32
2325
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2326
87.4k
                               uint32_t *nalloc, int *overflow) {
2327
3.81M
    while (*q == ',') {
2328
3.72M
        if ((*nused)++ >= (*nalloc)) {
2329
235k
            if (grow_B_array(b, nalloc, 4) < 0)
2330
0
                return NULL;
2331
235k
        }
2332
3.72M
        if (q[1] != '-') {
2333
3.59M
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2334
3.59M
                      b->data + b->l_data);
2335
3.59M
            b->l_data += 4;
2336
3.59M
        } else {
2337
138k
            *overflow = 1;
2338
138k
            q++;
2339
138k
            skip_to_comma_(q);
2340
138k
        }
2341
3.72M
    }
2342
87.4k
    return q;
2343
87.4k
}
2344
2345
HTS_ALIGN32
2346
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2347
15.0k
                               uint32_t *nalloc, int *overflow) {
2348
371k
    while (*q == ',') {
2349
356k
        if ((*nused)++ >= (*nalloc)) {
2350
45.9k
            if (grow_B_array(b, nalloc, 4) < 0)
2351
0
                return NULL;
2352
45.9k
        }
2353
356k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2354
356k
        b->l_data += 4;
2355
356k
    }
2356
15.0k
    return q;
2357
15.0k
}
2358
2359
HTS_ALIGN32
2360
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2361
                              char **end, bam1_t *b,
2362
208k
                              int *ctr) {
2363
    // Protect against infinite recursion when dealing with invalid input.
2364
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2365
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2366
    //
2367
    // Loop detection is the safest solution incase there are other
2368
    // strange corner cases with malformed inputs.
2369
208k
    if (++(*ctr) > 2) {
2370
12
        hts_log_error("Malformed data in B:%c array", type);
2371
12
        return -1;
2372
12
    }
2373
2374
208k
    int orig_l = b->l_data;
2375
208k
    char *q = in;
2376
208k
    int32_t size;
2377
208k
    size_t bytes;
2378
208k
    int overflow = 0;
2379
2380
208k
    size = aux_type2size(type);
2381
208k
    if (size <= 0 || size > 4) {
2382
7
        hts_log_error("Unrecognized type B:%c", type);
2383
7
        return -1;
2384
7
    }
2385
2386
    // Ensure space for type + values.
2387
    // The first pass through here we don't know the number of entries and
2388
    // nalloc == 0.  We start with a small working set and then parse the
2389
    // data, growing as needed.
2390
    //
2391
    // If we have a second pass through we do know the number of entries
2392
    // and nalloc is already known.  We have no need to expand the bam data.
2393
208k
    if (!nalloc)
2394
145k
         nalloc=7;
2395
2396
    // Ensure allocated memory is big enough (for current nalloc estimate)
2397
208k
    bytes = (size_t) nalloc * (size_t) size;
2398
208k
    if (bytes / size != nalloc
2399
208k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2400
0
        hts_log_error("Out of memory");
2401
0
        return -1;
2402
0
    }
2403
2404
208k
    uint32_t nused = 0;
2405
2406
208k
    b->data[b->l_data++] = 'B';
2407
208k
    b->data[b->l_data++] = type;
2408
    // 32-bit B-array length is inserted later once we know it.
2409
208k
    int b_len_idx = b->l_data;
2410
208k
    b->l_data += sizeof(uint32_t);
2411
2412
208k
    if (type == 'c') {
2413
30.6k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2414
0
            return -1;
2415
177k
    } else if (type == 'C') {
2416
28.6k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2417
0
            return -1;
2418
148k
    } else if (type == 's') {
2419
11.3k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2420
0
            return -1;
2421
137k
    } else if (type == 'S') {
2422
6.62k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2423
0
            return -1;
2424
130k
    } else if (type == 'i') {
2425
28.4k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2426
0
            return -1;
2427
102k
    } else if (type == 'I') {
2428
87.4k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2429
0
            return -1;
2430
87.4k
    } else if (type == 'f') {
2431
15.0k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2432
0
            return -1;
2433
15.0k
    }
2434
208k
    if (*q != '\t' && *q != '\0') {
2435
        // Unknown B array type or junk in the numbers
2436
211
        hts_log_error("Malformed B:%c", type);
2437
211
        return -1;
2438
211
    }
2439
208k
    i32_to_le(nused, b->data + b_len_idx);
2440
2441
208k
    if (!overflow) {
2442
145k
        *end = q;
2443
145k
        return 0;
2444
145k
    } else {
2445
63.0k
        int64_t max = 0, min = 0, val;
2446
        // Given type was incorrect.  Try to rescue the situation.
2447
63.0k
        char *r = q;
2448
63.0k
        q = in;
2449
63.0k
        overflow = 0;
2450
63.0k
        b->l_data = orig_l;
2451
        // Find out what range of values is present
2452
17.3M
        while (q < r) {
2453
17.3M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2454
17.3M
            if (max < val) max = val;
2455
17.3M
            if (min > val) min = val;
2456
17.3M
            skip_to_comma_(q);
2457
17.3M
        }
2458
        // Retry with appropriate type
2459
63.0k
        if (!overflow) {
2460
62.9k
            if (min < 0) {
2461
62.4k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2462
29.7k
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2463
32.6k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2464
4.23k
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2465
28.4k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2466
28.3k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2467
28.3k
                }
2468
62.4k
            } else {
2469
584
                if (max < UINT8_MAX) {
2470
20
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2471
564
                } else if (max <= UINT16_MAX) {
2472
156
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2473
408
                } else if (max <= UINT32_MAX) {
2474
402
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2475
402
                }
2476
584
            }
2477
62.9k
        }
2478
        // If here then at least one of the values is too big to store
2479
58
        hts_log_error("Numeric value in B array out of allowed range");
2480
58
        return -1;
2481
63.0k
    }
2482
208k
#undef skip_to_comma_
2483
208k
}
2484
2485
HTS_ALIGN32
2486
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2487
145k
{
2488
145k
    int ctr = 0;
2489
145k
    uint32_t nalloc = 0;
2490
145k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2491
145k
}
2492
2493
395k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2494
395k
    if (*v >= '1' && *v <= '9') {
2495
118k
        return hts_str2uint(v, rv, 16, overflow);
2496
118k
    }
2497
277k
    else if (*v == '0') {
2498
        // handle single-digit "0" directly; otherwise it's hex or octal
2499
96.0k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2500
1.40k
        else {
2501
1.40k
            unsigned long val = strtoul(v, rv, 0);
2502
1.40k
            if (val > 65535) { *overflow = 1; return 65535; }
2503
1.39k
            return val;
2504
1.40k
        }
2505
96.0k
    }
2506
180k
    else {
2507
        // TODO implement symbolic flag letters
2508
180k
        *rv = v;
2509
180k
        return 0;
2510
180k
    }
2511
395k
}
2512
2513
// Parse tag line and append to bam object b.
2514
// Shared by both SAM and FASTQ parsers.
2515
//
2516
// The difference between the two is how lenient we are to recognising
2517
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2518
// non-SAM looking strings.
2519
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2520
393k
                            khash_t(tag) *tag_whitelist) {
2521
393k
    int overflow = 0;
2522
393k
    int checkpoint;
2523
393k
    char logbuf[40];
2524
393k
    char *q = start, *p = end;
2525
2526
393k
#define _parse_err(cond, ...)                   \
2527
8.69M
    do {                                        \
2528
18.9M
        if (cond) {                             \
2529
764
            if (lenient) {                      \
2530
0
                while (q < p && !isspace_c(*q))   \
2531
0
                    q++;                        \
2532
0
                while (q < p && isspace_c(*q))    \
2533
0
                    q++;                        \
2534
0
                b->l_data = checkpoint;         \
2535
0
                goto loop;                      \
2536
764
            } else {                            \
2537
764
                hts_log_error(__VA_ARGS__);     \
2538
764
                goto err_ret;                   \
2539
764
            }                                   \
2540
764
        }                                       \
2541
8.69M
    } while (0)
2542
2543
8.16M
    while (q < p) loop: {
2544
8.16M
        char type;
2545
8.16M
        checkpoint = b->l_data;
2546
8.16M
        if (p - q < 5) {
2547
61
            if (lenient) {
2548
0
                break;
2549
61
            } else {
2550
61
                hts_log_error("Incomplete aux field");
2551
61
                goto err_ret;
2552
61
            }
2553
61
        }
2554
4.08M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2555
2556
4.08M
        if (lenient && (q[2] | q[4]) != ':') {
2557
0
            while (q < p && !isspace_c(*q))
2558
0
                q++;
2559
0
            while (q < p && isspace_c(*q))
2560
0
                q++;
2561
0
            continue;
2562
0
        }
2563
2564
4.08M
        if (tag_whitelist) {
2565
0
            int tt = q[0]*256 + q[1];
2566
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2567
0
                while (q < p && *q != '\t')
2568
0
                    q++;
2569
0
                continue;
2570
0
            }
2571
0
        }
2572
2573
        // Copy over id
2574
4.08M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2575
4.08M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2576
4.08M
        q += 3; type = *q++; ++q; // q points to value
2577
4.08M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2578
3.30M
            _parse_err(*q <= '\t', "incomplete aux field");
2579
2580
        // Ensure enough space for a double + type allocated.
2581
4.08M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2582
2583
4.08M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2584
1.19M
            b->data[b->l_data++] = 'A';
2585
1.19M
            b->data[b->l_data++] = *q++;
2586
2.88M
        } else if (type == 'i' || type == 'I') {
2587
1.87M
            if (*q == '-') {
2588
1.52M
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2589
1.52M
                if (x >= INT8_MIN) {
2590
809k
                    b->data[b->l_data++] = 'c';
2591
809k
                    b->data[b->l_data++] = x;
2592
809k
                } else if (x >= INT16_MIN) {
2593
203k
                    b->data[b->l_data++] = 's';
2594
203k
                    i16_to_le(x, b->data + b->l_data);
2595
203k
                    b->l_data += 2;
2596
514k
                } else {
2597
514k
                    b->data[b->l_data++] = 'i';
2598
514k
                    i32_to_le(x, b->data + b->l_data);
2599
514k
                    b->l_data += 4;
2600
514k
                }
2601
1.52M
            } else {
2602
345k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2603
345k
                if (x <= UINT8_MAX) {
2604
206k
                    b->data[b->l_data++] = 'C';
2605
206k
                    b->data[b->l_data++] = x;
2606
206k
                } else if (x <= UINT16_MAX) {
2607
107k
                    b->data[b->l_data++] = 'S';
2608
107k
                    u16_to_le(x, b->data + b->l_data);
2609
107k
                    b->l_data += 2;
2610
107k
                } else {
2611
31.6k
                    b->data[b->l_data++] = 'I';
2612
31.6k
                    u32_to_le(x, b->data + b->l_data);
2613
31.6k
                    b->l_data += 4;
2614
31.6k
                }
2615
345k
            }
2616
1.87M
        } else if (type == 'f') {
2617
47.4k
            b->data[b->l_data++] = 'f';
2618
47.4k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2619
47.4k
            b->l_data += sizeof(float);
2620
964k
        } else if (type == 'd') {
2621
43.7k
            b->data[b->l_data++] = 'd';
2622
43.7k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2623
43.7k
            b->l_data += sizeof(double);
2624
920k
        } else if (type == 'Z' || type == 'H') {
2625
775k
            char *end = strchr(q, '\t');
2626
775k
            if (!end) end = q + strlen(q);
2627
775k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2628
775k
                       "hex field does not have an even number of digits");
2629
775k
            b->data[b->l_data++] = type;
2630
775k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2631
775k
            memcpy(b->data + b->l_data, q, end - q);
2632
775k
            b->l_data += end - q;
2633
775k
            b->data[b->l_data++] = '\0';
2634
775k
            q = end;
2635
775k
        } else if (type == 'B') {
2636
145k
            type = *q++; // q points to the first ',' following the typing byte
2637
145k
            _parse_err(*q && *q != ',' && *q != '\t',
2638
145k
                       "B aux field type not followed by ','");
2639
2640
145k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2641
288
                goto err_ret;
2642
145k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2643
2644
22.1M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2645
4.07M
        q++;
2646
4.07M
    }
2647
2648
392k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2649
392k
#undef _parse_err
2650
2651
392k
    return 0;
2652
2653
1.11k
err_ret:
2654
1.11k
    return -2;
2655
392k
}
2656
2657
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2658
395k
{
2659
1.62M
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2660
2661
395k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2662
2663
// Macro that operates on 64-bits at a time.
2664
395k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2665
395k
    do {                                                        \
2666
319k
        uint64_u *from8 = (uint64_u *)(from);                   \
2667
319k
        uint64_u *to8 = (uint64_u *)(to);                       \
2668
319k
        uint64_t uflow = 0;                                     \
2669
319k
        size_t l8 = (l)>>3, i;                                  \
2670
319k
        for (i = 0; i < l8; i++) {                              \
2671
141
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2672
141
            uflow |= to8[i];                                    \
2673
141
        }                                                       \
2674
323k
        for (i<<=3; i < (l); ++i) {                             \
2675
3.80k
            to[i] = from[i] - (n);                              \
2676
3.80k
            uflow |= to[i];                                     \
2677
3.80k
        }                                                       \
2678
319k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2679
319k
    } while (0)
2680
2681
#else
2682
2683
// Basic version which operates a byte at a time
2684
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2685
        uint8_t uflow = 0;                                   \
2686
        for (i = 0; i < (l); ++i) {                          \
2687
            (to)[i] = (from)[i] - (n);                       \
2688
            uflow |= (uint8_t) (to)[i];                      \
2689
        }                                                    \
2690
        failed = (uflow & 0x80) > 0;                         \
2691
    } while (0)
2692
2693
#endif
2694
2695
698k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2696
5.09M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2697
1.43M
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2698
2699
395k
    uint8_t *t;
2700
2701
395k
    char *p = s->s, *q;
2702
395k
    int i, overflow = 0;
2703
395k
    char logbuf[40];
2704
395k
    hts_pos_t cigreflen;
2705
395k
    bam1_core_t *c = &b->core;
2706
2707
395k
    b->l_data = 0;
2708
395k
    memset(c, 0, 32);
2709
2710
    // qname
2711
395k
    q = _read_token(p);
2712
2713
395k
    _parse_warn(p - q <= 1, "empty query name");
2714
395k
    _parse_err(p - q > 255, "query name too long");
2715
    // resize large enough for name + extranul
2716
395k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2717
395k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2718
2719
395k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2720
395k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2721
395k
    b->l_data += c->l_extranul;
2722
2723
395k
    c->l_qname = p - q + c->l_extranul;
2724
2725
    // flag
2726
395k
    c->flag = parse_sam_flag(p, &p, &overflow);
2727
395k
    if (*p++ != '\t') goto err_ret; // malformated flag
2728
2729
    // chr
2730
395k
    q = _read_token(p);
2731
395k
    if (strcmp(q, "*")) {
2732
360k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2733
360k
        c->tid = bam_name2id(h, q);
2734
360k
        _parse_err(c->tid < -1, "failed to parse header");
2735
360k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2736
360k
    } else c->tid = -1;
2737
2738
    // pos
2739
395k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2740
395k
    if (*p++ != '\t') goto err_ret;
2741
394k
    if (c->pos < 0 && c->tid >= 0) {
2742
63.3k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2743
63.3k
        c->tid = -1;
2744
63.3k
    }
2745
394k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2746
2747
    // mapq
2748
394k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2749
394k
    if (*p++ != '\t') goto err_ret;
2750
    // cigar
2751
394k
    if (*p != '*') {
2752
349k
        uint32_t *cigar = NULL;
2753
349k
        int old_l_data = b->l_data;
2754
349k
        int n_cigar = bam_parse_cigar(p, &p, b);
2755
349k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2756
348k
        cigar = (uint32_t *)(b->data + old_l_data);
2757
2758
        // can't use bam_endpos() directly as some fields not yet set up
2759
348k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2760
348k
        if (cigreflen == 0) cigreflen = 1;
2761
348k
    } else {
2762
45.6k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2763
45.6k
        c->flag |= BAM_FUNMAP;
2764
45.6k
        q = _read_token(p);
2765
45.6k
        cigreflen = 1;
2766
45.6k
    }
2767
393k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2768
393k
               "read ends beyond highest supported position");
2769
393k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2770
    // mate chr
2771
393k
    q = _read_token(p);
2772
393k
    if (strcmp(q, "=") == 0) {
2773
1
        c->mtid = c->tid;
2774
393k
    } else if (strcmp(q, "*") == 0) {
2775
17
        c->mtid = -1;
2776
393k
    } else {
2777
393k
        c->mtid = bam_name2id(h, q);
2778
393k
        _parse_err(c->mtid < -1, "failed to parse header");
2779
393k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2780
393k
    }
2781
    // mpos
2782
393k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2783
393k
    if (*p++ != '\t') goto err_ret;
2784
393k
    if (c->mpos < 0 && c->mtid >= 0) {
2785
173k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2786
173k
        c->mtid = -1;
2787
173k
    }
2788
    // tlen
2789
393k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2790
393k
    if (*p++ != '\t') goto err_ret;
2791
393k
    _parse_err(overflow, "number outside allowed range");
2792
    // seq
2793
393k
    q = _read_token(p);
2794
393k
    if (strcmp(q, "*")) {
2795
305k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2796
305k
        c->l_qseq = p - q - 1;
2797
305k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2798
305k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2799
305k
        i = (c->l_qseq + 1) >> 1;
2800
305k
        _get_mem(uint8_t, &t, b, i);
2801
2802
305k
        unsigned int lqs2 = c->l_qseq&~1, i;
2803
324k
        for (i = 0; i < lqs2; i+=2)
2804
18.5k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2805
380k
        for (; i < c->l_qseq; ++i)
2806
74.5k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2807
305k
    } else c->l_qseq = 0;
2808
    // qual
2809
786k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2810
786k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2811
73.6k
        memset(t, 0xff, c->l_qseq);
2812
73.6k
        p += 2;
2813
319k
    } else {
2814
319k
        int failed = 0;
2815
319k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2816
319k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2817
319k
                   "SEQ and QUAL are of different length");
2818
319k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2819
319k
        _parse_err(failed, "invalid QUAL character");
2820
319k
        p += c->l_qseq + 1;
2821
319k
    }
2822
2823
    // aux
2824
393k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2825
1.11k
        goto err_ret;
2826
2827
392k
    if (bam_tag2cigar(b, 1, 1) < 0)
2828
0
        return -2;
2829
392k
    return 0;
2830
2831
0
#undef _parse_warn
2832
0
#undef _parse_err
2833
0
#undef _get_mem
2834
0
#undef _read_token
2835
3.57k
err_ret:
2836
3.57k
    return -2;
2837
392k
}
2838
2839
349k
static uint32_t read_ncigar(const char *q) {
2840
349k
    uint32_t n_cigar = 0;
2841
3.77M
    for (; *q && *q != '\t'; ++q)
2842
3.42M
        if (!isdigit_c(*q)) ++n_cigar;
2843
349k
    if (!n_cigar) {
2844
127
        hts_log_error("No CIGAR operations");
2845
127
        return 0;
2846
127
    }
2847
348k
    if (n_cigar >= 2147483647) {
2848
0
        hts_log_error("Too many CIGAR operations");
2849
0
        return 0;
2850
0
    }
2851
2852
348k
    return n_cigar;
2853
348k
}
2854
2855
/*! @function
2856
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2857
 @param  in      [in]  pointer to the source string
2858
 @param  a_cigar [out]  address of the destination uint32_t buffer
2859
 @return         number of processed input characters; 0 on error
2860
 */
2861
348k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2862
348k
    int i, overflow = 0;
2863
348k
    const char *p = in;
2864
1.07M
    for (i = 0; i < n_cigar; i++) {
2865
723k
        uint32_t len;
2866
723k
        int op;
2867
723k
        char *q;
2868
723k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2869
723k
        if (q == p) {
2870
189
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2871
189
            return 0;
2872
189
        }
2873
723k
        if (overflow) {
2874
50
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2875
50
            return 0;
2876
50
        }
2877
723k
        p = q;
2878
723k
        op = bam_cigar_table[(unsigned char)*p++];
2879
723k
        if (op < 0) {
2880
329
            hts_log_error("Unrecognized CIGAR operator");
2881
329
            return 0;
2882
329
        }
2883
723k
        a_cigar[i] = len;
2884
723k
        a_cigar[i] |= op;
2885
723k
    }
2886
2887
348k
    return p-in;
2888
348k
}
2889
2890
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2891
0
    size_t n_cigar = 0;
2892
0
    int diff;
2893
2894
0
    if (!in || !a_cigar || !a_mem) {
2895
0
        hts_log_error("NULL pointer arguments");
2896
0
        return -1;
2897
0
    }
2898
0
    if (end) *end = (char *)in;
2899
2900
0
    if (*in == '*') {
2901
0
        if (end) (*end)++;
2902
0
        return 0;
2903
0
    }
2904
0
    n_cigar = read_ncigar(in);
2905
0
    if (!n_cigar) return 0;
2906
0
    if (n_cigar > *a_mem) {
2907
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2908
0
        if (a_tmp) {
2909
0
            *a_cigar = a_tmp;
2910
0
            *a_mem = n_cigar;
2911
0
        } else {
2912
0
            hts_log_error("Memory allocation error");
2913
0
            return -1;
2914
0
        }
2915
0
    }
2916
2917
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2918
0
    if (end) *end = (char *)in+diff;
2919
2920
0
    return n_cigar;
2921
0
}
2922
2923
349k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2924
349k
    size_t n_cigar = 0;
2925
349k
    int diff;
2926
2927
349k
    if (!in || !b) {
2928
0
        hts_log_error("NULL pointer arguments");
2929
0
        return -1;
2930
0
    }
2931
349k
    if (end) *end = (char *)in;
2932
2933
349k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2934
349k
    if (!n_cigar && b->core.n_cigar == 0) {
2935
127
        if (end) *end = (char *)in+1;
2936
127
        return 0;
2937
127
    }
2938
2939
348k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2940
348k
    if (cig_diff > 0 &&
2941
348k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2942
0
        hts_log_error("Memory allocation error");
2943
0
        return -1;
2944
0
    }
2945
2946
348k
    uint32_t *cig = bam_get_cigar(b);
2947
348k
    if ((uint8_t *)cig != b->data + b->l_data) {
2948
        // Modifying an BAM existing BAM record
2949
0
        uint8_t  *seq = bam_get_seq(b);
2950
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2951
0
    }
2952
2953
348k
    if (n_cigar) {
2954
348k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2955
568
            return -1;
2956
348k
    } else {
2957
0
        diff = 1; // handle "*"
2958
0
    }
2959
2960
348k
    b->l_data += cig_diff * sizeof(uint32_t);
2961
348k
    b->core.n_cigar = n_cigar;
2962
348k
    if (end) *end = (char *)in + diff;
2963
2964
348k
    return n_cigar;
2965
348k
}
2966
2967
/*
2968
 * -----------------------------------------------------------------------------
2969
 * SAM threading
2970
 */
2971
// Size of SAM text block (reading)
2972
0
#define SAM_NBYTES 240000
2973
2974
// Number of BAM records (writing, up to NB_mem in size)
2975
0
#define SAM_NBAM 1000
2976
2977
struct SAM_state;
2978
2979
// Output job - a block of BAM records
2980
typedef struct sp_bams {
2981
    struct sp_bams *next;
2982
    int serial;
2983
2984
    bam1_t *bams;
2985
    int nbams, abams; // used and alloc for bams[] array
2986
    size_t bam_mem;   // very approximate total size
2987
2988
    struct SAM_state *fd;
2989
} sp_bams;
2990
2991
// Input job - a block of SAM text
2992
typedef struct sp_lines {
2993
    struct sp_lines *next;
2994
    int serial;
2995
2996
    char *data;
2997
    int data_size;
2998
    int alloc;
2999
3000
    struct SAM_state *fd;
3001
    sp_bams *bams;
3002
} sp_lines;
3003
3004
enum sam_cmd {
3005
    SAM_NONE = 0,
3006
    SAM_CLOSE,
3007
    SAM_CLOSE_DONE,
3008
    SAM_AT_EOF,
3009
};
3010
3011
typedef struct SAM_state {
3012
    sam_hdr_t *h;
3013
3014
    hts_tpool *p;
3015
    int own_pool;
3016
    pthread_mutex_t lines_m;
3017
    hts_tpool_process *q;
3018
    pthread_t dispatcher;
3019
    int dispatcher_set;
3020
3021
    sp_lines *lines;
3022
    sp_bams *bams;
3023
3024
    sp_bams *curr_bam;
3025
    int curr_idx;
3026
    int serial;
3027
3028
    // Be warned: moving these mutexes around in this struct can reduce
3029
    // threading performance by up to 70%!
3030
    pthread_mutex_t command_m;
3031
    pthread_cond_t command_c;
3032
    enum sam_cmd command;
3033
3034
    // One of the E* errno codes
3035
    int errcode;
3036
3037
    htsFile *fp;
3038
} SAM_state;
3039
3040
// Returns a SAM_state struct from a generic hFILE.
3041
//
3042
// Returns NULL on failure.
3043
0
static SAM_state *sam_state_create(htsFile *fp) {
3044
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3045
    // be a redirect call with an additional 'S' mode.  This in turn would
3046
    // correctly set the designed format to sam instead of a generic
3047
    // text_format.
3048
0
    if (fp->format.format != sam && fp->format.format != text_format)
3049
0
        return NULL;
3050
3051
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3052
0
    if (!fd)
3053
0
        return NULL;
3054
3055
0
    fp->state = fd;
3056
0
    fd->fp = fp;
3057
3058
0
    return fd;
3059
0
}
3060
3061
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3062
static void *sam_format_worker(void *arg);
3063
3064
0
static void sam_state_err(SAM_state *fd, int errcode) {
3065
0
    pthread_mutex_lock(&fd->command_m);
3066
0
    if (!fd->errcode)
3067
0
        fd->errcode = errcode;
3068
0
    pthread_mutex_unlock(&fd->command_m);
3069
0
}
3070
3071
0
static void sam_free_sp_bams(sp_bams *b) {
3072
0
    if (!b)
3073
0
        return;
3074
3075
0
    if (b->bams) {
3076
0
        int i;
3077
0
        for (i = 0; i < b->abams; i++) {
3078
0
            if (b->bams[i].data)
3079
0
                free(b->bams[i].data);
3080
0
        }
3081
0
        free(b->bams);
3082
0
    }
3083
0
    free(b);
3084
0
}
3085
3086
// Destroys the state produce by sam_state_create.
3087
12.6k
int sam_state_destroy(htsFile *fp) {
3088
12.6k
    int ret = 0;
3089
3090
12.6k
    if (!fp->state)
3091
12.6k
        return 0;
3092
3093
0
    SAM_state *fd = fp->state;
3094
0
    if (fd->p) {
3095
0
        if (fd->h) {
3096
            // Notify sam_dispatcher we're closing
3097
0
            pthread_mutex_lock(&fd->command_m);
3098
0
            if (fd->command != SAM_CLOSE_DONE)
3099
0
                fd->command = SAM_CLOSE;
3100
0
            pthread_cond_signal(&fd->command_c);
3101
0
            ret = -fd->errcode;
3102
0
            if (fd->q)
3103
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3104
3105
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3106
0
                for (;;) {
3107
                    // Avoid deadlocks with dispatcher
3108
0
                    if (fd->command == SAM_CLOSE_DONE)
3109
0
                        break;
3110
0
                    hts_tpool_wake_dispatch(fd->q);
3111
0
                    pthread_mutex_unlock(&fd->command_m);
3112
0
                    hts_usleep(10000);
3113
0
                    pthread_mutex_lock(&fd->command_m);
3114
0
                }
3115
0
            }
3116
0
            pthread_mutex_unlock(&fd->command_m);
3117
3118
0
            if (fp->is_write) {
3119
                // Dispatch the last partial block.
3120
0
                sp_bams *gb = fd->curr_bam;
3121
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3122
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3123
3124
                // Flush and drain output
3125
0
                if (fd->q)
3126
0
                    hts_tpool_process_flush(fd->q);
3127
0
                pthread_mutex_lock(&fd->command_m);
3128
0
                if (!ret) ret = -fd->errcode;
3129
0
                pthread_mutex_unlock(&fd->command_m);
3130
3131
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3132
0
                    hts_usleep(10000);
3133
0
                    pthread_mutex_lock(&fd->command_m);
3134
0
                    ret = -fd->errcode;
3135
                    // not empty but shutdown implies error
3136
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3137
0
                        ret = EIO;
3138
0
                    pthread_mutex_unlock(&fd->command_m);
3139
0
                }
3140
0
                if (fd->q)
3141
0
                    hts_tpool_process_shutdown(fd->q);
3142
0
            }
3143
3144
            // Wait for it to acknowledge
3145
0
            if (fd->dispatcher_set)
3146
0
                pthread_join(fd->dispatcher, NULL);
3147
0
            if (!ret) ret = -fd->errcode;
3148
0
        }
3149
3150
        // Tidy up memory
3151
0
        if (fd->q)
3152
0
            hts_tpool_process_destroy(fd->q);
3153
3154
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3155
0
            hts_tpool_destroy(fd->p);
3156
0
            fd->p = NULL;
3157
0
        }
3158
0
        pthread_mutex_destroy(&fd->lines_m);
3159
0
        pthread_mutex_destroy(&fd->command_m);
3160
0
        pthread_cond_destroy(&fd->command_c);
3161
3162
0
        sp_lines *l = fd->lines;
3163
0
        while (l) {
3164
0
            sp_lines *n = l->next;
3165
0
            free(l->data);
3166
0
            free(l);
3167
0
            l = n;
3168
0
        }
3169
3170
0
        sp_bams *b = fd->bams;
3171
0
        while (b) {
3172
0
            if (fd->curr_bam == b)
3173
0
                fd->curr_bam = NULL;
3174
0
            sp_bams *n = b->next;
3175
0
            sam_free_sp_bams(b);
3176
0
            b = n;
3177
0
        }
3178
3179
0
        if (fd->curr_bam)
3180
0
            sam_free_sp_bams(fd->curr_bam);
3181
3182
        // Decrement counter by one, maybe destroying too.
3183
        // This is to permit the caller using bam_hdr_destroy
3184
        // before sam_close without triggering decode errors
3185
        // in the background threads.
3186
0
        bam_hdr_destroy(fd->h);
3187
0
    }
3188
3189
0
    free(fp->state);
3190
0
    fp->state = NULL;
3191
0
    return ret;
3192
12.6k
}
3193
3194
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3195
0
static void cleanup_sp_lines(void *arg) {
3196
0
    sp_lines *gl = (sp_lines *)arg;
3197
0
    if (!gl) return;
3198
3199
    // Should always be true for lines passed to / from thread workers.
3200
0
    assert(gl->next == NULL);
3201
3202
0
    free(gl->data);
3203
0
    sam_free_sp_bams(gl->bams);
3204
0
    free(gl);
3205
0
}
3206
3207
// Run from one of the worker threads.
3208
// Convert a passed in array of lines to array of BAMs, returning
3209
// the result back to the thread queue.
3210
0
static void *sam_parse_worker(void *arg) {
3211
0
    sp_lines *gl = (sp_lines *)arg;
3212
0
    sp_bams *gb = NULL;
3213
0
    char *lines = gl->data;
3214
0
    int i;
3215
0
    bam1_t *b;
3216
0
    SAM_state *fd = gl->fd;
3217
3218
    // Use a block of BAM structs we had earlier if available.
3219
0
    pthread_mutex_lock(&fd->lines_m);
3220
0
    if (fd->bams) {
3221
0
        gb = fd->bams;
3222
0
        fd->bams = gb->next;
3223
0
    }
3224
0
    pthread_mutex_unlock(&fd->lines_m);
3225
3226
0
    if (gb == NULL) {
3227
0
        gb = calloc(1, sizeof(*gb));
3228
0
        if (!gb) {
3229
0
            return NULL;
3230
0
        }
3231
0
        gb->abams = 100;
3232
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3233
0
        if (!gb->bams) {
3234
0
            sam_state_err(fd, ENOMEM);
3235
0
            goto err;
3236
0
        }
3237
0
        gb->nbams = 0;
3238
0
        gb->bam_mem = 0;
3239
0
    }
3240
0
    gb->serial = gl->serial;
3241
0
    gb->next = NULL;
3242
3243
0
    b = (bam1_t *)gb->bams;
3244
0
    if (!b) {
3245
0
        sam_state_err(fd, ENOMEM);
3246
0
        goto err;
3247
0
    }
3248
3249
0
    i = 0;
3250
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3251
0
    while (cp < cp_end) {
3252
0
        if (i >= gb->abams) {
3253
0
            int old_abams = gb->abams;
3254
0
            gb->abams *= 2;
3255
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3256
0
            if (!b) {
3257
0
                gb->abams /= 2;
3258
0
                sam_state_err(fd, ENOMEM);
3259
0
                goto err;
3260
0
            }
3261
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3262
0
            gb->bams = b;
3263
0
        }
3264
3265
        // Ideally we'd get sam_parse1 to return the number of
3266
        // bytes decoded and to be able to stop on newline as
3267
        // well as \0.
3268
        //
3269
        // We can then avoid the additional strchr loop.
3270
        // It's around 6% of our CPU cost, albeit threadable.
3271
        //
3272
        // However this is an API change so for now we copy.
3273
3274
0
        char *nl = strchr(cp, '\n');
3275
0
        char *line_end;
3276
0
        if (nl) {
3277
0
            line_end = nl;
3278
0
            if (line_end > cp && *(line_end - 1) == '\r')
3279
0
                line_end--;
3280
0
            nl++;
3281
0
        } else {
3282
0
            nl = line_end = cp_end;
3283
0
        }
3284
0
        *line_end = '\0';
3285
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3286
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3287
0
            sam_state_err(fd, errno ? errno : EIO);
3288
0
            cleanup_sp_lines(gl);
3289
0
            goto err;
3290
0
        }
3291
3292
0
        cp = nl;
3293
0
        i++;
3294
0
    }
3295
0
    gb->nbams = i;
3296
3297
0
    pthread_mutex_lock(&fd->lines_m);
3298
0
    gl->next = fd->lines;
3299
0
    fd->lines = gl;
3300
0
    pthread_mutex_unlock(&fd->lines_m);
3301
0
    return gb;
3302
3303
0
 err:
3304
0
    sam_free_sp_bams(gb);
3305
0
    return NULL;
3306
0
}
3307
3308
0
static void *sam_parse_eof(void *arg) {
3309
0
    return NULL;
3310
0
}
3311
3312
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3313
0
static void cleanup_sp_bams(void *arg) {
3314
0
    sam_free_sp_bams((sp_bams *) arg);
3315
0
}
3316
3317
// Runs in its own thread.
3318
// Reads a block of text (SAM) and sends a new job to the thread queue to
3319
// translate this to BAM.
3320
0
static void *sam_dispatcher_read(void *vp) {
3321
0
    htsFile *fp = vp;
3322
0
    kstring_t line = {0};
3323
0
    int line_frag = 0;
3324
0
    SAM_state *fd = fp->state;
3325
0
    sp_lines *l = NULL;
3326
3327
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3328
    // matter as it will grow if necessary).
3329
0
    if (ks_resize(&line, 1000) < 0)
3330
0
        goto err;
3331
3332
0
    for (;;) {
3333
        // Check for command
3334
0
        pthread_mutex_lock(&fd->command_m);
3335
0
        switch (fd->command) {
3336
3337
0
        case SAM_CLOSE:
3338
0
            pthread_cond_signal(&fd->command_c);
3339
0
            pthread_mutex_unlock(&fd->command_m);
3340
0
            hts_tpool_process_shutdown(fd->q);
3341
0
            goto tidyup;
3342
3343
0
        default:
3344
0
            break;
3345
0
        }
3346
0
        pthread_mutex_unlock(&fd->command_m);
3347
3348
0
        pthread_mutex_lock(&fd->lines_m);
3349
0
        if (fd->lines) {
3350
            // reuse existing line buffer
3351
0
            l = fd->lines;
3352
0
            fd->lines = l->next;
3353
0
        }
3354
0
        pthread_mutex_unlock(&fd->lines_m);
3355
3356
0
        if (l == NULL) {
3357
            // none to reuse, to create a new one
3358
0
            l = calloc(1, sizeof(*l));
3359
0
            if (!l)
3360
0
                goto err;
3361
0
            l->alloc = SAM_NBYTES;
3362
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3363
0
            if (!l->data) {
3364
0
                free(l);
3365
0
                l = NULL;
3366
0
                goto err;
3367
0
            }
3368
0
            l->fd = fd;
3369
0
        }
3370
0
        l->next = NULL;
3371
3372
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3373
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3374
0
            if (!rp)
3375
0
                goto err;
3376
0
            l->alloc = line_frag+SAM_NBYTES/2;
3377
0
            l->data = rp;
3378
0
        }
3379
0
        memcpy(l->data, line.s, line_frag);
3380
3381
0
        l->data_size = line_frag;
3382
0
        ssize_t nbytes;
3383
0
    longer_line:
3384
0
        if (fp->is_bgzf)
3385
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3386
0
        else
3387
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3388
0
        if (nbytes < 0) {
3389
0
            sam_state_err(fd, errno ? errno : EIO);
3390
0
            goto err;
3391
0
        } else if (nbytes == 0)
3392
0
            break; // EOF
3393
0
        l->data_size += nbytes;
3394
3395
        // trim to last \n. Maybe \r\n, but that's still fine
3396
0
        if (nbytes == l->alloc - line_frag) {
3397
0
            char *cp_end = l->data + l->data_size;
3398
0
            char *cp = cp_end-1;
3399
3400
0
            while (cp > (char *)l->data && *cp != '\n')
3401
0
                cp--;
3402
3403
            // entire buffer is part of a single line
3404
0
            if (cp == l->data) {
3405
0
                line_frag = l->data_size;
3406
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3407
0
                if (!rp)
3408
0
                    goto err;
3409
0
                l->alloc *= 2;
3410
0
                l->data = rp;
3411
0
                assert(l->alloc >= l->data_size);
3412
0
                assert(l->alloc >= line_frag);
3413
0
                assert(l->alloc >= l->alloc - line_frag);
3414
0
                goto longer_line;
3415
0
            }
3416
0
            cp++;
3417
3418
            // line holds the remainder of our line.
3419
0
            if (ks_resize(&line, cp_end - cp) < 0)
3420
0
                goto err;
3421
0
            memcpy(line.s, cp, cp_end - cp);
3422
0
            line_frag = cp_end - cp;
3423
0
            l->data_size = l->alloc - line_frag;
3424
0
        } else {
3425
            // out of buffer
3426
0
            line_frag = 0;
3427
0
        }
3428
3429
0
        l->serial = fd->serial++;
3430
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3431
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3432
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3433
0
            goto err;
3434
0
        pthread_mutex_lock(&fd->command_m);
3435
0
        if (fd->command == SAM_CLOSE) {
3436
0
            pthread_mutex_unlock(&fd->command_m);
3437
0
            l = NULL;
3438
0
            goto tidyup;
3439
0
        }
3440
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3441
0
        pthread_mutex_unlock(&fd->command_m);
3442
0
    }
3443
3444
    // Submit a NULL sp_bams entry to act as an EOF marker
3445
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3446
0
        goto err;
3447
3448
    // At EOF, wait for close request.
3449
    // (In future if we add support for seek, this is where we need to catch it.)
3450
0
    for (;;) {
3451
0
        pthread_mutex_lock(&fd->command_m);
3452
0
        if (fd->command == SAM_NONE)
3453
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3454
0
        switch (fd->command) {
3455
0
        case SAM_CLOSE:
3456
0
            pthread_cond_signal(&fd->command_c);
3457
0
            pthread_mutex_unlock(&fd->command_m);
3458
0
            hts_tpool_process_shutdown(fd->q);
3459
0
            goto tidyup;
3460
3461
0
        default:
3462
0
            pthread_mutex_unlock(&fd->command_m);
3463
0
            break;
3464
0
        }
3465
0
    }
3466
3467
0
 tidyup:
3468
0
    pthread_mutex_lock(&fd->command_m);
3469
0
    fd->command = SAM_CLOSE_DONE;
3470
0
    pthread_cond_signal(&fd->command_c);
3471
0
    pthread_mutex_unlock(&fd->command_m);
3472
3473
0
    if (l) {
3474
0
        pthread_mutex_lock(&fd->lines_m);
3475
0
        l->next = fd->lines;
3476
0
        fd->lines = l;
3477
0
        pthread_mutex_unlock(&fd->lines_m);
3478
0
    }
3479
0
    free(line.s);
3480
3481
0
    return NULL;
3482
3483
0
 err:
3484
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3485
0
    hts_tpool_process_shutdown(fd->q);
3486
0
    goto tidyup;
3487
0
}
3488
3489
// Runs in its own thread.
3490
// Takes encoded blocks of SAM off the thread results queue and writes them
3491
// to our output stream.
3492
0
static void *sam_dispatcher_write(void *vp) {
3493
0
    htsFile *fp = vp;
3494
0
    SAM_state *fd = fp->state;
3495
0
    hts_tpool_result *r;
3496
3497
    // Iterates until result queue is shutdown, where it returns NULL.
3498
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3499
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3500
0
        if (!gl) {
3501
0
            sam_state_err(fd, ENOMEM);
3502
0
            goto err;
3503
0
        }
3504
3505
0
        if (fp->idx) {
3506
0
            sp_bams *gb = gl->bams;
3507
0
            int i = 0, count = 0;
3508
0
            while (i < gl->data_size) {
3509
0
                int j = i;
3510
0
                while (i < gl->data_size && gl->data[i] != '\n')
3511
0
                    i++;
3512
0
                if (i < gl->data_size)
3513
0
                    i++;
3514
3515
0
                if (fp->is_bgzf) {
3516
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3517
0
                        goto err;
3518
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3519
0
                        goto err;
3520
0
                } else {
3521
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3522
0
                        goto err;
3523
0
                }
3524
3525
0
                bam1_t *b = &gb->bams[count++];
3526
0
                if (fp->format.compression == bgzf) {
3527
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3528
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3529
0
                                      bgzf_tell(fp->fp.bgzf),
3530
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3531
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3532
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3533
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3534
0
                        goto err;
3535
0
                    }
3536
0
                } else {
3537
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3538
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3539
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3540
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3541
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3542
0
                        goto err;
3543
0
                    }
3544
0
                }
3545
0
            }
3546
3547
0
            assert(count == gb->nbams);
3548
3549
            // Add bam array to free-list
3550
0
            pthread_mutex_lock(&fd->lines_m);
3551
0
            gb->next = fd->bams;
3552
0
            fd->bams = gl->bams;
3553
0
            gl->bams = NULL;
3554
0
            pthread_mutex_unlock(&fd->lines_m);
3555
0
        } else {
3556
0
            if (fp->is_bgzf) {
3557
                // We keep track of how much in the current block we have
3558
                // remaining => R.  We look for the last newline in input
3559
                // [i] to [i+R], backwards => position N.
3560
                //
3561
                // If we find a newline, we write out bytes i to N.
3562
                // We know we cannot fit the next record in this bgzf block,
3563
                // so we flush what we have and copy input N to i+R into
3564
                // the start of a new block, and recompute a new R for that.
3565
                //
3566
                // If we don't find a newline (i==N) then we cannot extend
3567
                // the current block at all, so flush whatever is in it now
3568
                // if it ends on a newline.
3569
                // We still copy i(==N) to i+R to the next block and
3570
                // continue as before with a new R.
3571
                //
3572
                // The only exception on the flush is when we run out of
3573
                // data in the input.  In that case we skip it as we don't
3574
                // yet know if the next record will fit.
3575
                //
3576
                // Both conditions share the same code here:
3577
                // - Look for newline (pos N)
3578
                // - Write i to N (which maybe 0)
3579
                // - Flush if block ends on newline and not end of input
3580
                // - write N to i+R
3581
3582
0
                int i = 0;
3583
0
                BGZF *fb = fp->fp.bgzf;
3584
0
                while (i < gl->data_size) {
3585
                    // remaining space in block
3586
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3587
0
                    int eod = 0;
3588
0
                    if (R > gl->data_size-i)
3589
0
                        R = gl->data_size-i, eod = 1;
3590
3591
                    // Find last newline in input data
3592
0
                    int N = i + R;
3593
0
                    while (--N > i) {
3594
0
                        if (gl->data[N] == '\n')
3595
0
                            break;
3596
0
                    }
3597
3598
0
                    if (N != i) {
3599
                        // Found a newline
3600
0
                        N++;
3601
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3602
0
                            goto err;
3603
0
                    }
3604
3605
                    // Flush bgzf block
3606
0
                    int b_off = fb->block_offset;
3607
0
                    if (!eod && b_off &&
3608
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3609
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3610
0
                            goto err;
3611
3612
                    // Copy from N onwards into next block
3613
0
                    if (i+R > N)
3614
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3615
0
                            != i+R - N)
3616
0
                            goto err;
3617
3618
0
                    i = i+R;
3619
0
                }
3620
0
            } else {
3621
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3622
0
                    goto err;
3623
0
            }
3624
0
        }
3625
3626
0
        hts_tpool_delete_result(r, 0);
3627
3628
        // Also updated by main thread
3629
0
        pthread_mutex_lock(&fd->lines_m);
3630
0
        gl->next = fd->lines;
3631
0
        fd->lines = gl;
3632
0
        pthread_mutex_unlock(&fd->lines_m);
3633
0
    }
3634
3635
0
    sam_state_err(fd, 0); // success
3636
0
    hts_tpool_process_shutdown(fd->q);
3637
0
    return NULL;
3638
3639
0
 err:
3640
0
    sam_state_err(fd, errno ? errno : EIO);
3641
0
    return (void *)-1;
3642
0
}
3643
3644
// Run from one of the worker threads.
3645
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3646
// of text SAM records (sp_lines).
3647
0
static void *sam_format_worker(void *arg) {
3648
0
    sp_bams *gb = (sp_bams *)arg;
3649
0
    sp_lines *gl = NULL;
3650
0
    int i;
3651
0
    SAM_state *fd = gb->fd;
3652
0
    htsFile *fp = fd->fp;
3653
3654
    // Use a block of SAM strings we had earlier if available.
3655
0
    pthread_mutex_lock(&fd->lines_m);
3656
0
    if (fd->lines) {
3657
0
        gl = fd->lines;
3658
0
        fd->lines = gl->next;
3659
0
    }
3660
0
    pthread_mutex_unlock(&fd->lines_m);
3661
3662
0
    if (gl == NULL) {
3663
0
        gl = calloc(1, sizeof(*gl));
3664
0
        if (!gl) {
3665
0
            sam_state_err(fd, ENOMEM);
3666
0
            return NULL;
3667
0
        }
3668
0
        gl->alloc = gl->data_size = 0;
3669
0
        gl->data = NULL;
3670
0
    }
3671
0
    gl->serial = gb->serial;
3672
0
    gl->next = NULL;
3673
3674
0
    kstring_t ks = {0, gl->alloc, gl->data};
3675
3676
0
    for (i = 0; i < gb->nbams; i++) {
3677
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3678
0
            sam_state_err(fd, errno ? errno : EIO);
3679
0
            goto err;
3680
0
        }
3681
0
        kputc('\n', &ks);
3682
0
    }
3683
3684
0
    pthread_mutex_lock(&fd->lines_m);
3685
0
    gl->data_size = ks.l;
3686
0
    gl->alloc = ks.m;
3687
0
    gl->data = ks.s;
3688
3689
0
    if (fp->idx) {
3690
        // Keep hold of the bam array a little longer as
3691
        // sam_dispatcher_write needs to use them for building the index.
3692
0
        gl->bams = gb;
3693
0
    } else {
3694
        // Add bam array to free-list
3695
0
        gb->next = fd->bams;
3696
0
        fd->bams = gb;
3697
0
    }
3698
0
    pthread_mutex_unlock(&fd->lines_m);
3699
3700
0
    return gl;
3701
3702
0
 err:
3703
    // Possible race between this and fd->curr_bam.
3704
    // Easier to not free and leave it on the input list so it
3705
    // gets freed there instead?
3706
    // sam_free_sp_bams(gb);
3707
0
    if (gl) {
3708
0
        free(gl->data);
3709
0
        free(gl);
3710
0
    }
3711
0
    return NULL;
3712
0
}
3713
3714
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3715
0
    if (fp->state)
3716
0
        return 0;
3717
3718
0
    if (!(fp->state = sam_state_create(fp)))
3719
0
        return -1;
3720
0
    SAM_state *fd = (SAM_state *)fp->state;
3721
3722
0
    pthread_mutex_init(&fd->lines_m, NULL);
3723
0
    pthread_mutex_init(&fd->command_m, NULL);
3724
0
    pthread_cond_init(&fd->command_c, NULL);
3725
0
    fd->p = p->pool;
3726
0
    int qsize = p->qsize;
3727
0
    if (!qsize)
3728
0
        qsize = 2*hts_tpool_size(fd->p);
3729
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3730
0
    if (!fd->q) {
3731
0
        sam_state_destroy(fp);
3732
0
        return -1;
3733
0
    }
3734
3735
0
    if (fp->format.compression == bgzf)
3736
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3737
3738
0
    return 0;
3739
0
}
3740
3741
0
int sam_set_threads(htsFile *fp, int nthreads) {
3742
0
    if (nthreads <= 0)
3743
0
        return 0;
3744
3745
0
    htsThreadPool p;
3746
0
    p.pool = hts_tpool_init(nthreads);
3747
0
    p.qsize = nthreads*2;
3748
3749
0
    int ret = sam_set_thread_pool(fp, &p);
3750
0
    if (ret < 0)
3751
0
        return ret;
3752
3753
0
    SAM_state *fd = (SAM_state *)fp->state;
3754
0
    fd->own_pool = 1;
3755
3756
0
    return 0;
3757
0
}
3758
3759
0
#define UMI_TAGS 5
3760
typedef struct {
3761
    kstring_t name;
3762
    kstring_t comment; // NB: pointer into name, do not free
3763
    kstring_t seq;
3764
    kstring_t qual;
3765
    int casava;
3766
    int aux;
3767
    int rnum;
3768
    char BC[3];         // aux tag ID for barcode
3769
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3770
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3771
    char nprefix;
3772
    int sra_names;
3773
    regex_t regex;
3774
} fastq_state;
3775
3776
// Initialise fastq state.
3777
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3778
2.46k
static fastq_state *fastq_state_init(int name_char) {
3779
2.46k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3780
2.46k
    if (!x)
3781
0
        return NULL;
3782
2.46k
    strcpy(x->BC, "BC");
3783
2.46k
    x->nprefix = name_char;
3784
    // Default Illumina naming convention
3785
2.46k
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3786
2.46k
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3787
0
        free(x);
3788
0
        return NULL;
3789
0
    }
3790
3791
2.46k
    return x;
3792
2.46k
}
3793
3794
3.28k
void fastq_state_destroy(htsFile *fp) {
3795
3.28k
    if (fp->state) {
3796
2.46k
        fastq_state *x = (fastq_state *)fp->state;
3797
2.46k
        if (x->tags)
3798
0
            kh_destroy(tag, x->tags);
3799
2.46k
        ks_free(&x->name);
3800
2.46k
        ks_free(&x->seq);
3801
2.46k
        ks_free(&x->qual);
3802
2.46k
        regfree(&x->regex);
3803
2.46k
        free(fp->state);
3804
2.46k
    }
3805
3.28k
}
3806
3807
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3808
0
    va_list args;
3809
3810
0
    if (!fp)
3811
0
        return -1;
3812
0
    if (!fp->state)
3813
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3814
0
                                           ? '@' : '>')))
3815
0
            return -1;
3816
3817
0
    fastq_state *x = (fastq_state *)fp->state;
3818
3819
0
    switch (opt) {
3820
0
    case FASTQ_OPT_CASAVA:
3821
0
        x->casava = 1;
3822
0
        break;
3823
3824
0
    case FASTQ_OPT_NAME2:
3825
0
        x->sra_names = 1;
3826
0
        break;
3827
3828
0
    case FASTQ_OPT_AUX: {
3829
0
        va_start(args, opt);
3830
0
        x->aux = 1;
3831
0
        char *tag = va_arg(args, char *);
3832
0
        va_end(args);
3833
0
        if (tag && strcmp(tag, "1") != 0) {
3834
0
            if (!x->tags)
3835
0
                if (!(x->tags = kh_init(tag)))
3836
0
                    return -1;
3837
3838
0
            size_t i, tlen = strlen(tag);
3839
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3840
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3841
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3842
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3843
0
                    break;
3844
0
                }
3845
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3846
0
                kh_put(tag, x->tags, tcode, &ret);
3847
0
                if (ret < 0)
3848
0
                    return -1;
3849
0
            }
3850
0
        }
3851
0
        break;
3852
0
    }
3853
3854
0
    case FASTQ_OPT_BARCODE: {
3855
0
        va_start(args, opt);
3856
0
        char *bc = va_arg(args, char *);
3857
0
        va_end(args);
3858
0
        strncpy(x->BC, bc, 2);
3859
0
        x->BC[2] = 0;
3860
0
        break;
3861
0
    }
3862
3863
0
    case FASTQ_OPT_UMI: {
3864
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3865
0
        va_start(args, opt);
3866
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3867
0
        va_end(args);
3868
0
        if (!bc || strcmp(bc, "1") == 0)
3869
0
            bc = "RX";
3870
0
        int ntags = 0, err = 0;
3871
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3872
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3873
0
                err = 1;
3874
0
                break;
3875
0
            }
3876
3877
0
            strncpy(x->UMI[ntags], bc, 3);
3878
0
            bc += 2;
3879
0
            if (*bc && *bc != ',') {
3880
0
                err = 1;
3881
0
                break;
3882
0
            }
3883
0
            bc+=(*bc==',');
3884
0
            x->UMI[ntags][2] = 0;
3885
0
        }
3886
0
        for (; ntags < UMI_TAGS; ntags++)
3887
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3888
3889
3890
0
        if (err)
3891
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3892
3893
0
        break;
3894
0
    }
3895
3896
0
    case FASTQ_OPT_UMI_REGEX: {
3897
0
        va_start(args, opt);
3898
0
        char *re = va_arg(args, char *);
3899
0
        va_end(args);
3900
3901
0
        regfree(&x->regex);
3902
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3903
0
            hts_log_error("Regular expression '%s' is not supported", re);
3904
0
            return -1;
3905
0
        }
3906
0
        break;
3907
0
    }
3908
3909
0
    case FASTQ_OPT_RNUM:
3910
0
        x->rnum = 1;
3911
0
        break;
3912
3913
0
    default:
3914
0
        break;
3915
0
    }
3916
0
    return 0;
3917
0
}
3918
3919
29.2M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3920
29.2M
    fastq_state *x = (fastq_state *)fp->state;
3921
29.2M
    size_t i, l;
3922
29.2M
    int ret = 0;
3923
3924
29.2M
    if (fp->format.format == fasta_format && fp->line.s) {
3925
        // For FASTA we've already read the >name line; steal it
3926
        // Not the most efficient, but we don't optimise for fasta reading.
3927
29.2M
        if (fp->line.l == 0)
3928
1.18k
            return -1; // EOF
3929
3930
29.2M
        free(x->name.s);
3931
29.2M
        x->name = fp->line;
3932
29.2M
        fp->line.l = fp->line.m = 0;
3933
29.2M
        fp->line.s = NULL;
3934
29.2M
    } else {
3935
        // Read a FASTQ format entry.
3936
2.47k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3937
2.47k
        if (ret == -1)
3938
0
            return -1;  // EOF
3939
2.47k
        else if (ret < -1)
3940
18
            return ret; // ERR
3941
2.47k
    }
3942
3943
    // Name
3944
29.2M
    if (*x->name.s != x->nprefix)
3945
9
        return -2;
3946
3947
    // Reverse the SRA strangeness of putting the run_name.number before
3948
    // the read name.
3949
29.2M
    i = 0;
3950
29.2M
    char *name = x->name.s+1;
3951
29.2M
    if (x->sra_names) {
3952
0
        char *cp = strpbrk(x->name.s, " \t");
3953
0
        if (cp) {
3954
0
            while (*cp == ' ' || *cp == '\t')
3955
0
                cp++;
3956
0
            *--cp = '@';
3957
0
            i = cp - x->name.s;
3958
0
            name = cp+1;
3959
0
        }
3960
0
    }
3961
3962
29.2M
    l = x->name.l;
3963
29.2M
    char *s = x->name.s;
3964
83.5M
    while (i < l && !isspace_c(s[i]))
3965
54.2M
        i++;
3966
29.2M
    if (i < l) {
3967
200k
        s[i] = 0;
3968
200k
        x->name.l = i++;
3969
200k
    }
3970
3971
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3972
30.0M
    while (i < l && isspace_c(s[i]))
3973
838k
        i++;
3974
29.2M
    x->comment.s = s+i;
3975
29.2M
    x->comment.l = l - i;
3976
3977
    // Seq
3978
29.2M
    x->seq.l = 0;
3979
181M
    for (;;) {
3980
181M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3981
2.29k
            if (fp->format.format == fastq_format || ret < -1)
3982
1.06k
                return -2;
3983
181M
        if (ret == -1 ||
3984
181M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3985
29.2M
            break;
3986
152M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3987
0
            return -2;
3988
152M
    }
3989
3990
    // Qual
3991
29.2M
    if (fp->format.format == fastq_format) {
3992
36
        size_t remainder = x->seq.l;
3993
36
        x->qual.l = 0;
3994
11.5k
        do {
3995
11.5k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3996
9
                return -2;
3997
11.5k
            if (fp->line.l > remainder)
3998
12
                return -2;
3999
11.5k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4000
0
                return -2;
4001
11.5k
            remainder -= fp->line.l;
4002
11.5k
        } while (remainder > 0);
4003
4004
        // Decr qual
4005
182k
        for (i = 0; i < x->qual.l; i++)
4006
182k
            x->qual.s[i] -= '!';
4007
15
    }
4008
4009
29.2M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4010
29.2M
    if (x->name.l > 2 &&
4011
1.66M
        x->name.s[x->name.l-2] == '/' &&
4012
80.3k
        isdigit_c(x->name.s[x->name.l-1])) {
4013
71.6k
        switch(x->name.s[x->name.l-1]) {
4014
10.7k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4015
10.5k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4016
50.3k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4017
71.6k
        }
4018
71.6k
        x->name.s[x->name.l-=2] = 0;
4019
71.6k
    }
4020
4021
    // Strip Illumina formatted UMI off read-name
4022
29.2M
    char UMI_seq[256]; // maximum length in spec
4023
29.2M
    size_t UMI_len = 0;
4024
29.2M
    if (x->UMI[0][0]) {
4025
0
        regmatch_t match[3];
4026
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4027
0
            && match[0].rm_so >= 0     // whole regex
4028
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4029
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4030
0
            if (UMI_len > 255) {
4031
0
                hts_log_error("SAM read name is too long");
4032
0
                return -2;
4033
0
            }
4034
4035
            // The SAMTags spec recommends (but not requires) separating
4036
            // barcodes with hyphen ('-').
4037
0
            size_t i;
4038
0
            for (i = 0; i < UMI_len; i++)
4039
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4040
0
                    ? x->name.s[i+match[1].rm_so]
4041
0
                    : '-';
4042
4043
            // Move any trailing #num earlier in the name
4044
0
            if (UMI_len) {
4045
0
                UMI_seq[UMI_len++] = 0;
4046
4047
0
                x->name.l = match[1].rm_so;
4048
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4049
0
                    x->name.l--; // remove colon too
4050
0
                char *cp = x->name.s + match[1].rm_eo;
4051
0
                while (*cp)
4052
0
                    x->name.s[x->name.l++] = *cp++;
4053
0
                x->name.s[x->name.l] = 0;
4054
0
            }
4055
0
        }
4056
0
    }
4057
4058
    // Convert to BAM
4059
29.2M
    ret = bam_set1(b,
4060
29.2M
                   x->name.s + x->name.l - name, name,
4061
29.2M
                   flag,
4062
29.2M
                   -1, -1, 0, // ref '*', pos, mapq,
4063
29.2M
                   0, NULL,     // no cigar,
4064
29.2M
                   -1, -1, 0,    // mate
4065
29.2M
                   x->seq.l, x->seq.s, x->qual.s,
4066
29.2M
                   0);
4067
29.2M
    if (ret < 0) return -2;
4068
4069
    // Add UMI tag if removed from read-name above
4070
29.2M
    if (UMI_len) {
4071
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4072
0
            ret = -2;
4073
0
    }
4074
4075
    // Identify Illumina CASAVA strings.
4076
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4077
29.2M
    char *barcode = NULL;
4078
29.2M
    int barcode_len = 0;
4079
29.2M
    kstring_t *kc = &x->comment;
4080
29.2M
    char *endptr;
4081
29.2M
    if (x->casava &&
4082
        // \d:[YN]:\d+:[ACGTN]+
4083
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4084
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4085
0
        && *endptr == ':') {
4086
4087
        // read num
4088
0
        switch(kc->s[0]) {
4089
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4090
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4091
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4092
0
        }
4093
4094
0
        if (kc->s[2] == 'Y')
4095
0
            b->core.flag |= BAM_FQCFAIL;
4096
4097
        // Barcode, maybe numeric in which case we skip it
4098
0
        if (!isdigit_c(endptr[1])) {
4099
0
            barcode = endptr+1;
4100
0
            for (i = barcode - kc->s; i < kc->l; i++)
4101
0
                if (isspace_c(kc->s[i]))
4102
0
                    break;
4103
4104
0
            kc->s[i] = 0;
4105
0
            barcode_len = i+1-(barcode - kc->s);
4106
0
        }
4107
0
    }
4108
4109
29.2M
    if (ret >= 0 && barcode_len)
4110
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4111
0
            ret = -2;
4112
4113
29.2M
    if (!x->aux)
4114
29.2M
        return ret;
4115
4116
    // Identify any SAM style aux tags in comments too.
4117
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4118
0
        ret = -2;
4119
4120
0
    return ret;
4121
29.2M
}
4122
4123
// Internal component of sam_read1 below
4124
769
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4125
769
    int ret = bam_read1(fp->fp.bgzf, b);
4126
769
    if (h && ret >= 0) {
4127
651
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4128
628
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4129
26
            errno = ERANGE;
4130
26
            return -3;
4131
26
        }
4132
651
    }
4133
743
    return ret;
4134
769
}
4135
4136
// Internal component of sam_read1 below
4137
1.76k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4138
1.76k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4139
1.76k
    if (ret < 0)
4140
1.76k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4141
4142
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4143
0
        return -2;
4144
4145
0
    return ret;
4146
0
}
4147
4148
// Internal component of sam_read1 below
4149
398k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4150
398k
    int ret;
4151
4152
    // Consume 1st line after header parsing as it wasn't using peek
4153
398k
    if (fp->line.l != 0) {
4154
0
        ret = sam_parse1(&fp->line, h, b);
4155
0
        fp->line.l = 0;
4156
0
        return ret;
4157
0
    }
4158
4159
398k
    if (fp->state) {
4160
0
        SAM_state *fd = (SAM_state *)fp->state;
4161
4162
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4163
            // We don't support multi-threaded SAM parsing with seeks yet.
4164
0
            int ret;
4165
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4166
0
                errno = -ret;
4167
0
                return -2;
4168
0
            }
4169
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4170
0
                return -2;
4171
0
            fp->fp.bgzf->seeked = 0;
4172
0
            goto err_recover;
4173
0
        }
4174
4175
0
        if (!fd->h) {
4176
0
            fd->h = h;
4177
0
            fd->h->ref_count++;
4178
            // Ensure hrecs is initialised now as we don't want multiple
4179
            // threads trying to do this simultaneously.
4180
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4181
0
                return -2;
4182
4183
            // We can only do this once we've got a header
4184
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4185
0
                               fp) != 0)
4186
0
                return -2;
4187
0
            fd->dispatcher_set = 1;
4188
0
        }
4189
4190
0
        if (fd->h != h) {
4191
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4192
0
            return -2;
4193
0
        }
4194
4195
0
        sp_bams *gb = fd->curr_bam;
4196
0
        if (!gb) {
4197
0
            if (fd->errcode) {
4198
                // In case reader failed
4199
0
                errno = fd->errcode;
4200
0
                return -2;
4201
0
            }
4202
4203
0
            pthread_mutex_lock(&fd->command_m);
4204
0
            int cmd = fd->command;
4205
0
            pthread_mutex_unlock(&fd->command_m);
4206
0
            if (cmd == SAM_AT_EOF)
4207
0
                return -1;
4208
4209
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4210
0
            if (!r)
4211
0
                return -2;
4212
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4213
0
            hts_tpool_delete_result(r, 0);
4214
0
        }
4215
0
        if (!gb) {
4216
0
            pthread_mutex_lock(&fd->command_m);
4217
0
            fd->command = SAM_AT_EOF;
4218
0
            pthread_mutex_unlock(&fd->command_m);
4219
0
            return fd->errcode ? -2 : -1;
4220
0
        }
4221
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4222
0
        if (fd->curr_idx < gb->nbams)
4223
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4224
0
                return -2;
4225
0
        if (fd->curr_idx == gb->nbams) {
4226
0
            pthread_mutex_lock(&fd->lines_m);
4227
0
            gb->next = fd->bams;
4228
0
            fd->bams = gb;
4229
0
            pthread_mutex_unlock(&fd->lines_m);
4230
4231
0
            fd->curr_bam = NULL;
4232
0
            fd->curr_idx = 0;
4233
        // Consider prefetching next record?  I.e.
4234
        // } else {
4235
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4236
0
        }
4237
4238
0
        ret = 0;
4239
4240
398k
    } else  {
4241
398k
    err_recover:
4242
398k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4243
398k
        if (ret < 0) return ret;
4244
4245
395k
        ret = sam_parse1(&fp->line, h, b);
4246
395k
        fp->line.l = 0;
4247
395k
        if (ret < 0) {
4248
3.57k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4249
3.57k
            if (h && h->ignore_sam_err) goto err_recover;
4250
3.57k
        }
4251
395k
    }
4252
4253
395k
    return ret;
4254
398k
}
4255
4256
// Returns 0 on success,
4257
//        -1 on EOF,
4258
//       <-1 on error
4259
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4260
29.6M
{
4261
29.6M
    int ret, pass_filter;
4262
4263
29.6M
    do {
4264
29.6M
        switch (fp->format.format) {
4265
769
        case bam:
4266
769
            ret = sam_read1_bam(fp, h, b);
4267
769
            break;
4268
4269
1.76k
        case cram:
4270
1.76k
            ret = sam_read1_cram(fp, h, &b);
4271
1.76k
            break;
4272
4273
398k
        case sam:
4274
398k
            ret = sam_read1_sam(fp, h, b);
4275
398k
            break;
4276
4277
29.2M
        case fasta_format:
4278
29.2M
        case fastq_format: {
4279
29.2M
            fastq_state *x = (fastq_state *)fp->state;
4280
29.2M
            if (!x) {
4281
2.46k
                if (!(fp->state = fastq_state_init(fp->format.format
4282
2.46k
                                                   == fastq_format ? '@' : '>')))
4283
0
                    return -2;
4284
2.46k
            }
4285
4286
29.2M
            return fastq_parse1(fp, b);
4287
29.2M
        }
4288
4289
0
        case empty_format:
4290
0
            errno = EPIPE;
4291
0
            return -3;
4292
4293
0
        default:
4294
0
            errno = EFTYPE;
4295
0
            return -3;
4296
29.6M
        }
4297
4298
400k
        pass_filter = (ret >= 0 && fp->filter)
4299
400k
            ? sam_passes_filter(h, b, fp->filter)
4300
400k
            : 1;
4301
400k
    } while (pass_filter == 0);
4302
4303
400k
    return pass_filter < 0 ? -2 : ret;
4304
29.6M
}
4305
4306
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4307
// this code isn't vectorised and runs far slower than is necessary (even
4308
// with the restrict keyword being used).
4309
static inline void HTS_OPT3
4310
679
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4311
679
    uint32_t i;
4312
63.4k
    for (i = 0; i < len; i++)
4313
62.7k
        a[i] = b[i]+33;
4314
679
}
4315
4316
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4317
9.88M
{
4318
9.88M
    int i, r = 0;
4319
9.88M
    uint8_t *s, *end;
4320
9.88M
    const bam1_core_t *c = &b->core;
4321
4322
9.88M
    if (c->l_qname == 0)
4323
0
        return -1;
4324
9.88M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4325
9.88M
    r |= kputc_('\t', str); // query name
4326
9.88M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4327
9.88M
    if (c->tid >= 0) { // chr
4328
83.4k
        r |= kputs(h->target_name[c->tid] , str);
4329
83.4k
        r |= kputc_('\t', str);
4330
9.80M
    } else r |= kputsn_("*\t", 2, str);
4331
9.88M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4332
9.88M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4333
9.88M
    if (c->n_cigar) { // cigar
4334
116k
        uint32_t *cigar = bam_get_cigar(b);
4335
4.43M
        for (i = 0; i < c->n_cigar; ++i) {
4336
4.31M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4337
4.31M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4338
4.31M
        }
4339
9.76M
    } else r |= kputc_('*', str);
4340
9.88M
    r |= kputc_('\t', str);
4341
9.88M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4342
9.69k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4343
8.76k
    else {
4344
8.76k
        r |= kputs(h->target_name[c->mtid], str);
4345
8.76k
        r |= kputc_('\t', str);
4346
8.76k
    }
4347
9.88M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4348
9.88M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4349
9.88M
    if (c->l_qseq) { // seq and qual
4350
276k
        uint8_t *s = bam_get_seq(b);
4351
276k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4352
276k
        char *cp = str->s + str->l;
4353
4354
        // Sequence, 2 bases at a time
4355
276k
        nibble2base(s, cp, c->l_qseq);
4356
276k
        cp[c->l_qseq] = '\t';
4357
276k
        cp += c->l_qseq+1;
4358
4359
        // Quality
4360
276k
        s = bam_get_qual(b);
4361
276k
        i = 0;
4362
276k
        if (s[0] == 0xff) {
4363
275k
            cp[i++] = '*';
4364
275k
        } else {
4365
679
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4366
679
            i = c->l_qseq;
4367
679
        }
4368
276k
        cp[i] = 0;
4369
276k
        cp += i;
4370
276k
        str->l = cp - str->s;
4371
9.60M
    } else r |= kputsn_("*\t*", 3, str);
4372
4373
9.88M
    s = bam_get_aux(b); // aux
4374
9.88M
    end = b->data + b->l_data;
4375
4376
11.1M
    while (end - s >= 4) {
4377
1.30M
        r |= kputc_('\t', str);
4378
1.30M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4379
44
            goto bad_aux;
4380
1.30M
    }
4381
9.88M
    r |= kputsn("", 0, str); // nul terminate
4382
9.88M
    if (r < 0) goto mem_err;
4383
4384
9.88M
    return str->l;
4385
4386
44
 bad_aux:
4387
44
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4388
44
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4389
44
    errno = EINVAL;
4390
44
    return -1;
4391
4392
0
 mem_err:
4393
0
    hts_log_error("Out of memory");
4394
0
    errno = ENOMEM;
4395
0
    return -1;
4396
9.88M
}
4397
4398
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4399
9.88M
{
4400
9.88M
    str->l = 0;
4401
9.88M
    return sam_format1_append(h, b, str);
4402
9.88M
}
4403
4404
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4405
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4406
0
{
4407
0
    unsigned flag = b->core.flag;
4408
0
    int i, e = 0, len = b->core.l_qseq;
4409
0
    uint8_t *seq, *qual;
4410
4411
0
    str->l = 0;
4412
4413
    // Name
4414
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4415
0
        return -1;
4416
4417
    // UMI tag
4418
0
    if (x && *x->UMI[0]) {
4419
        // Temporary copy of '#num' if present
4420
0
        char plex[256];
4421
0
        size_t len = str->l;
4422
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4423
0
            len--;
4424
4425
0
        if (str->s[len] == '#' && str->l - len < 255) {
4426
0
            memcpy(plex, &str->s[len], str->l - len);
4427
0
            plex[str->l - len] = 0;
4428
0
            str->l = len;
4429
0
        } else {
4430
0
            *plex = 0;
4431
0
        }
4432
4433
0
        uint8_t *bc = NULL;
4434
0
        int n;
4435
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4436
0
            bc = bam_aux_get(b, x->UMI[n]);
4437
0
        if (bc && *bc == 'Z') {
4438
0
            int err = kputc(':', str) < 0;
4439
            // Replace any non-alpha with '+'
4440
0
            while (*++bc)
4441
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4442
0
            if (err)
4443
0
                return -1;
4444
0
        }
4445
4446
0
        if (*plex && kputs(plex, str) < 0)
4447
0
            return -1;
4448
0
    }
4449
4450
    // /1 or /2 suffix
4451
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4452
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4453
0
        if (r12 == BAM_FREAD1) {
4454
0
            if (kputs("/1", str) == EOF)
4455
0
                return -1;
4456
0
        } else if (r12 == BAM_FREAD2) {
4457
0
            if (kputs("/2", str) == EOF)
4458
0
                return -1;
4459
0
        }
4460
0
    }
4461
4462
    // Illumina CASAVA tag.
4463
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4464
0
    if (x && x->casava) {
4465
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4466
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4467
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4468
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4469
0
                     bc ? (char *)bc+1 : "0") < 0)
4470
0
            return -1;
4471
4472
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4473
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4474
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4475
0
            str->s[str->l-1] = '0';
4476
0
            str->s[str->l] = 0;
4477
0
            bc = NULL;
4478
0
        }
4479
4480
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4481
0
        if (bc) {
4482
0
            int l = strlen((char *)bc+1);
4483
0
            char *c = (char *)str->s + str->l - l;
4484
0
            for (i = 0; i < l; i++) {
4485
0
                if (!isalpha_c(c[i]))
4486
0
                    c[i] = '+';
4487
0
                else if (islower_c(c[i]))
4488
0
                    c[i] = toupper_c(c[i]);
4489
0
            }
4490
0
        }
4491
0
    }
4492
4493
    // Aux tags
4494
0
    if (x && x->aux) {
4495
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4496
0
        while (s && end - s >= 4) {
4497
0
            int tt = s[0]*256 + s[1];
4498
0
            if (x->tags == NULL ||
4499
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4500
0
                e |= kputc_('\t', str) < 0;
4501
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4502
0
                    return -1;
4503
0
            } else {
4504
0
                s = skip_aux(s+2, end);
4505
0
            }
4506
0
        }
4507
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4508
0
    }
4509
4510
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4511
0
    e |= kputc_('\n', str) < 0;
4512
4513
    // Seq line
4514
0
    seq = bam_get_seq(b);
4515
0
    if (flag & BAM_FREVERSE)
4516
0
        for (i = len-1; i >= 0; i--)
4517
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4518
0
    else
4519
0
        for (i = 0; i < len; i++)
4520
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4521
4522
4523
    // Qual line
4524
0
    if (x->nprefix == '@') {
4525
0
        kputsn("\n+\n", 3, str);
4526
0
        qual = bam_get_qual(b);
4527
0
        if (qual[0] == 0xff)
4528
0
            for (i = 0; i < len; i++)
4529
0
                e |= kputc_('B', str) < 0;
4530
0
        else if (flag & BAM_FREVERSE)
4531
0
            for (i = len-1; i >= 0; i--)
4532
0
                e |= kputc_(33 + qual[i], str) < 0;
4533
0
        else
4534
0
            for (i = 0; i < len; i++)
4535
0
                e |= kputc_(33 + qual[i], str) < 0;
4536
4537
0
    }
4538
0
    e |= kputc('\n', str) < 0;
4539
4540
0
    return e ? -1 : str->l;
4541
0
}
4542
4543
// Sadly we need to be able to modify the bam_hdr here so we can
4544
// reference count the structure.
4545
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4546
29.6M
{
4547
29.6M
    switch (fp->format.format) {
4548
0
    case binary_format:
4549
0
        fp->format.category = sequence_data;
4550
0
        fp->format.format = bam;
4551
        /* fall-through */
4552
9.88M
    case bam:
4553
9.88M
        return bam_write_idx1(fp, h, b);
4554
4555
9.88M
    case cram:
4556
9.88M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4557
4558
0
    case text_format:
4559
0
        fp->format.category = sequence_data;
4560
0
        fp->format.format = sam;
4561
        /* fall-through */
4562
9.88M
    case sam:
4563
9.88M
        if (fp->state) {
4564
0
            SAM_state *fd = (SAM_state *)fp->state;
4565
4566
            // Threaded output
4567
0
            if (!fd->h) {
4568
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4569
                // just data pointed to by it (which is a bit weasely still),
4570
                // but out cached pointer must be non-const as we want to
4571
                // destroy it later on and sam_hdr_destroy takes non-const.
4572
                //
4573
                // We do this because some tools do sam_hdr_destroy; sam_close
4574
                // while others do sam_close; sam_hdr_destroy.  The former is
4575
                // an issue as we need the header still when flushing.
4576
0
                fd->h = (sam_hdr_t *)h;
4577
0
                fd->h->ref_count++;
4578
4579
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4580
0
                                   fp) != 0)
4581
0
                    return -2;
4582
0
                fd->dispatcher_set = 1;
4583
0
            }
4584
4585
0
            if (fd->h != h) {
4586
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4587
0
                return -2;
4588
0
            }
4589
4590
            // Find a suitable BAM array to copy to
4591
0
            sp_bams *gb = fd->curr_bam;
4592
0
            if (!gb) {
4593
0
                pthread_mutex_lock(&fd->lines_m);
4594
0
                if (fd->bams) {
4595
0
                    fd->curr_bam = gb = fd->bams;
4596
0
                    fd->bams = gb->next;
4597
0
                    gb->next = NULL;
4598
0
                    gb->nbams = 0;
4599
0
                    gb->bam_mem = 0;
4600
0
                    pthread_mutex_unlock(&fd->lines_m);
4601
0
                } else {
4602
0
                    pthread_mutex_unlock(&fd->lines_m);
4603
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4604
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4605
0
                        free(gb);
4606
0
                        return -1;
4607
0
                    }
4608
0
                    gb->nbams = 0;
4609
0
                    gb->abams = SAM_NBAM;
4610
0
                    gb->bam_mem = 0;
4611
0
                    gb->fd = fd;
4612
0
                    fd->curr_idx = 0;
4613
0
                    fd->curr_bam = gb;
4614
0
                }
4615
0
            }
4616
4617
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4618
0
                return -2;
4619
0
            gb->bam_mem += b->l_data + sizeof(*b);
4620
4621
            // Dispatch if full
4622
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4623
0
                gb->serial = fd->serial++;
4624
0
                pthread_mutex_lock(&fd->command_m);
4625
0
                if (fd->errcode != 0) {
4626
0
                    pthread_mutex_unlock(&fd->command_m);
4627
0
                    return -fd->errcode;
4628
0
                }
4629
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4630
0
                                        cleanup_sp_bams,
4631
0
                                        cleanup_sp_lines, 0) < 0) {
4632
0
                    pthread_mutex_unlock(&fd->command_m);
4633
0
                    return -1;
4634
0
                }
4635
0
                pthread_mutex_unlock(&fd->command_m);
4636
0
                fd->curr_bam = NULL;
4637
0
            }
4638
4639
            // Dummy value as we don't know how long it really is.
4640
            // We could track file sizes via a SAM_state field, but I don't think
4641
            // it is necessary.
4642
0
            return 1;
4643
9.88M
        } else {
4644
9.88M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4645
9.88M
            kputc('\n', &fp->line);
4646
9.88M
            if (fp->is_bgzf) {
4647
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4648
0
                    return -1;
4649
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4650
9.88M
            } else {
4651
9.88M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4652
9.88M
            }
4653
4654
9.88M
            if (fp->idx) {
4655
0
                if (fp->format.compression == bgzf) {
4656
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4657
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4658
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4659
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4660
0
                        return -1;
4661
0
                    }
4662
0
                } else {
4663
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4664
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4665
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4666
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4667
0
                        return -1;
4668
0
                    }
4669
0
                }
4670
0
            }
4671
4672
9.88M
            return fp->line.l;
4673
9.88M
        }
4674
4675
4676
0
    case fasta_format:
4677
0
    case fastq_format: {
4678
0
        fastq_state *x = (fastq_state *)fp->state;
4679
0
        if (!x) {
4680
0
            if (!(fp->state = fastq_state_init(fp->format.format
4681
0
                                               == fastq_format ? '@' : '>')))
4682
0
                return -2;
4683
0
        }
4684
4685
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4686
0
            return -1;
4687
0
        if (fp->is_bgzf) {
4688
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4689
0
                return -1;
4690
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4691
0
                return -1;
4692
0
        } else {
4693
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4694
0
                return -1;
4695
0
        }
4696
0
        return fp->line.l;
4697
0
    }
4698
4699
0
    default:
4700
0
        errno = EBADF;
4701
0
        return -1;
4702
29.6M
    }
4703
29.6M
}
4704
4705
/************************
4706
 *** Auxiliary fields ***
4707
 ************************/
4708
#ifndef HTS_LITTLE_ENDIAN
4709
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4710
    int tsz = aux_type2size(type);
4711
4712
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4713
4714
    switch (tsz) {
4715
        case 'H': case 'Z': case 1:  // Trivial
4716
            memcpy(out, in, len);
4717
            break;
4718
4719
#define aux_val_to_le(type_t, store_le) do {                            \
4720
        type_t v;                                                       \
4721
        size_t i;                                                       \
4722
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4723
            memcpy(&v, in + i, sizeof(type_t));                         \
4724
            store_le(v, out);                                           \
4725
        }                                                               \
4726
    } while (0)
4727
4728
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4729
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4730
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4731
4732
#undef aux_val_to_le
4733
4734
        case 'B': { // Recurse!
4735
            uint32_t n;
4736
            if (len < 5) return -1;
4737
            memcpy(&n, in + 1, 4);
4738
            out[0] = in[0];
4739
            u32_to_le(n, out + 1);
4740
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4741
        }
4742
4743
        default: // Unknown type code
4744
            return -1;
4745
    }
4746
4747
4748
4749
    return 0;
4750
}
4751
#endif
4752
4753
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4754
0
{
4755
0
    uint32_t new_len;
4756
4757
0
    assert(b->l_data >= 0);
4758
0
    new_len = b->l_data + 3 + len;
4759
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4760
4761
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4762
4763
0
    b->data[b->l_data] = tag[0];
4764
0
    b->data[b->l_data + 1] = tag[1];
4765
0
    b->data[b->l_data + 2] = type;
4766
4767
0
#ifdef HTS_LITTLE_ENDIAN
4768
0
    memcpy(b->data + b->l_data + 3, data, len);
4769
#else
4770
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4771
        errno = EINVAL;
4772
        return -1;
4773
    }
4774
#endif
4775
4776
0
    b->l_data = new_len;
4777
4778
0
    return 0;
4779
4780
0
 nomem:
4781
0
    errno = ENOMEM;
4782
0
    return -1;
4783
0
}
4784
4785
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4786
3.02M
{
4787
3.02M
    int size;
4788
3.02M
    uint32_t n;
4789
3.02M
    if (s >= end) return end;
4790
3.02M
    size = aux_type2size(*s); ++s; // skip type
4791
3.02M
    switch (size) {
4792
576k
    case 'Z':
4793
593k
    case 'H':
4794
593k
        s = memchr(s, 0, end-s);
4795
593k
        return s ? s+1 : end;
4796
100k
    case 'B':
4797
100k
        if (end - s < 5) return NULL;
4798
100k
        size = aux_type2size(*s); ++s;
4799
100k
        n = le_to_u32(s);
4800
100k
        s += 4;
4801
100k
        if (size == 0 || end - s < size * n) return NULL;
4802
100k
        return s + size * n;
4803
256
    case 0:
4804
256
        return NULL;
4805
2.32M
    default:
4806
2.32M
        if (end - s < size) return NULL;
4807
2.32M
        return s + size;
4808
3.02M
    }
4809
3.02M
}
4810
4811
uint8_t *bam_aux_first(const bam1_t *b)
4812
10.0M
{
4813
10.0M
    uint8_t *s = bam_get_aux(b);
4814
10.0M
    uint8_t *end = b->data + b->l_data;
4815
10.0M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4816
289k
    return s+2;
4817
10.0M
}
4818
4819
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4820
2.93M
{
4821
2.93M
    uint8_t *end = b->data + b->l_data;
4822
2.93M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4823
2.93M
    if (next == NULL) goto bad_aux;
4824
2.93M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4825
2.73M
    return next+2;
4826
4827
256
 bad_aux:
4828
256
    hts_log_error("Corrupted aux data for read %s flag %d",
4829
256
                  bam_get_qname(b), b->core.flag);
4830
256
    errno = EINVAL;
4831
256
    return NULL;
4832
2.93M
}
4833
4834
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4835
10.0M
{
4836
10.0M
    uint8_t *s;
4837
13.0M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4838
3.02M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4839
            // Check the tag value is valid and complete
4840
87.9k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4841
87.9k
            if (e == NULL) goto bad_aux;
4842
87.9k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4843
4844
87.9k
            return s;
4845
87.9k
        }
4846
4847
    // errno now as set by bam_aux_first()/bam_aux_next()
4848
10.0M
    return NULL;
4849
4850
0
 bad_aux:
4851
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4852
0
                  bam_get_qname(b), b->core.flag);
4853
0
    errno = EINVAL;
4854
0
    return NULL;
4855
10.0M
}
4856
4857
int bam_aux_del(bam1_t *b, uint8_t *s)
4858
0
{
4859
0
    s = bam_aux_remove(b, s);
4860
0
    return (s || errno == ENOENT)? 0 : -1;
4861
0
}
4862
4863
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4864
0
{
4865
0
    uint8_t *end = b->data + b->l_data;
4866
0
    uint8_t *next = skip_aux(s, end);
4867
0
    if (next == NULL) goto bad_aux;
4868
4869
0
    b->l_data -= next - (s-2);
4870
0
    if (next >= end) { errno = ENOENT; return NULL; }
4871
4872
0
    memmove(s-2, next, end - next);
4873
0
    return s;
4874
4875
0
 bad_aux:
4876
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4877
0
                  bam_get_qname(b), b->core.flag);
4878
0
    errno = EINVAL;
4879
0
    return NULL;
4880
0
}
4881
4882
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4883
0
{
4884
    // FIXME: This is not at all efficient!
4885
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4886
0
    size_t old_ln = 0;
4887
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4888
0
    int save_errno = errno;
4889
0
    int new_tag = 0;
4890
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4891
4892
0
    if (s) {  // Replacing existing tag
4893
0
        char type = *s;
4894
0
        if (type != 'Z') {
4895
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4896
0
            errno = EINVAL;
4897
0
            return -1;
4898
0
        }
4899
0
        s++;
4900
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4901
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4902
0
        s -= 3;
4903
0
    } else {
4904
0
        if (errno != ENOENT) { // Invalid aux data, give up
4905
0
            return -1;
4906
0
        } else { // Tag doesn't exist - put it on the end
4907
0
            errno = save_errno;
4908
0
            s = b->data + b->l_data;
4909
0
            new_tag = 3;
4910
0
        }
4911
0
    }
4912
4913
0
    if (old_ln < ln + need_nul + new_tag) {
4914
0
        ptrdiff_t s_offset = s - b->data;
4915
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4916
0
            return -1;
4917
0
        s = b->data + s_offset;
4918
0
    }
4919
0
    if (!new_tag) {
4920
0
        memmove(s + 3 + ln + need_nul,
4921
0
                s + 3 + old_ln,
4922
0
                b->l_data - (s + 3 - b->data) - old_ln);
4923
0
    }
4924
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4925
4926
0
    s[0] = tag[0];
4927
0
    s[1] = tag[1];
4928
0
    s[2] = 'Z';
4929
0
    memmove(s+3,data,ln);
4930
0
    if (need_nul) s[3 + ln] = '\0';
4931
0
    return 0;
4932
0
}
4933
4934
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4935
0
{
4936
0
    uint32_t sz, old_sz = 0, new = 0;
4937
0
    uint8_t *s, type;
4938
4939
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4940
0
        errno = EOVERFLOW;
4941
0
        return -1;
4942
0
    }
4943
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4944
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4945
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4946
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4947
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4948
0
    else                       { type = 'I'; sz = 4; }
4949
4950
0
    s = bam_aux_get(b, tag);
4951
0
    if (s) {  // Tag present - how big was the old one?
4952
0
        switch (*s) {
4953
0
            case 'c': case 'C': old_sz = 1; break;
4954
0
            case 's': case 'S': old_sz = 2; break;
4955
0
            case 'i': case 'I': old_sz = 4; break;
4956
0
            default: errno = EINVAL; return -1;  // Not an integer
4957
0
        }
4958
0
    } else {
4959
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4960
0
            s = b->data + b->l_data;
4961
0
            new = 1;
4962
0
        }  else { // Invalid aux data, give up.
4963
0
            return -1;
4964
0
        }
4965
0
    }
4966
4967
0
    if (new || old_sz < sz) {
4968
        // Make room for new tag
4969
0
        ptrdiff_t s_offset = s - b->data;
4970
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4971
0
            return -1;
4972
0
        s =  b->data + s_offset;
4973
0
        if (new) { // Add tag id
4974
0
            *s++ = tag[0];
4975
0
            *s++ = tag[1];
4976
0
        } else {   // Shift following data so we have space
4977
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4978
0
        }
4979
0
    } else {
4980
        // Reuse old space.  Data value may be bigger than necessary but
4981
        // we avoid having to move everything else
4982
0
        sz = old_sz;
4983
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4984
0
        assert(type > 0);
4985
0
    }
4986
0
    *s++ = type;
4987
0
#ifdef HTS_LITTLE_ENDIAN
4988
0
    memcpy(s, &val, sz);
4989
#else
4990
    switch (sz) {
4991
        case 4:  u32_to_le(val, s); break;
4992
        case 2:  u16_to_le(val, s); break;
4993
        default: *s = val; break;
4994
    }
4995
#endif
4996
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
4997
0
    return 0;
4998
0
}
4999
5000
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5001
0
{
5002
0
    uint8_t *s = bam_aux_get(b, tag);
5003
0
    int shrink = 0, new = 0;
5004
5005
0
    if (s) { // Tag present - what was it?
5006
0
        switch (*s) {
5007
0
            case 'f': break;
5008
0
            case 'd': shrink = 1; break;
5009
0
            default: errno = EINVAL; return -1;  // Not a float
5010
0
        }
5011
0
    } else {
5012
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5013
0
            new = 1;
5014
0
        }  else { // Invalid aux data, give up.
5015
0
            return -1;
5016
0
        }
5017
0
    }
5018
5019
0
    if (new) { // Ensure there's room
5020
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5021
0
            return -1;
5022
0
        s = b->data + b->l_data;
5023
0
        *s++ = tag[0];
5024
0
        *s++ = tag[1];
5025
0
    } else if (shrink) { // Convert non-standard double tag to float
5026
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5027
0
        b->l_data -= 4;
5028
0
    }
5029
0
    *s++ = 'f';
5030
0
    float_to_le(val, s);
5031
0
    if (new) b->l_data += 7;
5032
5033
0
    return 0;
5034
0
}
5035
5036
int bam_aux_update_array(bam1_t *b, const char tag[2],
5037
                         uint8_t type, uint32_t items, void *data)
5038
0
{
5039
0
    uint8_t *s = bam_aux_get(b, tag);
5040
0
    size_t old_sz = 0, new_sz;
5041
0
    int new = 0;
5042
5043
0
    if (s) { // Tag present
5044
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5045
0
        old_sz = aux_type2size(s[1]);
5046
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5047
0
        old_sz *= le_to_u32(s + 2);
5048
0
    } else {
5049
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5050
0
            s = b->data + b->l_data;
5051
0
            new = 1;
5052
0
        }  else { // Invalid aux data, give up.
5053
0
            return -1;
5054
0
        }
5055
0
    }
5056
5057
0
    new_sz = aux_type2size(type);
5058
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5059
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5060
0
    new_sz *= items;
5061
5062
0
    if (new || old_sz < new_sz) {
5063
        // Make room for new tag
5064
0
        ptrdiff_t s_offset = s - b->data;
5065
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5066
0
            return -1;
5067
0
        s =  b->data + s_offset;
5068
0
    }
5069
0
    if (new) { // Add tag id and type
5070
0
        *s++ = tag[0];
5071
0
        *s++ = tag[1];
5072
0
        *s = 'B';
5073
0
        b->l_data += 8 + new_sz;
5074
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5075
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5076
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5077
0
        b->l_data -= old_sz;
5078
0
        b->l_data += new_sz;
5079
0
    }
5080
5081
0
    s[1] = type;
5082
0
    u32_to_le(items, s + 2);
5083
0
    if (new_sz > 0) {
5084
0
#ifdef HTS_LITTLE_ENDIAN
5085
0
        memcpy(s + 6, data, new_sz);
5086
#else
5087
        return aux_to_le(type, s + 6, data, new_sz);
5088
#endif
5089
0
    }
5090
0
    return 0;
5091
0
}
5092
5093
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5094
                                      uint32_t idx)
5095
0
{
5096
0
    switch (type) {
5097
0
        case 'c': return le_to_i8(s + idx);
5098
0
        case 'C': return s[idx];
5099
0
        case 's': return le_to_i16(s + 2 * idx);
5100
0
        case 'S': return le_to_u16(s + 2 * idx);
5101
0
        case 'i': return le_to_i32(s + 4 * idx);
5102
0
        case 'I': return le_to_u32(s + 4 * idx);
5103
0
        default:
5104
0
            errno = EINVAL;
5105
0
            return 0;
5106
0
    }
5107
0
}
5108
5109
int64_t bam_aux2i(const uint8_t *s)
5110
0
{
5111
0
    int type;
5112
0
    type = *s++;
5113
0
    return get_int_aux_val(type, s, 0);
5114
0
}
5115
5116
double bam_aux2f(const uint8_t *s)
5117
0
{
5118
0
    int type;
5119
0
    type = *s++;
5120
0
    if (type == 'd') return le_to_double(s);
5121
0
    else if (type == 'f') return le_to_float(s);
5122
0
    else return get_int_aux_val(type, s, 0);
5123
0
}
5124
5125
char bam_aux2A(const uint8_t *s)
5126
0
{
5127
0
    int type;
5128
0
    type = *s++;
5129
0
    if (type == 'A') return *(char*)s;
5130
0
    errno = EINVAL;
5131
0
    return 0;
5132
0
}
5133
5134
char *bam_aux2Z(const uint8_t *s)
5135
0
{
5136
0
    int type;
5137
0
    type = *s++;
5138
0
    if (type == 'Z' || type == 'H') return (char*)s;
5139
0
    errno = EINVAL;
5140
0
    return 0;
5141
0
}
5142
5143
uint32_t bam_auxB_len(const uint8_t *s)
5144
0
{
5145
0
    if (s[0] != 'B') {
5146
0
        errno = EINVAL;
5147
0
        return 0;
5148
0
    }
5149
0
    return le_to_u32(s + 2);
5150
0
}
5151
5152
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5153
0
{
5154
0
    uint32_t len = bam_auxB_len(s);
5155
0
    if (idx >= len) {
5156
0
        errno = ERANGE;
5157
0
        return 0;
5158
0
    }
5159
0
    return get_int_aux_val(s[1], s + 6, idx);
5160
0
}
5161
5162
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5163
0
{
5164
0
    uint32_t len = bam_auxB_len(s);
5165
0
    if (idx >= len) {
5166
0
        errno = ERANGE;
5167
0
        return 0.0;
5168
0
    }
5169
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5170
0
    else return get_int_aux_val(s[1], s + 6, idx);
5171
0
}
5172
5173
int sam_open_mode(char *mode, const char *fn, const char *format)
5174
0
{
5175
    // TODO Parse "bam5" etc for compression level
5176
0
    if (format == NULL) {
5177
        // Try to pick a format based on the filename extension
5178
0
        char extension[HTS_MAX_EXT_LEN];
5179
0
        if (find_file_extension(fn, extension) < 0) return -1;
5180
0
        return sam_open_mode(mode, fn, extension);
5181
0
    }
5182
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5183
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5184
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5185
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5186
0
    else if (strcasecmp(format, "fastq") == 0 ||
5187
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5188
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5189
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5190
0
    else if (strcasecmp(format, "fasta") == 0 ||
5191
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5192
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5193
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5194
0
    else return -1;
5195
5196
0
    return 0;
5197
0
}
5198
5199
// A version of sam_open_mode that can handle ,key=value options.
5200
// The format string is allocated and returned, to be freed by the caller.
5201
// Prefix should be "r" or "w",
5202
char *sam_open_mode_opts(const char *fn,
5203
                         const char *mode,
5204
                         const char *format)
5205
0
{
5206
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5207
0
                             (mode   ? strlen(mode)   : 1) + 12);
5208
0
    char *opts, *cp;
5209
0
    int format_len;
5210
5211
0
    if (!mode_opts)
5212
0
        return NULL;
5213
5214
0
    strcpy(mode_opts, mode ? mode : "r");
5215
0
    cp = mode_opts + strlen(mode_opts);
5216
5217
0
    if (format == NULL) {
5218
        // Try to pick a format based on the filename extension
5219
0
        char extension[HTS_MAX_EXT_LEN];
5220
0
        if (find_file_extension(fn, extension) < 0) {
5221
0
            free(mode_opts);
5222
0
            return NULL;
5223
0
        }
5224
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5225
0
            return mode_opts;
5226
0
        } else {
5227
0
            free(mode_opts);
5228
0
            return NULL;
5229
0
        }
5230
0
    }
5231
5232
0
    if ((opts = strchr(format, ','))) {
5233
0
        format_len = opts-format;
5234
0
    } else {
5235
0
        opts="";
5236
0
        format_len = strlen(format);
5237
0
    }
5238
5239
0
    if (strncmp(format, "bam", format_len) == 0) {
5240
0
        *cp++ = 'b';
5241
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5242
0
        *cp++ = 'c';
5243
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5244
0
        *cp++ = 'c';
5245
0
        strcpy(cp, ",VERSION=2.1");
5246
0
        cp += 12;
5247
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5248
0
        *cp++ = 'c';
5249
0
        strcpy(cp, ",VERSION=3.0");
5250
0
        cp += 12;
5251
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5252
0
        ; // format mode=""
5253
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5254
0
        *cp++ = 'z';
5255
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5256
0
               strncmp(format, "fq", format_len) == 0) {
5257
0
        *cp++ = 'f';
5258
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5259
0
               strncmp(format, "fq.gz", format_len) == 0) {
5260
0
        *cp++ = 'f';
5261
0
        *cp++ = 'z';
5262
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5263
0
               strncmp(format, "fa", format_len) == 0) {
5264
0
        *cp++ = 'F';
5265
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5266
0
               strncmp(format, "fa", format_len) == 0) {
5267
0
        *cp++ = 'F';
5268
0
        *cp++ = 'z';
5269
0
    } else {
5270
0
        free(mode_opts);
5271
0
        return NULL;
5272
0
    }
5273
5274
0
    strcpy(cp, opts);
5275
5276
0
    return mode_opts;
5277
0
}
5278
5279
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5280
int bam_str2flag(const char *str)
5281
0
{
5282
0
    char *end, *beg = (char*) str;
5283
0
    long int flag = strtol(str, &end, 0);
5284
0
    if ( end!=str ) return flag;    // the conversion was successful
5285
0
    flag = 0;
5286
0
    while ( *str )
5287
0
    {
5288
0
        end = beg;
5289
0
        while ( *end && *end!=',' ) end++;
5290
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5291
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5292
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5293
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5294
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5295
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5296
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5297
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5298
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5299
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5300
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5301
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5302
0
        else return -1;
5303
0
        if ( !*end ) break;
5304
0
        beg = end + 1;
5305
0
    }
5306
0
    return flag;
5307
0
}
5308
5309
char *bam_flag2str(int flag)
5310
0
{
5311
0
    kstring_t str = {0,0,0};
5312
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5313
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5314
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5315
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5316
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5317
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5318
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5319
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5320
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5321
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5322
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5323
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5324
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5325
0
    return str.s;
5326
0
}
5327
5328
5329
/**************************
5330
 *** Pileup and Mpileup ***
5331
 **************************/
5332
5333
#if !defined(BAM_NO_PILEUP)
5334
5335
#include <assert.h>
5336
5337
/*******************
5338
 *** Memory pool ***
5339
 *******************/
5340
5341
typedef struct {
5342
    int k, y;
5343
    hts_pos_t x, end;
5344
} cstate_t;
5345
5346
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5347
5348
typedef struct __linkbuf_t {
5349
    bam1_t b;
5350
    hts_pos_t beg, end;
5351
    cstate_t s;
5352
    struct __linkbuf_t *next;
5353
    bam_pileup_cd cd;
5354
} lbnode_t;
5355
5356
typedef struct {
5357
    int cnt, n, max;
5358
    lbnode_t **buf;
5359
} mempool_t;
5360
5361
static mempool_t *mp_init(void)
5362
0
{
5363
0
    mempool_t *mp;
5364
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5365
0
    return mp;
5366
0
}
5367
static void mp_destroy(mempool_t *mp)
5368
0
{
5369
0
    int k;
5370
0
    for (k = 0; k < mp->n; ++k) {
5371
0
        free(mp->buf[k]->b.data);
5372
0
        free(mp->buf[k]);
5373
0
    }
5374
0
    free(mp->buf);
5375
0
    free(mp);
5376
0
}
5377
static inline lbnode_t *mp_alloc(mempool_t *mp)
5378
0
{
5379
0
    ++mp->cnt;
5380
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5381
0
    else return mp->buf[--mp->n];
5382
0
}
5383
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5384
0
{
5385
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5386
0
    if (mp->n == mp->max) {
5387
0
        mp->max = mp->max? mp->max<<1 : 256;
5388
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5389
0
    }
5390
0
    mp->buf[mp->n++] = p;
5391
0
}
5392
5393
/**********************
5394
 *** CIGAR resolver ***
5395
 **********************/
5396
5397
/* s->k: the index of the CIGAR operator that has just been processed.
5398
   s->x: the reference coordinate of the start of s->k
5399
   s->y: the query coordinate of the start of s->k
5400
 */
5401
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5402
0
{
5403
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5404
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5405
5406
0
    bam1_t *b = p->b;
5407
0
    bam1_core_t *c = &b->core;
5408
0
    uint32_t *cigar = bam_get_cigar(b);
5409
0
    int k;
5410
    // determine the current CIGAR operation
5411
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5412
0
    if (s->k == -1) { // never processed
5413
0
        p->qpos = 0;
5414
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5415
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5416
0
        } else { // find the first match or deletion
5417
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5418
0
                int op = _cop(cigar[k]);
5419
0
                int l = _cln(cigar[k]);
5420
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5421
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5422
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5423
0
            }
5424
0
            assert(k < c->n_cigar);
5425
0
            s->k = k;
5426
0
        }
5427
0
    } else { // the read has been processed before
5428
0
        int op, l = _cln(cigar[s->k]);
5429
0
        if (pos - s->x >= l) { // jump to the next operation
5430
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5431
0
            op = _cop(cigar[s->k+1]);
5432
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5433
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5434
0
                s->x += l;
5435
0
                ++s->k;
5436
0
            } else { // find the next M/D/N/=/X
5437
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5438
0
                s->x += l;
5439
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5440
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5441
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5442
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5443
0
                }
5444
0
                s->k = k;
5445
0
            }
5446
0
            assert(s->k < c->n_cigar); // otherwise a bug
5447
0
        } // else, do nothing
5448
0
    }
5449
0
    { // collect pileup information
5450
0
        int op, l;
5451
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5452
0
        p->is_del = p->indel = p->is_refskip = 0;
5453
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5454
0
            int op2 = _cop(cigar[s->k+1]);
5455
0
            int l2 = _cln(cigar[s->k+1]);
5456
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5457
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5458
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5459
                // and rely on is_del=1 as we would for 3D.
5460
0
                p->indel = -(int)l2;
5461
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5462
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5463
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5464
0
                    else break;
5465
0
                }
5466
0
            } else if (op2 == BAM_CINS) {
5467
0
                p->indel = l2;
5468
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5469
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5470
0
                    if (op2 == BAM_CINS) p->indel += l2;
5471
0
                    else if (op2 != BAM_CPAD) break;
5472
0
                }
5473
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5474
0
                int l3 = 0;
5475
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5476
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5477
0
                    if (op2 == BAM_CINS) l3 += l2;
5478
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5479
0
                }
5480
0
                if (l3 > 0) p->indel = l3;
5481
0
            }
5482
0
        }
5483
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5484
0
            p->qpos = s->y + (pos - s->x);
5485
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5486
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5487
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5488
0
        } // cannot be other operations; otherwise a bug
5489
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5490
0
    }
5491
0
    p->cigar_ind = s->k;
5492
0
    return 1;
5493
0
}
5494
5495
/*******************************
5496
 *** Expansion of insertions ***
5497
 *******************************/
5498
5499
/*
5500
 * Fills out the kstring with the padded insertion sequence for the current
5501
 * location in 'p'.  If this is not an insertion site, the string is blank.
5502
 *
5503
 * This variant handles base modifications, but only when "m" is non-NULL.
5504
 *
5505
 * Returns the number of inserted base on success, with string length being
5506
 *        accessable via ins->l;
5507
 *        -1 on failure.
5508
 */
5509
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5510
                          hts_base_mod_state *m,
5511
0
                          kstring_t *ins, int *del_len) {
5512
0
    int j, k, indel, nb = 0;
5513
0
    uint32_t *cigar;
5514
5515
0
    if (p->indel <= 0) {
5516
0
        if (ks_resize(ins, 1) < 0)
5517
0
            return -1;
5518
0
        ins->l = 0;
5519
0
        ins->s[0] = '\0';
5520
0
        return 0;
5521
0
    }
5522
5523
0
    if (del_len)
5524
0
        *del_len = 0;
5525
5526
    // Measure indel length including pads
5527
0
    indel = 0;
5528
0
    k = p->cigar_ind+1;
5529
0
    cigar = bam_get_cigar(p->b);
5530
0
    while (k < p->b->core.n_cigar) {
5531
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5532
0
        case BAM_CPAD:
5533
0
        case BAM_CINS:
5534
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5535
0
            break;
5536
0
        default:
5537
0
            k = p->b->core.n_cigar;
5538
0
            break;
5539
0
        }
5540
0
        k++;
5541
0
    }
5542
0
    nb = ins->l = indel;
5543
5544
    // Produce sequence
5545
0
    if (ks_resize(ins, indel+1) < 0)
5546
0
        return -1;
5547
0
    indel = 0;
5548
0
    k = p->cigar_ind+1;
5549
0
    j = 1;
5550
0
    while (k < p->b->core.n_cigar) {
5551
0
        int l, c;
5552
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5553
0
        case BAM_CPAD:
5554
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5555
0
                ins->s[indel++] = '*';
5556
0
            break;
5557
0
        case BAM_CINS:
5558
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5559
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5560
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5561
0
                                            p->qpos + j - p->is_del)]
5562
0
                    : 'N';
5563
0
                ins->s[indel++] = c;
5564
0
                int nm;
5565
0
                hts_base_mod mod[256];
5566
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5567
0
                                                m, mod, 256)) > 0) {
5568
0
                    int o_indel = indel;
5569
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5570
0
                        return -1;
5571
0
                    ins->s[indel++] = '[';
5572
0
                    int j;
5573
0
                    for (j = 0; j < nm; j++) {
5574
0
                        char qual[20];
5575
0
                        if (mod[j].qual >= 0)
5576
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5577
0
                        else
5578
0
                            *qual=0;
5579
0
                        if (mod[j].modified_base < 0)
5580
                            // ChEBI
5581
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5582
0
                                              "%c(%d)%s",
5583
0
                                              "+-"[mod[j].strand],
5584
0
                                              -mod[j].modified_base,
5585
0
                                              qual);
5586
0
                        else
5587
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5588
0
                                              "%c%c%s",
5589
0
                                              "+-"[mod[j].strand],
5590
0
                                              mod[j].modified_base,
5591
0
                                              qual);
5592
0
                    }
5593
0
                    ins->s[indel++] = ']';
5594
0
                    ins->l += indel - o_indel; // grow by amount we used
5595
0
                }
5596
0
            }
5597
0
            break;
5598
0
        case BAM_CDEL:
5599
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5600
0
            if (del_len)
5601
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5602
            // fall through
5603
0
        default:
5604
0
            k = p->b->core.n_cigar;
5605
0
            break;
5606
0
        }
5607
0
        k++;
5608
0
    }
5609
0
    ins->s[indel] = '\0';
5610
0
    ins->l = indel; // string length
5611
5612
0
    return nb;      // base length
5613
0
}
5614
5615
/*
5616
 * Fills out the kstring with the padded insertion sequence for the current
5617
 * location in 'p'.  If this is not an insertion site, the string is blank.
5618
 *
5619
 * This is the original interface with no capability for reporting base
5620
 * modifications.
5621
 *
5622
 * Returns the length of insertion string on success;
5623
 *        -1 on failure.
5624
 */
5625
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5626
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5627
0
}
5628
5629
/***********************
5630
 *** Pileup iterator ***
5631
 ***********************/
5632
5633
// Dictionary of overlapping reads
5634
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5635
typedef khash_t(olap_hash) olap_hash_t;
5636
5637
struct bam_plp_s {
5638
    mempool_t *mp;
5639
    lbnode_t *head, *tail;
5640
    int32_t tid, max_tid;
5641
    hts_pos_t pos, max_pos;
5642
    int is_eof, max_plp, error, maxcnt;
5643
    uint64_t id;
5644
    bam_pileup1_t *plp;
5645
    // for the "auto" interface only
5646
    bam1_t *b;
5647
    bam_plp_auto_f func;
5648
    void *data;
5649
    olap_hash_t *overlaps;
5650
5651
    // For notification of creation and destruction events
5652
    // and associated client-owned pointer.
5653
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5654
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5655
};
5656
5657
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5658
0
{
5659
0
    bam_plp_t iter;
5660
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5661
0
    iter->mp = mp_init();
5662
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5663
0
    iter->max_tid = iter->max_pos = -1;
5664
0
    iter->maxcnt = 8000;
5665
0
    if (func) {
5666
0
        iter->func = func;
5667
0
        iter->data = data;
5668
0
        iter->b = bam_init1();
5669
0
    }
5670
0
    return iter;
5671
0
}
5672
5673
int bam_plp_init_overlaps(bam_plp_t iter)
5674
0
{
5675
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5676
0
    return iter->overlaps ? 0 : -1;
5677
0
}
5678
5679
void bam_plp_destroy(bam_plp_t iter)
5680
0
{
5681
0
    lbnode_t *p, *pnext;
5682
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5683
0
    for (p = iter->head; p != NULL; p = pnext) {
5684
0
        if (iter->plp_destruct && p != iter->tail)
5685
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5686
0
        pnext = p->next;
5687
0
        mp_free(iter->mp, p);
5688
0
    }
5689
0
    mp_destroy(iter->mp);
5690
0
    if (iter->b) bam_destroy1(iter->b);
5691
0
    free(iter->plp);
5692
0
    free(iter);
5693
0
}
5694
5695
void bam_plp_constructor(bam_plp_t plp,
5696
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5697
0
    plp->plp_construct = func;
5698
0
}
5699
5700
void bam_plp_destructor(bam_plp_t plp,
5701
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5702
0
    plp->plp_destruct = func;
5703
0
}
5704
5705
//---------------------------------
5706
//---  Tweak overlapping reads
5707
//---------------------------------
5708
5709
/**
5710
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5711
 *  cigar_iref2iseq_next() - get the next CMATCH base
5712
 *  @cigar:       pointer to current cigar block (rw)
5713
 *  @cigar_max:   pointer just beyond the last cigar block
5714
 *  @icig:        position within the current cigar block (rw)
5715
 *  @iseq:        position in the sequence (rw)
5716
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5717
 *
5718
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5719
 *  or -2 on error.
5720
 */
5721
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5722
                                      const uint32_t *cigar_max,
5723
                                      hts_pos_t *icig,
5724
                                      hts_pos_t *iseq,
5725
                                      hts_pos_t *iref)
5726
0
{
5727
0
    hts_pos_t pos = *iref;
5728
0
    if ( pos < 0 ) return -1;
5729
0
    *icig = 0;
5730
0
    *iseq = 0;
5731
0
    *iref = 0;
5732
0
    while ( *cigar<cigar_max )
5733
0
    {
5734
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5735
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5736
5737
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5738
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5739
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5740
0
        {
5741
0
            pos -= ncig;
5742
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5743
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5744
0
            continue;
5745
0
        }
5746
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5747
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5748
0
        {
5749
0
            pos -= ncig;
5750
0
            if ( pos<0 ) pos = 0;
5751
0
            (*cigar)++; *icig = 0; *iref += ncig;
5752
0
            continue;
5753
0
        }
5754
0
        hts_log_error("Unexpected cigar %d", cig);
5755
0
        return -2;
5756
0
    }
5757
0
    *iseq = -1;
5758
0
    return -1;
5759
0
}
5760
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5761
                                       const uint32_t *cigar_max,
5762
                                       hts_pos_t *icig,
5763
                                       hts_pos_t *iseq,
5764
                                       hts_pos_t *iref)
5765
0
{
5766
0
    while ( *cigar < cigar_max )
5767
0
    {
5768
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5769
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5770
5771
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5772
0
        {
5773
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5774
0
            (*iseq)++; (*icig)++; (*iref)++;
5775
0
            return BAM_CMATCH;
5776
0
        }
5777
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5778
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5779
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5780
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5781
0
        hts_log_error("Unexpected cigar %d", cig);
5782
0
        return -2;
5783
0
    }
5784
0
    *iseq = -1;
5785
0
    *iref = -1;
5786
0
    return -1;
5787
0
}
5788
5789
// Given overlapping read 'a' (left) and 'b' (right) on the same
5790
// template, adjust quality values to zero for either a or b.
5791
// Note versions 1.12 and earlier always removed quality from 'b' for
5792
// matching bases.  Now we select a or b semi-randomly based on name hash.
5793
// Returns 0 on success,
5794
//        -1 on failure
5795
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5796
0
{
5797
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5798
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5799
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5800
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5801
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5802
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5803
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5804
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5805
5806
0
    hts_pos_t iref   = b->core.pos;
5807
0
    hts_pos_t a_iref = iref - a->core.pos;
5808
0
    hts_pos_t b_iref = iref - b->core.pos;
5809
5810
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5811
0
                                    &a_icig, &a_iseq, &a_iref);
5812
0
    if ( a_ret<0 )
5813
        // no overlap or error
5814
0
        return a_ret<-1 ? -1:0;
5815
5816
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5817
0
                                    &b_icig, &b_iseq, &b_iref);
5818
0
    if ( b_ret<0 )
5819
        // no overlap or error
5820
0
        return b_ret<-1 ? -1:0;
5821
5822
    // Determine which seq is the one getting modified qualities.
5823
0
    uint8_t amul, bmul;
5824
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5825
0
        amul = 1;
5826
0
        bmul = 0;
5827
0
    } else {
5828
0
        amul = 0;
5829
0
        bmul = 1;
5830
0
    }
5831
5832
    // Loop over the overlapping region nulling qualities in either
5833
    // seq a or b.
5834
0
    int err = 0;
5835
0
    while ( 1 ) {
5836
        // Step to next matching reference position in a and b
5837
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5838
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5839
0
                                         &a_icig, &a_iseq, &a_iref);
5840
0
        if ( a_ret<0 ) { // done
5841
0
            err = a_ret<-1?-1:0;
5842
0
            break;
5843
0
        }
5844
5845
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5846
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5847
0
                                         &b_iseq, &b_iref);
5848
0
        if ( b_ret<0 ) { // done
5849
0
            err = b_ret<-1?-1:0;
5850
0
            break;
5851
0
        }
5852
5853
0
        if ( iref < a_iref + a->core.pos )
5854
0
            iref = a_iref + a->core.pos;
5855
5856
0
        if ( iref < b_iref + b->core.pos )
5857
0
            iref = b_iref + b->core.pos;
5858
5859
0
        iref++;
5860
5861
        // If A or B has a deletion then we catch up the other to this point.
5862
        // We also amend quality values using the same rules for mismatch.
5863
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5864
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5865
0
                && b_cigar > bam_get_cigar(b)
5866
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5867
                // Del in B means it's moved on further than A
5868
0
                do {
5869
0
                    a_qual[a_iseq] = amul
5870
0
                        ? a_qual[a_iseq]*0.8
5871
0
                        : 0;
5872
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5873
0
                                                 &a_icig, &a_iseq, &a_iref);
5874
0
                    if (a_ret < 0)
5875
0
                        return -(a_ret<-1); // 0 or -1
5876
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5877
0
            } else if (a_cigar > bam_get_cigar(a)
5878
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5879
                // Del in A means it's moved on further than B
5880
0
                do {
5881
0
                    b_qual[b_iseq] = bmul
5882
0
                        ? b_qual[b_iseq]*0.8
5883
0
                        : 0;
5884
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5885
0
                                                 &b_icig, &b_iseq, &b_iref);
5886
0
                    if (b_ret < 0)
5887
0
                        return -(b_ret<-1); // 0 or -1
5888
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5889
0
            } else {
5890
                // Anything else, eg ref-skip, we don't support here
5891
0
                continue;
5892
0
            }
5893
0
        }
5894
5895
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5896
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5897
        //         a_cigar-bam_get_cigar(a), a_icig,
5898
        //         b_cigar-bam_get_cigar(b), b_icig,
5899
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5900
        //         a_iseq, b_iseq);
5901
5902
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5903
            // Fell off end of sequence, bad CIGAR?
5904
0
            return -1;
5905
5906
        // We're finally at the same ref base in both a and b.
5907
        // Check if the bases match (confident) or mismatch
5908
        // (not so confident).
5909
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5910
            // We are very confident about this base.  Use sum of quals
5911
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5912
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5913
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5914
0
        } else {
5915
            // Not so confident about anymore given the mismatch.
5916
            // Reduce qual for lowest quality base.
5917
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5918
                // A highest qual base; keep
5919
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5920
0
                b_qual[b_iseq] = 0;
5921
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5922
                // B highest qual base; keep
5923
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5924
0
                a_qual[a_iseq] = 0;
5925
0
            } else {
5926
                // Both equal, so pick randomly
5927
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5928
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5929
0
            }
5930
0
        }
5931
0
    }
5932
5933
0
    return err;
5934
0
}
5935
5936
// Fix overlapping reads. Simple soft-clipping did not give good results.
5937
// Lowering qualities of unwanted bases is more selective and works better.
5938
//
5939
// Returns 0 on success, -1 on failure
5940
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5941
0
{
5942
0
    if ( !iter->overlaps ) return 0;
5943
5944
    // mapped mates and paired reads only
5945
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5946
5947
    // no overlap possible, unless some wild cigar
5948
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5949
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5950
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5951
0
       ) return 0;
5952
5953
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5954
0
    if ( kitr==kh_end(iter->overlaps) )
5955
0
    {
5956
        // Only add reads where the mate is still to arrive
5957
0
        if (node->b.core.mpos >= node->b.core.pos ||
5958
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5959
0
            int ret;
5960
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5961
0
            if (ret < 0) return -1;
5962
0
            kh_value(iter->overlaps, kitr) = node;
5963
0
        }
5964
0
    }
5965
0
    else
5966
0
    {
5967
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5968
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5969
0
        kh_del(olap_hash, iter->overlaps, kitr);
5970
0
        assert(a->end-1 == a->s.end);
5971
0
        return err;
5972
0
    }
5973
0
    return 0;
5974
0
}
5975
5976
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5977
0
{
5978
0
    if ( !iter->overlaps ) return;
5979
5980
0
    khiter_t kitr;
5981
0
    if ( b )
5982
0
    {
5983
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5984
0
        if ( kitr!=kh_end(iter->overlaps) )
5985
0
            kh_del(olap_hash, iter->overlaps, kitr);
5986
0
    }
5987
0
    else
5988
0
    {
5989
        // remove all
5990
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5991
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5992
0
    }
5993
0
}
5994
5995
5996
5997
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5998
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
5999
// buffer yet (the current position is still the maximum position across all buffered reads).
6000
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6001
0
{
6002
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6003
0
    *_n_plp = 0;
6004
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6005
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6006
0
        int n_plp = 0;
6007
        // write iter->plp at iter->pos
6008
0
        lbnode_t **pptr = &iter->head;
6009
0
        while (*pptr != iter->tail) {
6010
0
            lbnode_t *p = *pptr;
6011
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6012
0
                overlap_remove(iter, &p->b);
6013
0
                if (iter->plp_destruct)
6014
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6015
0
                *pptr = p->next; mp_free(iter->mp, p);
6016
0
            }
6017
0
            else {
6018
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6019
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6020
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6021
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6022
0
                    }
6023
0
                    iter->plp[n_plp].b = &p->b;
6024
0
                    iter->plp[n_plp].cd = p->cd;
6025
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6026
0
                }
6027
0
                pptr = &(*pptr)->next;
6028
0
            }
6029
0
        }
6030
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6031
        // update iter->tid and iter->pos
6032
0
        if (iter->head != iter->tail) {
6033
0
            if (iter->tid > iter->head->b.core.tid) {
6034
0
                hts_log_error("Unsorted input. Pileup aborts");
6035
0
                iter->error = 1;
6036
0
                *_n_plp = -1;
6037
0
                return NULL;
6038
0
            }
6039
0
        }
6040
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6041
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6042
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6043
0
            iter->pos = iter->head->beg; // jump to the next position
6044
0
        } else ++iter->pos; // scan contiguously
6045
        // return
6046
0
        if (n_plp) return iter->plp;
6047
0
        if (iter->is_eof && iter->head == iter->tail) break;
6048
0
    }
6049
0
    return NULL;
6050
0
}
6051
6052
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6053
0
{
6054
0
    hts_pos_t pos64 = 0;
6055
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6056
0
    if (pos64 < INT_MAX) {
6057
0
        *_pos = pos64;
6058
0
    } else {
6059
0
        hts_log_error("Position %"PRId64" too large", pos64);
6060
0
        *_pos = INT_MAX;
6061
0
        iter->error = 1;
6062
0
        *_n_plp = -1;
6063
0
        return NULL;
6064
0
    }
6065
0
    return p;
6066
0
}
6067
6068
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6069
0
{
6070
0
    if (iter->error) return -1;
6071
0
    if (b) {
6072
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6073
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6074
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6075
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6076
0
        {
6077
0
            overlap_remove(iter, b);
6078
0
            return 0;
6079
0
        }
6080
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6081
0
            return -1;
6082
0
        iter->tail->b.id = iter->id++;
6083
0
        iter->tail->beg = b->core.pos;
6084
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6085
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6086
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6087
0
        if (b->core.tid < iter->max_tid) {
6088
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6089
0
            iter->error = 1;
6090
0
            return -1;
6091
0
        }
6092
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6093
0
            hts_log_error("The input is not sorted (reads out of order)");
6094
0
            iter->error = 1;
6095
0
            return -1;
6096
0
        }
6097
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6098
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6099
0
            lbnode_t *next = mp_alloc(iter->mp);
6100
0
            if (!next) {
6101
0
                iter->error = 1;
6102
0
                return -1;
6103
0
            }
6104
0
            if (iter->plp_construct) {
6105
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6106
0
                                        &iter->tail->cd) < 0) {
6107
0
                    mp_free(iter->mp, next);
6108
0
                    iter->error = 1;
6109
0
                    return -1;
6110
0
                }
6111
0
            }
6112
0
            if (overlap_push(iter, iter->tail) < 0) {
6113
0
                mp_free(iter->mp, next);
6114
0
                iter->error = 1;
6115
0
                return -1;
6116
0
            }
6117
0
            iter->tail->next = next;
6118
0
            iter->tail = iter->tail->next;
6119
0
        }
6120
0
    } else iter->is_eof = 1;
6121
0
    return 0;
6122
0
}
6123
6124
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6125
0
{
6126
0
    const bam_pileup1_t *plp;
6127
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6128
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6129
0
    else { // no pileup line can be obtained; read alignments
6130
0
        *_n_plp = 0;
6131
0
        if (iter->is_eof) return 0;
6132
0
        int ret;
6133
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6134
0
            if (bam_plp_push(iter, iter->b) < 0) {
6135
0
                *_n_plp = -1;
6136
0
                return 0;
6137
0
            }
6138
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6139
            // otherwise no pileup line can be returned; read the next alignment.
6140
0
        }
6141
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6142
0
        if (bam_plp_push(iter, 0) < 0) {
6143
0
            *_n_plp = -1;
6144
0
            return 0;
6145
0
        }
6146
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6147
0
        return 0;
6148
0
    }
6149
0
}
6150
6151
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6152
0
{
6153
0
    hts_pos_t pos64 = 0;
6154
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6155
0
    if (pos64 < INT_MAX) {
6156
0
        *_pos = pos64;
6157
0
    } else {
6158
0
        hts_log_error("Position %"PRId64" too large", pos64);
6159
0
        *_pos = INT_MAX;
6160
0
        iter->error = 1;
6161
0
        *_n_plp = -1;
6162
0
        return NULL;
6163
0
    }
6164
0
    return p;
6165
0
}
6166
6167
void bam_plp_reset(bam_plp_t iter)
6168
0
{
6169
0
    overlap_remove(iter, NULL);
6170
0
    iter->max_tid = iter->max_pos = -1;
6171
0
    iter->tid = iter->pos = 0;
6172
0
    iter->is_eof = 0;
6173
0
    while (iter->head != iter->tail) {
6174
0
        lbnode_t *p = iter->head;
6175
0
        iter->head = p->next;
6176
0
        mp_free(iter->mp, p);
6177
0
    }
6178
0
}
6179
6180
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6181
0
{
6182
0
    iter->maxcnt = maxcnt;
6183
0
}
6184
6185
/************************
6186
 *** Mpileup iterator ***
6187
 ************************/
6188
6189
struct bam_mplp_s {
6190
    int n;
6191
    int32_t min_tid, *tid;
6192
    hts_pos_t min_pos, *pos;
6193
    bam_plp_t *iter;
6194
    int *n_plp;
6195
    const bam_pileup1_t **plp;
6196
};
6197
6198
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6199
0
{
6200
0
    int i;
6201
0
    bam_mplp_t iter;
6202
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6203
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6204
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6205
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6206
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6207
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6208
0
    iter->n = n;
6209
0
    iter->min_pos = HTS_POS_MAX;
6210
0
    iter->min_tid = (uint32_t)-1;
6211
0
    for (i = 0; i < n; ++i) {
6212
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6213
0
        iter->pos[i] = iter->min_pos;
6214
0
        iter->tid[i] = iter->min_tid;
6215
0
    }
6216
0
    return iter;
6217
0
}
6218
6219
int bam_mplp_init_overlaps(bam_mplp_t iter)
6220
0
{
6221
0
    int i, r = 0;
6222
0
    for (i = 0; i < iter->n; ++i)
6223
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6224
0
    return r == 0 ? 0 : -1;
6225
0
}
6226
6227
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6228
0
{
6229
0
    int i;
6230
0
    for (i = 0; i < iter->n; ++i)
6231
0
        iter->iter[i]->maxcnt = maxcnt;
6232
0
}
6233
6234
void bam_mplp_destroy(bam_mplp_t iter)
6235
0
{
6236
0
    int i;
6237
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6238
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6239
0
    free(iter->n_plp); free(iter->plp);
6240
0
    free(iter);
6241
0
}
6242
6243
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6244
0
{
6245
0
    int i, ret = 0;
6246
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6247
0
    uint32_t new_min_tid = (uint32_t)-1;
6248
0
    for (i = 0; i < iter->n; ++i) {
6249
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6250
0
            int tid;
6251
0
            hts_pos_t pos;
6252
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6253
0
            if ( iter->iter[i]->error ) return -1;
6254
0
            if (iter->plp[i]) {
6255
0
                iter->tid[i] = tid;
6256
0
                iter->pos[i] = pos;
6257
0
            } else {
6258
0
                iter->tid[i] = 0;
6259
0
                iter->pos[i] = 0;
6260
0
            }
6261
0
        }
6262
0
        if (iter->plp[i]) {
6263
0
            if (iter->tid[i] < new_min_tid) {
6264
0
                new_min_tid = iter->tid[i];
6265
0
                new_min_pos = iter->pos[i];
6266
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6267
0
                new_min_pos = iter->pos[i];
6268
0
            }
6269
0
        }
6270
0
    }
6271
0
    iter->min_pos = new_min_pos;
6272
0
    iter->min_tid = new_min_tid;
6273
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6274
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6275
0
    for (i = 0; i < iter->n; ++i) {
6276
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6277
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6278
0
            ++ret;
6279
0
        } else n_plp[i] = 0, plp[i] = 0;
6280
0
    }
6281
0
    return ret;
6282
0
}
6283
6284
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6285
0
{
6286
0
    hts_pos_t pos64 = 0;
6287
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6288
0
    if (ret >= 0) {
6289
0
        if (pos64 < INT_MAX) {
6290
0
            *_pos = pos64;
6291
0
        } else {
6292
0
            hts_log_error("Position %"PRId64" too large", pos64);
6293
0
            *_pos = INT_MAX;
6294
0
            return -1;
6295
0
        }
6296
0
    }
6297
0
    return ret;
6298
0
}
6299
6300
void bam_mplp_reset(bam_mplp_t iter)
6301
0
{
6302
0
    int i;
6303
0
    iter->min_pos = HTS_POS_MAX;
6304
0
    iter->min_tid = (uint32_t)-1;
6305
0
    for (i = 0; i < iter->n; ++i) {
6306
0
        bam_plp_reset(iter->iter[i]);
6307
0
        iter->pos[i] = HTS_POS_MAX;
6308
0
        iter->tid[i] = (uint32_t)-1;
6309
0
        iter->n_plp[i] = 0;
6310
0
        iter->plp[i] = NULL;
6311
0
    }
6312
0
}
6313
6314
void bam_mplp_constructor(bam_mplp_t iter,
6315
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6316
0
    int i;
6317
0
    for (i = 0; i < iter->n; ++i)
6318
0
        bam_plp_constructor(iter->iter[i], func);
6319
0
}
6320
6321
void bam_mplp_destructor(bam_mplp_t iter,
6322
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6323
0
    int i;
6324
0
    for (i = 0; i < iter->n; ++i)
6325
0
        bam_plp_destructor(iter->iter[i], func);
6326
0
}
6327
6328
#endif // ~!defined(BAM_NO_PILEUP)