Coverage Report

Created: 2024-02-25 06:34

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
// Suppress deprecation message for cigar_tab, which we initialise
45
#include "htslib/hts_defs.h"
46
#undef HTS_DEPRECATED
47
#define HTS_DEPRECATED(message)
48
49
#include "htslib/sam.h"
50
#include "htslib/bgzf.h"
51
#include "cram/cram.h"
52
#include "hts_internal.h"
53
#include "sam_internal.h"
54
#include "htslib/hfile.h"
55
#include "htslib/hts_endian.h"
56
#include "htslib/hts_expr.h"
57
#include "header.h"
58
59
#include "htslib/khash.h"
60
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
61
KHASH_SET_INIT_INT(tag)
62
63
#ifndef EFTYPE
64
0
#define EFTYPE ENOEXEC
65
#endif
66
#ifndef EOVERFLOW
67
#define EOVERFLOW ERANGE
68
#endif
69
70
/**********************
71
 *** BAM header I/O ***
72
 **********************/
73
74
HTSLIB_EXPORT
75
const int8_t bam_cigar_table[256] = {
76
    // 0 .. 47
77
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
81
    // 48 .. 63  (including =)
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
83
84
    // 64 .. 79  (including MIDNHB)
85
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
86
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
87
88
    // 80 .. 95  (including SPX)
89
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
90
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 96 .. 127
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
96
    // 128 .. 255
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
105
};
106
107
sam_hdr_t *sam_hdr_init()
108
23.2k
{
109
23.2k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
110
23.2k
    if (bh == NULL) return NULL;
111
112
23.2k
    bh->cigar_tab = bam_cigar_table;
113
23.2k
    return bh;
114
23.2k
}
115
116
void sam_hdr_destroy(sam_hdr_t *bh)
117
68.1k
{
118
68.1k
    int32_t i;
119
120
68.1k
    if (bh == NULL) return;
121
122
29.7k
    if (bh->ref_count > 0) {
123
6.47k
        --bh->ref_count;
124
6.47k
        return;
125
6.47k
    }
126
127
23.2k
    if (bh->target_name) {
128
60.1k
        for (i = 0; i < bh->n_targets; ++i)
129
52.7k
            free(bh->target_name[i]);
130
7.42k
        free(bh->target_name);
131
7.42k
        free(bh->target_len);
132
7.42k
    }
133
23.2k
    free(bh->text);
134
23.2k
    if (bh->hrecs)
135
21.6k
        sam_hrecs_free(bh->hrecs);
136
23.2k
    if (bh->sdict)
137
23.2k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
138
23.2k
    free(bh);
139
23.2k
}
140
141
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
142
// references before sam_hdr_t::hrecs is populated
143
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
144
140
{
145
140
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
146
140
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
147
140
    int i;
148
140
    if (!dest_long_refs) return -1;
149
150
4.09k
    for (i = 0; i < h->n_targets; i++) {
151
3.95k
        int ret;
152
3.95k
        khiter_t ksrc, kdest;
153
3.95k
        if (h->target_len[i] < UINT32_MAX) continue;
154
2.12k
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
155
2.12k
        if (ksrc == kh_end(src_long_refs)) continue;
156
2.12k
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
157
2.12k
        if (ret < 0) {
158
0
            kh_destroy(s2i, dest_long_refs);
159
0
            return -1;
160
0
        }
161
2.12k
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
162
2.12k
    }
163
164
140
    h->sdict = dest_long_refs;
165
140
    return 0;
166
140
}
167
168
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
169
8.56k
{
170
8.56k
    if (h0 == NULL) return NULL;
171
8.56k
    sam_hdr_t *h;
172
8.56k
    if ((h = sam_hdr_init()) == NULL) return NULL;
173
    // copy the simple data
174
8.56k
    h->n_targets = 0;
175
8.56k
    h->ignore_sam_err = h0->ignore_sam_err;
176
8.56k
    h->l_text = 0;
177
178
    // Then the pointery stuff
179
180
8.56k
    if (!h0->hrecs) {
181
332
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
182
332
        if (!h->target_len) goto fail;
183
332
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
184
332
        if (!h->target_name) goto fail;
185
186
332
        int i;
187
4.61k
        for (i = 0; i < h0->n_targets; ++i) {
188
4.28k
            h->target_len[i] = h0->target_len[i];
189
4.28k
            h->target_name[i] = strdup(h0->target_name[i]);
190
4.28k
            if (!h->target_name[i]) break;
191
4.28k
        }
192
332
        h->n_targets = i;
193
332
        if (i < h0->n_targets) goto fail;
194
195
332
        if (h0->sdict) {
196
140
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
197
140
        }
198
332
    }
199
200
8.56k
    if (h0->hrecs) {
201
8.22k
        kstring_t tmp = { 0, 0, NULL };
202
8.22k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
203
0
            free(ks_release(&tmp));
204
0
            goto fail;
205
0
        }
206
207
8.22k
        h->l_text = tmp.l;
208
8.22k
        h->text   = ks_release(&tmp);
209
210
8.22k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
211
0
            goto fail;
212
8.22k
    } else {
213
332
        h->l_text = h0->l_text;
214
332
        h->text = malloc(h->l_text + 1);
215
332
        if (!h->text) goto fail;
216
332
        memcpy(h->text, h0->text, h->l_text);
217
332
        h->text[h->l_text] = '\0';
218
332
    }
219
220
8.56k
    return h;
221
222
0
 fail:
223
0
    sam_hdr_destroy(h);
224
0
    return NULL;
225
8.56k
}
226
227
sam_hdr_t *bam_hdr_read(BGZF *fp)
228
801
{
229
801
    sam_hdr_t *h;
230
801
    uint8_t buf[4];
231
801
    int magic_len, has_EOF;
232
801
    int32_t i, name_len, num_names = 0;
233
801
    size_t bufsize;
234
801
    ssize_t bytes;
235
    // check EOF
236
801
    has_EOF = bgzf_check_EOF(fp);
237
801
    if (has_EOF < 0) {
238
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
239
801
    } else if (has_EOF == 0) {
240
801
        hts_log_warning("EOF marker is absent. The input is probably truncated");
241
801
    }
242
    // read "BAM1"
243
801
    magic_len = bgzf_read(fp, buf, 4);
244
801
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
245
0
        hts_log_error("Invalid BAM binary header");
246
0
        return 0;
247
0
    }
248
801
    h = sam_hdr_init();
249
801
    if (!h) goto nomem;
250
251
    // read plain text and the number of reference sequences
252
801
    bytes = bgzf_read(fp, buf, 4);
253
801
    if (bytes != 4) goto read_err;
254
801
    h->l_text = le_to_u32(buf);
255
256
801
    bufsize = h->l_text + 1;
257
801
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
258
801
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
259
801
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
260
801
#endif
261
801
    h->text = (char*)malloc(bufsize);
262
801
    if (!h->text) goto nomem;
263
801
    h->text[h->l_text] = 0; // make sure it is NULL terminated
264
801
    bytes = bgzf_read(fp, h->text, h->l_text);
265
801
    if (bytes != h->l_text) goto read_err;
266
267
759
    bytes = bgzf_read(fp, &h->n_targets, 4);
268
759
    if (bytes != 4) goto read_err;
269
759
    if (fp->is_be) ed_swap_4p(&h->n_targets);
270
271
759
    if (h->n_targets < 0) goto invalid;
272
273
    // read reference sequence names and lengths
274
759
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
275
759
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
276
6
        goto nomem;
277
753
#endif
278
753
    if (h->n_targets > 0) {
279
342
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
280
342
        if (!h->target_name) goto nomem;
281
342
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
282
342
        if (!h->target_len) goto nomem;
283
342
    }
284
411
    else {
285
411
        h->target_name = NULL;
286
411
        h->target_len = NULL;
287
411
    }
288
289
2.05k
    for (i = 0; i != h->n_targets; ++i) {
290
1.35k
        bytes = bgzf_read(fp, &name_len, 4);
291
1.35k
        if (bytes != 4) goto read_err;
292
1.35k
        if (fp->is_be) ed_swap_4p(&name_len);
293
1.35k
        if (name_len <= 0) goto invalid;
294
295
1.34k
        h->target_name[i] = (char*)malloc(name_len);
296
1.34k
        if (!h->target_name[i]) goto nomem;
297
1.34k
        num_names++;
298
299
1.34k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
300
1.34k
        if (bytes != name_len) goto read_err;
301
302
1.29k
        if (h->target_name[i][name_len - 1] != '\0') {
303
            /* Fix missing NUL-termination.  Is this being too nice?
304
               We could alternatively bail out with an error. */
305
714
            char *new_name;
306
714
            if (name_len == INT32_MAX) goto invalid;
307
714
            new_name = realloc(h->target_name[i], name_len + 1);
308
714
            if (new_name == NULL) goto nomem;
309
714
            h->target_name[i] = new_name;
310
714
            h->target_name[i][name_len] = '\0';
311
714
        }
312
313
1.29k
        bytes = bgzf_read(fp, &h->target_len[i], 4);
314
1.29k
        if (bytes != 4) goto read_err;
315
1.29k
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
316
1.29k
    }
317
693
    return h;
318
319
6
 nomem:
320
6
    hts_log_error("Out of memory");
321
6
    goto clean;
322
323
87
 read_err:
324
87
    if (bytes < 0) {
325
27
        hts_log_error("Error reading BGZF stream");
326
60
    } else {
327
60
        hts_log_error("Truncated BAM header");
328
60
    }
329
87
    goto clean;
330
331
15
 invalid:
332
15
    hts_log_error("Invalid BAM binary header");
333
334
108
 clean:
335
108
    if (h != NULL) {
336
108
        h->n_targets = num_names; // ensure we free only allocated target_names
337
108
        sam_hdr_destroy(h);
338
108
    }
339
108
    return NULL;
340
15
}
341
342
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
343
4.31k
{
344
4.31k
    int32_t i, name_len, x;
345
4.31k
    kstring_t hdr_ks = { 0, 0, NULL };
346
4.31k
    char *text;
347
4.31k
    uint32_t l_text;
348
349
4.31k
    if (!h) return -1;
350
351
4.31k
    if (h->hrecs) {
352
3.98k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
353
3.98k
        if (hdr_ks.l > UINT32_MAX) {
354
0
            hts_log_error("Header too long for BAM format");
355
0
            free(hdr_ks.s);
356
0
            return -1;
357
3.98k
        } else if (hdr_ks.l > INT32_MAX) {
358
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
359
0
            hts_log_warning("Output file may not be portable");
360
0
        }
361
3.98k
        text = hdr_ks.s;
362
3.98k
        l_text = hdr_ks.l;
363
3.98k
    } else {
364
332
        if (h->l_text > UINT32_MAX) {
365
0
            hts_log_error("Header too long for BAM format");
366
0
            return -1;
367
332
        } else if (h->l_text > INT32_MAX) {
368
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
369
0
            hts_log_warning("Output file may not be portable");
370
0
        }
371
332
        text = h->text;
372
332
        l_text = h->l_text;
373
332
    }
374
    // write "BAM1"
375
4.31k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
376
    // write plain text and the number of reference sequences
377
4.31k
    if (fp->is_be) {
378
0
        x = ed_swap_4(l_text);
379
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
380
0
        if (l_text) {
381
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
382
0
        }
383
0
        x = ed_swap_4(h->n_targets);
384
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
385
4.31k
    } else {
386
4.31k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
387
4.31k
        if (l_text) {
388
2.29k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
389
2.29k
        }
390
4.31k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
391
4.31k
    }
392
4.31k
    free(hdr_ks.s);
393
    // write sequence names and lengths
394
17.1k
    for (i = 0; i != h->n_targets; ++i) {
395
12.8k
        char *p = h->target_name[i];
396
12.8k
        name_len = strlen(p) + 1;
397
12.8k
        if (fp->is_be) {
398
0
            x = ed_swap_4(name_len);
399
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
400
12.8k
        } else {
401
12.8k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
402
12.8k
        }
403
12.8k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
404
12.8k
        if (fp->is_be) {
405
0
            x = ed_swap_4(h->target_len[i]);
406
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
407
12.8k
        } else {
408
12.8k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
409
12.8k
        }
410
12.8k
    }
411
4.31k
    if (bgzf_flush(fp) < 0) return -1;
412
4.31k
    return 0;
413
4.31k
}
414
415
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
416
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
417
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
418
0
}
419
420
/*************************
421
 *** BAM alignment I/O ***
422
 *************************/
423
424
bam1_t *bam_init1()
425
1.28M
{
426
1.28M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
427
1.28M
}
428
429
int sam_realloc_bam_data(bam1_t *b, size_t desired)
430
1.61M
{
431
1.61M
    uint32_t new_m_data;
432
1.61M
    uint8_t *new_data;
433
1.61M
    new_m_data = desired;
434
1.61M
    kroundup32(new_m_data);
435
1.61M
    if (new_m_data < desired) {
436
0
        errno = ENOMEM; // Not strictly true but we can't store the size
437
0
        return -1;
438
0
    }
439
1.61M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
440
1.61M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
441
13
        errno = ENOMEM;
442
13
        return -1;
443
13
    }
444
1.61M
#endif
445
1.61M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
446
1.61M
        new_data = realloc(b->data, new_m_data);
447
1.61M
    } else {
448
0
        if ((new_data = malloc(new_m_data)) != NULL) {
449
0
            if (b->l_data > 0)
450
0
                memcpy(new_data, b->data,
451
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
452
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
453
0
        }
454
0
    }
455
1.61M
    if (!new_data) return -1;
456
1.61M
    b->data = new_data;
457
1.61M
    b->m_data = new_m_data;
458
1.61M
    return 0;
459
1.61M
}
460
461
void bam_destroy1(bam1_t *b)
462
19.8M
{
463
19.8M
    if (b == 0) return;
464
1.28M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
465
1.28M
        free(b->data);
466
1.28M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
467
            // In case of reuse
468
0
            b->data = NULL;
469
0
            b->m_data = 0;
470
0
            b->l_data = 0;
471
0
        }
472
1.28M
    }
473
474
1.28M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
475
1.28M
        free(b);
476
1.28M
}
477
478
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
479
9.24M
{
480
9.24M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
481
9.24M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
482
9.24M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
483
9.24M
    bdst->l_data = bsrc->l_data;
484
9.24M
    bdst->id = bsrc->id;
485
9.24M
    return bdst;
486
9.24M
}
487
488
bam1_t *bam_dup1(const bam1_t *bsrc)
489
1.26M
{
490
1.26M
    if (bsrc == NULL) return NULL;
491
1.26M
    bam1_t *bdst = bam_init1();
492
1.26M
    if (bdst == NULL) return NULL;
493
1.26M
    if (bam_copy1(bdst, bsrc) == NULL) {
494
0
        bam_destroy1(bdst);
495
0
        return NULL;
496
0
    }
497
1.26M
    return bdst;
498
1.26M
}
499
500
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
501
                             hts_pos_t *rlen, hts_pos_t *qlen)
502
3.28k
{
503
3.28k
    int k;
504
3.28k
    *rlen = *qlen = 0;
505
376k
    for (k = 0; k < n_cigar; ++k) {
506
373k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
507
373k
        int len = bam_cigar_oplen(cigar[k]);
508
373k
        if (type & 1) *qlen += len;
509
373k
        if (type & 2) *rlen += len;
510
373k
    }
511
3.28k
}
512
513
static int subtract_check_underflow(size_t length, size_t *limit)
514
137M
{
515
137M
    if (length <= *limit) {
516
137M
        *limit -= length;
517
137M
        return 0;
518
137M
    }
519
520
0
    return -1;
521
137M
}
522
523
int bam_set1(bam1_t *bam,
524
             size_t l_qname, const char *qname,
525
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
526
             size_t n_cigar, const uint32_t *cigar,
527
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
528
             size_t l_seq, const char *seq, const char *qual,
529
             size_t l_aux)
530
27.4M
{
531
    // use a default qname "*" if none is provided
532
27.4M
    if (l_qname == 0) {
533
23.9M
        l_qname = 1;
534
23.9M
        qname = "*";
535
23.9M
    }
536
537
    // note: the qname is stored nul terminated and padded as described in the
538
    // documentation for the bam1_t struct.
539
27.4M
    size_t qname_nuls = 4 - l_qname % 4;
540
541
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
542
    // can't use bam_endpos() directly as some fields not yet set up.
543
27.4M
    hts_pos_t rlen = 0, qlen = 0;
544
27.4M
    if (!(flag & BAM_FUNMAP)) {
545
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
546
0
    }
547
27.4M
    if (rlen == 0) {
548
27.4M
        rlen = 1;
549
27.4M
    }
550
551
    // validate parameters
552
27.4M
    if (l_qname > 254) {
553
93
        hts_log_error("Query name too long");
554
93
        errno = EINVAL;
555
93
        return -1;
556
93
    }
557
27.4M
    if (HTS_POS_MAX - rlen <= pos) {
558
0
        hts_log_error("Read ends beyond highest supported position");
559
0
        errno = EINVAL;
560
0
        return -1;
561
0
    }
562
27.4M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
563
0
        hts_log_error("Mapped query must have a CIGAR");
564
0
        errno = EINVAL;
565
0
        return -1;
566
0
    }
567
27.4M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
568
0
        hts_log_error("CIGAR and query sequence are of different length");
569
0
        errno = EINVAL;
570
0
        return -1;
571
0
    }
572
573
27.4M
    size_t limit = INT32_MAX;
574
27.4M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
575
27.4M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
576
27.4M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
577
27.4M
    u    += subtract_check_underflow(l_seq, &limit);
578
27.4M
    u    += subtract_check_underflow(l_aux, &limit);
579
27.4M
    if (u != 0) {
580
0
        hts_log_error("Size overflow");
581
0
        errno = EINVAL;
582
0
        return -1;
583
0
    }
584
585
    // re-allocate the data buffer as needed.
586
27.4M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
587
27.4M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
588
0
        return -1;
589
0
    }
590
591
27.4M
    bam->l_data = (int)data_len;
592
27.4M
    bam->core.pos = pos;
593
27.4M
    bam->core.tid = tid;
594
27.4M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
595
27.4M
    bam->core.qual = mapq;
596
27.4M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
597
27.4M
    bam->core.flag = flag;
598
27.4M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
599
27.4M
    bam->core.n_cigar = (uint32_t)n_cigar;
600
27.4M
    bam->core.l_qseq = (int32_t)l_seq;
601
27.4M
    bam->core.mtid = mtid;
602
27.4M
    bam->core.mpos = mpos;
603
27.4M
    bam->core.isize = isize;
604
605
27.4M
    uint8_t *cp = bam->data;
606
27.4M
    strncpy((char *)cp, qname, l_qname);
607
27.4M
    int i;
608
108M
    for (i = 0; i < qname_nuls; i++) {
609
80.8M
        cp[l_qname + i] = '\0';
610
80.8M
    }
611
27.4M
    cp += l_qname + qname_nuls;
612
613
27.4M
    if (n_cigar > 0) {
614
0
        memcpy(cp, cigar, n_cigar * 4);
615
0
    }
616
27.4M
    cp += n_cigar * 4;
617
618
1.48G
#define NN 16
619
27.4M
    const uint8_t *useq = (uint8_t *)seq;
620
148M
    for (i = 0; i + NN < l_seq; i += NN) {
621
121M
        int j;
622
121M
        const uint8_t *u2 = useq+i;
623
1.09G
        for (j = 0; j < NN/2; j++)
624
970M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
625
121M
        cp += NN/2;
626
121M
    }
627
34.3M
    for (; i + 1 < l_seq; i += 2) {
628
6.89M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
629
6.89M
    }
630
631
28.4M
    for (; i < l_seq; i++) {
632
991k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
633
991k
    }
634
635
27.4M
    if (qual) {
636
1.21k
        memcpy(cp, qual, l_seq);
637
1.21k
    }
638
27.4M
    else {
639
27.4M
        memset(cp, '\xff', l_seq);
640
27.4M
    }
641
642
27.4M
    return (int)data_len;
643
27.4M
}
644
645
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
646
8.84M
{
647
8.84M
    int k;
648
8.84M
    hts_pos_t l;
649
9.36M
    for (k = l = 0; k < n_cigar; ++k)
650
519k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
651
415k
            l += bam_cigar_oplen(cigar[k]);
652
8.84M
    return l;
653
8.84M
}
654
655
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
656
157k
{
657
157k
    int k;
658
157k
    hts_pos_t l;
659
356k
    for (k = l = 0; k < n_cigar; ++k)
660
198k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
661
36.2k
            l += bam_cigar_oplen(cigar[k]);
662
157k
    return l;
663
157k
}
664
665
hts_pos_t bam_endpos(const bam1_t *b)
666
1.09k
{
667
1.09k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
668
1.09k
    if (rlen == 0) rlen = 1;
669
1.09k
    return b->core.pos + rlen;
670
1.09k
}
671
672
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
673
312k
{
674
312k
    bam1_core_t *c = &b->core;
675
312k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
676
312k
    uint8_t *CG;
677
678
    // test where there is a real CIGAR in the CG tag to move
679
312k
    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
680
156k
    cigar0 = bam_get_cigar(b);
681
156k
    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
682
149k
    fake_bytes = c->n_cigar * 4;
683
149k
    int saved_errno = errno;
684
149k
    CG = bam_aux_get(b, "CG");
685
149k
    if (!CG) {
686
145k
        if (errno != ENOENT) return -1;  // Bad aux data
687
145k
        errno = saved_errno; // restore errno on expected no-CG-tag case
688
145k
        return 0;
689
145k
    }
690
4.10k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
691
3.01k
        return 0; // not of type B,I
692
1.09k
    CG_len = le_to_u32(CG + 2);
693
1.09k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
694
695
    // move from the CG tag to the right position
696
1.09k
    cigar_st = (uint8_t*)cigar0 - b->data;
697
1.09k
    c->n_cigar = CG_len;
698
1.09k
    n_cigar4 = c->n_cigar * 4;
699
1.09k
    CG_st = CG - b->data - 2;
700
1.09k
    CG_en = CG_st + 8 + n_cigar4;
701
1.09k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
702
1.09k
    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
703
1.09k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
704
1.09k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
705
1.09k
    if (ori_len > CG_en) // move data after the CG tag
706
2
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
707
1.09k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
708
1.09k
    if (recal_bin)
709
1.09k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
710
1.09k
    if (give_warning)
711
1.09k
        hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
712
1.09k
    return 1;
713
1.09k
}
714
715
static inline int aux_type2size(uint8_t type)
716
1.72M
{
717
1.72M
    switch (type) {
718
1.22M
    case 'A': case 'c': case 'C':
719
1.22M
        return 1;
720
146k
    case 's': case 'S':
721
146k
        return 2;
722
232k
    case 'i': case 'I': case 'f':
723
232k
        return 4;
724
30.6k
    case 'd':
725
30.6k
        return 8;
726
93.3k
    case 'Z': case 'H': case 'B':
727
93.3k
        return type;
728
166
    default:
729
166
        return 0;
730
1.72M
    }
731
1.72M
}
732
733
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
734
0
{
735
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
736
0
    uint32_t i;
737
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
738
0
}
739
740
// Fix bad records where qname is not terminated correctly.
741
1.89k
static int fixup_missing_qname_nul(bam1_t *b) {
742
1.89k
    bam1_core_t *c = &b->core;
743
744
    // Note this is called before c->l_extranul is added to c->l_qname
745
1.89k
    if (c->l_extranul > 0) {
746
1.77k
        b->data[c->l_qname++] = '\0';
747
1.77k
        c->l_extranul--;
748
1.77k
    } else {
749
114
        if (b->l_data > INT_MAX - 4) return -1;
750
114
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
751
114
        b->l_data += 4;
752
114
        b->data[c->l_qname++] = '\0';
753
114
        c->l_extranul = 3;
754
114
    }
755
1.89k
    return 0;
756
1.89k
}
757
758
/*
759
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
760
 * in multi-threaded handling.  This may be worth considering for htslib2.
761
 */
762
int bam_read1(BGZF *fp, bam1_t *b)
763
3.67k
{
764
3.67k
    bam1_core_t *c = &b->core;
765
3.67k
    int32_t block_len, ret, i;
766
3.67k
    uint32_t x[8], new_l_data;
767
768
3.67k
    b->l_data = 0;
769
770
3.67k
    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
771
69
        if (ret == 0) return -1; // normal end-of-file
772
40
        else return -2; // truncated
773
69
    }
774
3.60k
    if (fp->is_be)
775
0
        ed_swap_4p(&block_len);
776
3.60k
    if (block_len < 32) return -4;  // block_len includes core data
777
3.53k
    if (bgzf_read(fp, x, 32) != 32) return -3;
778
3.47k
    if (fp->is_be) {
779
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
780
0
    }
781
3.47k
    c->tid = x[0]; c->pos = (int32_t)x[1];
782
3.47k
    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
783
3.47k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
784
3.47k
    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
785
3.47k
    c->l_qseq = x[4];
786
3.47k
    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
787
788
3.47k
    new_l_data = block_len - 32 + c->l_extranul;
789
3.47k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
790
3.43k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
791
3.43k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
792
54
        return -4;
793
3.37k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
794
3.36k
    b->l_data = new_l_data;
795
796
3.36k
    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
797
3.36k
    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
798
1.89k
        if (fixup_missing_qname_nul(b) < 0) return -4;
799
1.89k
    }
800
5.94k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
801
3.36k
    c->l_qname += c->l_extranul;
802
3.36k
    if (b->l_data < c->l_qname ||
803
3.36k
        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
804
35
        return -4;
805
3.32k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
806
3.32k
    if (bam_tag2cigar(b, 0, 0) < 0)
807
0
        return -4;
808
809
3.32k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
810
3.28k
        hts_pos_t rlen, qlen;
811
3.28k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
812
3.28k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
813
3.28k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
814
        // Sanity check for broken CIGAR alignments
815
3.28k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
816
20
            hts_log_error("CIGAR and query sequence lengths differ for %s",
817
20
                    bam_get_qname(b));
818
20
            return -4;
819
20
        }
820
3.28k
    }
821
822
3.30k
    return 4 + block_len;
823
3.32k
}
824
825
int bam_write1(BGZF *fp, const bam1_t *b)
826
9.24M
{
827
9.24M
    const bam1_core_t *c = &b->core;
828
9.24M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
829
9.24M
    int i, ok;
830
9.24M
    if (c->l_qname - c->l_extranul > 255) {
831
1
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
832
1
        errno = EOVERFLOW;
833
1
        return -1;
834
1
    }
835
9.24M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
836
9.24M
    if (c->pos > INT_MAX ||
837
9.24M
        c->mpos > INT_MAX ||
838
9.24M
        c->isize < INT_MIN || c->isize > INT_MAX) {
839
256
        hts_log_error("Positional data is too large for BAM format");
840
256
        return -1;
841
256
    }
842
9.24M
    x[0] = c->tid;
843
9.24M
    x[1] = c->pos;
844
9.24M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
845
9.24M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
846
9.24M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
847
9.24M
    x[4] = c->l_qseq;
848
9.24M
    x[5] = c->mtid;
849
9.24M
    x[6] = c->mpos;
850
9.24M
    x[7] = c->isize;
851
9.24M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
852
9.24M
    if (fp->is_be) {
853
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
854
0
        y = block_len;
855
0
        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
856
0
        swap_data(c, b->l_data, b->data, 1);
857
9.24M
    } else {
858
9.24M
        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
859
9.24M
    }
860
9.24M
    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
861
9.24M
    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
862
9.24M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
863
9.24M
        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
864
9.24M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
865
0
        uint8_t buf[8];
866
0
        uint32_t cigar_st, cigar_en, cigar[2];
867
0
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
868
0
        if (cigreflen >= (1<<28)) {
869
            // Length of reference covered is greater than the biggest
870
            // CIGAR operation currently allowed.
871
0
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
872
0
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
873
0
                          bam_get_qname(b), c->n_cigar, cigreflen);
874
0
            return -1;
875
0
        }
876
0
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
877
0
        cigar_en = cigar_st + c->n_cigar * 4;
878
0
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
879
0
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
880
0
        u32_to_le(cigar[0], buf);
881
0
        u32_to_le(cigar[1], buf + 4);
882
0
        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
883
0
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
884
0
        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
885
0
        u32_to_le(c->n_cigar, buf);
886
0
        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
887
0
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
888
0
    }
889
9.24M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
890
9.24M
    return ok? 4 + block_len : -1;
891
9.24M
}
892
893
/*
894
 * Write a BAM file and append to the in-memory index simultaneously.
895
 */
896
9.24M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
897
9.24M
    BGZF *bfp = fp->fp.bgzf;
898
899
9.24M
    if (!fp->idx)
900
9.24M
        return bam_write1(bfp, b);
901
902
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
903
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
904
0
        return -1;
905
0
    if (!bfp->mt)
906
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
907
908
0
    int ret = bam_write1(bfp, b);
909
0
    if (ret < 0)
910
0
        return -1;
911
912
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
913
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
914
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
915
0
        ret = -1;
916
0
    }
917
918
0
    return ret;
919
0
}
920
921
/*
922
 * Set the qname in a BAM record
923
 */
924
int bam_set_qname(bam1_t *rec, const char *qname)
925
0
{
926
0
    if (!rec) return -1;
927
0
    if (!qname || !*qname) return -1;
928
929
0
    size_t old_len = rec->core.l_qname;
930
0
    size_t new_len = strlen(qname) + 1;
931
0
    if (new_len < 1 || new_len > 255) return -1;
932
933
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
934
935
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
936
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
937
938
    // Make room
939
0
    if (new_len + extranul != rec->core.l_qname)
940
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
941
    // Copy in new name and pad if needed
942
0
    memcpy(rec->data, qname, new_len);
943
0
    int n;
944
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
945
946
0
    rec->l_data = new_data_len;
947
0
    rec->core.l_qname = new_len + extranul;
948
0
    rec->core.l_extranul = extranul;
949
950
0
    return 0;
951
0
}
952
953
/********************
954
 *** BAM indexing ***
955
 ********************/
956
957
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
958
0
{
959
0
    int n_lvls, i, fmt, ret;
960
0
    bam1_t *b;
961
0
    hts_idx_t *idx;
962
0
    sam_hdr_t *h;
963
0
    h = sam_hdr_read(fp);
964
0
    if (h == NULL) return NULL;
965
0
    if (min_shift > 0) {
966
0
        hts_pos_t max_len = 0, s;
967
0
        for (i = 0; i < h->n_targets; ++i) {
968
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
969
0
            if (max_len < len) max_len = len;
970
0
        }
971
0
        max_len += 256;
972
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
973
0
        fmt = HTS_FMT_CSI;
974
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
975
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
976
0
    b = bam_init1();
977
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
978
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
979
0
        if (ret < 0) { // unsorted or doesn't fit
980
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
981
0
            goto err;
982
0
        }
983
0
    }
984
0
    if (ret < -1) goto err; // corrupted BAM file
985
986
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
987
0
    sam_hdr_destroy(h);
988
0
    bam_destroy1(b);
989
0
    return idx;
990
991
0
err:
992
0
    bam_destroy1(b);
993
0
    hts_idx_destroy(idx);
994
0
    return NULL;
995
0
}
996
997
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
998
0
{
999
0
    hts_idx_t *idx;
1000
0
    htsFile *fp;
1001
0
    int ret = 0;
1002
1003
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1004
0
    if (nthreads)
1005
0
        hts_set_threads(fp, nthreads);
1006
1007
0
    switch (fp->format.format) {
1008
0
    case cram:
1009
1010
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1011
0
        break;
1012
1013
0
    case bam:
1014
0
    case sam:
1015
0
        if (fp->format.compression != bgzf) {
1016
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1017
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1018
0
            ret = -1;
1019
0
            break;
1020
0
        }
1021
0
        idx = sam_index(fp, min_shift);
1022
0
        if (idx) {
1023
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1024
0
            if (ret < 0) ret = -4;
1025
0
            hts_idx_destroy(idx);
1026
0
        }
1027
0
        else ret = -1;
1028
0
        break;
1029
1030
0
    default:
1031
0
        ret = -3;
1032
0
        break;
1033
0
    }
1034
0
    hts_close(fp);
1035
1036
0
    return ret;
1037
0
}
1038
1039
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1040
0
{
1041
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1042
0
}
1043
1044
int sam_index_build(const char *fn, int min_shift)
1045
0
{
1046
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1047
0
}
1048
1049
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1050
#undef bam_index_build
1051
int bam_index_build(const char *fn, int min_shift)
1052
0
{
1053
0
    return sam_index_build2(fn, NULL, min_shift);
1054
0
}
1055
1056
// Initialise fp->idx for the current format type.
1057
// This must be called after the header has been written but no other data.
1058
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1059
0
    fp->fnidx = fnidx;
1060
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1061
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1062
0
        int n_lvls, fmt = HTS_FMT_CSI;
1063
0
        if (min_shift > 0) {
1064
0
            int64_t max_len = 0, s;
1065
0
            int i;
1066
0
            for (i = 0; i < h->n_targets; ++i)
1067
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1068
0
            max_len += 256;
1069
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1070
1071
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1072
1073
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1074
0
        return fp->idx ? 0 : -1;
1075
0
    }
1076
1077
0
    if (fp->format.format == cram) {
1078
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1079
0
        return fp->fp.cram->idxfp ? 0 : -1;
1080
0
    }
1081
1082
0
    return -1;
1083
0
}
1084
1085
// Finishes an index. Call after the last record has been written.
1086
// Returns 0 on success, <0 on failure.
1087
0
int sam_idx_save(htsFile *fp) {
1088
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1089
0
        fp->format.format == vcf || fp->format.format == sam) {
1090
0
        int ret;
1091
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1092
0
            errno = -ret;
1093
0
            return -1;
1094
0
        }
1095
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1096
0
            return -1;
1097
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1098
1099
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1100
0
            return -1;
1101
1102
0
        return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1103
1104
0
    } else if (fp->format.format == cram) {
1105
        // flushed and closed by cram_close
1106
0
    }
1107
1108
0
    return 0;
1109
0
}
1110
1111
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1112
0
{
1113
0
    htsFile *fp = (htsFile *)fpv;
1114
0
    bam1_t *b = bv;
1115
0
    fp->line.l = 0;
1116
0
    int ret = sam_read1(fp, fp->bam_header, b);
1117
0
    if (ret >= 0) {
1118
0
        *tid = b->core.tid;
1119
0
        *beg = b->core.pos;
1120
0
        *end = bam_endpos(b);
1121
0
    }
1122
0
    return ret;
1123
0
}
1124
1125
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1126
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1127
0
{
1128
0
    htsFile *fp = (htsFile *)fpv;
1129
0
    bam1_t *b = bv;
1130
0
    fp->line.l = 0;
1131
0
    int ret = sam_read1(fp, fp->bam_header, b);
1132
0
    return ret;
1133
0
}
1134
1135
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1136
// samtools/bam.c.
1137
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1138
0
{
1139
0
    const char *rg;
1140
0
    kstring_t lib = { 0, 0, NULL };
1141
0
    rg = (char *)bam_aux_get(b, "RG");
1142
1143
0
    if (!rg)
1144
0
        return NULL;
1145
0
    else
1146
0
        rg++;
1147
1148
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1149
0
        return NULL;
1150
1151
0
    static char LB_text[1024];
1152
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1153
1154
0
    memcpy(LB_text, lib.s, len);
1155
0
    LB_text[len] = 0;
1156
1157
0
    free(lib.s);
1158
1159
0
    return LB_text;
1160
0
}
1161
1162
1163
// Bam record pointer and SAM header combined
1164
typedef struct {
1165
    const sam_hdr_t *h;
1166
    const bam1_t *b;
1167
} hb_pair;
1168
1169
// Looks up variable names in str and replaces them with their value.
1170
// Also supports aux tags.
1171
//
1172
// Note the expression parser deliberately overallocates str size so it
1173
// is safe to use memcmp over strcmp.
1174
static int bam_sym_lookup(void *data, char *str, char **end,
1175
0
                          hts_expr_val_t *res) {
1176
0
    hb_pair *hb = (hb_pair *)data;
1177
0
    const bam1_t *b = hb->b;
1178
1179
0
    res->is_str = 0;
1180
0
    switch(*str) {
1181
0
    case 'c':
1182
0
        if (memcmp(str, "cigar", 5) == 0) {
1183
0
            *end = str+5;
1184
0
            res->is_str = 1;
1185
0
            ks_clear(&res->s);
1186
0
            uint32_t *cigar = bam_get_cigar(b);
1187
0
            int i, n = b->core.n_cigar, r = 0;
1188
0
            if (n) {
1189
0
                for (i = 0; i < n; i++) {
1190
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1191
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1192
0
                }
1193
0
                r |= kputs("", &res->s) < 0;
1194
0
            } else {
1195
0
                r |= kputs("*", &res->s) < 0;
1196
0
            }
1197
0
            return r ? -1 : 0;
1198
0
        }
1199
0
        break;
1200
1201
0
    case 'e':
1202
0
        if (memcmp(str, "endpos", 6) == 0) {
1203
0
            *end = str+6;
1204
0
            res->d = bam_endpos(b);
1205
0
            return 0;
1206
0
        }
1207
0
        break;
1208
1209
0
    case 'f':
1210
0
        if (memcmp(str, "flag", 4) == 0) {
1211
0
            str = *end = str+4;
1212
0
            if (*str != '.') {
1213
0
                res->d = b->core.flag;
1214
0
                return 0;
1215
0
            } else {
1216
0
                str++;
1217
0
                if (!memcmp(str, "paired", 6)) {
1218
0
                    *end = str+6;
1219
0
                    res->d = b->core.flag & BAM_FPAIRED;
1220
0
                    return 0;
1221
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1222
0
                    *end = str+11;
1223
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1224
0
                    return 0;
1225
0
                } else if (!memcmp(str, "unmap", 5)) {
1226
0
                    *end = str+5;
1227
0
                    res->d = b->core.flag & BAM_FUNMAP;
1228
0
                    return 0;
1229
0
                } else if (!memcmp(str, "munmap", 6)) {
1230
0
                    *end = str+6;
1231
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1232
0
                    return 0;
1233
0
                } else if (!memcmp(str, "reverse", 7)) {
1234
0
                    *end = str+7;
1235
0
                    res->d = b->core.flag & BAM_FREVERSE;
1236
0
                    return 0;
1237
0
                } else if (!memcmp(str, "mreverse", 8)) {
1238
0
                    *end = str+8;
1239
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1240
0
                    return 0;
1241
0
                } else if (!memcmp(str, "read1", 5)) {
1242
0
                    *end = str+5;
1243
0
                    res->d = b->core.flag & BAM_FREAD1;
1244
0
                    return 0;
1245
0
                } else if (!memcmp(str, "read2", 5)) {
1246
0
                    *end = str+5;
1247
0
                    res->d = b->core.flag & BAM_FREAD2;
1248
0
                    return 0;
1249
0
                } else if (!memcmp(str, "secondary", 9)) {
1250
0
                    *end = str+9;
1251
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1252
0
                    return 0;
1253
0
                } else if (!memcmp(str, "qcfail", 6)) {
1254
0
                    *end = str+6;
1255
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1256
0
                    return 0;
1257
0
                } else if (!memcmp(str, "dup", 3)) {
1258
0
                    *end = str+3;
1259
0
                    res->d = b->core.flag & BAM_FDUP;
1260
0
                    return 0;
1261
0
                } else if (!memcmp(str, "supplementary", 13)) {
1262
0
                    *end = str+13;
1263
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1264
0
                    return 0;
1265
0
                } else {
1266
0
                    hts_log_error("Unrecognised flag string");
1267
0
                    return -1;
1268
0
                }
1269
0
            }
1270
0
        }
1271
0
        break;
1272
1273
0
    case 'h':
1274
0
        if (memcmp(str, "hclen", 5) == 0) {
1275
0
            int hclen = 0;
1276
0
            uint32_t *cigar = bam_get_cigar(b);
1277
0
            uint32_t ncigar = b->core.n_cigar;
1278
1279
            // left
1280
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1281
0
                hclen = bam_cigar_oplen(cigar[0]);
1282
1283
            // right
1284
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1285
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1286
1287
0
            *end = str+5;
1288
0
            res->d = hclen;
1289
0
            return 0;
1290
0
        }
1291
0
        break;
1292
1293
0
    case 'l':
1294
0
        if (memcmp(str, "library", 7) == 0) {
1295
0
            *end = str+7;
1296
0
            res->is_str = 1;
1297
0
            const char *lib = bam_get_library(hb->h, b);
1298
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1299
0
            return 0;
1300
0
        }
1301
0
        break;
1302
1303
0
    case 'm':
1304
0
        if (memcmp(str, "mapq", 4) == 0) {
1305
0
            *end = str+4;
1306
0
            res->d = b->core.qual;
1307
0
            return 0;
1308
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1309
0
            *end = str+4;
1310
0
            res->d = b->core.mpos+1;
1311
0
            return 0;
1312
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1313
0
            *end = str+6;
1314
0
            res->is_str = 1;
1315
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1316
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1317
0
            return 0;
1318
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1319
0
            *end = str+6;
1320
0
            res->d = b->core.mtid;
1321
0
            return 0;
1322
0
        }
1323
0
        break;
1324
1325
0
    case 'n':
1326
0
        if (memcmp(str, "ncigar", 6) == 0) {
1327
0
            *end = str+6;
1328
0
            res->d = b->core.n_cigar;
1329
0
            return 0;
1330
0
        }
1331
0
        break;
1332
1333
0
    case 'p':
1334
0
        if (memcmp(str, "pos", 3) == 0) {
1335
0
            *end = str+3;
1336
0
            res->d = b->core.pos+1;
1337
0
            return 0;
1338
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1339
0
            *end = str+5;
1340
0
            res->d = b->core.mpos+1;
1341
0
            return 0;
1342
0
        }
1343
0
        break;
1344
1345
0
    case 'q':
1346
0
        if (memcmp(str, "qlen", 4) == 0) {
1347
0
            *end = str+4;
1348
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1349
0
            return 0;
1350
0
        } else if (memcmp(str, "qname", 5) == 0) {
1351
0
            *end = str+5;
1352
0
            res->is_str = 1;
1353
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1354
0
            return 0;
1355
0
        } else if (memcmp(str, "qual", 4) == 0) {
1356
0
            *end = str+4;
1357
0
            ks_clear(&res->s);
1358
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1359
0
                return -1;
1360
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1361
0
            res->s.l = b->core.l_qseq;
1362
0
            res->is_str = 1;
1363
0
            return 0;
1364
0
        }
1365
0
        break;
1366
1367
0
    case 'r':
1368
0
        if (memcmp(str, "rlen", 4) == 0) {
1369
0
            *end = str+4;
1370
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1371
0
            return 0;
1372
0
        } else if (memcmp(str, "rname", 5) == 0) {
1373
0
            *end = str+5;
1374
0
            res->is_str = 1;
1375
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1376
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1377
0
            return 0;
1378
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1379
0
            *end = str+5;
1380
0
            res->is_str = 1;
1381
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1382
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1383
0
            return 0;
1384
0
        } else if (memcmp(str, "refid", 5) == 0) {
1385
0
            *end = str+5;
1386
0
            res->d = b->core.tid;
1387
0
            return 0;
1388
0
        }
1389
0
        break;
1390
1391
0
    case 's':
1392
0
        if (memcmp(str, "seq", 3) == 0) {
1393
0
            *end = str+3;
1394
0
            ks_clear(&res->s);
1395
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1396
0
                return -1;
1397
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1398
0
            res->s.s[b->core.l_qseq] = 0;
1399
0
            res->s.l = b->core.l_qseq;
1400
0
            res->is_str = 1;
1401
0
            return 0;
1402
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1403
0
            int sclen = 0;
1404
0
            uint32_t *cigar = bam_get_cigar(b);
1405
0
            int ncigar = b->core.n_cigar;
1406
0
            int left = 0;
1407
1408
            // left
1409
0
            if (ncigar > 0
1410
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1411
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1412
0
            else if (ncigar > 1
1413
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1414
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1415
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1416
1417
            // right
1418
0
            if (ncigar-1 > left
1419
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1420
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1421
0
            else if (ncigar-2 > left
1422
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1423
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1424
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1425
1426
0
            *end = str+5;
1427
0
            res->d = sclen;
1428
0
            return 0;
1429
0
        }
1430
0
        break;
1431
1432
0
    case 't':
1433
0
        if (memcmp(str, "tlen", 4) == 0) {
1434
0
            *end = str+4;
1435
0
            res->d = b->core.isize;
1436
0
            return 0;
1437
0
        }
1438
0
        break;
1439
1440
0
    case '[':
1441
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1442
            /* aux tags */
1443
0
            *end = str+4;
1444
1445
0
            uint8_t *aux = bam_aux_get(b, str+1);
1446
0
            if (aux) {
1447
                // we define the truth of a tag to be its presence, even if 0.
1448
0
                res->is_true = 1;
1449
0
                switch (*aux) {
1450
0
                case 'Z':
1451
0
                case 'H':
1452
0
                    res->is_str = 1;
1453
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1454
0
                    break;
1455
1456
0
                case 'A':
1457
0
                    res->is_str = 1;
1458
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1459
0
                    break;
1460
1461
0
                case 'i': case 'I':
1462
0
                case 's': case 'S':
1463
0
                case 'c': case 'C':
1464
0
                    res->is_str = 0;
1465
0
                    res->d = bam_aux2i(aux);
1466
0
                    break;
1467
1468
0
                case 'f':
1469
0
                case 'd':
1470
0
                    res->is_str = 0;
1471
0
                    res->d = bam_aux2f(aux);
1472
0
                    break;
1473
1474
0
                default:
1475
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1476
0
                                  *aux);
1477
0
                    return -1;
1478
0
                }
1479
0
                return 0;
1480
1481
0
            } else {
1482
                // hence absent tags are always false (and strings)
1483
0
                res->is_str = 1;
1484
0
                res->s.l = 0;
1485
0
                res->d = 0;
1486
0
                res->is_true = 0;
1487
0
                return 0;
1488
0
            }
1489
0
        }
1490
0
        break;
1491
0
    }
1492
1493
    // All successful matches in switch should return 0.
1494
    // So if we didn't match, it's a parse error.
1495
0
    return -1;
1496
0
}
1497
1498
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1499
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1500
0
{
1501
0
    hb_pair hb = {h, b};
1502
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1503
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1504
0
        hts_log_error("Couldn't process filter expression");
1505
0
        hts_expr_val_free(&res);
1506
0
        return -1;
1507
0
    }
1508
1509
0
    int t = res.is_true;
1510
0
    hts_expr_val_free(&res);
1511
1512
0
    return t;
1513
0
}
1514
1515
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1516
0
{
1517
0
    htsFile *fp = fpv;
1518
0
    bam1_t *b = bv;
1519
0
    int pass_filter, ret;
1520
1521
0
    do {
1522
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1523
0
        if (ret < 0)
1524
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1525
1526
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1527
0
            return -2;
1528
1529
0
        *tid = b->core.tid;
1530
0
        *beg = b->core.pos;
1531
0
        *end = bam_endpos(b);
1532
1533
0
        if (fp->filter) {
1534
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1535
0
            if (pass_filter < 0)
1536
0
                return -2;
1537
0
        } else {
1538
0
            pass_filter = 1;
1539
0
        }
1540
0
    } while (pass_filter == 0);
1541
1542
0
    return ret;
1543
0
}
1544
1545
static int cram_pseek(void *fp, int64_t offset, int whence)
1546
0
{
1547
0
    cram_fd *fd =  (cram_fd *)fp;
1548
1549
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1550
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1551
0
        return -1;
1552
1553
0
    fd->curr_position = offset;
1554
1555
0
    if (fd->ctr) {
1556
0
        cram_free_container(fd->ctr);
1557
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1558
0
            cram_free_container(fd->ctr_mt);
1559
1560
0
        fd->ctr = NULL;
1561
0
        fd->ctr_mt = NULL;
1562
0
        fd->ooc = 0;
1563
0
    }
1564
1565
0
    return 0;
1566
0
}
1567
1568
/*
1569
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1570
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1571
 *   container previously fetched. It was designed like this to integrate with the functionality
1572
 *   of the iterator stepping logic.
1573
 */
1574
1575
static int64_t cram_ptell(void *fp)
1576
0
{
1577
0
    cram_fd *fd = (cram_fd *)fp;
1578
0
    cram_container *c;
1579
0
    cram_slice *s;
1580
0
    int64_t ret = -1L;
1581
1582
0
    if (fd) {
1583
0
        if ((c = fd->ctr) != NULL) {
1584
0
            if ((s = c->slice) != NULL && s->max_rec) {
1585
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1586
0
                    fd->curr_position += c->offset + c->length;
1587
0
            }
1588
0
        }
1589
0
        ret = fd->curr_position;
1590
0
    }
1591
1592
0
    return ret;
1593
0
}
1594
1595
static int bam_pseek(void *fp, int64_t offset, int whence)
1596
0
{
1597
0
    BGZF *fd = (BGZF *)fp;
1598
1599
0
    return bgzf_seek(fd, offset, whence);
1600
0
}
1601
1602
static int64_t bam_ptell(void *fp)
1603
0
{
1604
0
    BGZF *fd = (BGZF *)fp;
1605
0
    if (!fd)
1606
0
        return -1L;
1607
1608
0
    return bgzf_tell(fd);
1609
0
}
1610
1611
1612
1613
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1614
0
{
1615
0
    switch (fp->format.format) {
1616
0
    case bam:
1617
0
    case sam:
1618
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1619
1620
0
    case cram: {
1621
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1622
1623
        // Cons up a fake "index" just pointing at the associated cram_fd:
1624
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1625
0
        if (idx == NULL) return NULL;
1626
0
        idx->fmt = HTS_FMT_CRAI;
1627
0
        idx->cram = fp->fp.cram;
1628
0
        return (hts_idx_t *) idx;
1629
0
        }
1630
1631
0
    default:
1632
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1633
0
    }
1634
0
}
1635
1636
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1637
0
{
1638
0
    return index_load(fp, fn, fnidx, flags);
1639
0
}
1640
1641
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1642
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1643
0
}
1644
1645
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1646
0
{
1647
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1648
0
}
1649
1650
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1651
0
{
1652
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1653
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1654
0
    if (iter == NULL) return NULL;
1655
1656
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1657
    // the readrec function:
1658
0
    iter->is_cram = 1;
1659
0
    iter->read_rest = 1;
1660
0
    iter->off = NULL;
1661
0
    iter->bins.a = NULL;
1662
0
    iter->readrec = readrec;
1663
1664
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1665
0
        cram_range r = { tid, beg+1, end };
1666
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1667
1668
0
        iter->curr_off = 0;
1669
        // The following fields are not required by hts_itr_next(), but are
1670
        // filled in in case user code wants to look at them.
1671
0
        iter->tid = tid;
1672
0
        iter->beg = beg;
1673
0
        iter->end = end;
1674
1675
0
        switch (ret) {
1676
0
        case 0:
1677
0
            break;
1678
1679
0
        case -2:
1680
            // No data vs this ref, so mark iterator as completed.
1681
            // Same as HTS_IDX_NONE.
1682
0
            iter->finished = 1;
1683
0
            break;
1684
1685
0
        default:
1686
0
            free(iter);
1687
0
            return NULL;
1688
0
        }
1689
0
    }
1690
0
    else switch (tid) {
1691
0
    case HTS_IDX_REST:
1692
0
        iter->curr_off = 0;
1693
0
        break;
1694
0
    case HTS_IDX_NONE:
1695
0
        iter->curr_off = 0;
1696
0
        iter->finished = 1;
1697
0
        break;
1698
0
    default:
1699
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1700
0
        abort();
1701
0
        break;
1702
0
    }
1703
1704
0
    return iter;
1705
0
}
1706
1707
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1708
0
{
1709
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1710
0
    if (idx == NULL)
1711
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1712
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1713
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1714
0
    else
1715
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1716
0
}
1717
1718
static int cram_name2id(void *fdv, const char *ref)
1719
0
{
1720
0
    cram_fd *fd = (cram_fd *) fdv;
1721
0
    return sam_hdr_name2tid(fd->header, ref);
1722
0
}
1723
1724
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1725
0
{
1726
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1727
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1728
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1729
0
                          sam_readrec);
1730
0
}
1731
1732
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1733
0
{
1734
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1735
0
    hts_reglist_t *r_list = NULL;
1736
0
    int r_count = 0;
1737
1738
0
    if (!cidx || !hdr)
1739
0
        return NULL;
1740
1741
0
    hts_itr_t *itr = NULL;
1742
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1743
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1744
0
        if (!r_list)
1745
0
            return NULL;
1746
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1747
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1748
0
    } else {
1749
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1750
0
        if (!r_list)
1751
0
            return NULL;
1752
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1753
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1754
0
    }
1755
1756
0
    if (!itr)
1757
0
        hts_reglist_free(r_list, r_count);
1758
1759
0
    return itr;
1760
0
}
1761
1762
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1763
0
{
1764
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1765
1766
0
    if(!cidx || !hdr || !reglist)
1767
0
        return NULL;
1768
1769
0
    if (cidx->fmt == HTS_FMT_CRAI)
1770
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1771
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1772
0
    else
1773
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1774
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1775
0
}
1776
1777
/**********************
1778
 *** SAM header I/O ***
1779
 **********************/
1780
1781
#include "htslib/kseq.h"
1782
#include "htslib/kstring.h"
1783
1784
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1785
0
{
1786
0
    sam_hdr_t *bh = sam_hdr_init();
1787
0
    if (!bh) return NULL;
1788
1789
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1790
0
        sam_hdr_destroy(bh);
1791
0
        return NULL;
1792
0
    }
1793
1794
0
    return bh;
1795
0
}
1796
1797
1.42M
static int valid_sam_header_type(const char *s) {
1798
1.42M
    if (s[0] != '@') return 0;
1799
1.42M
    switch (s[1]) {
1800
1.68k
    case 'H':
1801
1.68k
        return s[2] == 'D' && s[3] == '\t';
1802
21
    case 'S':
1803
21
        return s[2] == 'Q' && s[3] == '\t';
1804
1.40M
    case 'R':
1805
1.41M
    case 'P':
1806
1.41M
        return s[2] == 'G' && s[3] == '\t';
1807
7.18k
    case 'C':
1808
7.18k
        return s[2] == 'O';
1809
1.42M
    }
1810
51
    return 0;
1811
1.42M
}
1812
1813
// Minimal sanitisation of a header to ensure.
1814
// - null terminated string.
1815
// - all lines start with @ (also implies no blank lines).
1816
//
1817
// Much more could be done, but currently is not, including:
1818
// - checking header types are known (HD, SQ, etc).
1819
// - syntax (eg checking tab separated fields).
1820
// - validating n_targets matches @SQ records.
1821
// - validating target lengths against @SQ records.
1822
11.5k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1823
11.5k
    if (!h)
1824
108
        return NULL;
1825
1826
    // Special case for empty headers.
1827
11.4k
    if (h->l_text == 0)
1828
4.65k
        return h;
1829
1830
6.75k
    size_t i;
1831
6.75k
    unsigned int lnum = 0;
1832
6.75k
    char *cp = h->text, last = '\n';
1833
672M
    for (i = 0; i < h->l_text; i++) {
1834
        // NB: l_text excludes terminating nul.  This finds early ones.
1835
672M
        if (cp[i] == 0)
1836
3.13k
            break;
1837
1838
        // Error on \n[^@], including duplicate newlines
1839
672M
        if (last == '\n') {
1840
991k
            lnum++;
1841
991k
            if (cp[i] != '@') {
1842
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1843
0
                sam_hdr_destroy(h);
1844
0
                return NULL;
1845
0
            }
1846
991k
        }
1847
1848
672M
        last = cp[i];
1849
672M
    }
1850
1851
6.75k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1852
3.13k
        size_t j = i;
1853
17.7k
        while (j < h->l_text && cp[j] == '\0') j++;
1854
3.13k
        if (j < h->l_text)
1855
2.97k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1856
3.13k
    }
1857
1858
    // Add trailing newline and/or trailing nul if required.
1859
6.75k
    if (last != '\n') {
1860
2.97k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1861
1862
2.97k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1863
366
            if (h->l_text >= SIZE_MAX - 2) {
1864
0
                hts_log_error("No room for extra newline");
1865
0
                sam_hdr_destroy(h);
1866
0
                return NULL;
1867
0
            }
1868
1869
366
            cp = realloc(h->text, (size_t) h->l_text+2);
1870
366
            if (!cp) {
1871
0
                sam_hdr_destroy(h);
1872
0
                return NULL;
1873
0
            }
1874
366
            h->text = cp;
1875
366
        }
1876
2.97k
        cp[i++] = '\n';
1877
1878
        // l_text may be larger already due to multiple nul padding
1879
2.97k
        if (h->l_text < i)
1880
3
            h->l_text = i;
1881
2.97k
        cp[h->l_text] = '\0';
1882
2.97k
    }
1883
1884
6.75k
    return h;
1885
6.75k
}
1886
1887
4.42k
static void known_stderr(const char *tool, const char *advice) {
1888
4.42k
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1889
4.42k
    hts_log_warning("%s", advice);
1890
4.42k
}
1891
1892
54.8k
static void warn_if_known_stderr(const char *line) {
1893
54.8k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1894
1.86k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1895
53.0k
    else if (strstr(line, "M::mem_pestat") != NULL)
1896
1.40k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1897
51.6k
    else if (strstr(line, "loaded/built the index") != NULL)
1898
1.14k
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1899
54.8k
}
1900
1901
6.60k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1902
6.60k
    kstring_t str = { 0, 0, NULL };
1903
6.60k
    khint_t k;
1904
6.60k
    sam_hdr_t* h = sam_hdr_init();
1905
6.60k
    const char *q, *r;
1906
6.60k
    char* sn = NULL;
1907
6.60k
    khash_t(s2i) *d = kh_init(s2i);
1908
6.60k
    khash_t(s2i) *long_refs = NULL;
1909
6.60k
    if (!h || !d)
1910
0
        goto error;
1911
1912
6.60k
    int ret, has_SQ = 0;
1913
6.60k
    int next_c = '@';
1914
1.59M
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1915
1.58M
        if (fp->line.s[0] != '@')
1916
3
            break;
1917
1918
1.58M
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1919
168k
            has_SQ = 1;
1920
168k
            hts_pos_t ln = -1;
1921
420k
            for (q = fp->line.s + 4;; ++q) {
1922
420k
                if (strncmp(q, "SN:", 3) == 0) {
1923
159k
                    q += 3;
1924
1.00G
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1925
1926
159k
                    if (sn) {
1927
34.8k
                        hts_log_warning("SQ header line has more than one SN: tag");
1928
34.8k
                        free(sn);
1929
34.8k
                    }
1930
159k
                    sn = (char*)calloc(r - q + 1, 1);
1931
159k
                    if (!sn)
1932
0
                        goto error;
1933
1934
159k
                    strncpy(sn, q, r - q);
1935
159k
                    q = r;
1936
260k
                } else {
1937
260k
                    if (strncmp(q, "LN:", 3) == 0)
1938
136k
                        ln = strtoll(q + 3, (char**)&q, 10);
1939
260k
                }
1940
1941
25.1M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1942
24.7M
                    ++q;
1943
420k
                if (*q == '\0' || *q == '\n')
1944
168k
                    break;
1945
420k
            }
1946
168k
            if (sn) {
1947
124k
                if (ln >= 0) {
1948
113k
                    int absent;
1949
113k
                    k = kh_put(s2i, d, sn, &absent);
1950
113k
                    if (absent < 0)
1951
0
                        goto error;
1952
1953
113k
                    if (!absent) {
1954
72.0k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1955
72.0k
                        free(sn);
1956
72.0k
                    } else {
1957
41.2k
                        sn = NULL;
1958
41.2k
                        if (ln >= UINT32_MAX) {
1959
                            // Stash away ref length that
1960
                            // doesn't fit in target_len array
1961
17.9k
                            int k2;
1962
17.9k
                            if (!long_refs) {
1963
1.06k
                                long_refs = kh_init(s2i);
1964
1.06k
                                if (!long_refs)
1965
0
                                    goto error;
1966
1.06k
                            }
1967
17.9k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1968
17.9k
                            if (absent < 0)
1969
0
                                goto error;
1970
17.9k
                            kh_val(long_refs, k2) = ln;
1971
17.9k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1972
17.9k
                                            | UINT32_MAX);
1973
23.3k
                        } else {
1974
23.3k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1975
23.3k
                        }
1976
41.2k
                    }
1977
113k
                } else {
1978
11.7k
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1979
11.7k
                    warn_if_known_stderr(fp->line.s);
1980
11.7k
                    free(sn);
1981
11.7k
                }
1982
124k
            } else {
1983
43.0k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
1984
43.0k
                warn_if_known_stderr(fp->line.s);
1985
43.0k
            }
1986
168k
            sn = NULL;
1987
168k
        }
1988
1.42M
        else if (!valid_sam_header_type(fp->line.s)) {
1989
81
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1990
81
            warn_if_known_stderr(fp->line.s);
1991
81
            goto error;
1992
81
        }
1993
1994
1.58M
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1995
0
            goto error;
1996
1997
1.58M
        if (kputc('\n', &str) < 0)
1998
0
            goto error;
1999
2000
1.58M
        if (fp->is_bgzf) {
2001
1.48M
            next_c = bgzf_peek(fp->fp.bgzf);
2002
1.48M
        } else {
2003
99.0k
            unsigned char nc;
2004
99.0k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
2005
99.0k
            next_c = pret > 0 ? nc : pret - 1;
2006
99.0k
        }
2007
1.58M
        if (next_c < -1)
2008
3
            goto error;
2009
1.58M
    }
2010
6.52k
    if (next_c != '@')
2011
6.47k
        fp->line.l = 0;
2012
2013
6.52k
    if (ret < -1)
2014
48
        goto error;
2015
2016
6.47k
    if (!has_SQ && fp->fn_aux) {
2017
0
        kstring_t line = { 0, 0, NULL };
2018
2019
        /* The reference index (.fai) is actually needed here */
2020
0
        char *fai_fn = fp->fn_aux;
2021
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
2022
0
        if (fn_delim)
2023
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
2024
2025
0
        hFILE* f = hopen(fai_fn, "r");
2026
0
        int e = 0, absent;
2027
0
        if (f == NULL)
2028
0
            goto error;
2029
2030
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
2031
0
            char* tab = strchr(line.s, '\t');
2032
0
            hts_pos_t ln;
2033
2034
0
            if (tab == NULL)
2035
0
                continue;
2036
2037
0
            sn = (char*)calloc(tab-line.s+1, 1);
2038
0
            if (!sn) {
2039
0
                e = 1;
2040
0
                break;
2041
0
            }
2042
0
            memcpy(sn, line.s, tab-line.s);
2043
0
            k = kh_put(s2i, d, sn, &absent);
2044
0
            if (absent < 0) {
2045
0
                e = 1;
2046
0
                break;
2047
0
            }
2048
2049
0
            ln = strtoll(tab, NULL, 10);
2050
2051
0
            if (!absent) {
2052
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2053
0
                free(sn);
2054
0
                sn = NULL;
2055
0
            } else {
2056
0
                sn = NULL;
2057
0
                if (ln >= UINT32_MAX) {
2058
                    // Stash away ref length that
2059
                    // doesn't fit in target_len array
2060
0
                    khint_t k2;
2061
0
                    int absent = -1;
2062
0
                    if (!long_refs) {
2063
0
                        long_refs = kh_init(s2i);
2064
0
                        if (!long_refs) {
2065
0
                            e = 1;
2066
0
                            break;
2067
0
                        }
2068
0
                    }
2069
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2070
0
                    if (absent < 0) {
2071
0
                         e = 1;
2072
0
                         break;
2073
0
                    }
2074
0
                    kh_val(long_refs, k2) = ln;
2075
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2076
0
                                    | UINT32_MAX);
2077
0
                } else {
2078
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2079
0
                }
2080
0
                has_SQ = 1;
2081
0
            }
2082
2083
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2084
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2085
0
            e |= kputs("\tLN:", &str) < 0;
2086
0
            e |= kputll(ln, &str) < 0;
2087
0
            e |= kputc('\n', &str) < 0;
2088
0
            if (e)
2089
0
                break;
2090
0
        }
2091
2092
0
        ks_free(&line);
2093
0
        if (hclose(f) != 0) {
2094
0
            hts_log_error("Error on closing %s", fai_fn);
2095
0
            e = 1;
2096
0
        }
2097
0
        if (e)
2098
0
            goto error;
2099
0
    }
2100
2101
6.47k
    if (has_SQ) {
2102
        // Populate the targets array
2103
4.98k
        h->n_targets = kh_size(d);
2104
2105
4.98k
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2106
4.98k
        if (!h->target_name) {
2107
0
            h->n_targets = 0;
2108
0
            goto error;
2109
0
        }
2110
2111
4.98k
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2112
4.98k
        if (!h->target_len) {
2113
0
            h->n_targets = 0;
2114
0
            goto error;
2115
0
        }
2116
2117
79.3k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2118
74.3k
            if (!kh_exist(d, k))
2119
37.2k
                continue;
2120
2121
37.0k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2122
37.0k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2123
37.0k
            kh_val(d, k) >>= 32;
2124
37.0k
        }
2125
4.98k
    }
2126
2127
    // Repurpose sdict to hold any references longer than UINT32_MAX
2128
6.47k
    h->sdict = long_refs;
2129
2130
6.47k
    kh_destroy(s2i, d);
2131
2132
6.47k
    if (str.l == 0)
2133
3
        kputsn("", 0, &str);
2134
6.47k
    h->l_text = str.l;
2135
6.47k
    h->text = ks_release(&str);
2136
6.47k
    fp->bam_header = sam_hdr_sanitise(h);
2137
6.47k
    fp->bam_header->ref_count = 1;
2138
2139
6.47k
    return fp->bam_header;
2140
2141
132
 error:
2142
132
    if (h && d && (!h->target_name || !h->target_len)) {
2143
8.40k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2144
8.26k
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2145
132
    }
2146
132
    sam_hdr_destroy(h);
2147
132
    ks_free(&str);
2148
132
    kh_destroy(s2i, d);
2149
132
    kh_destroy(s2i, long_refs);
2150
132
    if (sn) free(sn);
2151
132
    return NULL;
2152
6.47k
}
2153
2154
sam_hdr_t *sam_hdr_read(htsFile *fp)
2155
13.1k
{
2156
13.1k
    if (!fp) {
2157
0
        errno = EINVAL;
2158
0
        return NULL;
2159
0
    }
2160
2161
13.1k
    switch (fp->format.format) {
2162
801
    case bam:
2163
801
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2164
2165
4.24k
    case cram:
2166
4.24k
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2167
2168
6.60k
    case sam:
2169
6.60k
        return sam_hdr_create(fp);
2170
2171
117
    case fastq_format:
2172
1.53k
    case fasta_format:
2173
1.53k
        return sam_hdr_init();
2174
2175
0
    case empty_format:
2176
0
        errno = EPIPE;
2177
0
        return NULL;
2178
2179
0
    default:
2180
0
        errno = EFTYPE;
2181
0
        return NULL;
2182
13.1k
    }
2183
13.1k
}
2184
2185
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2186
12.9k
{
2187
12.9k
    if (!fp || !h) {
2188
0
        errno = EINVAL;
2189
0
        return -1;
2190
0
    }
2191
2192
12.9k
    switch (fp->format.format) {
2193
4.31k
    case binary_format:
2194
4.31k
        fp->format.category = sequence_data;
2195
4.31k
        fp->format.format = bam;
2196
        /* fall-through */
2197
4.31k
    case bam:
2198
4.31k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2199
4.31k
        break;
2200
2201
4.31k
    case cram: {
2202
4.31k
        cram_fd *fd = fp->fp.cram;
2203
4.31k
        if (cram_set_header2(fd, h) < 0) return -1;
2204
3.98k
        if (fp->fn_aux)
2205
0
            cram_load_reference(fd, fp->fn_aux);
2206
3.98k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2207
3.98k
        }
2208
3.98k
        break;
2209
2210
4.31k
    case text_format:
2211
4.31k
        fp->format.category = sequence_data;
2212
4.31k
        fp->format.format = sam;
2213
        /* fall-through */
2214
4.31k
    case sam: {
2215
4.31k
        if (!h->hrecs && !h->text)
2216
0
            return 0;
2217
4.31k
        char *text;
2218
4.31k
        kstring_t hdr_ks = { 0, 0, NULL };
2219
4.31k
        size_t l_text;
2220
4.31k
        ssize_t bytes;
2221
4.31k
        int r = 0, no_sq = 0;
2222
2223
4.31k
        if (h->hrecs) {
2224
3.98k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2225
0
                return -1;
2226
3.98k
            text = hdr_ks.s;
2227
3.98k
            l_text = hdr_ks.l;
2228
3.98k
        } else {
2229
332
            const char *p = NULL;
2230
793
            do {
2231
793
                const char *q = p == NULL ? h->text : p + 4;
2232
793
                p = strstr(q, "@SQ\t");
2233
793
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2234
332
            no_sq = p == NULL;
2235
332
            text = h->text;
2236
332
            l_text = h->l_text;
2237
332
        }
2238
2239
4.31k
        if (fp->is_bgzf) {
2240
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2241
4.31k
        } else {
2242
4.31k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2243
4.31k
        }
2244
4.31k
        free(hdr_ks.s);
2245
4.31k
        if (bytes != l_text)
2246
0
            return -1;
2247
2248
4.31k
        if (no_sq) {
2249
206
            int i;
2250
2.90k
            for (i = 0; i < h->n_targets; ++i) {
2251
2.69k
                fp->line.l = 0;
2252
2.69k
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2253
2.69k
                r |= kputs(h->target_name[i], &fp->line) < 0;
2254
2.69k
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2255
2.69k
                r |= kputw(h->target_len[i], &fp->line) < 0;
2256
2.69k
                r |= kputc('\n', &fp->line) < 0;
2257
2.69k
                if (r != 0)
2258
0
                    return -1;
2259
2260
2.69k
                if (fp->is_bgzf) {
2261
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2262
2.69k
                } else {
2263
2.69k
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2264
2.69k
                }
2265
2.69k
                if (bytes != fp->line.l)
2266
0
                    return -1;
2267
2.69k
            }
2268
206
        }
2269
4.31k
        if (fp->is_bgzf) {
2270
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2271
4.31k
        } else {
2272
4.31k
            if (hflush(fp->fp.hfile) != 0) return -1;
2273
4.31k
        }
2274
4.31k
        }
2275
4.31k
        break;
2276
2277
4.31k
    case fastq_format:
2278
0
    case fasta_format:
2279
        // Nothing to output; FASTQ has no file headers.
2280
0
        break;
2281
2282
0
    default:
2283
0
        errno = EBADF;
2284
0
        return -1;
2285
12.9k
    }
2286
12.6k
    return 0;
2287
12.9k
}
2288
2289
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2290
0
{
2291
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2292
0
    size_t new_l_text;
2293
0
    if (!h || !key)
2294
0
        return -1;
2295
2296
0
    if (h->l_text > 3) {
2297
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2298
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2299
0
            *p = '\0'; // for strstr call
2300
2301
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2302
2303
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2304
0
                *p = '\n'; // change back
2305
2306
                // mark the key:val
2307
0
                beg = q;
2308
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2309
0
                end = q;
2310
2311
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2312
0
                    && strlen(val) == end - beg - 4)
2313
0
                     return 0; // val is the same, no need to change
2314
2315
0
            } else {
2316
0
                beg = end = p;
2317
0
                *p = '\n';
2318
0
            }
2319
0
        }
2320
0
    }
2321
0
    if (beg == NULL) { // no @HD
2322
0
        new_l_text = h->l_text;
2323
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2324
0
            return -1;
2325
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2326
0
        if (val) {
2327
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2328
0
                return -1;
2329
0
            new_l_text += strlen(val) + 4;
2330
0
        }
2331
0
        newtext = (char*)malloc(new_l_text + 1);
2332
0
        if (!newtext) return -1;
2333
2334
0
        if (val)
2335
0
            snprintf(newtext, new_l_text + 1,
2336
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2337
0
        else
2338
0
            snprintf(newtext, new_l_text + 1,
2339
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2340
0
    } else { // has @HD but different or no key
2341
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2342
0
        if (val) {
2343
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2344
0
                return -1;
2345
0
            new_l_text += strlen(val) + 4;
2346
0
        }
2347
0
        newtext = (char*)malloc(new_l_text + 1);
2348
0
        if (!newtext) return -1;
2349
2350
0
        if (val) {
2351
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2352
0
                    (int) (beg - h->text), h->text, key, val, end);
2353
0
        } else { //delete key
2354
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2355
0
                    (int) (beg - h->text), h->text, end);
2356
0
        }
2357
0
    }
2358
0
    free(h->text);
2359
0
    h->text = newtext;
2360
0
    h->l_text = new_l_text;
2361
0
    return 0;
2362
0
}
2363
2364
2365
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2366
0
{
2367
0
    if (!h || !key)
2368
0
        return -1;
2369
2370
0
    if (!h->hrecs)
2371
0
        return old_sam_hdr_change_HD(h, key, val);
2372
2373
0
    if (val) {
2374
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2375
0
            return -1;
2376
0
    } else {
2377
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2378
0
            return -1;
2379
0
    }
2380
0
    return sam_hdr_rebuild(h);
2381
0
}
2382
/**********************
2383
 *** SAM record I/O ***
2384
 **********************/
2385
2386
// The speed of this code can vary considerably depending on minor code
2387
// changes elsewhere as some of the tight loops are particularly prone to
2388
// speed changes when the instruction blocks are split over a 32-byte
2389
// boundary.  To protect against this, we explicitly specify an alignment
2390
// for this function.  If this is insufficient, we may also wish to
2391
// consider alignment of blocks within this function via
2392
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2393
// However it's not very portable.
2394
// Instead we break into separate functions so we can explicitly specify
2395
// use __attribute__((aligned(32))) instead and force consistent loop
2396
// alignment.
2397
2.55k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2398
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2399
2.55k
    if (*n > INT32_MAX*0.666) {
2400
0
        errno = ENOMEM;
2401
0
        return -1;
2402
0
    }
2403
2404
2.55k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2405
2.55k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2406
0
        hts_log_error("Out of memory");
2407
0
        return -1;
2408
0
    }
2409
2410
2.55k
    (*n)+=*n>>1;
2411
2.55k
    return 0;
2412
2.55k
}
2413
2414
2415
// This ensures that q always ends up at the next comma after
2416
// reading a number even if it's followed by junk.  It
2417
// prevents the possibility of trying to read more than n items.
2418
4.13k
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2419
2420
HTS_ALIGN32
2421
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2422
197
                               uint32_t *nalloc, int *overflow) {
2423
757
    while (*q == ',') {
2424
560
        if ((*nused)++ >= (*nalloc)) {
2425
43
            if (grow_B_array(b, nalloc, 1) < 0)
2426
0
                return NULL;
2427
43
        }
2428
560
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2429
560
        b->l_data++;
2430
560
    }
2431
197
    return q;
2432
197
}
2433
2434
HTS_ALIGN32
2435
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2436
145
                               uint32_t *nalloc, int *overflow) {
2437
1.64k
    while (*q == ',') {
2438
1.49k
        if ((*nused)++ >= (*nalloc)) {
2439
147
            if (grow_B_array(b, nalloc, 1) < 0)
2440
0
                return NULL;
2441
147
        }
2442
1.49k
        if (q[1] != '-') {
2443
1.48k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2444
1.48k
            b->l_data++;
2445
1.48k
        } else {
2446
16
            *overflow = 1;
2447
16
            q++;
2448
16
            skip_to_comma_(q);
2449
16
        }
2450
1.49k
    }
2451
145
    return q;
2452
145
}
2453
2454
HTS_ALIGN32
2455
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2456
1.03k
                               uint32_t *nalloc, int *overflow) {
2457
1.66k
    while (*q == ',') {
2458
627
        if ((*nused)++ >= (*nalloc)) {
2459
70
            if (grow_B_array(b, nalloc, 2) < 0)
2460
0
                return NULL;
2461
70
        }
2462
627
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2463
627
                  b->data + b->l_data);
2464
627
        b->l_data += 2;
2465
627
    }
2466
1.03k
    return q;
2467
1.03k
}
2468
2469
HTS_ALIGN32
2470
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2471
1.76k
                               uint32_t *nalloc, int *overflow) {
2472
3.91k
    while (*q == ',') {
2473
2.14k
        if ((*nused)++ >= (*nalloc)) {
2474
37
            if (grow_B_array(b, nalloc, 2) < 0)
2475
0
                return NULL;
2476
37
        }
2477
2.14k
        if (q[1] != '-') {
2478
2.08k
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2479
2.08k
                      b->data + b->l_data);
2480
2.08k
            b->l_data += 2;
2481
2.08k
        } else {
2482
61
            *overflow = 1;
2483
61
            q++;
2484
61
            skip_to_comma_(q);
2485
61
        }
2486
2.14k
    }
2487
1.76k
    return q;
2488
1.76k
}
2489
2490
HTS_ALIGN32
2491
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2492
637
                               uint32_t *nalloc, int *overflow) {
2493
33.0k
    while (*q == ',') {
2494
32.4k
        if ((*nused)++ >= (*nalloc)) {
2495
2.22k
            if (grow_B_array(b, nalloc, 4) < 0)
2496
0
                return NULL;
2497
2.22k
        }
2498
32.4k
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2499
32.4k
                  b->data + b->l_data);
2500
32.4k
        b->l_data += 4;
2501
32.4k
    }
2502
637
    return q;
2503
637
}
2504
2505
HTS_ALIGN32
2506
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2507
7.92k
                               uint32_t *nalloc, int *overflow) {
2508
9.62k
    while (*q == ',') {
2509
1.70k
        if ((*nused)++ >= (*nalloc)) {
2510
0
            if (grow_B_array(b, nalloc, 4) < 0)
2511
0
                return NULL;
2512
0
        }
2513
1.70k
        if (q[1] != '-') {
2514
1.69k
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2515
1.69k
                      b->data + b->l_data);
2516
1.69k
            b->l_data += 4;
2517
1.69k
        } else {
2518
15
            *overflow = 1;
2519
15
            q++;
2520
15
            skip_to_comma_(q);
2521
15
        }
2522
1.70k
    }
2523
7.92k
    return q;
2524
7.92k
}
2525
2526
HTS_ALIGN32
2527
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2528
699
                               uint32_t *nalloc, int *overflow) {
2529
1.29k
    while (*q == ',') {
2530
600
        if ((*nused)++ >= (*nalloc)) {
2531
33
            if (grow_B_array(b, nalloc, 4) < 0)
2532
0
                return NULL;
2533
33
        }
2534
600
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2535
600
        b->l_data += 4;
2536
600
    }
2537
699
    return q;
2538
699
}
2539
2540
HTS_ALIGN32
2541
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2542
                              char **end, bam1_t *b,
2543
12.4k
                              int *ctr) {
2544
    // Protect against infinite recursion when dealing with invalid input.
2545
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2546
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2547
    //
2548
    // Loop detection is the safest solution incase there are other
2549
    // strange corner cases with malformed inputs.
2550
12.4k
    if (++(*ctr) > 2) {
2551
0
        hts_log_error("Malformed data in B:%c array", type);
2552
0
        return -1;
2553
0
    }
2554
2555
12.4k
    int orig_l = b->l_data;
2556
12.4k
    char *q = in;
2557
12.4k
    int32_t size;
2558
12.4k
    size_t bytes;
2559
12.4k
    int overflow = 0;
2560
2561
12.4k
    size = aux_type2size(type);
2562
12.4k
    if (size <= 0 || size > 4) {
2563
3
        hts_log_error("Unrecognized type B:%c", type);
2564
3
        return -1;
2565
3
    }
2566
2567
    // Ensure space for type + values.
2568
    // The first pass through here we don't know the number of entries and
2569
    // nalloc == 0.  We start with a small working set and then parse the
2570
    // data, growing as needed.
2571
    //
2572
    // If we have a second pass through we do know the number of entries
2573
    // and nalloc is already known.  We have no need to expand the bam data.
2574
12.4k
    if (!nalloc)
2575
10.9k
         nalloc=7;
2576
2577
    // Ensure allocated memory is big enough (for current nalloc estimate)
2578
12.4k
    bytes = (size_t) nalloc * (size_t) size;
2579
12.4k
    if (bytes / size != nalloc
2580
12.4k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2581
0
        hts_log_error("Out of memory");
2582
0
        return -1;
2583
0
    }
2584
2585
12.4k
    uint32_t nused = 0;
2586
2587
12.4k
    b->data[b->l_data++] = 'B';
2588
12.4k
    b->data[b->l_data++] = type;
2589
    // 32-bit B-array length is inserted later once we know it.
2590
12.4k
    int b_len_idx = b->l_data;
2591
12.4k
    b->l_data += sizeof(uint32_t);
2592
2593
12.4k
    if (type == 'c') {
2594
197
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2595
0
            return -1;
2596
12.2k
    } else if (type == 'C') {
2597
145
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2598
0
            return -1;
2599
12.0k
    } else if (type == 's') {
2600
1.03k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2601
0
            return -1;
2602
11.0k
    } else if (type == 'S') {
2603
1.76k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2604
0
            return -1;
2605
9.25k
    } else if (type == 'i') {
2606
637
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2607
0
            return -1;
2608
8.62k
    } else if (type == 'I') {
2609
7.92k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2610
0
            return -1;
2611
7.92k
    } else if (type == 'f') {
2612
699
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2613
0
            return -1;
2614
699
    }
2615
12.4k
    if (*q != '\t' && *q != '\0') {
2616
        // Unknown B array type or junk in the numbers
2617
67
        hts_log_error("Malformed B:%c", type);
2618
67
        return -1;
2619
67
    }
2620
12.3k
    i32_to_le(nused, b->data + b_len_idx);
2621
2622
12.3k
    if (!overflow) {
2623
10.8k
        *end = q;
2624
10.8k
        return 0;
2625
10.8k
    } else {
2626
1.50k
        int64_t max = 0, min = 0, val;
2627
        // Given type was incorrect.  Try to rescue the situation.
2628
1.50k
        char *r = q;
2629
1.50k
        q = in;
2630
1.50k
        overflow = 0;
2631
1.50k
        b->l_data = orig_l;
2632
        // Find out what range of values is present
2633
4.69k
        while (q < r) {
2634
3.19k
            val = hts_str2int(q + 1, &q, 64, &overflow);
2635
3.19k
            if (max < val) max = val;
2636
3.19k
            if (min > val) min = val;
2637
3.19k
            skip_to_comma_(q);
2638
3.19k
        }
2639
        // Retry with appropriate type
2640
1.50k
        if (!overflow) {
2641
1.46k
            if (min < 0) {
2642
55
                if (min >= INT8_MIN && max <= INT8_MAX) {
2643
4
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2644
51
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2645
6
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2646
45
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2647
40
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2648
40
                }
2649
1.41k
            } else {
2650
1.41k
                if (max < UINT8_MAX) {
2651
69
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2652
1.34k
                } else if (max <= UINT16_MAX) {
2653
24
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2654
1.32k
                } else if (max <= UINT32_MAX) {
2655
1.30k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2656
1.30k
                }
2657
1.41k
            }
2658
1.46k
        }
2659
        // If here then at least one of the values is too big to store
2660
58
        hts_log_error("Numeric value in B array out of allowed range");
2661
58
        return -1;
2662
1.50k
    }
2663
12.3k
#undef skip_to_comma_
2664
12.3k
}
2665
2666
HTS_ALIGN32
2667
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2668
10.9k
{
2669
10.9k
    int ctr = 0;
2670
10.9k
    uint32_t nalloc = 0;
2671
10.9k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2672
10.9k
}
2673
2674
312k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2675
312k
    if (*v >= '1' && *v <= '9') {
2676
60.3k
        return hts_str2uint(v, rv, 16, overflow);
2677
60.3k
    }
2678
252k
    else if (*v == '0') {
2679
        // handle single-digit "0" directly; otherwise it's hex or octal
2680
6.86k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2681
39
        else {
2682
39
            unsigned long val = strtoul(v, rv, 0);
2683
39
            if (val > 65535) { *overflow = 1; return 65535; }
2684
37
            return val;
2685
39
        }
2686
6.86k
    }
2687
245k
    else {
2688
        // TODO implement symbolic flag letters
2689
245k
        *rv = v;
2690
245k
        return 0;
2691
245k
    }
2692
312k
}
2693
2694
// Parse tag line and append to bam object b.
2695
// Shared by both SAM and FASTQ parsers.
2696
//
2697
// The difference between the two is how lenient we are to recognising
2698
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2699
// non-SAM looking strings.
2700
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2701
310k
                            khash_t(tag) *tag_whitelist) {
2702
310k
    int overflow = 0;
2703
310k
    int checkpoint;
2704
310k
    char logbuf[40];
2705
310k
    char *q = start, *p = end;
2706
2707
310k
#define _parse_err(cond, ...)                   \
2708
8.64M
    do {                                        \
2709
17.7M
        if (cond) {                             \
2710
423
            if (lenient) {                      \
2711
0
                while (q < p && !isspace_c(*q))   \
2712
0
                    q++;                        \
2713
0
                while (q < p && isspace_c(*q))    \
2714
0
                    q++;                        \
2715
0
                b->l_data = checkpoint;         \
2716
0
                goto loop;                      \
2717
423
            } else {                            \
2718
423
                hts_log_error(__VA_ARGS__);     \
2719
423
                goto err_ret;                   \
2720
423
            }                                   \
2721
423
        }                                       \
2722
8.64M
    } while (0)
2723
2724
8.32M
    while (q < p) loop: {
2725
8.32M
        char type;
2726
8.32M
        checkpoint = b->l_data;
2727
8.32M
        if (p - q < 5) {
2728
63
            if (lenient) {
2729
0
                break;
2730
63
            } else {
2731
63
                hts_log_error("Incomplete aux field");
2732
63
                goto err_ret;
2733
63
            }
2734
63
        }
2735
4.16M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2736
2737
4.16M
        if (lenient && (q[2] | q[4]) != ':') {
2738
0
            while (q < p && !isspace_c(*q))
2739
0
                q++;
2740
0
            while (q < p && isspace_c(*q))
2741
0
                q++;
2742
0
            continue;
2743
0
        }
2744
2745
4.16M
        if (tag_whitelist) {
2746
0
            int tt = q[0]*256 + q[1];
2747
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2748
0
                while (q < p && *q != '\t')
2749
0
                    q++;
2750
0
                continue;
2751
0
            }
2752
0
        }
2753
2754
        // Copy over id
2755
4.16M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2756
4.16M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2757
4.16M
        q += 3; type = *q++; ++q; // q points to value
2758
4.16M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2759
4.08M
            _parse_err(*q <= '\t', "incomplete aux field");
2760
2761
        // Ensure enough space for a double + type allocated.
2762
4.16M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2763
2764
4.16M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2765
1.71M
            b->data[b->l_data++] = 'A';
2766
1.71M
            b->data[b->l_data++] = *q++;
2767
2.45M
        } else if (type == 'i' || type == 'I') {
2768
2.27M
            if (*q == '-') {
2769
1.81M
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2770
1.81M
                if (x >= INT8_MIN) {
2771
878k
                    b->data[b->l_data++] = 'c';
2772
878k
                    b->data[b->l_data++] = x;
2773
938k
                } else if (x >= INT16_MIN) {
2774
296k
                    b->data[b->l_data++] = 's';
2775
296k
                    i16_to_le(x, b->data + b->l_data);
2776
296k
                    b->l_data += 2;
2777
642k
                } else {
2778
642k
                    b->data[b->l_data++] = 'i';
2779
642k
                    i32_to_le(x, b->data + b->l_data);
2780
642k
                    b->l_data += 4;
2781
642k
                }
2782
1.81M
            } else {
2783
453k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2784
453k
                if (x <= UINT8_MAX) {
2785
334k
                    b->data[b->l_data++] = 'C';
2786
334k
                    b->data[b->l_data++] = x;
2787
334k
                } else if (x <= UINT16_MAX) {
2788
117k
                    b->data[b->l_data++] = 'S';
2789
117k
                    u16_to_le(x, b->data + b->l_data);
2790
117k
                    b->l_data += 2;
2791
117k
                } else {
2792
1.32k
                    b->data[b->l_data++] = 'I';
2793
1.32k
                    u32_to_le(x, b->data + b->l_data);
2794
1.32k
                    b->l_data += 4;
2795
1.32k
                }
2796
453k
            }
2797
2.27M
        } else if (type == 'f') {
2798
1.29k
            b->data[b->l_data++] = 'f';
2799
1.29k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2800
1.29k
            b->l_data += sizeof(float);
2801
181k
        } else if (type == 'd') {
2802
90.5k
            b->data[b->l_data++] = 'd';
2803
90.5k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2804
90.5k
            b->l_data += sizeof(double);
2805
90.6k
        } else if (type == 'Z' || type == 'H') {
2806
79.5k
            char *end = strchr(q, '\t');
2807
79.5k
            if (!end) end = q + strlen(q);
2808
79.5k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2809
79.5k
                       "hex field does not have an even number of digits");
2810
79.5k
            b->data[b->l_data++] = type;
2811
79.5k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2812
79.5k
            memcpy(b->data + b->l_data, q, end - q);
2813
79.5k
            b->l_data += end - q;
2814
79.5k
            b->data[b->l_data++] = '\0';
2815
79.5k
            q = end;
2816
79.5k
        } else if (type == 'B') {
2817
10.9k
            type = *q++; // q points to the first ',' following the typing byte
2818
10.9k
            _parse_err(*q && *q != ',' && *q != '\t',
2819
10.9k
                       "B aux field type not followed by ','");
2820
2821
10.9k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2822
128
                goto err_ret;
2823
10.9k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2824
2825
24.9M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2826
4.16M
        q++;
2827
4.16M
    }
2828
2829
309k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2830
309k
#undef _parse_err
2831
2832
309k
    return 0;
2833
2834
614
err_ret:
2835
614
    return -2;
2836
309k
}
2837
2838
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2839
313k
{
2840
1.26M
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2841
2842
313k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2843
2844
// Macro that operates on 64-bits at a time.
2845
313k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2846
313k
    do {                                                        \
2847
305k
        uint64_u *from8 = (uint64_u *)(from);                   \
2848
305k
        uint64_u *to8 = (uint64_u *)(to);                       \
2849
305k
        uint64_t uflow = 0;                                     \
2850
305k
        size_t l8 = (l)>>3, i;                                  \
2851
305k
        for (i = 0; i < l8; i++) {                              \
2852
82
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2853
82
            uflow |= to8[i];                                    \
2854
82
        }                                                       \
2855
306k
        for (i<<=3; i < (l); ++i) {                             \
2856
994
            to[i] = from[i] - (n);                              \
2857
994
            uflow |= to[i];                                     \
2858
994
        }                                                       \
2859
305k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2860
305k
    } while (0)
2861
2862
#else
2863
2864
// Basic version which operates a byte at a time
2865
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2866
        uint8_t uflow = 0;                                   \
2867
        for (i = 0; i < (l); ++i) {                          \
2868
            (to)[i] = (from)[i] - (n);                       \
2869
            uflow |= (uint8_t) (to)[i];                      \
2870
        }                                                    \
2871
        failed = (uflow & 0x80) > 0;                         \
2872
    } while (0)
2873
2874
#endif
2875
2876
604k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2877
4.25M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2878
1.22M
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2879
2880
313k
    uint8_t *t;
2881
2882
313k
    char *p = s->s, *q;
2883
313k
    int i, overflow = 0;
2884
313k
    char logbuf[40];
2885
313k
    hts_pos_t cigreflen;
2886
313k
    bam1_core_t *c = &b->core;
2887
2888
313k
    b->l_data = 0;
2889
313k
    memset(c, 0, 32);
2890
2891
    // qname
2892
313k
    q = _read_token(p);
2893
2894
312k
    _parse_warn(p - q <= 1, "empty query name");
2895
312k
    _parse_err(p - q > 255, "query name too long");
2896
    // resize large enough for name + extranul
2897
312k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2898
312k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2899
2900
312k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2901
312k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2902
312k
    b->l_data += c->l_extranul;
2903
2904
312k
    c->l_qname = p - q + c->l_extranul;
2905
2906
    // flag
2907
312k
    c->flag = parse_sam_flag(p, &p, &overflow);
2908
312k
    if (*p++ != '\t') goto err_ret; // malformated flag
2909
2910
    // chr
2911
312k
    q = _read_token(p);
2912
311k
    if (strcmp(q, "*")) {
2913
301k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2914
301k
        c->tid = bam_name2id(h, q);
2915
301k
        _parse_err(c->tid < -1, "failed to parse header");
2916
301k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2917
301k
    } else c->tid = -1;
2918
2919
    // pos
2920
311k
    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2921
311k
    if (*p++ != '\t') goto err_ret;
2922
311k
    if (c->pos < 0 && c->tid >= 0) {
2923
7.91k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2924
7.91k
        c->tid = -1;
2925
7.91k
    }
2926
311k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2927
2928
    // mapq
2929
311k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2930
311k
    if (*p++ != '\t') goto err_ret;
2931
    // cigar
2932
311k
    if (*p != '*') {
2933
289k
        uint32_t *cigar = NULL;
2934
289k
        int old_l_data = b->l_data;
2935
289k
        int n_cigar = bam_parse_cigar(p, &p, b);
2936
289k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2937
288k
        cigar = (uint32_t *)(b->data + old_l_data);
2938
2939
        // can't use bam_endpos() directly as some fields not yet set up
2940
288k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2941
288k
        if (cigreflen == 0) cigreflen = 1;
2942
288k
    } else {
2943
22.3k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2944
22.3k
        c->flag |= BAM_FUNMAP;
2945
22.3k
        q = _read_token(p);
2946
22.2k
        cigreflen = 1;
2947
22.2k
    }
2948
310k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2949
310k
               "read ends beyond highest supported position");
2950
310k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2951
    // mate chr
2952
310k
    q = _read_token(p);
2953
310k
    if (strcmp(q, "=") == 0) {
2954
135
        c->mtid = c->tid;
2955
310k
    } else if (strcmp(q, "*") == 0) {
2956
296
        c->mtid = -1;
2957
310k
    } else {
2958
310k
        c->mtid = bam_name2id(h, q);
2959
310k
        _parse_err(c->mtid < -1, "failed to parse header");
2960
310k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2961
310k
    }
2962
    // mpos
2963
310k
    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2964
310k
    if (*p++ != '\t') goto err_ret;
2965
310k
    if (c->mpos < 0 && c->mtid >= 0) {
2966
269k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2967
269k
        c->mtid = -1;
2968
269k
    }
2969
    // tlen
2970
310k
    c->isize = hts_str2int(p, &p, 64, &overflow);
2971
310k
    if (*p++ != '\t') goto err_ret;
2972
    // seq
2973
310k
    q = _read_token(p);
2974
310k
    if (strcmp(q, "*")) {
2975
294k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2976
294k
        c->l_qseq = p - q - 1;
2977
294k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2978
294k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2979
294k
        i = (c->l_qseq + 1) >> 1;
2980
294k
        _get_mem(uint8_t, &t, b, i);
2981
2982
294k
        unsigned int lqs2 = c->l_qseq&~1, i;
2983
295k
        for (i = 0; i < lqs2; i+=2)
2984
864
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2985
299k
        for (; i < c->l_qseq; ++i)
2986
5.36k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2987
294k
    } else c->l_qseq = 0;
2988
    // qual
2989
620k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2990
620k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2991
4.90k
        memset(t, 0xff, c->l_qseq);
2992
4.90k
        p += 2;
2993
305k
    } else {
2994
305k
        int failed = 0;
2995
305k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2996
305k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2997
305k
                   "SEQ and QUAL are of different length");
2998
305k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2999
305k
        _parse_err(failed, "invalid QUAL character");
3000
305k
        p += c->l_qseq + 1;
3001
305k
    }
3002
3003
    // aux
3004
310k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
3005
614
        goto err_ret;
3006
3007
309k
    if (bam_tag2cigar(b, 1, 1) < 0)
3008
0
        return -2;
3009
309k
    return 0;
3010
3011
0
#undef _parse_warn
3012
0
#undef _parse_err
3013
0
#undef _get_mem
3014
0
#undef _read_token
3015
3.52k
err_ret:
3016
3.52k
    return -2;
3017
309k
}
3018
3019
289k
static uint32_t read_ncigar(const char *q) {
3020
289k
    uint32_t n_cigar = 0;
3021
8.41M
    for (; *q && *q != '\t'; ++q)
3022
8.12M
        if (!isdigit_c(*q)) ++n_cigar;
3023
289k
    if (!n_cigar) {
3024
165
        hts_log_error("No CIGAR operations");
3025
165
        return 0;
3026
165
    }
3027
288k
    if (n_cigar >= 2147483647) {
3028
0
        hts_log_error("Too many CIGAR operations");
3029
0
        return 0;
3030
0
    }
3031
3032
288k
    return n_cigar;
3033
288k
}
3034
3035
/*! @function
3036
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
3037
 @param  in      [in]  pointer to the source string
3038
 @param  a_cigar [out]  address of the destination uint32_t buffer
3039
 @return         number of processed input characters; 0 on error
3040
 */
3041
288k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
3042
288k
    int i, overflow = 0;
3043
288k
    const char *p = in;
3044
624k
    for (i = 0; i < n_cigar; i++) {
3045
335k
        uint32_t len;
3046
335k
        int op;
3047
335k
        char *q;
3048
335k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
3049
335k
        if (q == p) {
3050
194
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
3051
194
            return 0;
3052
194
        }
3053
335k
        if (overflow) {
3054
21
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
3055
21
            return 0;
3056
21
        }
3057
335k
        p = q;
3058
335k
        op = bam_cigar_table[(unsigned char)*p++];
3059
335k
        if (op < 0) {
3060
124
            hts_log_error("Unrecognized CIGAR operator");
3061
124
            return 0;
3062
124
        }
3063
335k
        a_cigar[i] = len;
3064
335k
        a_cigar[i] |= op;
3065
335k
    }
3066
3067
288k
    return p-in;
3068
288k
}
3069
3070
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
3071
0
    size_t n_cigar = 0;
3072
0
    int diff;
3073
3074
0
    if (!in || !a_cigar || !a_mem) {
3075
0
        hts_log_error("NULL pointer arguments");
3076
0
        return -1;
3077
0
    }
3078
0
    if (end) *end = (char *)in;
3079
3080
0
    if (*in == '*') {
3081
0
        if (end) (*end)++;
3082
0
        return 0;
3083
0
    }
3084
0
    n_cigar = read_ncigar(in);
3085
0
    if (!n_cigar) return 0;
3086
0
    if (n_cigar > *a_mem) {
3087
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
3088
0
        if (a_tmp) {
3089
0
            *a_cigar = a_tmp;
3090
0
            *a_mem = n_cigar;
3091
0
        } else {
3092
0
            hts_log_error("Memory allocation error");
3093
0
            return -1;
3094
0
        }
3095
0
    }
3096
3097
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
3098
0
    if (end) *end = (char *)in+diff;
3099
3100
0
    return n_cigar;
3101
0
}
3102
3103
289k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
3104
289k
    size_t n_cigar = 0;
3105
289k
    int diff;
3106
3107
289k
    if (!in || !b) {
3108
0
        hts_log_error("NULL pointer arguments");
3109
0
        return -1;
3110
0
    }
3111
289k
    if (end) *end = (char *)in;
3112
3113
289k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
3114
289k
    if (!n_cigar && b->core.n_cigar == 0) {
3115
165
        if (end) *end = (char *)in+1;
3116
165
        return 0;
3117
165
    }
3118
3119
288k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
3120
288k
    if (cig_diff > 0 &&
3121
288k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
3122
0
        hts_log_error("Memory allocation error");
3123
0
        return -1;
3124
0
    }
3125
3126
288k
    uint32_t *cig = bam_get_cigar(b);
3127
288k
    if ((uint8_t *)cig != b->data + b->l_data) {
3128
        // Modifying an BAM existing BAM record
3129
0
        uint8_t  *seq = bam_get_seq(b);
3130
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
3131
0
    }
3132
3133
288k
    if (n_cigar) {
3134
288k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
3135
339
            return -1;
3136
288k
    } else {
3137
0
        diff = 1; // handle "*"
3138
0
    }
3139
3140
288k
    b->l_data += cig_diff * sizeof(uint32_t);
3141
288k
    b->core.n_cigar = n_cigar;
3142
288k
    if (end) *end = (char *)in + diff;
3143
3144
288k
    return n_cigar;
3145
288k
}
3146
3147
/*
3148
 * -----------------------------------------------------------------------------
3149
 * SAM threading
3150
 */
3151
// Size of SAM text block (reading)
3152
0
#define SAM_NBYTES 240000
3153
3154
// Number of BAM records (writing, up to NB_mem in size)
3155
0
#define SAM_NBAM 1000
3156
3157
struct SAM_state;
3158
3159
// Output job - a block of BAM records
3160
typedef struct sp_bams {
3161
    struct sp_bams *next;
3162
    int serial;
3163
3164
    bam1_t *bams;
3165
    int nbams, abams; // used and alloc for bams[] array
3166
    size_t bam_mem;   // very approximate total size
3167
3168
    struct SAM_state *fd;
3169
} sp_bams;
3170
3171
// Input job - a block of SAM text
3172
typedef struct sp_lines {
3173
    struct sp_lines *next;
3174
    int serial;
3175
3176
    char *data;
3177
    int data_size;
3178
    int alloc;
3179
3180
    struct SAM_state *fd;
3181
    sp_bams *bams;
3182
} sp_lines;
3183
3184
enum sam_cmd {
3185
    SAM_NONE = 0,
3186
    SAM_CLOSE,
3187
    SAM_CLOSE_DONE,
3188
};
3189
3190
typedef struct SAM_state {
3191
    sam_hdr_t *h;
3192
3193
    hts_tpool *p;
3194
    int own_pool;
3195
    pthread_mutex_t lines_m;
3196
    hts_tpool_process *q;
3197
    pthread_t dispatcher;
3198
    int dispatcher_set;
3199
3200
    sp_lines *lines;
3201
    sp_bams *bams;
3202
3203
    sp_bams *curr_bam;
3204
    int curr_idx;
3205
    int serial;
3206
3207
    // Be warned: moving these mutexes around in this struct can reduce
3208
    // threading performance by up to 70%!
3209
    pthread_mutex_t command_m;
3210
    pthread_cond_t command_c;
3211
    enum sam_cmd command;
3212
3213
    // One of the E* errno codes
3214
    int errcode;
3215
3216
    htsFile *fp;
3217
} SAM_state;
3218
3219
// Returns a SAM_state struct from a generic hFILE.
3220
//
3221
// Returns NULL on failure.
3222
0
static SAM_state *sam_state_create(htsFile *fp) {
3223
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3224
    // be a redirect call with an additional 'S' mode.  This in turn would
3225
    // correctly set the designed format to sam instead of a generic
3226
    // text_format.
3227
0
    if (fp->format.format != sam && fp->format.format != text_format)
3228
0
        return NULL;
3229
3230
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3231
0
    if (!fd)
3232
0
        return NULL;
3233
3234
0
    fp->state = fd;
3235
0
    fd->fp = fp;
3236
3237
0
    return fd;
3238
0
}
3239
3240
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3241
static void *sam_format_worker(void *arg);
3242
3243
0
static void sam_state_err(SAM_state *fd, int errcode) {
3244
0
    pthread_mutex_lock(&fd->command_m);
3245
0
    if (!fd->errcode)
3246
0
        fd->errcode = errcode;
3247
0
    pthread_mutex_unlock(&fd->command_m);
3248
0
}
3249
3250
0
static void sam_free_sp_bams(sp_bams *b) {
3251
0
    if (!b)
3252
0
        return;
3253
3254
0
    if (b->bams) {
3255
0
        int i;
3256
0
        for (i = 0; i < b->abams; i++) {
3257
0
            if (b->bams[i].data)
3258
0
                free(b->bams[i].data);
3259
0
        }
3260
0
        free(b->bams);
3261
0
    }
3262
0
    free(b);
3263
0
}
3264
3265
// Destroys the state produce by sam_state_create.
3266
13.1k
int sam_state_destroy(htsFile *fp) {
3267
13.1k
    int ret = 0;
3268
3269
13.1k
    if (!fp->state)
3270
13.1k
        return 0;
3271
3272
0
    SAM_state *fd = fp->state;
3273
0
    if (fd->p) {
3274
0
        if (fd->h) {
3275
            // Notify sam_dispatcher we're closing
3276
0
            pthread_mutex_lock(&fd->command_m);
3277
0
            if (fd->command != SAM_CLOSE_DONE)
3278
0
                fd->command = SAM_CLOSE;
3279
0
            pthread_cond_signal(&fd->command_c);
3280
0
            ret = -fd->errcode;
3281
0
            if (fd->q)
3282
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3283
3284
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3285
0
                for (;;) {
3286
                    // Avoid deadlocks with dispatcher
3287
0
                    if (fd->command == SAM_CLOSE_DONE)
3288
0
                        break;
3289
0
                    hts_tpool_wake_dispatch(fd->q);
3290
0
                    pthread_mutex_unlock(&fd->command_m);
3291
0
                    usleep(10000);
3292
0
                    pthread_mutex_lock(&fd->command_m);
3293
0
                }
3294
0
            }
3295
0
            pthread_mutex_unlock(&fd->command_m);
3296
3297
0
            if (fp->is_write) {
3298
                // Dispatch the last partial block.
3299
0
                sp_bams *gb = fd->curr_bam;
3300
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3301
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3302
3303
                // Flush and drain output
3304
0
                if (fd->q)
3305
0
                    hts_tpool_process_flush(fd->q);
3306
0
                pthread_mutex_lock(&fd->command_m);
3307
0
                if (!ret) ret = -fd->errcode;
3308
0
                pthread_mutex_unlock(&fd->command_m);
3309
3310
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3311
0
                    usleep(10000);
3312
0
                    pthread_mutex_lock(&fd->command_m);
3313
0
                    ret = -fd->errcode;
3314
                    // not empty but shutdown implies error
3315
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3316
0
                        ret = EIO;
3317
0
                    pthread_mutex_unlock(&fd->command_m);
3318
0
                }
3319
0
                if (fd->q)
3320
0
                    hts_tpool_process_shutdown(fd->q);
3321
0
            }
3322
3323
            // Wait for it to acknowledge
3324
0
            if (fd->dispatcher_set)
3325
0
                pthread_join(fd->dispatcher, NULL);
3326
0
            if (!ret) ret = -fd->errcode;
3327
0
        }
3328
3329
        // Tidy up memory
3330
0
        if (fd->q)
3331
0
            hts_tpool_process_destroy(fd->q);
3332
3333
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3334
0
            hts_tpool_destroy(fd->p);
3335
0
            fd->p = NULL;
3336
0
        }
3337
0
        pthread_mutex_destroy(&fd->lines_m);
3338
0
        pthread_mutex_destroy(&fd->command_m);
3339
0
        pthread_cond_destroy(&fd->command_c);
3340
3341
0
        sp_lines *l = fd->lines;
3342
0
        while (l) {
3343
0
            sp_lines *n = l->next;
3344
0
            free(l->data);
3345
0
            free(l);
3346
0
            l = n;
3347
0
        }
3348
3349
0
        sp_bams *b = fd->bams;
3350
0
        while (b) {
3351
0
            if (fd->curr_bam == b)
3352
0
                fd->curr_bam = NULL;
3353
0
            sp_bams *n = b->next;
3354
0
            sam_free_sp_bams(b);
3355
0
            b = n;
3356
0
        }
3357
3358
0
        if (fd->curr_bam)
3359
0
            sam_free_sp_bams(fd->curr_bam);
3360
3361
        // Decrement counter by one, maybe destroying too.
3362
        // This is to permit the caller using bam_hdr_destroy
3363
        // before sam_close without triggering decode errors
3364
        // in the background threads.
3365
0
        bam_hdr_destroy(fd->h);
3366
0
    }
3367
3368
0
    free(fp->state);
3369
0
    fp->state = NULL;
3370
0
    return ret;
3371
13.1k
}
3372
3373
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3374
0
static void cleanup_sp_lines(void *arg) {
3375
0
    sp_lines *gl = (sp_lines *)arg;
3376
0
    if (!gl) return;
3377
3378
    // Should always be true for lines passed to / from thread workers.
3379
0
    assert(gl->next == NULL);
3380
3381
0
    free(gl->data);
3382
0
    sam_free_sp_bams(gl->bams);
3383
0
    free(gl);
3384
0
}
3385
3386
// Run from one of the worker threads.
3387
// Convert a passed in array of lines to array of BAMs, returning
3388
// the result back to the thread queue.
3389
0
static void *sam_parse_worker(void *arg) {
3390
0
    sp_lines *gl = (sp_lines *)arg;
3391
0
    sp_bams *gb = NULL;
3392
0
    char *lines = gl->data;
3393
0
    int i;
3394
0
    bam1_t *b;
3395
0
    SAM_state *fd = gl->fd;
3396
3397
    // Use a block of BAM structs we had earlier if available.
3398
0
    pthread_mutex_lock(&fd->lines_m);
3399
0
    if (fd->bams) {
3400
0
        gb = fd->bams;
3401
0
        fd->bams = gb->next;
3402
0
    }
3403
0
    pthread_mutex_unlock(&fd->lines_m);
3404
3405
0
    if (gb == NULL) {
3406
0
        gb = calloc(1, sizeof(*gb));
3407
0
        if (!gb) {
3408
0
            return NULL;
3409
0
        }
3410
0
        gb->abams = 100;
3411
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3412
0
        if (!gb->bams) {
3413
0
            sam_state_err(fd, ENOMEM);
3414
0
            goto err;
3415
0
        }
3416
0
        gb->nbams = 0;
3417
0
        gb->bam_mem = 0;
3418
0
    }
3419
0
    gb->serial = gl->serial;
3420
0
    gb->next = NULL;
3421
3422
0
    b = (bam1_t *)gb->bams;
3423
0
    if (!b) {
3424
0
        sam_state_err(fd, ENOMEM);
3425
0
        goto err;
3426
0
    }
3427
3428
0
    i = 0;
3429
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3430
0
    while (cp < cp_end) {
3431
0
        if (i >= gb->abams) {
3432
0
            int old_abams = gb->abams;
3433
0
            gb->abams *= 2;
3434
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3435
0
            if (!b) {
3436
0
                gb->abams /= 2;
3437
0
                sam_state_err(fd, ENOMEM);
3438
0
                goto err;
3439
0
            }
3440
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3441
0
            gb->bams = b;
3442
0
        }
3443
3444
        // Ideally we'd get sam_parse1 to return the number of
3445
        // bytes decoded and to be able to stop on newline as
3446
        // well as \0.
3447
        //
3448
        // We can then avoid the additional strchr loop.
3449
        // It's around 6% of our CPU cost, albeit threadable.
3450
        //
3451
        // However this is an API change so for now we copy.
3452
3453
0
        char *nl = strchr(cp, '\n');
3454
0
        char *line_end;
3455
0
        if (nl) {
3456
0
            line_end = nl;
3457
0
            if (line_end > cp && *(line_end - 1) == '\r')
3458
0
                line_end--;
3459
0
            nl++;
3460
0
        } else {
3461
0
            nl = line_end = cp_end;
3462
0
        }
3463
0
        *line_end = '\0';
3464
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3465
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3466
0
            sam_state_err(fd, errno ? errno : EIO);
3467
0
            cleanup_sp_lines(gl);
3468
0
            goto err;
3469
0
        }
3470
3471
0
        cp = nl;
3472
0
        i++;
3473
0
    }
3474
0
    gb->nbams = i;
3475
3476
0
    pthread_mutex_lock(&fd->lines_m);
3477
0
    gl->next = fd->lines;
3478
0
    fd->lines = gl;
3479
0
    pthread_mutex_unlock(&fd->lines_m);
3480
0
    return gb;
3481
3482
0
 err:
3483
0
    sam_free_sp_bams(gb);
3484
0
    return NULL;
3485
0
}
3486
3487
0
static void *sam_parse_eof(void *arg) {
3488
0
    return NULL;
3489
0
}
3490
3491
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3492
0
static void cleanup_sp_bams(void *arg) {
3493
0
    sam_free_sp_bams((sp_bams *) arg);
3494
0
}
3495
3496
// Runs in its own thread.
3497
// Reads a block of text (SAM) and sends a new job to the thread queue to
3498
// translate this to BAM.
3499
0
static void *sam_dispatcher_read(void *vp) {
3500
0
    htsFile *fp = vp;
3501
0
    kstring_t line = {0};
3502
0
    int line_frag = 0;
3503
0
    SAM_state *fd = fp->state;
3504
0
    sp_lines *l = NULL;
3505
3506
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3507
    // matter as it will grow if necessary).
3508
0
    if (ks_resize(&line, 1000) < 0)
3509
0
        goto err;
3510
3511
0
    for (;;) {
3512
        // Check for command
3513
0
        pthread_mutex_lock(&fd->command_m);
3514
0
        switch (fd->command) {
3515
3516
0
        case SAM_CLOSE:
3517
0
            pthread_cond_signal(&fd->command_c);
3518
0
            pthread_mutex_unlock(&fd->command_m);
3519
0
            hts_tpool_process_shutdown(fd->q);
3520
0
            goto tidyup;
3521
3522
0
        default:
3523
0
            break;
3524
0
        }
3525
0
        pthread_mutex_unlock(&fd->command_m);
3526
3527
0
        pthread_mutex_lock(&fd->lines_m);
3528
0
        if (fd->lines) {
3529
            // reuse existing line buffer
3530
0
            l = fd->lines;
3531
0
            fd->lines = l->next;
3532
0
        }
3533
0
        pthread_mutex_unlock(&fd->lines_m);
3534
3535
0
        if (l == NULL) {
3536
            // none to reuse, to create a new one
3537
0
            l = calloc(1, sizeof(*l));
3538
0
            if (!l)
3539
0
                goto err;
3540
0
            l->alloc = SAM_NBYTES;
3541
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3542
0
            if (!l->data) {
3543
0
                free(l);
3544
0
                l = NULL;
3545
0
                goto err;
3546
0
            }
3547
0
            l->fd = fd;
3548
0
        }
3549
0
        l->next = NULL;
3550
3551
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3552
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3553
0
            if (!rp)
3554
0
                goto err;
3555
0
            l->alloc = line_frag+SAM_NBYTES/2;
3556
0
            l->data = rp;
3557
0
        }
3558
0
        memcpy(l->data, line.s, line_frag);
3559
3560
0
        l->data_size = line_frag;
3561
0
        ssize_t nbytes;
3562
0
    longer_line:
3563
0
        if (fp->is_bgzf)
3564
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3565
0
        else
3566
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3567
0
        if (nbytes < 0) {
3568
0
            sam_state_err(fd, errno ? errno : EIO);
3569
0
            goto err;
3570
0
        } else if (nbytes == 0)
3571
0
            break; // EOF
3572
0
        l->data_size += nbytes;
3573
3574
        // trim to last \n. Maybe \r\n, but that's still fine
3575
0
        if (nbytes == l->alloc - line_frag) {
3576
0
            char *cp_end = l->data + l->data_size;
3577
0
            char *cp = cp_end-1;
3578
3579
0
            while (cp > (char *)l->data && *cp != '\n')
3580
0
                cp--;
3581
3582
            // entire buffer is part of a single line
3583
0
            if (cp == l->data) {
3584
0
                line_frag = l->data_size;
3585
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3586
0
                if (!rp)
3587
0
                    goto err;
3588
0
                l->alloc *= 2;
3589
0
                l->data = rp;
3590
0
                assert(l->alloc >= l->data_size);
3591
0
                assert(l->alloc >= line_frag);
3592
0
                assert(l->alloc >= l->alloc - line_frag);
3593
0
                goto longer_line;
3594
0
            }
3595
0
            cp++;
3596
3597
            // line holds the remainder of our line.
3598
0
            if (ks_resize(&line, cp_end - cp) < 0)
3599
0
                goto err;
3600
0
            memcpy(line.s, cp, cp_end - cp);
3601
0
            line_frag = cp_end - cp;
3602
0
            l->data_size = l->alloc - line_frag;
3603
0
        } else {
3604
            // out of buffer
3605
0
            line_frag = 0;
3606
0
        }
3607
3608
0
        l->serial = fd->serial++;
3609
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3610
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3611
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3612
0
            goto err;
3613
0
        pthread_mutex_lock(&fd->command_m);
3614
0
        if (fd->command == SAM_CLOSE) {
3615
0
            pthread_mutex_unlock(&fd->command_m);
3616
0
            l = NULL;
3617
0
            goto tidyup;
3618
0
        }
3619
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3620
0
        pthread_mutex_unlock(&fd->command_m);
3621
0
    }
3622
3623
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3624
0
        goto err;
3625
3626
    // At EOF, wait for close request.
3627
    // (In future if we add support for seek, this is where we need to catch it.)
3628
0
    for (;;) {
3629
0
        pthread_mutex_lock(&fd->command_m);
3630
0
        if (fd->command == SAM_NONE)
3631
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3632
0
        switch (fd->command) {
3633
0
        case SAM_CLOSE:
3634
0
            pthread_cond_signal(&fd->command_c);
3635
0
            pthread_mutex_unlock(&fd->command_m);
3636
0
            hts_tpool_process_shutdown(fd->q);
3637
0
            goto tidyup;
3638
3639
0
        default:
3640
0
            pthread_mutex_unlock(&fd->command_m);
3641
0
            break;
3642
0
        }
3643
0
    }
3644
3645
0
 tidyup:
3646
0
    pthread_mutex_lock(&fd->command_m);
3647
0
    fd->command = SAM_CLOSE_DONE;
3648
0
    pthread_cond_signal(&fd->command_c);
3649
0
    pthread_mutex_unlock(&fd->command_m);
3650
3651
0
    if (l) {
3652
0
        pthread_mutex_lock(&fd->lines_m);
3653
0
        l->next = fd->lines;
3654
0
        fd->lines = l;
3655
0
        pthread_mutex_unlock(&fd->lines_m);
3656
0
    }
3657
0
    free(line.s);
3658
3659
0
    return NULL;
3660
3661
0
 err:
3662
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3663
0
    hts_tpool_process_shutdown(fd->q);
3664
0
    goto tidyup;
3665
0
}
3666
3667
// Runs in its own thread.
3668
// Takes encoded blocks of SAM off the thread results queue and writes them
3669
// to our output stream.
3670
0
static void *sam_dispatcher_write(void *vp) {
3671
0
    htsFile *fp = vp;
3672
0
    SAM_state *fd = fp->state;
3673
0
    hts_tpool_result *r;
3674
3675
    // Iterates until result queue is shutdown, where it returns NULL.
3676
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3677
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3678
0
        if (!gl) {
3679
0
            sam_state_err(fd, ENOMEM);
3680
0
            goto err;
3681
0
        }
3682
3683
0
        if (fp->idx) {
3684
0
            sp_bams *gb = gl->bams;
3685
0
            int i = 0, count = 0;
3686
0
            while (i < gl->data_size) {
3687
0
                int j = i;
3688
0
                while (i < gl->data_size && gl->data[i] != '\n')
3689
0
                    i++;
3690
0
                if (i < gl->data_size)
3691
0
                    i++;
3692
3693
0
                if (fp->is_bgzf) {
3694
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3695
0
                        goto err;
3696
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3697
0
                        goto err;
3698
0
                } else {
3699
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3700
0
                        goto err;
3701
0
                }
3702
3703
0
                bam1_t *b = &gb->bams[count++];
3704
0
                if (fp->format.compression == bgzf) {
3705
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3706
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3707
0
                                      bgzf_tell(fp->fp.bgzf),
3708
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3709
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3710
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3711
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3712
0
                        goto err;
3713
0
                    }
3714
0
                } else {
3715
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3716
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3717
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3718
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3719
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3720
0
                        goto err;
3721
0
                    }
3722
0
                }
3723
0
            }
3724
3725
0
            assert(count == gb->nbams);
3726
3727
            // Add bam array to free-list
3728
0
            pthread_mutex_lock(&fd->lines_m);
3729
0
            gb->next = fd->bams;
3730
0
            fd->bams = gl->bams;
3731
0
            gl->bams = NULL;
3732
0
            pthread_mutex_unlock(&fd->lines_m);
3733
0
        } else {
3734
0
            if (fp->is_bgzf) {
3735
                // We keep track of how much in the current block we have
3736
                // remaining => R.  We look for the last newline in input
3737
                // [i] to [i+R], backwards => position N.
3738
                //
3739
                // If we find a newline, we write out bytes i to N.
3740
                // We know we cannot fit the next record in this bgzf block,
3741
                // so we flush what we have and copy input N to i+R into
3742
                // the start of a new block, and recompute a new R for that.
3743
                //
3744
                // If we don't find a newline (i==N) then we cannot extend
3745
                // the current block at all, so flush whatever is in it now
3746
                // if it ends on a newline.
3747
                // We still copy i(==N) to i+R to the next block and
3748
                // continue as before with a new R.
3749
                //
3750
                // The only exception on the flush is when we run out of
3751
                // data in the input.  In that case we skip it as we don't
3752
                // yet know if the next record will fit.
3753
                //
3754
                // Both conditions share the same code here:
3755
                // - Look for newline (pos N)
3756
                // - Write i to N (which maybe 0)
3757
                // - Flush if block ends on newline and not end of input
3758
                // - write N to i+R
3759
3760
0
                int i = 0;
3761
0
                BGZF *fb = fp->fp.bgzf;
3762
0
                while (i < gl->data_size) {
3763
                    // remaining space in block
3764
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3765
0
                    int eod = 0;
3766
0
                    if (R > gl->data_size-i)
3767
0
                        R = gl->data_size-i, eod = 1;
3768
3769
                    // Find last newline in input data
3770
0
                    int N = i + R;
3771
0
                    while (--N > i) {
3772
0
                        if (gl->data[N] == '\n')
3773
0
                            break;
3774
0
                    }
3775
3776
0
                    if (N != i) {
3777
                        // Found a newline
3778
0
                        N++;
3779
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3780
0
                            goto err;
3781
0
                    }
3782
3783
                    // Flush bgzf block
3784
0
                    int b_off = fb->block_offset;
3785
0
                    if (!eod && b_off &&
3786
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3787
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3788
0
                            goto err;
3789
3790
                    // Copy from N onwards into next block
3791
0
                    if (i+R > N)
3792
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3793
0
                            != i+R - N)
3794
0
                            goto err;
3795
3796
0
                    i = i+R;
3797
0
                }
3798
0
            } else {
3799
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3800
0
                    goto err;
3801
0
            }
3802
0
        }
3803
3804
0
        hts_tpool_delete_result(r, 0);
3805
3806
        // Also updated by main thread
3807
0
        pthread_mutex_lock(&fd->lines_m);
3808
0
        gl->next = fd->lines;
3809
0
        fd->lines = gl;
3810
0
        pthread_mutex_unlock(&fd->lines_m);
3811
0
    }
3812
3813
0
    sam_state_err(fd, 0); // success
3814
0
    hts_tpool_process_shutdown(fd->q);
3815
0
    return NULL;
3816
3817
0
 err:
3818
0
    sam_state_err(fd, errno ? errno : EIO);
3819
0
    return (void *)-1;
3820
0
}
3821
3822
// Run from one of the worker threads.
3823
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3824
// of text SAM records (sp_lines).
3825
0
static void *sam_format_worker(void *arg) {
3826
0
    sp_bams *gb = (sp_bams *)arg;
3827
0
    sp_lines *gl = NULL;
3828
0
    int i;
3829
0
    SAM_state *fd = gb->fd;
3830
0
    htsFile *fp = fd->fp;
3831
3832
    // Use a block of SAM strings we had earlier if available.
3833
0
    pthread_mutex_lock(&fd->lines_m);
3834
0
    if (fd->lines) {
3835
0
        gl = fd->lines;
3836
0
        fd->lines = gl->next;
3837
0
    }
3838
0
    pthread_mutex_unlock(&fd->lines_m);
3839
3840
0
    if (gl == NULL) {
3841
0
        gl = calloc(1, sizeof(*gl));
3842
0
        if (!gl) {
3843
0
            sam_state_err(fd, ENOMEM);
3844
0
            return NULL;
3845
0
        }
3846
0
        gl->alloc = gl->data_size = 0;
3847
0
        gl->data = NULL;
3848
0
    }
3849
0
    gl->serial = gb->serial;
3850
0
    gl->next = NULL;
3851
3852
0
    kstring_t ks = {0, gl->alloc, gl->data};
3853
3854
0
    for (i = 0; i < gb->nbams; i++) {
3855
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3856
0
            sam_state_err(fd, errno ? errno : EIO);
3857
0
            goto err;
3858
0
        }
3859
0
        kputc('\n', &ks);
3860
0
    }
3861
3862
0
    pthread_mutex_lock(&fd->lines_m);
3863
0
    gl->data_size = ks.l;
3864
0
    gl->alloc = ks.m;
3865
0
    gl->data = ks.s;
3866
3867
0
    if (fp->idx) {
3868
        // Keep hold of the bam array a little longer as
3869
        // sam_dispatcher_write needs to use them for building the index.
3870
0
        gl->bams = gb;
3871
0
    } else {
3872
        // Add bam array to free-list
3873
0
        gb->next = fd->bams;
3874
0
        fd->bams = gb;
3875
0
    }
3876
0
    pthread_mutex_unlock(&fd->lines_m);
3877
3878
0
    return gl;
3879
3880
0
 err:
3881
    // Possible race between this and fd->curr_bam.
3882
    // Easier to not free and leave it on the input list so it
3883
    // gets freed there instead?
3884
    // sam_free_sp_bams(gb);
3885
0
    if (gl) {
3886
0
        free(gl->data);
3887
0
        free(gl);
3888
0
    }
3889
0
    return NULL;
3890
0
}
3891
3892
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3893
0
    if (fp->state)
3894
0
        return 0;
3895
3896
0
    if (!(fp->state = sam_state_create(fp)))
3897
0
        return -1;
3898
0
    SAM_state *fd = (SAM_state *)fp->state;
3899
3900
0
    pthread_mutex_init(&fd->lines_m, NULL);
3901
0
    pthread_mutex_init(&fd->command_m, NULL);
3902
0
    pthread_cond_init(&fd->command_c, NULL);
3903
0
    fd->p = p->pool;
3904
0
    int qsize = p->qsize;
3905
0
    if (!qsize)
3906
0
        qsize = 2*hts_tpool_size(fd->p);
3907
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3908
0
    if (!fd->q) {
3909
0
        sam_state_destroy(fp);
3910
0
        return -1;
3911
0
    }
3912
3913
0
    if (fp->format.compression == bgzf)
3914
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3915
3916
0
    return 0;
3917
0
}
3918
3919
0
int sam_set_threads(htsFile *fp, int nthreads) {
3920
0
    if (nthreads <= 0)
3921
0
        return 0;
3922
3923
0
    htsThreadPool p;
3924
0
    p.pool = hts_tpool_init(nthreads);
3925
0
    p.qsize = nthreads*2;
3926
3927
0
    int ret = sam_set_thread_pool(fp, &p);
3928
0
    if (ret < 0)
3929
0
        return ret;
3930
3931
0
    SAM_state *fd = (SAM_state *)fp->state;
3932
0
    fd->own_pool = 1;
3933
3934
0
    return 0;
3935
0
}
3936
3937
typedef struct {
3938
    kstring_t name;
3939
    kstring_t comment; // NB: pointer into name, do not free
3940
    kstring_t seq;
3941
    kstring_t qual;
3942
    int casava;
3943
    int aux;
3944
    int rnum;
3945
    char BC[3];         // aux tag ID for barcode
3946
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3947
    char nprefix;
3948
    int sra_names;
3949
} fastq_state;
3950
3951
// Initialise fastq state.
3952
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3953
1.53k
static fastq_state *fastq_state_init(int name_char) {
3954
1.53k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3955
1.53k
    if (!x)
3956
0
        return NULL;
3957
1.53k
    strcpy(x->BC, "BC");
3958
1.53k
    x->nprefix = name_char;
3959
3960
1.53k
    return x;
3961
1.53k
}
3962
3963
2.04k
void fastq_state_destroy(htsFile *fp) {
3964
2.04k
    if (fp->state) {
3965
1.53k
        fastq_state *x = (fastq_state *)fp->state;
3966
1.53k
        if (x->tags)
3967
1.53k
            kh_destroy(tag, x->tags);
3968
1.53k
        ks_free(&x->name);
3969
1.53k
        ks_free(&x->seq);
3970
1.53k
        ks_free(&x->qual);
3971
1.53k
        free(fp->state);
3972
1.53k
    }
3973
2.04k
}
3974
3975
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3976
0
    va_list args;
3977
3978
0
    if (!fp)
3979
0
        return -1;
3980
0
    if (!fp->state)
3981
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3982
0
                                           ? '@' : '>')))
3983
0
            return -1;
3984
3985
0
    fastq_state *x = (fastq_state *)fp->state;
3986
3987
0
    switch (opt) {
3988
0
    case FASTQ_OPT_CASAVA:
3989
0
        x->casava = 1;
3990
0
        break;
3991
3992
0
    case FASTQ_OPT_NAME2:
3993
0
        x->sra_names = 1;
3994
0
        break;
3995
3996
0
    case FASTQ_OPT_AUX: {
3997
0
        va_start(args, opt);
3998
0
        x->aux = 1;
3999
0
        char *tag = va_arg(args, char *);
4000
0
        va_end(args);
4001
0
        if (tag && strcmp(tag, "1") != 0) {
4002
0
            if (!x->tags)
4003
0
                if (!(x->tags = kh_init(tag)))
4004
0
                    return -1;
4005
4006
0
            size_t i, tlen = strlen(tag);
4007
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
4008
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
4009
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
4010
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
4011
0
                    break;
4012
0
                }
4013
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
4014
0
                kh_put(tag, x->tags, tcode, &ret);
4015
0
                if (ret < 0)
4016
0
                    return -1;
4017
0
            }
4018
0
        }
4019
0
        break;
4020
0
    }
4021
4022
0
    case FASTQ_OPT_BARCODE: {
4023
0
        va_start(args, opt);
4024
0
        char *bc = va_arg(args, char *);
4025
0
        va_end(args);
4026
0
        strncpy(x->BC, bc, 2);
4027
0
        x->BC[2] = 0;
4028
0
        break;
4029
0
    }
4030
4031
0
    case FASTQ_OPT_RNUM:
4032
0
        x->rnum = 1;
4033
0
        break;
4034
4035
0
    default:
4036
0
        break;
4037
0
    }
4038
0
    return 0;
4039
0
}
4040
4041
27.4M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
4042
27.4M
    fastq_state *x = (fastq_state *)fp->state;
4043
27.4M
    size_t i, l;
4044
27.4M
    int ret = 0;
4045
4046
27.4M
    if (fp->format.format == fasta_format && fp->line.s) {
4047
        // For FASTA we've already read the >name line; steal it
4048
        // Not the most efficient, but we don't optimise for fasta reading.
4049
27.4M
        if (fp->line.l == 0)
4050
471
            return -1; // EOF
4051
4052
27.4M
        free(x->name.s);
4053
27.4M
        x->name = fp->line;
4054
27.4M
        fp->line.l = fp->line.m = 0;
4055
27.4M
        fp->line.s = NULL;
4056
27.4M
    } else {
4057
        // Read a FASTQ format entry.
4058
2.74k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
4059
2.74k
        if (ret == -1)
4060
6
            return -1;  // EOF
4061
2.74k
        else if (ret < -1)
4062
9
            return ret; // ERR
4063
2.74k
    }
4064
4065
    // Name
4066
27.4M
    if (*x->name.s != x->nprefix)
4067
24
        return -2;
4068
4069
    // Reverse the SRA strangeness of putting the run_name.number before
4070
    // the read name.
4071
27.4M
    i = 0;
4072
27.4M
    char *name = x->name.s+1;
4073
27.4M
    if (x->sra_names) {
4074
0
        char *cp = strpbrk(x->name.s, " \t");
4075
0
        if (cp) {
4076
0
            while (*cp == ' ' || *cp == '\t')
4077
0
                cp++;
4078
0
            *--cp = '@';
4079
0
            i = cp - x->name.s;
4080
0
            name = cp+1;
4081
0
        }
4082
0
    }
4083
4084
27.4M
    l = x->name.l;
4085
27.4M
    char *s = x->name.s;
4086
100M
    while (i < l && !isspace_c(s[i]))
4087
73.0M
        i++;
4088
27.4M
    if (i < l) {
4089
181k
        s[i] = 0;
4090
181k
        x->name.l = i++;
4091
181k
    }
4092
4093
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
4094
28.1M
    while (i < l && isspace_c(s[i]))
4095
728k
        i++;
4096
27.4M
    x->comment.s = s+i;
4097
27.4M
    x->comment.l = l - i;
4098
4099
    // Seq
4100
27.4M
    x->seq.l = 0;
4101
177M
    for (;;) {
4102
177M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
4103
1.36k
            if (fp->format.format == fastq_format || ret < -1)
4104
891
                return -2;
4105
177M
        if (ret == -1 ||
4106
177M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
4107
27.4M
            break;
4108
149M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
4109
0
            return -2;
4110
149M
    }
4111
4112
    // Qual
4113
27.4M
    if (fp->format.format == fastq_format) {
4114
1.25k
        size_t remainder = x->seq.l;
4115
1.25k
        x->qual.l = 0;
4116
6.58k
        do {
4117
6.58k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
4118
18
                return -2;
4119
6.57k
            if (fp->line.l > remainder)
4120
21
                return -2;
4121
6.54k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4122
0
                return -2;
4123
6.54k
            remainder -= fp->line.l;
4124
6.54k
        } while (remainder > 0);
4125
4126
        // Decr qual
4127
16.8k
        for (i = 0; i < x->qual.l; i++)
4128
15.6k
            x->qual.s[i] -= '!';
4129
1.21k
    }
4130
4131
27.4M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4132
27.4M
    if (x->name.l > 2 &&
4133
27.4M
        x->name.s[x->name.l-2] == '/' &&
4134
27.4M
        isdigit_c(x->name.s[x->name.l-1])) {
4135
10.0k
        switch(x->name.s[x->name.l-1]) {
4136
4.31k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4137
3.22k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4138
2.53k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4139
10.0k
        }
4140
10.0k
        x->name.s[x->name.l-=2] = 0;
4141
10.0k
    }
4142
4143
    // Convert to BAM
4144
27.4M
    ret = bam_set1(b,
4145
27.4M
                   x->name.s + x->name.l - name, name,
4146
27.4M
                   flag,
4147
27.4M
                   -1, -1, 0, // ref '*', pos, mapq,
4148
27.4M
                   0, NULL,     // no cigar,
4149
27.4M
                   -1, -1, 0,    // mate
4150
27.4M
                   x->seq.l, x->seq.s, x->qual.s,
4151
27.4M
                   0);
4152
4153
    // Identify Illumina CASAVA strings.
4154
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4155
27.4M
    char *barcode = NULL;
4156
27.4M
    int barcode_len = 0;
4157
27.4M
    kstring_t *kc = &x->comment;
4158
27.4M
    char *endptr;
4159
27.4M
    if (x->casava &&
4160
        // \d:[YN]:\d+:[ACGTN]+
4161
27.4M
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4162
27.4M
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4163
27.4M
        && *endptr == ':') {
4164
4165
        // read num
4166
0
        switch(kc->s[0]) {
4167
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4168
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4169
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4170
0
        }
4171
4172
0
        if (kc->s[2] == 'Y')
4173
0
            b->core.flag |= BAM_FQCFAIL;
4174
4175
        // Barcode, maybe numeric in which case we skip it
4176
0
        if (!isdigit_c(endptr[1])) {
4177
0
            barcode = endptr+1;
4178
0
            for (i = barcode - kc->s; i < kc->l; i++)
4179
0
                if (isspace_c(kc->s[i]))
4180
0
                    break;
4181
4182
0
            kc->s[i] = 0;
4183
0
            barcode_len = i+1-(barcode - kc->s);
4184
0
        }
4185
0
    }
4186
4187
27.4M
    if (ret >= 0 && barcode_len)
4188
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4189
0
            ret = -2;
4190
4191
27.4M
    if (!x->aux)
4192
27.4M
        return ret;
4193
4194
    // Identify any SAM style aux tags in comments too.
4195
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4196
0
        ret = -2;
4197
4198
0
    return ret;
4199
27.4M
}
4200
4201
// Internal component of sam_read1 below
4202
3.67k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4203
3.67k
    int ret = bam_read1(fp->fp.bgzf, b);
4204
3.67k
    if (h && ret >= 0) {
4205
3.30k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4206
3.30k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4207
58
            errno = ERANGE;
4208
58
            return -3;
4209
58
        }
4210
3.30k
    }
4211
3.61k
    return ret;
4212
3.67k
}
4213
4214
// Internal component of sam_read1 below
4215
4.24k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4216
4.24k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4217
4.24k
    if (ret < 0)
4218
4.24k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4219
4220
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4221
0
        return -2;
4222
4223
0
    return ret;
4224
0
}
4225
4226
// Internal component of sam_read1 below
4227
315k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4228
315k
    int ret;
4229
4230
    // Consume 1st line after header parsing as it wasn't using peek
4231
315k
    if (fp->line.l != 0) {
4232
3
        ret = sam_parse1(&fp->line, h, b);
4233
3
        fp->line.l = 0;
4234
3
        return ret;
4235
3
    }
4236
4237
315k
    if (fp->state) {
4238
0
        SAM_state *fd = (SAM_state *)fp->state;
4239
4240
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4241
            // We don't support multi-threaded SAM parsing with seeks yet.
4242
0
            int ret;
4243
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4244
0
                errno = -ret;
4245
0
                return -2;
4246
0
            }
4247
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4248
0
                return -1;
4249
0
            fp->fp.bgzf->seeked = 0;
4250
0
            goto err_recover;
4251
0
        }
4252
4253
0
        if (!fd->h) {
4254
0
            fd->h = h;
4255
0
            fd->h->ref_count++;
4256
            // Ensure hrecs is initialised now as we don't want multiple
4257
            // threads trying to do this simultaneously.
4258
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4259
0
                return -2;
4260
4261
            // We can only do this once we've got a header
4262
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4263
0
                               fp) != 0)
4264
0
                return -2;
4265
0
            fd->dispatcher_set = 1;
4266
0
        }
4267
4268
0
        if (fd->h != h) {
4269
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4270
0
            return -1;
4271
0
        }
4272
4273
0
        sp_bams *gb = fd->curr_bam;
4274
0
        if (!gb) {
4275
0
            if (fd->errcode) {
4276
                // In case reader failed
4277
0
                errno = fd->errcode;
4278
0
                return -2;
4279
0
            }
4280
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4281
0
            if (!r)
4282
0
                return -2;
4283
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4284
0
            hts_tpool_delete_result(r, 0);
4285
0
        }
4286
0
        if (!gb)
4287
0
            return fd->errcode ? -2 : -1;
4288
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4289
0
        if (fd->curr_idx < gb->nbams)
4290
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4291
0
                return -2;
4292
0
        if (fd->curr_idx == gb->nbams) {
4293
0
            pthread_mutex_lock(&fd->lines_m);
4294
0
            gb->next = fd->bams;
4295
0
            fd->bams = gb;
4296
0
            pthread_mutex_unlock(&fd->lines_m);
4297
4298
0
            fd->curr_bam = NULL;
4299
0
            fd->curr_idx = 0;
4300
0
        }
4301
4302
0
        ret = 0;
4303
4304
315k
    } else  {
4305
315k
    err_recover:
4306
315k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4307
315k
        if (ret < 0) return ret;
4308
4309
313k
        ret = sam_parse1(&fp->line, h, b);
4310
313k
        fp->line.l = 0;
4311
313k
        if (ret < 0) {
4312
3.52k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4313
3.52k
            if (h && h->ignore_sam_err) goto err_recover;
4314
3.52k
        }
4315
313k
    }
4316
4317
313k
    return ret;
4318
315k
}
4319
4320
// Returns 0 on success,
4321
//        -1 on EOF,
4322
//       <-1 on error
4323
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4324
27.7M
{
4325
27.7M
    int ret, pass_filter;
4326
4327
27.7M
    do {
4328
27.7M
        switch (fp->format.format) {
4329
3.67k
        case bam:
4330
3.67k
            ret = sam_read1_bam(fp, h, b);
4331
3.67k
            break;
4332
4333
4.24k
        case cram:
4334
4.24k
            ret = sam_read1_cram(fp, h, &b);
4335
4.24k
            break;
4336
4337
315k
        case sam:
4338
315k
            ret = sam_read1_sam(fp, h, b);
4339
315k
            break;
4340
4341
27.4M
        case fasta_format:
4342
27.4M
        case fastq_format: {
4343
27.4M
            fastq_state *x = (fastq_state *)fp->state;
4344
27.4M
            if (!x) {
4345
1.53k
                if (!(fp->state = fastq_state_init(fp->format.format
4346
1.53k
                                                   == fastq_format ? '@' : '>')))
4347
0
                    return -2;
4348
1.53k
            }
4349
4350
27.4M
            return fastq_parse1(fp, b);
4351
27.4M
        }
4352
4353
0
        case empty_format:
4354
0
            errno = EPIPE;
4355
0
            return -3;
4356
4357
0
        default:
4358
0
            errno = EFTYPE;
4359
0
            return -3;
4360
27.7M
        }
4361
4362
323k
        pass_filter = (ret >= 0 && fp->filter)
4363
323k
            ? sam_passes_filter(h, b, fp->filter)
4364
323k
            : 1;
4365
323k
    } while (pass_filter == 0);
4366
4367
323k
    return pass_filter < 0 ? -2 : ret;
4368
27.7M
}
4369
4370
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4371
// this code isn't vectorised and runs far slower than is necessary (even
4372
// with the restrict keyword being used).
4373
static inline void HTS_OPT3
4374
818
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4375
818
    uint32_t i;
4376
7.14k
    for (i = 0; i < len; i++)
4377
6.33k
        a[i] = b[i]+33;
4378
818
}
4379
4380
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4381
9.24M
{
4382
9.24M
    int i, r = 0;
4383
9.24M
    uint8_t *s, *end;
4384
9.24M
    const bam1_core_t *c = &b->core;
4385
4386
9.24M
    if (c->l_qname == 0)
4387
0
        return -1;
4388
9.24M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4389
9.24M
    r |= kputc_('\t', str); // query name
4390
9.24M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4391
9.24M
    if (c->tid >= 0) { // chr
4392
54.4k
        r |= kputs(h->target_name[c->tid] , str);
4393
54.4k
        r |= kputc_('\t', str);
4394
9.19M
    } else r |= kputsn_("*\t", 2, str);
4395
9.24M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4396
9.24M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4397
9.24M
    if (c->n_cigar) { // cigar
4398
97.7k
        uint32_t *cigar = bam_get_cigar(b);
4399
333k
        for (i = 0; i < c->n_cigar; ++i) {
4400
236k
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4401
236k
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4402
236k
        }
4403
9.14M
    } else r |= kputc_('*', str);
4404
9.24M
    r |= kputc_('\t', str);
4405
9.24M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4406
5.92k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4407
4.20k
    else {
4408
4.20k
        r |= kputs(h->target_name[c->mtid], str);
4409
4.20k
        r |= kputc_('\t', str);
4410
4.20k
    }
4411
9.24M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4412
9.24M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4413
9.24M
    if (c->l_qseq) { // seq and qual
4414
689k
        uint8_t *s = bam_get_seq(b);
4415
689k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4416
689k
        char *cp = str->s + str->l;
4417
4418
        // Sequence, 2 bases at a time
4419
689k
        nibble2base(s, cp, c->l_qseq);
4420
689k
        cp[c->l_qseq] = '\t';
4421
689k
        cp += c->l_qseq+1;
4422
4423
        // Quality
4424
689k
        s = bam_get_qual(b);
4425
689k
        i = 0;
4426
689k
        if (s[0] == 0xff) {
4427
688k
            cp[i++] = '*';
4428
688k
        } else {
4429
818
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4430
818
            i = c->l_qseq;
4431
818
        }
4432
689k
        cp[i] = 0;
4433
689k
        cp += i;
4434
689k
        str->l = cp - str->s;
4435
8.55M
    } else r |= kputsn_("*\t*", 3, str);
4436
4437
9.24M
    s = bam_get_aux(b); // aux
4438
9.24M
    end = b->data + b->l_data;
4439
4440
10.6M
    while (end - s >= 4) {
4441
1.36M
        r |= kputc_('\t', str);
4442
1.36M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4443
169
            goto bad_aux;
4444
1.36M
    }
4445
9.24M
    r |= kputsn("", 0, str); // nul terminate
4446
9.24M
    if (r < 0) goto mem_err;
4447
4448
9.24M
    return str->l;
4449
4450
169
 bad_aux:
4451
169
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4452
169
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4453
169
    errno = EINVAL;
4454
169
    return -1;
4455
4456
0
 mem_err:
4457
0
    hts_log_error("Out of memory");
4458
0
    errno = ENOMEM;
4459
0
    return -1;
4460
9.24M
}
4461
4462
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4463
9.24M
{
4464
9.24M
    str->l = 0;
4465
9.24M
    return sam_format1_append(h, b, str);
4466
9.24M
}
4467
4468
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4469
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4470
0
{
4471
0
    unsigned flag = b->core.flag;
4472
0
    int i, e = 0, len = b->core.l_qseq;
4473
0
    uint8_t *seq, *qual;
4474
4475
0
    str->l = 0;
4476
4477
    // Name
4478
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4479
0
        return -1;
4480
4481
    // /1 or /2 suffix
4482
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4483
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4484
0
        if (r12 == BAM_FREAD1) {
4485
0
            if (kputs("/1", str) == EOF)
4486
0
                return -1;
4487
0
        } else if (r12 == BAM_FREAD2) {
4488
0
            if (kputs("/2", str) == EOF)
4489
0
                return -1;
4490
0
        }
4491
0
    }
4492
4493
    // Illumina CASAVA tag.
4494
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4495
0
    if (x && x->casava) {
4496
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4497
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4498
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4499
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4500
0
                     bc ? (char *)bc+1 : "0") < 0)
4501
0
            return -1;
4502
4503
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4504
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4505
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4506
0
            str->s[str->l-1] = '0';
4507
0
            str->s[str->l] = 0;
4508
0
            bc = NULL;
4509
0
        }
4510
4511
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4512
0
        if (bc) {
4513
0
            int l = strlen((char *)bc+1);
4514
0
            char *c = (char *)str->s + str->l - l;
4515
0
            for (i = 0; i < l; i++) {
4516
0
                if (!isalpha_c(c[i]))
4517
0
                    c[i] = '+';
4518
0
                else if (islower_c(c[i]))
4519
0
                    c[i] = toupper_c(c[i]);
4520
0
            }
4521
0
        }
4522
0
    }
4523
4524
    // Aux tags
4525
0
    if (x && x->aux) {
4526
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4527
0
        while (s && end - s >= 4) {
4528
0
            int tt = s[0]*256 + s[1];
4529
0
            if (x->tags == NULL ||
4530
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4531
0
                e |= kputc_('\t', str) < 0;
4532
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4533
0
                    return -1;
4534
0
            } else {
4535
0
                s = skip_aux(s+2, end);
4536
0
            }
4537
0
        }
4538
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4539
0
    }
4540
4541
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4542
0
    e |= kputc_('\n', str) < 0;
4543
4544
    // Seq line
4545
0
    seq = bam_get_seq(b);
4546
0
    if (flag & BAM_FREVERSE)
4547
0
        for (i = len-1; i >= 0; i--)
4548
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4549
0
    else
4550
0
        for (i = 0; i < len; i++)
4551
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4552
4553
4554
    // Qual line
4555
0
    if (x->nprefix == '@') {
4556
0
        kputsn("\n+\n", 3, str);
4557
0
        qual = bam_get_qual(b);
4558
0
        if (qual[0] == 0xff)
4559
0
            for (i = 0; i < len; i++)
4560
0
                e |= kputc_('B', str) < 0;
4561
0
        else if (flag & BAM_FREVERSE)
4562
0
            for (i = len-1; i >= 0; i--)
4563
0
                e |= kputc_(33 + qual[i], str) < 0;
4564
0
        else
4565
0
            for (i = 0; i < len; i++)
4566
0
                e |= kputc_(33 + qual[i], str) < 0;
4567
4568
0
    }
4569
0
    e |= kputc('\n', str) < 0;
4570
4571
0
    return e ? -1 : str->l;
4572
0
}
4573
4574
// Sadly we need to be able to modify the bam_hdr here so we can
4575
// reference count the structure.
4576
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4577
27.7M
{
4578
27.7M
    switch (fp->format.format) {
4579
0
    case binary_format:
4580
0
        fp->format.category = sequence_data;
4581
0
        fp->format.format = bam;
4582
        /* fall-through */
4583
9.24M
    case bam:
4584
9.24M
        return bam_write_idx1(fp, h, b);
4585
4586
9.24M
    case cram:
4587
9.24M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4588
4589
0
    case text_format:
4590
0
        fp->format.category = sequence_data;
4591
0
        fp->format.format = sam;
4592
        /* fall-through */
4593
9.24M
    case sam:
4594
9.24M
        if (fp->state) {
4595
0
            SAM_state *fd = (SAM_state *)fp->state;
4596
4597
            // Threaded output
4598
0
            if (!fd->h) {
4599
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4600
                // just data pointed to by it (which is a bit weasely still),
4601
                // but out cached pointer must be non-const as we want to
4602
                // destroy it later on and sam_hdr_destroy takes non-const.
4603
                //
4604
                // We do this because some tools do sam_hdr_destroy; sam_close
4605
                // while others do sam_close; sam_hdr_destroy.  The former is
4606
                // an issue as we need the header still when flushing.
4607
0
                fd->h = (sam_hdr_t *)h;
4608
0
                fd->h->ref_count++;
4609
4610
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4611
0
                                   fp) != 0)
4612
0
                    return -2;
4613
0
                fd->dispatcher_set = 1;
4614
0
            }
4615
4616
0
            if (fd->h != h) {
4617
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4618
0
                return -2;
4619
0
            }
4620
4621
            // Find a suitable BAM array to copy to
4622
0
            sp_bams *gb = fd->curr_bam;
4623
0
            if (!gb) {
4624
0
                pthread_mutex_lock(&fd->lines_m);
4625
0
                if (fd->bams) {
4626
0
                    fd->curr_bam = gb = fd->bams;
4627
0
                    fd->bams = gb->next;
4628
0
                    gb->next = NULL;
4629
0
                    gb->nbams = 0;
4630
0
                    gb->bam_mem = 0;
4631
0
                    pthread_mutex_unlock(&fd->lines_m);
4632
0
                } else {
4633
0
                    pthread_mutex_unlock(&fd->lines_m);
4634
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4635
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4636
0
                        free(gb);
4637
0
                        return -1;
4638
0
                    }
4639
0
                    gb->nbams = 0;
4640
0
                    gb->abams = SAM_NBAM;
4641
0
                    gb->bam_mem = 0;
4642
0
                    gb->fd = fd;
4643
0
                    fd->curr_idx = 0;
4644
0
                    fd->curr_bam = gb;
4645
0
                }
4646
0
            }
4647
4648
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4649
0
                return -2;
4650
0
            gb->bam_mem += b->l_data + sizeof(*b);
4651
4652
            // Dispatch if full
4653
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4654
0
                gb->serial = fd->serial++;
4655
0
                pthread_mutex_lock(&fd->command_m);
4656
0
                if (fd->errcode != 0) {
4657
0
                    pthread_mutex_unlock(&fd->command_m);
4658
0
                    return -fd->errcode;
4659
0
                }
4660
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4661
0
                                        cleanup_sp_bams,
4662
0
                                        cleanup_sp_lines, 0) < 0) {
4663
0
                    pthread_mutex_unlock(&fd->command_m);
4664
0
                    return -1;
4665
0
                }
4666
0
                pthread_mutex_unlock(&fd->command_m);
4667
0
                fd->curr_bam = NULL;
4668
0
            }
4669
4670
            // Dummy value as we don't know how long it really is.
4671
            // We could track file sizes via a SAM_state field, but I don't think
4672
            // it is necessary.
4673
0
            return 1;
4674
9.24M
        } else {
4675
9.24M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4676
9.24M
            kputc('\n', &fp->line);
4677
9.24M
            if (fp->is_bgzf) {
4678
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4679
0
                    return -1;
4680
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4681
9.24M
            } else {
4682
9.24M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4683
9.24M
            }
4684
4685
9.24M
            if (fp->idx) {
4686
0
                if (fp->format.compression == bgzf) {
4687
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4688
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4689
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4690
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4691
0
                        return -1;
4692
0
                    }
4693
0
                } else {
4694
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4695
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4696
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4697
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4698
0
                        return -1;
4699
0
                    }
4700
0
                }
4701
0
            }
4702
4703
9.24M
            return fp->line.l;
4704
9.24M
        }
4705
4706
4707
0
    case fasta_format:
4708
0
    case fastq_format: {
4709
0
        fastq_state *x = (fastq_state *)fp->state;
4710
0
        if (!x) {
4711
0
            if (!(fp->state = fastq_state_init(fp->format.format
4712
0
                                               == fastq_format ? '@' : '>')))
4713
0
                return -2;
4714
0
        }
4715
4716
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4717
0
            return -1;
4718
0
        if (fp->is_bgzf) {
4719
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4720
0
                return -1;
4721
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4722
0
                return -1;
4723
0
        } else {
4724
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4725
0
                return -1;
4726
0
        }
4727
0
        return fp->line.l;
4728
0
    }
4729
4730
0
    default:
4731
0
        errno = EBADF;
4732
0
        return -1;
4733
27.7M
    }
4734
27.7M
}
4735
4736
/************************
4737
 *** Auxiliary fields ***
4738
 ************************/
4739
#ifndef HTS_LITTLE_ENDIAN
4740
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4741
    int tsz = aux_type2size(type);
4742
4743
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4744
4745
    switch (tsz) {
4746
        case 'H': case 'Z': case 1:  // Trivial
4747
            memcpy(out, in, len);
4748
            break;
4749
4750
#define aux_val_to_le(type_t, store_le) do {                            \
4751
        type_t v;                                                       \
4752
        size_t i;                                                       \
4753
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4754
            memcpy(&v, in + i, sizeof(type_t));                         \
4755
            store_le(v, out);                                           \
4756
        }                                                               \
4757
    } while (0)
4758
4759
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4760
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4761
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4762
4763
#undef aux_val_to_le
4764
4765
        case 'B': { // Recurse!
4766
            uint32_t n;
4767
            if (len < 5) return -1;
4768
            memcpy(&n, in + 1, 4);
4769
            out[0] = in[0];
4770
            u32_to_le(n, out + 1);
4771
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4772
        }
4773
4774
        default: // Unknown type code
4775
            return -1;
4776
    }
4777
4778
4779
4780
    return 0;
4781
}
4782
#endif
4783
4784
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4785
0
{
4786
0
    uint32_t new_len;
4787
4788
0
    assert(b->l_data >= 0);
4789
0
    new_len = b->l_data + 3 + len;
4790
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4791
4792
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4793
4794
0
    b->data[b->l_data] = tag[0];
4795
0
    b->data[b->l_data + 1] = tag[1];
4796
0
    b->data[b->l_data + 2] = type;
4797
4798
0
#ifdef HTS_LITTLE_ENDIAN
4799
0
    memcpy(b->data + b->l_data + 3, data, len);
4800
#else
4801
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4802
        errno = EINVAL;
4803
        return -1;
4804
    }
4805
#endif
4806
4807
0
    b->l_data = new_len;
4808
4809
0
    return 0;
4810
4811
0
 nomem:
4812
0
    errno = ENOMEM;
4813
0
    return -1;
4814
0
}
4815
4816
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4817
1.70M
{
4818
1.70M
    int size;
4819
1.70M
    uint32_t n;
4820
1.70M
    if (s >= end) return end;
4821
1.70M
    size = aux_type2size(*s); ++s; // skip type
4822
1.70M
    switch (size) {
4823
34.5k
    case 'Z':
4824
80.8k
    case 'H':
4825
2.65M
        while (s < end && *s) ++s;
4826
80.8k
        return s < end ? s + 1 : end;
4827
12.5k
    case 'B':
4828
12.5k
        if (end - s < 5) return NULL;
4829
12.5k
        size = aux_type2size(*s); ++s;
4830
12.5k
        n = le_to_u32(s);
4831
12.5k
        s += 4;