Coverage Report

Created: 2024-02-11 06:32

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
// Suppress deprecation message for cigar_tab, which we initialise
45
#include "htslib/hts_defs.h"
46
#undef HTS_DEPRECATED
47
#define HTS_DEPRECATED(message)
48
49
#include "htslib/sam.h"
50
#include "htslib/bgzf.h"
51
#include "cram/cram.h"
52
#include "hts_internal.h"
53
#include "sam_internal.h"
54
#include "htslib/hfile.h"
55
#include "htslib/hts_endian.h"
56
#include "htslib/hts_expr.h"
57
#include "header.h"
58
59
#include "htslib/khash.h"
60
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
61
KHASH_SET_INIT_INT(tag)
62
63
#ifndef EFTYPE
64
0
#define EFTYPE ENOEXEC
65
#endif
66
#ifndef EOVERFLOW
67
#define EOVERFLOW ERANGE
68
#endif
69
70
/**********************
71
 *** BAM header I/O ***
72
 **********************/
73
74
HTSLIB_EXPORT
75
const int8_t bam_cigar_table[256] = {
76
    // 0 .. 47
77
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
81
    // 48 .. 63  (including =)
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
83
84
    // 64 .. 79  (including MIDNHB)
85
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
86
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
87
88
    // 80 .. 95  (including SPX)
89
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
90
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 96 .. 127
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
96
    // 128 .. 255
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
105
};
106
107
sam_hdr_t *sam_hdr_init()
108
23.5k
{
109
23.5k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
110
23.5k
    if (bh == NULL) return NULL;
111
112
23.5k
    bh->cigar_tab = bam_cigar_table;
113
23.5k
    return bh;
114
23.5k
}
115
116
void sam_hdr_destroy(sam_hdr_t *bh)
117
68.4k
{
118
68.4k
    int32_t i;
119
120
68.4k
    if (bh == NULL) return;
121
122
30.9k
    if (bh->ref_count > 0) {
123
7.33k
        --bh->ref_count;
124
7.33k
        return;
125
7.33k
    }
126
127
23.5k
    if (bh->target_name) {
128
63.6k
        for (i = 0; i < bh->n_targets; ++i)
129
55.5k
            free(bh->target_name[i]);
130
8.18k
        free(bh->target_name);
131
8.18k
        free(bh->target_len);
132
8.18k
    }
133
23.5k
    free(bh->text);
134
23.5k
    if (bh->hrecs)
135
22.3k
        sam_hrecs_free(bh->hrecs);
136
23.5k
    if (bh->sdict)
137
23.5k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
138
23.5k
    free(bh);
139
23.5k
}
140
141
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
142
// references before sam_hdr_t::hrecs is populated
143
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
144
116
{
145
116
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
146
116
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
147
116
    int i;
148
116
    if (!dest_long_refs) return -1;
149
150
3.94k
    for (i = 0; i < h->n_targets; i++) {
151
3.83k
        int ret;
152
3.83k
        khiter_t ksrc, kdest;
153
3.83k
        if (h->target_len[i] < UINT32_MAX) continue;
154
2.04k
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
155
2.04k
        if (ksrc == kh_end(src_long_refs)) continue;
156
2.04k
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
157
2.04k
        if (ret < 0) {
158
0
            kh_destroy(s2i, dest_long_refs);
159
0
            return -1;
160
0
        }
161
2.04k
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
162
2.04k
    }
163
164
116
    h->sdict = dest_long_refs;
165
116
    return 0;
166
116
}
167
168
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
169
8.50k
{
170
8.50k
    if (h0 == NULL) return NULL;
171
8.50k
    sam_hdr_t *h;
172
8.50k
    if ((h = sam_hdr_init()) == NULL) return NULL;
173
    // copy the simple data
174
8.50k
    h->n_targets = 0;
175
8.50k
    h->ignore_sam_err = h0->ignore_sam_err;
176
8.50k
    h->l_text = 0;
177
178
    // Then the pointery stuff
179
180
8.50k
    if (!h0->hrecs) {
181
253
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
182
253
        if (!h->target_len) goto fail;
183
253
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
184
253
        if (!h->target_name) goto fail;
185
186
253
        int i;
187
4.38k
        for (i = 0; i < h0->n_targets; ++i) {
188
4.13k
            h->target_len[i] = h0->target_len[i];
189
4.13k
            h->target_name[i] = strdup(h0->target_name[i]);
190
4.13k
            if (!h->target_name[i]) break;
191
4.13k
        }
192
253
        h->n_targets = i;
193
253
        if (i < h0->n_targets) goto fail;
194
195
253
        if (h0->sdict) {
196
116
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
197
116
        }
198
253
    }
199
200
8.50k
    if (h0->hrecs) {
201
8.25k
        kstring_t tmp = { 0, 0, NULL };
202
8.25k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
203
0
            free(ks_release(&tmp));
204
0
            goto fail;
205
0
        }
206
207
8.25k
        h->l_text = tmp.l;
208
8.25k
        h->text   = ks_release(&tmp);
209
210
8.25k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
211
0
            goto fail;
212
8.25k
    } else {
213
253
        h->l_text = h0->l_text;
214
253
        h->text = malloc(h->l_text + 1);
215
253
        if (!h->text) goto fail;
216
253
        memcpy(h->text, h0->text, h->l_text);
217
253
        h->text[h->l_text] = '\0';
218
253
    }
219
220
8.50k
    return h;
221
222
0
 fail:
223
0
    sam_hdr_destroy(h);
224
0
    return NULL;
225
8.50k
}
226
227
sam_hdr_t *bam_hdr_read(BGZF *fp)
228
753
{
229
753
    sam_hdr_t *h;
230
753
    uint8_t buf[4];
231
753
    int magic_len, has_EOF;
232
753
    int32_t i, name_len, num_names = 0;
233
753
    size_t bufsize;
234
753
    ssize_t bytes;
235
    // check EOF
236
753
    has_EOF = bgzf_check_EOF(fp);
237
753
    if (has_EOF < 0) {
238
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
239
753
    } else if (has_EOF == 0) {
240
753
        hts_log_warning("EOF marker is absent. The input is probably truncated");
241
753
    }
242
    // read "BAM1"
243
753
    magic_len = bgzf_read(fp, buf, 4);
244
753
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
245
0
        hts_log_error("Invalid BAM binary header");
246
0
        return 0;
247
0
    }
248
753
    h = sam_hdr_init();
249
753
    if (!h) goto nomem;
250
251
    // read plain text and the number of reference sequences
252
753
    bytes = bgzf_read(fp, buf, 4);
253
753
    if (bytes != 4) goto read_err;
254
753
    h->l_text = le_to_u32(buf);
255
256
753
    bufsize = h->l_text + 1;
257
753
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
258
753
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
259
753
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
260
753
#endif
261
753
    h->text = (char*)malloc(bufsize);
262
753
    if (!h->text) goto nomem;
263
753
    h->text[h->l_text] = 0; // make sure it is NULL terminated
264
753
    bytes = bgzf_read(fp, h->text, h->l_text);
265
753
    if (bytes != h->l_text) goto read_err;
266
267
735
    bytes = bgzf_read(fp, &h->n_targets, 4);
268
735
    if (bytes != 4) goto read_err;
269
735
    if (fp->is_be) ed_swap_4p(&h->n_targets);
270
271
735
    if (h->n_targets < 0) goto invalid;
272
273
    // read reference sequence names and lengths
274
732
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
275
732
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
276
3
        goto nomem;
277
729
#endif
278
729
    if (h->n_targets > 0) {
279
393
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
280
393
        if (!h->target_name) goto nomem;
281
393
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
282
393
        if (!h->target_len) goto nomem;
283
393
    }
284
336
    else {
285
336
        h->target_name = NULL;
286
336
        h->target_len = NULL;
287
336
    }
288
289
2.80k
    for (i = 0; i != h->n_targets; ++i) {
290
2.19k
        bytes = bgzf_read(fp, &name_len, 4);
291
2.19k
        if (bytes != 4) goto read_err;
292
2.18k
        if (fp->is_be) ed_swap_4p(&name_len);
293
2.18k
        if (name_len <= 0) goto invalid;
294
295
2.16k
        h->target_name[i] = (char*)malloc(name_len);
296
2.16k
        if (!h->target_name[i]) goto nomem;
297
2.16k
        num_names++;
298
299
2.16k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
300
2.16k
        if (bytes != name_len) goto read_err;
301
302
2.07k
        if (h->target_name[i][name_len - 1] != '\0') {
303
            /* Fix missing NUL-termination.  Is this being too nice?
304
               We could alternatively bail out with an error. */
305
1.07k
            char *new_name;
306
1.07k
            if (name_len == INT32_MAX) goto invalid;
307
1.07k
            new_name = realloc(h->target_name[i], name_len + 1);
308
1.07k
            if (new_name == NULL) goto nomem;
309
1.07k
            h->target_name[i] = new_name;
310
1.07k
            h->target_name[i][name_len] = '\0';
311
1.07k
        }
312
313
2.07k
        bytes = bgzf_read(fp, &h->target_len[i], 4);
314
2.07k
        if (bytes != 4) goto read_err;
315
2.07k
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
316
2.07k
    }
317
618
    return h;
318
319
3
 nomem:
320
3
    hts_log_error("Out of memory");
321
3
    goto clean;
322
323
111
 read_err:
324
111
    if (bytes < 0) {
325
48
        hts_log_error("Error reading BGZF stream");
326
63
    } else {
327
63
        hts_log_error("Truncated BAM header");
328
63
    }
329
111
    goto clean;
330
331
21
 invalid:
332
21
    hts_log_error("Invalid BAM binary header");
333
334
135
 clean:
335
135
    if (h != NULL) {
336
135
        h->n_targets = num_names; // ensure we free only allocated target_names
337
135
        sam_hdr_destroy(h);
338
135
    }
339
135
    return NULL;
340
21
}
341
342
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
343
4.46k
{
344
4.46k
    int32_t i, name_len, x;
345
4.46k
    kstring_t hdr_ks = { 0, 0, NULL };
346
4.46k
    char *text;
347
4.46k
    uint32_t l_text;
348
349
4.46k
    if (!h) return -1;
350
351
4.46k
    if (h->hrecs) {
352
4.21k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
353
4.21k
        if (hdr_ks.l > UINT32_MAX) {
354
0
            hts_log_error("Header too long for BAM format");
355
0
            free(hdr_ks.s);
356
0
            return -1;
357
4.21k
        } else if (hdr_ks.l > INT32_MAX) {
358
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
359
0
            hts_log_warning("Output file may not be portable");
360
0
        }
361
4.21k
        text = hdr_ks.s;
362
4.21k
        l_text = hdr_ks.l;
363
4.21k
    } else {
364
253
        if (h->l_text > UINT32_MAX) {
365
0
            hts_log_error("Header too long for BAM format");
366
0
            return -1;
367
253
        } else if (h->l_text > INT32_MAX) {
368
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
369
0
            hts_log_warning("Output file may not be portable");
370
0
        }
371
253
        text = h->text;
372
253
        l_text = h->l_text;
373
253
    }
374
    // write "BAM1"
375
4.46k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
376
    // write plain text and the number of reference sequences
377
4.46k
    if (fp->is_be) {
378
0
        x = ed_swap_4(l_text);
379
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
380
0
        if (l_text) {
381
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
382
0
        }
383
0
        x = ed_swap_4(h->n_targets);
384
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
385
4.46k
    } else {
386
4.46k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
387
4.46k
        if (l_text) {
388
2.56k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
389
2.56k
        }
390
4.46k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
391
4.46k
    }
392
4.46k
    free(hdr_ks.s);
393
    // write sequence names and lengths
394
17.7k
    for (i = 0; i != h->n_targets; ++i) {
395
13.3k
        char *p = h->target_name[i];
396
13.3k
        name_len = strlen(p) + 1;
397
13.3k
        if (fp->is_be) {
398
0
            x = ed_swap_4(name_len);
399
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
400
13.3k
        } else {
401
13.3k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
402
13.3k
        }
403
13.3k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
404
13.3k
        if (fp->is_be) {
405
0
            x = ed_swap_4(h->target_len[i]);
406
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
407
13.3k
        } else {
408
13.3k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
409
13.3k
        }
410
13.3k
    }
411
4.46k
    if (bgzf_flush(fp) < 0) return -1;
412
4.46k
    return 0;
413
4.46k
}
414
415
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
416
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
417
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
418
0
}
419
420
/*************************
421
 *** BAM alignment I/O ***
422
 *************************/
423
424
bam1_t *bam_init1()
425
1.06M
{
426
1.06M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
427
1.06M
}
428
429
int sam_realloc_bam_data(bam1_t *b, size_t desired)
430
1.40M
{
431
1.40M
    uint32_t new_m_data;
432
1.40M
    uint8_t *new_data;
433
1.40M
    new_m_data = desired;
434
1.40M
    kroundup32(new_m_data);
435
1.40M
    if (new_m_data < desired) {
436
0
        errno = ENOMEM; // Not strictly true but we can't store the size
437
0
        return -1;
438
0
    }
439
1.40M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
440
1.40M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
441
9
        errno = ENOMEM;
442
9
        return -1;
443
9
    }
444
1.40M
#endif
445
1.40M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
446
1.40M
        new_data = realloc(b->data, new_m_data);
447
1.40M
    } else {
448
0
        if ((new_data = malloc(new_m_data)) != NULL) {
449
0
            if (b->l_data > 0)
450
0
                memcpy(new_data, b->data,
451
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
452
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
453
0
        }
454
0
    }
455
1.40M
    if (!new_data) return -1;
456
1.40M
    b->data = new_data;
457
1.40M
    b->m_data = new_m_data;
458
1.40M
    return 0;
459
1.40M
}
460
461
void bam_destroy1(bam1_t *b)
462
23.2M
{
463
23.2M
    if (b == 0) return;
464
1.06M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
465
1.06M
        free(b->data);
466
1.06M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
467
            // In case of reuse
468
0
            b->data = NULL;
469
0
            b->m_data = 0;
470
0
            b->l_data = 0;
471
0
        }
472
1.06M
    }
473
474
1.06M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
475
1.06M
        free(b);
476
1.06M
}
477
478
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
479
10.0M
{
480
10.0M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
481
10.0M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
482
10.0M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
483
10.0M
    bdst->l_data = bsrc->l_data;
484
10.0M
    bdst->id = bsrc->id;
485
10.0M
    return bdst;
486
10.0M
}
487
488
bam1_t *bam_dup1(const bam1_t *bsrc)
489
1.04M
{
490
1.04M
    if (bsrc == NULL) return NULL;
491
1.04M
    bam1_t *bdst = bam_init1();
492
1.04M
    if (bdst == NULL) return NULL;
493
1.04M
    if (bam_copy1(bdst, bsrc) == NULL) {
494
0
        bam_destroy1(bdst);
495
0
        return NULL;
496
0
    }
497
1.04M
    return bdst;
498
1.04M
}
499
500
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
501
                             hts_pos_t *rlen, hts_pos_t *qlen)
502
3.78k
{
503
3.78k
    int k;
504
3.78k
    *rlen = *qlen = 0;
505
483k
    for (k = 0; k < n_cigar; ++k) {
506
479k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
507
479k
        int len = bam_cigar_oplen(cigar[k]);
508
479k
        if (type & 1) *qlen += len;
509
479k
        if (type & 2) *rlen += len;
510
479k
    }
511
3.78k
}
512
513
static int subtract_check_underflow(size_t length, size_t *limit)
514
149M
{
515
149M
    if (length <= *limit) {
516
149M
        *limit -= length;
517
149M
        return 0;
518
149M
    }
519
520
0
    return -1;
521
149M
}
522
523
int bam_set1(bam1_t *bam,
524
             size_t l_qname, const char *qname,
525
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
526
             size_t n_cigar, const uint32_t *cigar,
527
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
528
             size_t l_seq, const char *seq, const char *qual,
529
             size_t l_aux)
530
29.9M
{
531
    // use a default qname "*" if none is provided
532
29.9M
    if (l_qname == 0) {
533
26.7M
        l_qname = 1;
534
26.7M
        qname = "*";
535
26.7M
    }
536
537
    // note: the qname is stored nul terminated and padded as described in the
538
    // documentation for the bam1_t struct.
539
29.9M
    size_t qname_nuls = 4 - l_qname % 4;
540
541
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
542
    // can't use bam_endpos() directly as some fields not yet set up.
543
29.9M
    hts_pos_t rlen = 0, qlen = 0;
544
29.9M
    if (!(flag & BAM_FUNMAP)) {
545
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
546
0
    }
547
29.9M
    if (rlen == 0) {
548
29.9M
        rlen = 1;
549
29.9M
    }
550
551
    // validate parameters
552
29.9M
    if (l_qname > 254) {
553
87
        hts_log_error("Query name too long");
554
87
        errno = EINVAL;
555
87
        return -1;
556
87
    }
557
29.9M
    if (HTS_POS_MAX - rlen <= pos) {
558
0
        hts_log_error("Read ends beyond highest supported position");
559
0
        errno = EINVAL;
560
0
        return -1;
561
0
    }
562
29.9M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
563
0
        hts_log_error("Mapped query must have a CIGAR");
564
0
        errno = EINVAL;
565
0
        return -1;
566
0
    }
567
29.9M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
568
0
        hts_log_error("CIGAR and query sequence are of different length");
569
0
        errno = EINVAL;
570
0
        return -1;
571
0
    }
572
573
29.9M
    size_t limit = INT32_MAX;
574
29.9M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
575
29.9M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
576
29.9M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
577
29.9M
    u    += subtract_check_underflow(l_seq, &limit);
578
29.9M
    u    += subtract_check_underflow(l_aux, &limit);
579
29.9M
    if (u != 0) {
580
0
        hts_log_error("Size overflow");
581
0
        errno = EINVAL;
582
0
        return -1;
583
0
    }
584
585
    // re-allocate the data buffer as needed.
586
29.9M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
587
29.9M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
588
0
        return -1;
589
0
    }
590
591
29.9M
    bam->l_data = (int)data_len;
592
29.9M
    bam->core.pos = pos;
593
29.9M
    bam->core.tid = tid;
594
29.9M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
595
29.9M
    bam->core.qual = mapq;
596
29.9M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
597
29.9M
    bam->core.flag = flag;
598
29.9M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
599
29.9M
    bam->core.n_cigar = (uint32_t)n_cigar;
600
29.9M
    bam->core.l_qseq = (int32_t)l_seq;
601
29.9M
    bam->core.mtid = mtid;
602
29.9M
    bam->core.mpos = mpos;
603
29.9M
    bam->core.isize = isize;
604
605
29.9M
    uint8_t *cp = bam->data;
606
29.9M
    strncpy((char *)cp, qname, l_qname);
607
29.9M
    int i;
608
118M
    for (i = 0; i < qname_nuls; i++) {
609
88.2M
        cp[l_qname + i] = '\0';
610
88.2M
    }
611
29.9M
    cp += l_qname + qname_nuls;
612
613
29.9M
    if (n_cigar > 0) {
614
0
        memcpy(cp, cigar, n_cigar * 4);
615
0
    }
616
29.9M
    cp += n_cigar * 4;
617
618
1.13G
#define NN 16
619
29.9M
    const uint8_t *useq = (uint8_t *)seq;
620
121M
    for (i = 0; i + NN < l_seq; i += NN) {
621
91.7M
        int j;
622
91.7M
        const uint8_t *u2 = useq+i;
623
825M
        for (j = 0; j < NN/2; j++)
624
733M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
625
91.7M
        cp += NN/2;
626
91.7M
    }
627
38.1M
    for (; i + 1 < l_seq; i += 2) {
628
8.18M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
629
8.18M
    }
630
631
31.1M
    for (; i < l_seq; i++) {
632
1.21M
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
633
1.21M
    }
634
635
29.9M
    if (qual) {
636
1.29k
        memcpy(cp, qual, l_seq);
637
1.29k
    }
638
29.9M
    else {
639
29.9M
        memset(cp, '\xff', l_seq);
640
29.9M
    }
641
642
29.9M
    return (int)data_len;
643
29.9M
}
644
645
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
646
9.53M
{
647
9.53M
    int k;
648
9.53M
    hts_pos_t l;
649
14.1M
    for (k = l = 0; k < n_cigar; ++k)
650
4.64M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
651
4.22M
            l += bam_cigar_oplen(cigar[k]);
652
9.53M
    return l;
653
9.53M
}
654
655
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
656
188k
{
657
188k
    int k;
658
188k
    hts_pos_t l;
659
14.9M
    for (k = l = 0; k < n_cigar; ++k)
660
14.7M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
661
14.3M
            l += bam_cigar_oplen(cigar[k]);
662
188k
    return l;
663
188k
}
664
665
hts_pos_t bam_endpos(const bam1_t *b)
666
68.3k
{
667
68.3k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
668
68.3k
    if (rlen == 0) rlen = 1;
669
68.3k
    return b->core.pos + rlen;
670
68.3k
}
671
672
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
673
285k
{
674
285k
    bam1_core_t *c = &b->core;
675
285k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
676
285k
    uint8_t *CG;
677
678
    // test where there is a real CIGAR in the CG tag to move
679
285k
    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
680
157k
    cigar0 = bam_get_cigar(b);
681
157k
    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
682
148k
    fake_bytes = c->n_cigar * 4;
683
148k
    int saved_errno = errno;
684
148k
    CG = bam_aux_get(b, "CG");
685
148k
    if (!CG) {
686
55.7k
        if (errno != ENOENT) return -1;  // Bad aux data
687
55.7k
        errno = saved_errno; // restore errno on expected no-CG-tag case
688
55.7k
        return 0;
689
55.7k
    }
690
93.1k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
691
24.8k
        return 0; // not of type B,I
692
68.3k
    CG_len = le_to_u32(CG + 2);
693
68.3k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
694
695
    // move from the CG tag to the right position
696
68.3k
    cigar_st = (uint8_t*)cigar0 - b->data;
697
68.3k
    c->n_cigar = CG_len;
698
68.3k
    n_cigar4 = c->n_cigar * 4;
699
68.3k
    CG_st = CG - b->data - 2;
700
68.3k
    CG_en = CG_st + 8 + n_cigar4;
701
68.3k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
702
68.3k
    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
703
68.3k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
704
68.3k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
705
68.3k
    if (ori_len > CG_en) // move data after the CG tag
706
34.3k
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
707
68.3k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
708
68.3k
    if (recal_bin)
709
68.3k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
710
68.3k
    if (give_warning)
711
68.3k
        hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
712
68.3k
    return 1;
713
68.3k
}
714
715
static inline int aux_type2size(uint8_t type)
716
26.8M
{
717
26.8M
    switch (type) {
718
7.72M
    case 'A': case 'c': case 'C':
719
7.72M
        return 1;
720
4.54M
    case 's': case 'S':
721
4.54M
        return 2;
722
5.01M
    case 'i': case 'I': case 'f':
723
5.01M
        return 4;
724
24.8k
    case 'd':
725
24.8k
        return 8;
726
9.54M
    case 'Z': case 'H': case 'B':
727
9.54M
        return type;
728
115
    default:
729
115
        return 0;
730
26.8M
    }
731
26.8M
}
732
733
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
734
0
{
735
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
736
0
    uint32_t i;
737
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
738
0
}
739
740
// Fix bad records where qname is not terminated correctly.
741
2.09k
static int fixup_missing_qname_nul(bam1_t *b) {
742
2.09k
    bam1_core_t *c = &b->core;
743
744
    // Note this is called before c->l_extranul is added to c->l_qname
745
2.09k
    if (c->l_extranul > 0) {
746
1.96k
        b->data[c->l_qname++] = '\0';
747
1.96k
        c->l_extranul--;
748
1.96k
    } else {
749
129
        if (b->l_data > INT_MAX - 4) return -1;
750
129
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
751
129
        b->l_data += 4;
752
129
        b->data[c->l_qname++] = '\0';
753
129
        c->l_extranul = 3;
754
129
    }
755
2.09k
    return 0;
756
2.09k
}
757
758
/*
759
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
760
 * in multi-threaded handling.  This may be worth considering for htslib2.
761
 */
762
int bam_read1(BGZF *fp, bam1_t *b)
763
4.16k
{
764
4.16k
    bam1_core_t *c = &b->core;
765
4.16k
    int32_t block_len, ret, i;
766
4.16k
    uint32_t x[8], new_l_data;
767
768
4.16k
    b->l_data = 0;
769
770
4.16k
    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
771
37
        if (ret == 0) return -1; // normal end-of-file
772
17
        else return -2; // truncated
773
37
    }
774
4.12k
    if (fp->is_be)
775
0
        ed_swap_4p(&block_len);
776
4.12k
    if (block_len < 32) return -4;  // block_len includes core data
777
4.03k
    if (bgzf_read(fp, x, 32) != 32) return -3;
778
3.98k
    if (fp->is_be) {
779
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
780
0
    }
781
3.98k
    c->tid = x[0]; c->pos = (int32_t)x[1];
782
3.98k
    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
783
3.98k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
784
3.98k
    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
785
3.98k
    c->l_qseq = x[4];
786
3.98k
    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
787
788
3.98k
    new_l_data = block_len - 32 + c->l_extranul;
789
3.98k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
790
3.96k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
791
3.96k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
792
32
        return -4;
793
3.93k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
794
3.92k
    b->l_data = new_l_data;
795
796
3.92k
    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
797
3.91k
    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
798
2.09k
        if (fixup_missing_qname_nul(b) < 0) return -4;
799
2.09k
    }
800
6.99k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
801
3.91k
    c->l_qname += c->l_extranul;
802
3.91k
    if (b->l_data < c->l_qname ||
803
3.91k
        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
804
52
        return -4;
805
3.85k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
806
3.85k
    if (bam_tag2cigar(b, 0, 0) < 0)
807
0
        return -4;
808
809
3.85k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
810
3.78k
        hts_pos_t rlen, qlen;
811
3.78k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
812
3.78k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
813
3.78k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
814
        // Sanity check for broken CIGAR alignments
815
3.78k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
816
7
            hts_log_error("CIGAR and query sequence lengths differ for %s",
817
7
                    bam_get_qname(b));
818
7
            return -4;
819
7
        }
820
3.78k
    }
821
822
3.85k
    return 4 + block_len;
823
3.85k
}
824
825
int bam_write1(BGZF *fp, const bam1_t *b)
826
10.0M
{
827
10.0M
    const bam1_core_t *c = &b->core;
828
10.0M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
829
10.0M
    int i, ok;
830
10.0M
    if (c->l_qname - c->l_extranul > 255) {
831
1
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
832
1
        errno = EOVERFLOW;
833
1
        return -1;
834
1
    }
835
10.0M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
836
10.0M
    if (c->pos > INT_MAX ||
837
10.0M
        c->mpos > INT_MAX ||
838
10.0M
        c->isize < INT_MIN || c->isize > INT_MAX) {
839
239
        hts_log_error("Positional data is too large for BAM format");
840
239
        return -1;
841
239
    }
842
10.0M
    x[0] = c->tid;
843
10.0M
    x[1] = c->pos;
844
10.0M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
845
10.0M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
846
10.0M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
847
10.0M
    x[4] = c->l_qseq;
848
10.0M
    x[5] = c->mtid;
849
10.0M
    x[6] = c->mpos;
850
10.0M
    x[7] = c->isize;
851
10.0M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
852
10.0M
    if (fp->is_be) {
853
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
854
0
        y = block_len;
855
0
        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
856
0
        swap_data(c, b->l_data, b->data, 1);
857
10.0M
    } else {
858
10.0M
        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
859
10.0M
    }
860
10.0M
    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
861
10.0M
    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
862
10.0M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
863
10.0M
        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
864
10.0M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
865
34
        uint8_t buf[8];
866
34
        uint32_t cigar_st, cigar_en, cigar[2];
867
34
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
868
34
        if (cigreflen >= (1<<28)) {
869
            // Length of reference covered is greater than the biggest
870
            // CIGAR operation currently allowed.
871
7
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
872
7
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
873
7
                          bam_get_qname(b), c->n_cigar, cigreflen);
874
7
            return -1;
875
7
        }
876
27
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
877
27
        cigar_en = cigar_st + c->n_cigar * 4;
878
27
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
879
27
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
880
27
        u32_to_le(cigar[0], buf);
881
27
        u32_to_le(cigar[1], buf + 4);
882
27
        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
883
27
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
884
27
        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
885
27
        u32_to_le(c->n_cigar, buf);
886
27
        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
887
27
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
888
27
    }
889
10.0M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
890
10.0M
    return ok? 4 + block_len : -1;
891
10.0M
}
892
893
/*
894
 * Write a BAM file and append to the in-memory index simultaneously.
895
 */
896
10.0M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
897
10.0M
    BGZF *bfp = fp->fp.bgzf;
898
899
10.0M
    if (!fp->idx)
900
10.0M
        return bam_write1(bfp, b);
901
902
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
903
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
904
0
        return -1;
905
0
    if (!bfp->mt)
906
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
907
908
0
    int ret = bam_write1(bfp, b);
909
0
    if (ret < 0)
910
0
        return -1;
911
912
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
913
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
914
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
915
0
        ret = -1;
916
0
    }
917
918
0
    return ret;
919
0
}
920
921
/*
922
 * Set the qname in a BAM record
923
 */
924
int bam_set_qname(bam1_t *rec, const char *qname)
925
0
{
926
0
    if (!rec) return -1;
927
0
    if (!qname || !*qname) return -1;
928
929
0
    size_t old_len = rec->core.l_qname;
930
0
    size_t new_len = strlen(qname) + 1;
931
0
    if (new_len < 1 || new_len > 255) return -1;
932
933
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
934
935
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
936
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
937
938
    // Make room
939
0
    if (new_len + extranul != rec->core.l_qname)
940
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
941
    // Copy in new name and pad if needed
942
0
    memcpy(rec->data, qname, new_len);
943
0
    int n;
944
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
945
946
0
    rec->l_data = new_data_len;
947
0
    rec->core.l_qname = new_len + extranul;
948
0
    rec->core.l_extranul = extranul;
949
950
0
    return 0;
951
0
}
952
953
/********************
954
 *** BAM indexing ***
955
 ********************/
956
957
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
958
0
{
959
0
    int n_lvls, i, fmt, ret;
960
0
    bam1_t *b;
961
0
    hts_idx_t *idx;
962
0
    sam_hdr_t *h;
963
0
    h = sam_hdr_read(fp);
964
0
    if (h == NULL) return NULL;
965
0
    if (min_shift > 0) {
966
0
        hts_pos_t max_len = 0, s;
967
0
        for (i = 0; i < h->n_targets; ++i) {
968
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
969
0
            if (max_len < len) max_len = len;
970
0
        }
971
0
        max_len += 256;
972
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
973
0
        fmt = HTS_FMT_CSI;
974
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
975
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
976
0
    b = bam_init1();
977
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
978
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
979
0
        if (ret < 0) { // unsorted or doesn't fit
980
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
981
0
            goto err;
982
0
        }
983
0
    }
984
0
    if (ret < -1) goto err; // corrupted BAM file
985
986
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
987
0
    sam_hdr_destroy(h);
988
0
    bam_destroy1(b);
989
0
    return idx;
990
991
0
err:
992
0
    bam_destroy1(b);
993
0
    hts_idx_destroy(idx);
994
0
    return NULL;
995
0
}
996
997
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
998
0
{
999
0
    hts_idx_t *idx;
1000
0
    htsFile *fp;
1001
0
    int ret = 0;
1002
1003
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1004
0
    if (nthreads)
1005
0
        hts_set_threads(fp, nthreads);
1006
1007
0
    switch (fp->format.format) {
1008
0
    case cram:
1009
1010
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1011
0
        break;
1012
1013
0
    case bam:
1014
0
    case sam:
1015
0
        if (fp->format.compression != bgzf) {
1016
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1017
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1018
0
            ret = -1;
1019
0
            break;
1020
0
        }
1021
0
        idx = sam_index(fp, min_shift);
1022
0
        if (idx) {
1023
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1024
0
            if (ret < 0) ret = -4;
1025
0
            hts_idx_destroy(idx);
1026
0
        }
1027
0
        else ret = -1;
1028
0
        break;
1029
1030
0
    default:
1031
0
        ret = -3;
1032
0
        break;
1033
0
    }
1034
0
    hts_close(fp);
1035
1036
0
    return ret;
1037
0
}
1038
1039
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1040
0
{
1041
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1042
0
}
1043
1044
int sam_index_build(const char *fn, int min_shift)
1045
0
{
1046
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1047
0
}
1048
1049
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1050
#undef bam_index_build
1051
int bam_index_build(const char *fn, int min_shift)
1052
0
{
1053
0
    return sam_index_build2(fn, NULL, min_shift);
1054
0
}
1055
1056
// Initialise fp->idx for the current format type.
1057
// This must be called after the header has been written but no other data.
1058
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1059
0
    fp->fnidx = fnidx;
1060
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1061
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1062
0
        int n_lvls, fmt = HTS_FMT_CSI;
1063
0
        if (min_shift > 0) {
1064
0
            int64_t max_len = 0, s;
1065
0
            int i;
1066
0
            for (i = 0; i < h->n_targets; ++i)
1067
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1068
0
            max_len += 256;
1069
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1070
1071
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1072
1073
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1074
0
        return fp->idx ? 0 : -1;
1075
0
    }
1076
1077
0
    if (fp->format.format == cram) {
1078
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1079
0
        return fp->fp.cram->idxfp ? 0 : -1;
1080
0
    }
1081
1082
0
    return -1;
1083
0
}
1084
1085
// Finishes an index. Call after the last record has been written.
1086
// Returns 0 on success, <0 on failure.
1087
0
int sam_idx_save(htsFile *fp) {
1088
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1089
0
        fp->format.format == vcf || fp->format.format == sam) {
1090
0
        int ret;
1091
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1092
0
            errno = -ret;
1093
0
            return -1;
1094
0
        }
1095
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1096
0
            return -1;
1097
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1098
1099
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1100
0
            return -1;
1101
1102
0
        return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1103
1104
0
    } else if (fp->format.format == cram) {
1105
        // flushed and closed by cram_close
1106
0
    }
1107
1108
0
    return 0;
1109
0
}
1110
1111
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1112
0
{
1113
0
    htsFile *fp = (htsFile *)fpv;
1114
0
    bam1_t *b = bv;
1115
0
    fp->line.l = 0;
1116
0
    int ret = sam_read1(fp, fp->bam_header, b);
1117
0
    if (ret >= 0) {
1118
0
        *tid = b->core.tid;
1119
0
        *beg = b->core.pos;
1120
0
        *end = bam_endpos(b);
1121
0
    }
1122
0
    return ret;
1123
0
}
1124
1125
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1126
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1127
0
{
1128
0
    htsFile *fp = (htsFile *)fpv;
1129
0
    bam1_t *b = bv;
1130
0
    fp->line.l = 0;
1131
0
    int ret = sam_read1(fp, fp->bam_header, b);
1132
0
    return ret;
1133
0
}
1134
1135
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1136
// samtools/bam.c.
1137
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1138
0
{
1139
0
    const char *rg;
1140
0
    kstring_t lib = { 0, 0, NULL };
1141
0
    rg = (char *)bam_aux_get(b, "RG");
1142
1143
0
    if (!rg)
1144
0
        return NULL;
1145
0
    else
1146
0
        rg++;
1147
1148
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1149
0
        return NULL;
1150
1151
0
    static char LB_text[1024];
1152
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1153
1154
0
    memcpy(LB_text, lib.s, len);
1155
0
    LB_text[len] = 0;
1156
1157
0
    free(lib.s);
1158
1159
0
    return LB_text;
1160
0
}
1161
1162
1163
// Bam record pointer and SAM header combined
1164
typedef struct {
1165
    const sam_hdr_t *h;
1166
    const bam1_t *b;
1167
} hb_pair;
1168
1169
// Looks up variable names in str and replaces them with their value.
1170
// Also supports aux tags.
1171
//
1172
// Note the expression parser deliberately overallocates str size so it
1173
// is safe to use memcmp over strcmp.
1174
static int bam_sym_lookup(void *data, char *str, char **end,
1175
0
                          hts_expr_val_t *res) {
1176
0
    hb_pair *hb = (hb_pair *)data;
1177
0
    const bam1_t *b = hb->b;
1178
1179
0
    res->is_str = 0;
1180
0
    switch(*str) {
1181
0
    case 'c':
1182
0
        if (memcmp(str, "cigar", 5) == 0) {
1183
0
            *end = str+5;
1184
0
            res->is_str = 1;
1185
0
            ks_clear(&res->s);
1186
0
            uint32_t *cigar = bam_get_cigar(b);
1187
0
            int i, n = b->core.n_cigar, r = 0;
1188
0
            if (n) {
1189
0
                for (i = 0; i < n; i++) {
1190
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1191
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1192
0
                }
1193
0
                r |= kputs("", &res->s) < 0;
1194
0
            } else {
1195
0
                r |= kputs("*", &res->s) < 0;
1196
0
            }
1197
0
            return r ? -1 : 0;
1198
0
        }
1199
0
        break;
1200
1201
0
    case 'e':
1202
0
        if (memcmp(str, "endpos", 6) == 0) {
1203
0
            *end = str+6;
1204
0
            res->d = bam_endpos(b);
1205
0
            return 0;
1206
0
        }
1207
0
        break;
1208
1209
0
    case 'f':
1210
0
        if (memcmp(str, "flag", 4) == 0) {
1211
0
            str = *end = str+4;
1212
0
            if (*str != '.') {
1213
0
                res->d = b->core.flag;
1214
0
                return 0;
1215
0
            } else {
1216
0
                str++;
1217
0
                if (!memcmp(str, "paired", 6)) {
1218
0
                    *end = str+6;
1219
0
                    res->d = b->core.flag & BAM_FPAIRED;
1220
0
                    return 0;
1221
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1222
0
                    *end = str+11;
1223
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1224
0
                    return 0;
1225
0
                } else if (!memcmp(str, "unmap", 5)) {
1226
0
                    *end = str+5;
1227
0
                    res->d = b->core.flag & BAM_FUNMAP;
1228
0
                    return 0;
1229
0
                } else if (!memcmp(str, "munmap", 6)) {
1230
0
                    *end = str+6;
1231
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1232
0
                    return 0;
1233
0
                } else if (!memcmp(str, "reverse", 7)) {
1234
0
                    *end = str+7;
1235
0
                    res->d = b->core.flag & BAM_FREVERSE;
1236
0
                    return 0;
1237
0
                } else if (!memcmp(str, "mreverse", 8)) {
1238
0
                    *end = str+8;
1239
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1240
0
                    return 0;
1241
0
                } else if (!memcmp(str, "read1", 5)) {
1242
0
                    *end = str+5;
1243
0
                    res->d = b->core.flag & BAM_FREAD1;
1244
0
                    return 0;
1245
0
                } else if (!memcmp(str, "read2", 5)) {
1246
0
                    *end = str+5;
1247
0
                    res->d = b->core.flag & BAM_FREAD2;
1248
0
                    return 0;
1249
0
                } else if (!memcmp(str, "secondary", 9)) {
1250
0
                    *end = str+9;
1251
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1252
0
                    return 0;
1253
0
                } else if (!memcmp(str, "qcfail", 6)) {
1254
0
                    *end = str+6;
1255
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1256
0
                    return 0;
1257
0
                } else if (!memcmp(str, "dup", 3)) {
1258
0
                    *end = str+3;
1259
0
                    res->d = b->core.flag & BAM_FDUP;
1260
0
                    return 0;
1261
0
                } else if (!memcmp(str, "supplementary", 13)) {
1262
0
                    *end = str+13;
1263
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1264
0
                    return 0;
1265
0
                } else {
1266
0
                    hts_log_error("Unrecognised flag string");
1267
0
                    return -1;
1268
0
                }
1269
0
            }
1270
0
        }
1271
0
        break;
1272
1273
0
    case 'h':
1274
0
        if (memcmp(str, "hclen", 5) == 0) {
1275
0
            int hclen = 0;
1276
0
            uint32_t *cigar = bam_get_cigar(b);
1277
0
            uint32_t ncigar = b->core.n_cigar;
1278
1279
            // left
1280
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1281
0
                hclen = bam_cigar_oplen(cigar[0]);
1282
1283
            // right
1284
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1285
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1286
1287
0
            *end = str+5;
1288
0
            res->d = hclen;
1289
0
            return 0;
1290
0
        }
1291
0
        break;
1292
1293
0
    case 'l':
1294
0
        if (memcmp(str, "library", 7) == 0) {
1295
0
            *end = str+7;
1296
0
            res->is_str = 1;
1297
0
            const char *lib = bam_get_library(hb->h, b);
1298
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1299
0
            return 0;
1300
0
        }
1301
0
        break;
1302
1303
0
    case 'm':
1304
0
        if (memcmp(str, "mapq", 4) == 0) {
1305
0
            *end = str+4;
1306
0
            res->d = b->core.qual;
1307
0
            return 0;
1308
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1309
0
            *end = str+4;
1310
0
            res->d = b->core.mpos+1;
1311
0
            return 0;
1312
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1313
0
            *end = str+6;
1314
0
            res->is_str = 1;
1315
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1316
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1317
0
            return 0;
1318
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1319
0
            *end = str+6;
1320
0
            res->d = b->core.mtid;
1321
0
            return 0;
1322
0
        }
1323
0
        break;
1324
1325
0
    case 'n':
1326
0
        if (memcmp(str, "ncigar", 6) == 0) {
1327
0
            *end = str+6;
1328
0
            res->d = b->core.n_cigar;
1329
0
            return 0;
1330
0
        }
1331
0
        break;
1332
1333
0
    case 'p':
1334
0
        if (memcmp(str, "pos", 3) == 0) {
1335
0
            *end = str+3;
1336
0
            res->d = b->core.pos+1;
1337
0
            return 0;
1338
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1339
0
            *end = str+5;
1340
0
            res->d = b->core.mpos+1;
1341
0
            return 0;
1342
0
        }
1343
0
        break;
1344
1345
0
    case 'q':
1346
0
        if (memcmp(str, "qlen", 4) == 0) {
1347
0
            *end = str+4;
1348
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1349
0
            return 0;
1350
0
        } else if (memcmp(str, "qname", 5) == 0) {
1351
0
            *end = str+5;
1352
0
            res->is_str = 1;
1353
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1354
0
            return 0;
1355
0
        } else if (memcmp(str, "qual", 4) == 0) {
1356
0
            *end = str+4;
1357
0
            ks_clear(&res->s);
1358
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1359
0
                return -1;
1360
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1361
0
            res->s.l = b->core.l_qseq;
1362
0
            res->is_str = 1;
1363
0
            return 0;
1364
0
        }
1365
0
        break;
1366
1367
0
    case 'r':
1368
0
        if (memcmp(str, "rlen", 4) == 0) {
1369
0
            *end = str+4;
1370
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1371
0
            return 0;
1372
0
        } else if (memcmp(str, "rname", 5) == 0) {
1373
0
            *end = str+5;
1374
0
            res->is_str = 1;
1375
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1376
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1377
0
            return 0;
1378
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1379
0
            *end = str+5;
1380
0
            res->is_str = 1;
1381
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1382
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1383
0
            return 0;
1384
0
        } else if (memcmp(str, "refid", 5) == 0) {
1385
0
            *end = str+5;
1386
0
            res->d = b->core.tid;
1387
0
            return 0;
1388
0
        }
1389
0
        break;
1390
1391
0
    case 's':
1392
0
        if (memcmp(str, "seq", 3) == 0) {
1393
0
            *end = str+3;
1394
0
            ks_clear(&res->s);
1395
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1396
0
                return -1;
1397
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1398
0
            res->s.s[b->core.l_qseq] = 0;
1399
0
            res->s.l = b->core.l_qseq;
1400
0
            res->is_str = 1;
1401
0
            return 0;
1402
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1403
0
            int sclen = 0;
1404
0
            uint32_t *cigar = bam_get_cigar(b);
1405
0
            int ncigar = b->core.n_cigar;
1406
0
            int left = 0;
1407
1408
            // left
1409
0
            if (ncigar > 0
1410
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1411
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1412
0
            else if (ncigar > 1
1413
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1414
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1415
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1416
1417
            // right
1418
0
            if (ncigar-1 > left
1419
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1420
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1421
0
            else if (ncigar-2 > left
1422
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1423
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1424
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1425
1426
0
            *end = str+5;
1427
0
            res->d = sclen;
1428
0
            return 0;
1429
0
        }
1430
0
        break;
1431
1432
0
    case 't':
1433
0
        if (memcmp(str, "tlen", 4) == 0) {
1434
0
            *end = str+4;
1435
0
            res->d = b->core.isize;
1436
0
            return 0;
1437
0
        }
1438
0
        break;
1439
1440
0
    case '[':
1441
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1442
            /* aux tags */
1443
0
            *end = str+4;
1444
1445
0
            uint8_t *aux = bam_aux_get(b, str+1);
1446
0
            if (aux) {
1447
                // we define the truth of a tag to be its presence, even if 0.
1448
0
                res->is_true = 1;
1449
0
                switch (*aux) {
1450
0
                case 'Z':
1451
0
                case 'H':
1452
0
                    res->is_str = 1;
1453
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1454
0
                    break;
1455
1456
0
                case 'A':
1457
0
                    res->is_str = 1;
1458
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1459
0
                    break;
1460
1461
0
                case 'i': case 'I':
1462
0
                case 's': case 'S':
1463
0
                case 'c': case 'C':
1464
0
                    res->is_str = 0;
1465
0
                    res->d = bam_aux2i(aux);
1466
0
                    break;
1467
1468
0
                case 'f':
1469
0
                case 'd':
1470
0
                    res->is_str = 0;
1471
0
                    res->d = bam_aux2f(aux);
1472
0
                    break;
1473
1474
0
                default:
1475
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1476
0
                                  *aux);
1477
0
                    return -1;
1478
0
                }
1479
0
                return 0;
1480
1481
0
            } else {
1482
                // hence absent tags are always false (and strings)
1483
0
                res->is_str = 1;
1484
0
                res->s.l = 0;
1485
0
                res->d = 0;
1486
0
                res->is_true = 0;
1487
0
                return 0;
1488
0
            }
1489
0
        }
1490
0
        break;
1491
0
    }
1492
1493
    // All successful matches in switch should return 0.
1494
    // So if we didn't match, it's a parse error.
1495
0
    return -1;
1496
0
}
1497
1498
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1499
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1500
0
{
1501
0
    hb_pair hb = {h, b};
1502
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1503
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1504
0
        hts_log_error("Couldn't process filter expression");
1505
0
        hts_expr_val_free(&res);
1506
0
        return -1;
1507
0
    }
1508
1509
0
    int t = res.is_true;
1510
0
    hts_expr_val_free(&res);
1511
1512
0
    return t;
1513
0
}
1514
1515
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1516
0
{
1517
0
    htsFile *fp = fpv;
1518
0
    bam1_t *b = bv;
1519
0
    int pass_filter, ret;
1520
1521
0
    do {
1522
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1523
0
        if (ret < 0)
1524
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1525
1526
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1527
0
            return -2;
1528
1529
0
        *tid = b->core.tid;
1530
0
        *beg = b->core.pos;
1531
0
        *end = bam_endpos(b);
1532
1533
0
        if (fp->filter) {
1534
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1535
0
            if (pass_filter < 0)
1536
0
                return -2;
1537
0
        } else {
1538
0
            pass_filter = 1;
1539
0
        }
1540
0
    } while (pass_filter == 0);
1541
1542
0
    return ret;
1543
0
}
1544
1545
static int cram_pseek(void *fp, int64_t offset, int whence)
1546
0
{
1547
0
    cram_fd *fd =  (cram_fd *)fp;
1548
1549
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1550
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1551
0
        return -1;
1552
1553
0
    fd->curr_position = offset;
1554
1555
0
    if (fd->ctr) {
1556
0
        cram_free_container(fd->ctr);
1557
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1558
0
            cram_free_container(fd->ctr_mt);
1559
1560
0
        fd->ctr = NULL;
1561
0
        fd->ctr_mt = NULL;
1562
0
        fd->ooc = 0;
1563
0
    }
1564
1565
0
    return 0;
1566
0
}
1567
1568
/*
1569
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1570
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1571
 *   container previously fetched. It was designed like this to integrate with the functionality
1572
 *   of the iterator stepping logic.
1573
 */
1574
1575
static int64_t cram_ptell(void *fp)
1576
0
{
1577
0
    cram_fd *fd = (cram_fd *)fp;
1578
0
    cram_container *c;
1579
0
    cram_slice *s;
1580
0
    int64_t ret = -1L;
1581
1582
0
    if (fd) {
1583
0
        if ((c = fd->ctr) != NULL) {
1584
0
            if ((s = c->slice) != NULL && s->max_rec) {
1585
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1586
0
                    fd->curr_position += c->offset + c->length;
1587
0
            }
1588
0
        }
1589
0
        ret = fd->curr_position;
1590
0
    }
1591
1592
0
    return ret;
1593
0
}
1594
1595
static int bam_pseek(void *fp, int64_t offset, int whence)
1596
0
{
1597
0
    BGZF *fd = (BGZF *)fp;
1598
1599
0
    return bgzf_seek(fd, offset, whence);
1600
0
}
1601
1602
static int64_t bam_ptell(void *fp)
1603
0
{
1604
0
    BGZF *fd = (BGZF *)fp;
1605
0
    if (!fd)
1606
0
        return -1L;
1607
1608
0
    return bgzf_tell(fd);
1609
0
}
1610
1611
1612
1613
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1614
0
{
1615
0
    switch (fp->format.format) {
1616
0
    case bam:
1617
0
    case sam:
1618
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1619
1620
0
    case cram: {
1621
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1622
1623
        // Cons up a fake "index" just pointing at the associated cram_fd:
1624
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1625
0
        if (idx == NULL) return NULL;
1626
0
        idx->fmt = HTS_FMT_CRAI;
1627
0
        idx->cram = fp->fp.cram;
1628
0
        return (hts_idx_t *) idx;
1629
0
        }
1630
1631
0
    default:
1632
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1633
0
    }
1634
0
}
1635
1636
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1637
0
{
1638
0
    return index_load(fp, fn, fnidx, flags);
1639
0
}
1640
1641
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1642
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1643
0
}
1644
1645
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1646
0
{
1647
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1648
0
}
1649
1650
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1651
0
{
1652
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1653
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1654
0
    if (iter == NULL) return NULL;
1655
1656
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1657
    // the readrec function:
1658
0
    iter->is_cram = 1;
1659
0
    iter->read_rest = 1;
1660
0
    iter->off = NULL;
1661
0
    iter->bins.a = NULL;
1662
0
    iter->readrec = readrec;
1663
1664
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1665
0
        cram_range r = { tid, beg+1, end };
1666
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1667
1668
0
        iter->curr_off = 0;
1669
        // The following fields are not required by hts_itr_next(), but are
1670
        // filled in in case user code wants to look at them.
1671
0
        iter->tid = tid;
1672
0
        iter->beg = beg;
1673
0
        iter->end = end;
1674
1675
0
        switch (ret) {
1676
0
        case 0:
1677
0
            break;
1678
1679
0
        case -2:
1680
            // No data vs this ref, so mark iterator as completed.
1681
            // Same as HTS_IDX_NONE.
1682
0
            iter->finished = 1;
1683
0
            break;
1684
1685
0
        default:
1686
0
            free(iter);
1687
0
            return NULL;
1688
0
        }
1689
0
    }
1690
0
    else switch (tid) {
1691
0
    case HTS_IDX_REST:
1692
0
        iter->curr_off = 0;
1693
0
        break;
1694
0
    case HTS_IDX_NONE:
1695
0
        iter->curr_off = 0;
1696
0
        iter->finished = 1;
1697
0
        break;
1698
0
    default:
1699
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1700
0
        abort();
1701
0
        break;
1702
0
    }
1703
1704
0
    return iter;
1705
0
}
1706
1707
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1708
0
{
1709
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1710
0
    if (idx == NULL)
1711
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1712
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1713
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1714
0
    else
1715
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1716
0
}
1717
1718
static int cram_name2id(void *fdv, const char *ref)
1719
0
{
1720
0
    cram_fd *fd = (cram_fd *) fdv;
1721
0
    return sam_hdr_name2tid(fd->header, ref);
1722
0
}
1723
1724
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1725
0
{
1726
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1727
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1728
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1729
0
                          sam_readrec);
1730
0
}
1731
1732
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1733
0
{
1734
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1735
0
    hts_reglist_t *r_list = NULL;
1736
0
    int r_count = 0;
1737
1738
0
    if (!cidx || !hdr)
1739
0
        return NULL;
1740
1741
0
    hts_itr_t *itr = NULL;
1742
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1743
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1744
0
        if (!r_list)
1745
0
            return NULL;
1746
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1747
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1748
0
    } else {
1749
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1750
0
        if (!r_list)
1751
0
            return NULL;
1752
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1753
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1754
0
    }
1755
1756
0
    if (!itr)
1757
0
        hts_reglist_free(r_list, r_count);
1758
1759
0
    return itr;
1760
0
}
1761
1762
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1763
0
{
1764
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1765
1766
0
    if(!cidx || !hdr || !reglist)
1767
0
        return NULL;
1768
1769
0
    if (cidx->fmt == HTS_FMT_CRAI)
1770
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1771
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1772
0
    else
1773
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1774
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1775
0
}
1776
1777
/**********************
1778
 *** SAM header I/O ***
1779
 **********************/
1780
1781
#include "htslib/kseq.h"
1782
#include "htslib/kstring.h"
1783
1784
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1785
0
{
1786
0
    sam_hdr_t *bh = sam_hdr_init();
1787
0
    if (!bh) return NULL;
1788
1789
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1790
0
        sam_hdr_destroy(bh);
1791
0
        return NULL;
1792
0
    }
1793
1794
0
    return bh;
1795
0
}
1796
1797
799k
static int valid_sam_header_type(const char *s) {
1798
799k
    if (s[0] != '@') return 0;
1799
799k
    switch (s[1]) {
1800
1.01k
    case 'H':
1801
1.01k
        return s[2] == 'D' && s[3] == '\t';
1802
30
    case 'S':
1803
30
        return s[2] == 'Q' && s[3] == '\t';
1804
785k
    case 'R':
1805
787k
    case 'P':
1806
787k
        return s[2] == 'G' && s[3] == '\t';
1807
10.5k
    case 'C':
1808
10.5k
        return s[2] == 'O';
1809
799k
    }
1810
45
    return 0;
1811
799k
}
1812
1813
// Minimal sanitisation of a header to ensure.
1814
// - null terminated string.
1815
// - all lines start with @ (also implies no blank lines).
1816
//
1817
// Much more could be done, but currently is not, including:
1818
// - checking header types are known (HD, SQ, etc).
1819
// - syntax (eg checking tab separated fields).
1820
// - validating n_targets matches @SQ records.
1821
// - validating target lengths against @SQ records.
1822
12.1k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1823
12.1k
    if (!h)
1824
135
        return NULL;
1825
1826
    // Special case for empty headers.
1827
12.0k
    if (h->l_text == 0)
1828
4.50k
        return h;
1829
1830
7.50k
    size_t i;
1831
7.50k
    unsigned int lnum = 0;
1832
7.50k
    char *cp = h->text, last = '\n';
1833
254M
    for (i = 0; i < h->l_text; i++) {
1834
        // NB: l_text excludes terminating nul.  This finds early ones.
1835
254M
        if (cp[i] == 0)
1836
3.17k
            break;
1837
1838
        // Error on \n[^@], including duplicate newlines
1839
254M
        if (last == '\n') {
1840
414k
            lnum++;
1841
414k
            if (cp[i] != '@') {
1842
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1843
0
                sam_hdr_destroy(h);
1844
0
                return NULL;
1845
0
            }
1846
414k
        }
1847
1848
254M
        last = cp[i];
1849
254M
    }
1850
1851
7.50k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1852
3.17k
        size_t j = i;
1853
21.5k
        while (j < h->l_text && cp[j] == '\0') j++;
1854
3.17k
        if (j < h->l_text)
1855
3.09k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1856
3.17k
    }
1857
1858
    // Add trailing newline and/or trailing nul if required.
1859
7.50k
    if (last != '\n') {
1860
3.08k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1861
1862
3.08k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1863
300
            if (h->l_text >= SIZE_MAX - 2) {
1864
0
                hts_log_error("No room for extra newline");
1865
0
                sam_hdr_destroy(h);
1866
0
                return NULL;
1867
0
            }
1868
1869
300
            cp = realloc(h->text, (size_t) h->l_text+2);
1870
300
            if (!cp) {
1871
0
                sam_hdr_destroy(h);
1872
0
                return NULL;
1873
0
            }
1874
300
            h->text = cp;
1875
300
        }
1876
3.08k
        cp[i++] = '\n';
1877
1878
        // l_text may be larger already due to multiple nul padding
1879
3.08k
        if (h->l_text < i)
1880
0
            h->l_text = i;
1881
3.08k
        cp[h->l_text] = '\0';
1882
3.08k
    }
1883
1884
7.50k
    return h;
1885
7.50k
}
1886
1887
3.83k
static void known_stderr(const char *tool, const char *advice) {
1888
3.83k
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1889
3.83k
    hts_log_warning("%s", advice);
1890
3.83k
}
1891
1892
50.8k
static void warn_if_known_stderr(const char *line) {
1893
50.8k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1894
1.01k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1895
49.8k
    else if (strstr(line, "M::mem_pestat") != NULL)
1896
1.98k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1897
47.8k
    else if (strstr(line, "loaded/built the index") != NULL)
1898
834
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1899
50.8k
}
1900
1901
7.45k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1902
7.45k
    kstring_t str = { 0, 0, NULL };
1903
7.45k
    khint_t k;
1904
7.45k
    sam_hdr_t* h = sam_hdr_init();
1905
7.45k
    const char *q, *r;
1906
7.45k
    char* sn = NULL;
1907
7.45k
    khash_t(s2i) *d = kh_init(s2i);
1908
7.45k
    khash_t(s2i) *long_refs = NULL;
1909
7.45k
    if (!h || !d)
1910
0
        goto error;
1911
1912
7.45k
    int ret, has_SQ = 0;
1913
7.45k
    int next_c = '@';
1914
980k
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1915
973k
        if (fp->line.s[0] != '@')
1916
0
            break;
1917
1918
973k
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1919
174k
            has_SQ = 1;
1920
174k
            hts_pos_t ln = -1;
1921
433k
            for (q = fp->line.s + 4;; ++q) {
1922
433k
                if (strncmp(q, "SN:", 3) == 0) {
1923
176k
                    q += 3;
1924
876M
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1925
1926
176k
                    if (sn) {
1927
41.0k
                        hts_log_warning("SQ header line has more than one SN: tag");
1928
41.0k
                        free(sn);
1929
41.0k
                    }
1930
176k
                    sn = (char*)calloc(r - q + 1, 1);
1931
176k
                    if (!sn)
1932
0
                        goto error;
1933
1934
176k
                    strncpy(sn, q, r - q);
1935
176k
                    q = r;
1936
256k
                } else {
1937
256k
                    if (strncmp(q, "LN:", 3) == 0)
1938
149k
                        ln = strtoll(q + 3, (char**)&q, 10);
1939
256k
                }
1940
1941
26.5M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1942
26.1M
                    ++q;
1943
433k
                if (*q == '\0' || *q == '\n')
1944
174k
                    break;
1945
433k
            }
1946
174k
            if (sn) {
1947
135k
                if (ln >= 0) {
1948
123k
                    int absent;
1949
123k
                    k = kh_put(s2i, d, sn, &absent);
1950
123k
                    if (absent < 0)
1951
0
                        goto error;
1952
1953
123k
                    if (!absent) {
1954
80.5k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1955
80.5k
                        free(sn);
1956
80.5k
                    } else {
1957
42.8k
                        sn = NULL;
1958
42.8k
                        if (ln >= UINT32_MAX) {
1959
                            // Stash away ref length that
1960
                            // doesn't fit in target_len array
1961
18.6k
                            int k2;
1962
18.6k
                            if (!long_refs) {
1963
957
                                long_refs = kh_init(s2i);
1964
957
                                if (!long_refs)
1965
0
                                    goto error;
1966
957
                            }
1967
18.6k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1968
18.6k
                            if (absent < 0)
1969
0
                                goto error;
1970
18.6k
                            kh_val(long_refs, k2) = ln;
1971
18.6k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1972
18.6k
                                            | UINT32_MAX);
1973
24.1k
                        } else {
1974
24.1k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1975
24.1k
                        }
1976
42.8k
                    }
1977
123k
                } else {
1978
12.1k
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1979
12.1k
                    warn_if_known_stderr(fp->line.s);
1980
12.1k
                    free(sn);
1981
12.1k
                }
1982
135k
            } else {
1983
38.6k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
1984
38.6k
                warn_if_known_stderr(fp->line.s);
1985
38.6k
            }
1986
174k
            sn = NULL;
1987
174k
        }
1988
799k
        else if (!valid_sam_header_type(fp->line.s)) {
1989
84
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1990
84
            warn_if_known_stderr(fp->line.s);
1991
84
            goto error;
1992
84
        }
1993
1994
973k
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1995
0
            goto error;
1996
1997
973k
        if (kputc('\n', &str) < 0)
1998
0
            goto error;
1999
2000
973k
        if (fp->is_bgzf) {
2001
848k
            next_c = bgzf_peek(fp->fp.bgzf);
2002
848k
        } else {
2003
124k
            unsigned char nc;
2004
124k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
2005
124k
            next_c = pret > 0 ? nc : pret - 1;
2006
124k
        }
2007
973k
        if (next_c < -1)
2008
3
            goto error;
2009
973k
    }
2010
7.36k
    if (next_c != '@')
2011
7.33k
        fp->line.l = 0;
2012
2013
7.36k
    if (ret < -1)
2014
27
        goto error;
2015
2016
7.33k
    if (!has_SQ && fp->fn_aux) {
2017
0
        kstring_t line = { 0, 0, NULL };
2018
2019
        /* The reference index (.fai) is actually needed here */
2020
0
        char *fai_fn = fp->fn_aux;
2021
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
2022
0
        if (fn_delim)
2023
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
2024
2025
0
        hFILE* f = hopen(fai_fn, "r");
2026
0
        int e = 0, absent;
2027
0
        if (f == NULL)
2028
0
            goto error;
2029
2030
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
2031
0
            char* tab = strchr(line.s, '\t');
2032
0
            hts_pos_t ln;
2033
2034
0
            if (tab == NULL)
2035
0
                continue;
2036
2037
0
            sn = (char*)calloc(tab-line.s+1, 1);
2038
0
            if (!sn) {
2039
0
                e = 1;
2040
0
                break;
2041
0
            }
2042
0
            memcpy(sn, line.s, tab-line.s);
2043
0
            k = kh_put(s2i, d, sn, &absent);
2044
0
            if (absent < 0) {
2045
0
                e = 1;
2046
0
                break;
2047
0
            }
2048
2049
0
            ln = strtoll(tab, NULL, 10);
2050
2051
0
            if (!absent) {
2052
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2053
0
                free(sn);
2054
0
                sn = NULL;
2055
0
            } else {
2056
0
                sn = NULL;
2057
0
                if (ln >= UINT32_MAX) {
2058
                    // Stash away ref length that
2059
                    // doesn't fit in target_len array
2060
0
                    khint_t k2;
2061
0
                    int absent = -1;
2062
0
                    if (!long_refs) {
2063
0
                        long_refs = kh_init(s2i);
2064
0
                        if (!long_refs) {
2065
0
                            e = 1;
2066
0
                            break;
2067
0
                        }
2068
0
                    }
2069
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2070
0
                    if (absent < 0) {
2071
0
                         e = 1;
2072
0
                         break;
2073
0
                    }
2074
0
                    kh_val(long_refs, k2) = ln;
2075
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2076
0
                                    | UINT32_MAX);
2077
0
                } else {
2078
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2079
0
                }
2080
0
                has_SQ = 1;
2081
0
            }
2082
2083
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2084
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2085
0
            e |= kputs("\tLN:", &str) < 0;
2086
0
            e |= kputll(ln, &str) < 0;
2087
0
            e |= kputc('\n', &str) < 0;
2088
0
            if (e)
2089
0
                break;
2090
0
        }
2091
2092
0
        ks_free(&line);
2093
0
        if (hclose(f) != 0) {
2094
0
            hts_log_error("Error on closing %s", fai_fn);
2095
0
            e = 1;
2096
0
        }
2097
0
        if (e)
2098
0
            goto error;
2099
0
    }
2100
2101
7.33k
    if (has_SQ) {
2102
        // Populate the targets array
2103
5.58k
        h->n_targets = kh_size(d);
2104
2105
5.58k
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2106
5.58k
        if (!h->target_name) {
2107
0
            h->n_targets = 0;
2108
0
            goto error;
2109
0
        }
2110
2111
5.58k
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2112
5.58k
        if (!h->target_len) {
2113
0
            h->n_targets = 0;
2114
0
            goto error;
2115
0
        }
2116
2117
87.2k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2118
81.7k
            if (!kh_exist(d, k))
2119
42.7k
                continue;
2120
2121
38.9k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2122
38.9k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2123
38.9k
            kh_val(d, k) >>= 32;
2124
38.9k
        }
2125
5.58k
    }
2126
2127
    // Repurpose sdict to hold any references longer than UINT32_MAX
2128
7.33k
    h->sdict = long_refs;
2129
2130
7.33k
    kh_destroy(s2i, d);
2131
2132
7.33k
    if (str.l == 0)
2133
0
        kputsn("", 0, &str);
2134
7.33k
    h->l_text = str.l;
2135
7.33k
    h->text = ks_release(&str);
2136
7.33k
    fp->bam_header = sam_hdr_sanitise(h);
2137
7.33k
    fp->bam_header->ref_count = 1;
2138
2139
7.33k
    return fp->bam_header;
2140
2141
114
 error:
2142
114
    if (h && d && (!h->target_name || !h->target_len)) {
2143
7.00k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2144
6.88k
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2145
114
    }
2146
114
    sam_hdr_destroy(h);
2147
114
    ks_free(&str);
2148
114
    kh_destroy(s2i, d);
2149
114
    kh_destroy(s2i, long_refs);
2150
114
    if (sn) free(sn);
2151
114
    return NULL;
2152
7.33k
}
2153
2154
sam_hdr_t *sam_hdr_read(htsFile *fp)
2155
13.6k
{
2156
13.6k
    if (!fp) {
2157
0
        errno = EINVAL;
2158
0
        return NULL;
2159
0
    }
2160
2161
13.6k
    switch (fp->format.format) {
2162
753
    case bam:
2163
753
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2164
2165
4.04k
    case cram:
2166
4.04k
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2167
2168
7.45k
    case sam:
2169
7.45k
        return sam_hdr_create(fp);
2170
2171
90
    case fastq_format:
2172
1.39k
    case fasta_format:
2173
1.39k
        return sam_hdr_init();
2174
2175
0
    case empty_format:
2176
0
        errno = EPIPE;
2177
0
        return NULL;
2178
2179
0
    default:
2180
0
        errno = EFTYPE;
2181
0
        return NULL;
2182
13.6k
    }
2183
13.6k
}
2184
2185
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2186
13.3k
{
2187
13.3k
    if (!fp || !h) {
2188
0
        errno = EINVAL;
2189
0
        return -1;
2190
0
    }
2191
2192
13.3k
    switch (fp->format.format) {
2193
4.46k
    case binary_format:
2194
4.46k
        fp->format.category = sequence_data;
2195
4.46k
        fp->format.format = bam;
2196
        /* fall-through */
2197
4.46k
    case bam:
2198
4.46k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2199
4.46k
        break;
2200
2201
4.46k
    case cram: {
2202
4.46k
        cram_fd *fd = fp->fp.cram;
2203
4.46k
        if (cram_set_header2(fd, h) < 0) return -1;
2204
4.21k
        if (fp->fn_aux)
2205
0
            cram_load_reference(fd, fp->fn_aux);
2206
4.21k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2207
4.21k
        }
2208
4.21k
        break;
2209
2210
4.46k
    case text_format:
2211
4.46k
        fp->format.category = sequence_data;
2212
4.46k
        fp->format.format = sam;
2213
        /* fall-through */
2214
4.46k
    case sam: {
2215
4.46k
        if (!h->hrecs && !h->text)
2216
0
            return 0;
2217
4.46k
        char *text;
2218
4.46k
        kstring_t hdr_ks = { 0, 0, NULL };
2219
4.46k
        size_t l_text;
2220
4.46k
        ssize_t bytes;
2221
4.46k
        int r = 0, no_sq = 0;
2222
2223
4.46k
        if (h->hrecs) {
2224
4.21k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2225
0
                return -1;
2226
4.21k
            text = hdr_ks.s;
2227
4.21k
            l_text = hdr_ks.l;
2228
4.21k
        } else {
2229
253
            const char *p = NULL;
2230
435
            do {
2231
435
                const char *q = p == NULL ? h->text : p + 4;
2232
435
                p = strstr(q, "@SQ\t");
2233
435
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2234
253
            no_sq = p == NULL;
2235
253
            text = h->text;
2236
253
            l_text = h->l_text;
2237
253
        }
2238
2239
4.46k
        if (fp->is_bgzf) {
2240
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2241
4.46k
        } else {
2242
4.46k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2243
4.46k
        }
2244
4.46k
        free(hdr_ks.s);
2245
4.46k
        if (bytes != l_text)
2246
0
            return -1;
2247
2248
4.46k
        if (no_sq) {
2249
154
            int i;
2250
2.82k
            for (i = 0; i < h->n_targets; ++i) {
2251
2.67k
                fp->line.l = 0;
2252
2.67k
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2253
2.67k
                r |= kputs(h->target_name[i], &fp->line) < 0;
2254
2.67k
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2255
2.67k
                r |= kputw(h->target_len[i], &fp->line) < 0;
2256
2.67k
                r |= kputc('\n', &fp->line) < 0;
2257
2.67k
                if (r != 0)
2258
0
                    return -1;
2259
2260
2.67k
                if (fp->is_bgzf) {
2261
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2262
2.67k
                } else {
2263
2.67k
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2264
2.67k
                }
2265
2.67k
                if (bytes != fp->line.l)
2266
0
                    return -1;
2267
2.67k
            }
2268
154
        }
2269
4.46k
        if (fp->is_bgzf) {
2270
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2271
4.46k
        } else {
2272
4.46k
            if (hflush(fp->fp.hfile) != 0) return -1;
2273
4.46k
        }
2274
4.46k
        }
2275
4.46k
        break;
2276
2277
4.46k
    case fastq_format:
2278
0
    case fasta_format:
2279
        // Nothing to output; FASTQ has no file headers.
2280
0
        break;
2281
2282
0
    default:
2283
0
        errno = EBADF;
2284
0
        return -1;
2285
13.3k
    }
2286
13.1k
    return 0;
2287
13.3k
}
2288
2289
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2290
0
{
2291
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2292
0
    size_t new_l_text;
2293
0
    if (!h || !key)
2294
0
        return -1;
2295
2296
0
    if (h->l_text > 3) {
2297
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2298
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2299
0
            *p = '\0'; // for strstr call
2300
2301
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2302
2303
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2304
0
                *p = '\n'; // change back
2305
2306
                // mark the key:val
2307
0
                beg = q;
2308
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2309
0
                end = q;
2310
2311
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2312
0
                    && strlen(val) == end - beg - 4)
2313
0
                     return 0; // val is the same, no need to change
2314
2315
0
            } else {
2316
0
                beg = end = p;
2317
0
                *p = '\n';
2318
0
            }
2319
0
        }
2320
0
    }
2321
0
    if (beg == NULL) { // no @HD
2322
0
        new_l_text = h->l_text;
2323
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2324
0
            return -1;
2325
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2326
0
        if (val) {
2327
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2328
0
                return -1;
2329
0
            new_l_text += strlen(val) + 4;
2330
0
        }
2331
0
        newtext = (char*)malloc(new_l_text + 1);
2332
0
        if (!newtext) return -1;
2333
2334
0
        if (val)
2335
0
            snprintf(newtext, new_l_text + 1,
2336
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2337
0
        else
2338
0
            snprintf(newtext, new_l_text + 1,
2339
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2340
0
    } else { // has @HD but different or no key
2341
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2342
0
        if (val) {
2343
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2344
0
                return -1;
2345
0
            new_l_text += strlen(val) + 4;
2346
0
        }
2347
0
        newtext = (char*)malloc(new_l_text + 1);
2348
0
        if (!newtext) return -1;
2349
2350
0
        if (val) {
2351
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2352
0
                    (int) (beg - h->text), h->text, key, val, end);
2353
0
        } else { //delete key
2354
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2355
0
                    (int) (beg - h->text), h->text, end);
2356
0
        }
2357
0
    }
2358
0
    free(h->text);
2359
0
    h->text = newtext;
2360
0
    h->l_text = new_l_text;
2361
0
    return 0;
2362
0
}
2363
2364
2365
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2366
0
{
2367
0
    if (!h || !key)
2368
0
        return -1;
2369
2370
0
    if (!h->hrecs)
2371
0
        return old_sam_hdr_change_HD(h, key, val);
2372
2373
0
    if (val) {
2374
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2375
0
            return -1;
2376
0
    } else {
2377
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2378
0
            return -1;
2379
0
    }
2380
0
    return sam_hdr_rebuild(h);
2381
0
}
2382
/**********************
2383
 *** SAM record I/O ***
2384
 **********************/
2385
2386
static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
2387
                            char *r, bam1_t *b)
2388
4.53M
{
2389
4.53M
    int orig_l = b->l_data;
2390
4.53M
    char *q = in;
2391
4.53M
    int32_t size;
2392
4.53M
    size_t bytes;
2393
4.53M
    int overflow = 0;
2394
2395
4.53M
    size = aux_type2size(type);
2396
4.53M
    if (size <= 0 || size > 4) {
2397
27
        hts_log_error("Unrecognized type B:%c", type);
2398
27
        return -1;
2399
27
    }
2400
2401
    // Ensure space for type + values
2402
4.53M
    bytes = (size_t) n * (size_t) size;
2403
4.53M
    if (bytes / size != n
2404
4.53M
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2405
0
        hts_log_error("Out of memory");
2406
0
        return -1;
2407
0
    }
2408
2409
4.53M
    b->data[b->l_data++] = 'B';
2410
4.53M
    b->data[b->l_data++] = type;
2411
4.53M
    i32_to_le(n, b->data + b->l_data);
2412
4.53M
    b->l_data += sizeof(uint32_t);
2413
    // This ensures that q always ends up at the next comma after
2414
    // reading a number even if it's followed by junk.  It
2415
    // prevents the possibility of trying to read more than n items.
2416
862M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2417
4.53M
    if (type == 'c') {
2418
140M
        while (q < r) {
2419
139M
            *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
2420
139M
            b->l_data++;
2421
139M
            skip_to_comma_(q);
2422
139M
        }
2423
3.17M
    } else if (type == 'C') {
2424
59.8M
        while (q < r) {
2425
58.4M
            if (*q != '-') {
2426
58.4M
                *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
2427
58.4M
                b->l_data++;
2428
58.4M
            } else {
2429
0
                overflow = 1;
2430
0
            }
2431
58.4M
            skip_to_comma_(q);
2432
58.4M
        }
2433
1.77M
    } else if (type == 's') {
2434
19.4M
        while (q < r) {
2435
18.7M
            i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
2436
18.7M
            b->l_data += 2;
2437
18.7M
            skip_to_comma_(q);
2438
18.7M
        }
2439
1.11M
    } else if (type == 'S') {
2440
16.4M
        while (q < r) {
2441
16.3M
            if (*q != '-') {
2442
16.3M
                u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
2443
16.3M
                b->l_data += 2;
2444
16.3M
            } else {
2445
0
                overflow = 1;
2446
0
            }
2447
16.3M
            skip_to_comma_(q);
2448
16.3M
        }
2449
962k
    } else if (type == 'i') {
2450
125M
        while (q < r) {
2451
124M
            i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
2452
124M
            b->l_data += 4;
2453
124M
            skip_to_comma_(q);
2454
124M
        }
2455
688k
    } else if (type == 'I') {
2456
61.2M
        while (q < r) {
2457
60.9M
            if (*q != '-') {
2458
60.9M
                u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
2459
60.9M
                b->l_data += 4;
2460
60.9M
            } else {
2461
0
                overflow = 1;
2462
0
            }
2463
60.9M
            skip_to_comma_(q);
2464
60.9M
        }
2465
244k
    } else if (type == 'f') {
2466
1.05M
        while (q < r) {
2467
1.02M
            float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2468
1.02M
            b->l_data += 4;
2469
1.02M
            skip_to_comma_(q);
2470
1.02M
        }
2471
29.5k
    } else {
2472
3
        hts_log_error("Unrecognized type B:%c", type);
2473
3
        return -1;
2474
3
    }
2475
2476
4.53M
    if (!overflow) {
2477
2.96M
        *end = q;
2478
2.96M
        return 0;
2479
2.96M
    } else {
2480
1.56M
        int64_t max = 0, min = 0, val;
2481
        // Given type was incorrect.  Try to rescue the situation.
2482
1.56M
        q = in;
2483
1.56M
        overflow = 0;
2484
1.56M
        b->l_data = orig_l;
2485
        // Find out what range of values is present
2486
205M
        while (q < r) {
2487
204M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2488
204M
            if (max < val) max = val;
2489
204M
            if (min > val) min = val;
2490
204M
            skip_to_comma_(q);
2491
204M
        }
2492
        // Retry with appropriate type
2493
1.56M
        if (!overflow) {
2494
1.56M
            if (min < 0) {
2495
1.34M
                if (min >= INT8_MIN && max <= INT8_MAX) {
2496
0
                    return sam_parse_B_vals('c', n, in, end, r, b);
2497
1.34M
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2498
661k
                    return sam_parse_B_vals('s', n, in, end, r, b);
2499
686k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2500
686k
                    return sam_parse_B_vals('i', n, in, end, r, b);
2501
686k
                }
2502
1.34M
            } else {
2503
218k
                if (max < UINT8_MAX) {
2504
503
                    return sam_parse_B_vals('C', n, in, end, r, b);
2505
218k
                } else if (max <= UINT16_MAX) {
2506
569
                    return sam_parse_B_vals('S', n, in, end, r, b);
2507
217k
                } else if (max <= UINT32_MAX) {
2508
217k
                    return sam_parse_B_vals('I', n, in, end, r, b);
2509
217k
                }
2510
218k
            }
2511
1.56M
        }
2512
        // If here then at least one of the values is too big to store
2513
220
        hts_log_error("Numeric value in B array out of allowed range");
2514
220
        return -1;
2515
1.56M
    }
2516
4.53M
#undef skip_to_comma_
2517
4.53M
}
2518
2519
284k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2520
284k
    if (*v >= '1' && *v <= '9') {
2521
64.4k
        return hts_str2uint(v, rv, 16, overflow);
2522
64.4k
    }
2523
220k
    else if (*v == '0') {
2524
        // handle single-digit "0" directly; otherwise it's hex or octal
2525
96.1k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2526
16.4k
        else {
2527
16.4k
            unsigned long val = strtoul(v, rv, 0);
2528
16.4k
            if (val > 65535) { *overflow = 1; return 65535; }
2529
16.4k
            return val;
2530
16.4k
        }
2531
96.1k
    }
2532
124k
    else {
2533
        // TODO implement symbolic flag letters
2534
124k
        *rv = v;
2535
124k
        return 0;
2536
124k
    }
2537
284k
}
2538
2539
// Parse tag line and append to bam object b.
2540
// Shared by both SAM and FASTQ parsers.
2541
//
2542
// The difference between the two is how lenient we are to recognising
2543
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2544
// non-SAM looking strings.
2545
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2546
282k
                            khash_t(tag) *tag_whitelist) {
2547
282k
    int overflow = 0;
2548
282k
    int checkpoint;
2549
282k
    char logbuf[40];
2550
282k
    char *q = start, *p = end;
2551
2552
282k
#define _parse_err(cond, ...)                   \
2553
25.7M
    do {                                        \
2554
59.9M
        if (cond) {                             \
2555
663
            if (lenient) {                      \
2556
0
                while (q < p && !isspace_c(*q))   \
2557
0
                    q++;                        \
2558
0
                while (q < p && isspace_c(*q))    \
2559
0
                    q++;                        \
2560
0
                b->l_data = checkpoint;         \
2561
0
                goto loop;                      \
2562
663
            } else {                            \
2563
663
                hts_log_error(__VA_ARGS__);     \
2564
663
                goto err_ret;                   \
2565
663
            }                                   \
2566
663
        }                                       \
2567
25.7M
    } while (0)
2568
2569
22.5M
    while (q < p) loop: {
2570
22.5M
        char type;
2571
22.5M
        checkpoint = b->l_data;
2572
22.5M
        if (p - q < 5) {
2573
90
            if (lenient) {
2574
0
                break;
2575
90
            } else {
2576
90
                hts_log_error("Incomplete aux field");
2577
90
                goto err_ret;
2578
90
            }
2579
90
        }
2580
11.2M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2581
2582
11.2M
        if (lenient && (q[2] | q[4]) != ':') {
2583
0
            while (q < p && !isspace_c(*q))
2584
0
                q++;
2585
0
            while (q < p && isspace_c(*q))
2586
0
                q++;
2587
0
            continue;
2588
0
        }
2589
2590
11.2M
        if (tag_whitelist) {
2591
0
            int tt = q[0]*256 + q[1];
2592
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2593
0
                while (q < p && *q != '\t')
2594
0
                    q++;
2595
0
                continue;
2596
0
            }
2597
0
        }
2598
2599
        // Copy over id
2600
11.2M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2601
11.2M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2602
11.2M
        q += 3; type = *q++; ++q; // q points to value
2603
11.2M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2604
9.22M
            _parse_err(*q <= '\t', "incomplete aux field");
2605
2606
        // Ensure enough space for a double + type allocated.
2607
11.2M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2608
2609
11.2M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2610
1.52M
            b->data[b->l_data++] = 'A';
2611
1.52M
            b->data[b->l_data++] = *q++;
2612
9.74M
        } else if (type == 'i' || type == 'I') {
2613
4.64M
            if (*q == '-') {
2614
3.19M
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2615
3.19M
                if (x >= INT8_MIN) {
2616
1.35M
                    b->data[b->l_data++] = 'c';
2617
1.35M
                    b->data[b->l_data++] = x;
2618
1.84M
                } else if (x >= INT16_MIN) {
2619
695k
                    b->data[b->l_data++] = 's';
2620
695k
                    i16_to_le(x, b->data + b->l_data);
2621
695k
                    b->l_data += 2;
2622
1.14M
                } else {
2623
1.14M
                    b->data[b->l_data++] = 'i';
2624
1.14M
                    i32_to_le(x, b->data + b->l_data);
2625
1.14M
                    b->l_data += 4;
2626
1.14M
                }
2627
3.19M
            } else {
2628
1.44M
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2629
1.44M
                if (x <= UINT8_MAX) {
2630
265k
                    b->data[b->l_data++] = 'C';
2631
265k
                    b->data[b->l_data++] = x;
2632
1.17M
                } else if (x <= UINT16_MAX) {
2633
718k
                    b->data[b->l_data++] = 'S';
2634
718k
                    u16_to_le(x, b->data + b->l_data);
2635
718k
                    b->l_data += 2;
2636
718k
                } else {
2637
460k
                    b->data[b->l_data++] = 'I';
2638
460k
                    u32_to_le(x, b->data + b->l_data);
2639
460k
                    b->l_data += 4;
2640
460k
                }
2641
1.44M
            }
2642
5.10M
        } else if (type == 'f') {
2643
7.89k
            b->data[b->l_data++] = 'f';
2644
7.89k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2645
7.89k
            b->l_data += sizeof(float);
2646
5.09M
        } else if (type == 'd') {
2647
77.9k
            b->data[b->l_data++] = 'd';
2648
77.9k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2649
77.9k
            b->l_data += sizeof(double);
2650
5.01M
        } else if (type == 'Z' || type == 'H') {
2651
2.05M
            char *end = strchr(q, '\t');
2652
2.05M
            if (!end) end = q + strlen(q);
2653
2.05M
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2654
2.05M
                       "hex field does not have an even number of digits");
2655
2.05M
            b->data[b->l_data++] = type;
2656
2.05M
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2657
2.05M
            memcpy(b->data + b->l_data, q, end - q);
2658
2.05M
            b->l_data += end - q;
2659
2.05M
            b->data[b->l_data++] = '\0';
2660
2.05M
            q = end;
2661
2.96M
        } else if (type == 'B') {
2662
2.96M
            uint32_t n;
2663
2.96M
            char *r;
2664
2.96M
            type = *q++; // q points to the first ',' following the typing byte
2665
2.96M
            _parse_err(*q && *q != ',' && *q != '\t',
2666
2.96M
                       "B aux field type not followed by ','");
2667
2668
337M
            for (r = q, n = 0; *r > '\t'; ++r)
2669
334M
                if (*r == ',') ++n;
2670
2671
2.96M
            if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2672
250
                goto err_ret;
2673
2.96M
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2674
2675
105M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2676
11.2M
        q++;
2677
11.2M
    }
2678
2679
281k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2680
281k
#undef _parse_err
2681
2682
281k
    return 0;
2683
2684
1.00k
err_ret:
2685
1.00k
    return -2;
2686
281k
}
2687
2688
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2689
285k
{
2690
1.17M
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2691
2692
285k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2693
2694
// Macro that operates on 64-bits at a time.
2695
285k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2696
285k
    do {                                                        \
2697
280k
        uint64_u *from8 = (uint64_u *)(from);                   \
2698
280k
        uint64_u *to8 = (uint64_u *)(to);                       \
2699
280k
        uint64_t uflow = 0;                                     \
2700
280k
        size_t l8 = (l)>>3, i;                                  \
2701
281k
        for (i = 0; i < l8; i++) {                              \
2702
300
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2703
300
            uflow |= to8[i];                                    \
2704
300
        }                                                       \
2705
283k
        for (i<<=3; i < (l); ++i) {                             \
2706
3.00k
            to[i] = from[i] - (n);                              \
2707
3.00k
            uflow |= to[i];                                     \
2708
3.00k
        }                                                       \
2709
280k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2710
280k
    } while (0)
2711
2712
#else
2713
2714
// Basic version which operates a byte at a time
2715
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2716
        uint8_t uflow = 0;                                   \
2717
        for (i = 0; i < (l); ++i) {                          \
2718
            (to)[i] = (from)[i] - (n);                       \
2719
            uflow |= (uint8_t) (to)[i];                      \
2720
        }                                                    \
2721
        failed = (uflow & 0x80) > 0;                         \
2722
    } while (0)
2723
2724
#endif
2725
2726
559k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2727
3.85M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2728
1.05M
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2729
2730
285k
    uint8_t *t;
2731
2732
285k
    char *p = s->s, *q;
2733
285k
    int i, overflow = 0;
2734
285k
    char logbuf[40];
2735
285k
    hts_pos_t cigreflen;
2736
285k
    bam1_core_t *c = &b->core;
2737
2738
285k
    b->l_data = 0;
2739
285k
    memset(c, 0, 32);
2740
2741
    // qname
2742
285k
    q = _read_token(p);
2743
2744
284k
    _parse_warn(p - q <= 1, "empty query name");
2745
284k
    _parse_err(p - q > 255, "query name too long");
2746
    // resize large enough for name + extranul
2747
284k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2748
284k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2749
2750
284k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2751
284k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2752
284k
    b->l_data += c->l_extranul;
2753
2754
284k
    c->l_qname = p - q + c->l_extranul;
2755
2756
    // flag
2757
284k
    c->flag = parse_sam_flag(p, &p, &overflow);
2758
284k
    if (*p++ != '\t') goto err_ret; // malformated flag
2759
2760
    // chr
2761
284k
    q = _read_token(p);
2762
284k
    if (strcmp(q, "*")) {
2763
252k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2764
252k
        c->tid = bam_name2id(h, q);
2765
252k
        _parse_err(c->tid < -1, "failed to parse header");
2766
252k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2767
252k
    } else c->tid = -1;
2768
2769
    // pos
2770
284k
    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2771
284k
    if (*p++ != '\t') goto err_ret;
2772
283k
    if (c->pos < 0 && c->tid >= 0) {
2773
5.36k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2774
5.36k
        c->tid = -1;
2775
5.36k
    }
2776
283k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2777
2778
    // mapq
2779
283k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2780
283k
    if (*p++ != '\t') goto err_ret;
2781
    // cigar
2782
283k
    if (*p != '*') {
2783
244k
        uint32_t *cigar = NULL;
2784
244k
        int old_l_data = b->l_data;
2785
244k
        int n_cigar = bam_parse_cigar(p, &p, b);
2786
244k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2787
244k
        cigar = (uint32_t *)(b->data + old_l_data);
2788
2789
        // can't use bam_endpos() directly as some fields not yet set up
2790
244k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2791
244k
        if (cigreflen == 0) cigreflen = 1;
2792
244k
    } else {
2793
38.7k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2794
38.7k
        c->flag |= BAM_FUNMAP;
2795
38.7k
        q = _read_token(p);
2796
38.7k
        cigreflen = 1;
2797
38.7k
    }
2798
283k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2799
283k
               "read ends beyond highest supported position");
2800
283k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2801
    // mate chr
2802
283k
    q = _read_token(p);
2803
283k
    if (strcmp(q, "=") == 0) {
2804
49
        c->mtid = c->tid;
2805
283k
    } else if (strcmp(q, "*") == 0) {
2806
3.24k
        c->mtid = -1;
2807
279k
    } else {
2808
279k
        c->mtid = bam_name2id(h, q);
2809
279k
        _parse_err(c->mtid < -1, "failed to parse header");
2810
279k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2811
279k
    }
2812
    // mpos
2813
283k
    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2814
283k
    if (*p++ != '\t') goto err_ret;
2815
283k
    if (c->mpos < 0 && c->mtid >= 0) {
2816
195k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2817
195k
        c->mtid = -1;
2818
195k
    }
2819
    // tlen
2820
283k
    c->isize = hts_str2int(p, &p, 64, &overflow);
2821
283k
    if (*p++ != '\t') goto err_ret;
2822
    // seq
2823
282k
    q = _read_token(p);
2824
282k
    if (strcmp(q, "*")) {
2825
277k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2826
277k
        c->l_qseq = p - q - 1;
2827
277k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2828
277k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2829
277k
        i = (c->l_qseq + 1) >> 1;
2830
277k
        _get_mem(uint8_t, &t, b, i);
2831
2832
277k
        unsigned int lqs2 = c->l_qseq&~1, i;
2833
278k
        for (i = 0; i < lqs2; i+=2)
2834
1.29k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2835
281k
        for (; i < c->l_qseq; ++i)
2836
4.60k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2837
277k
    } else c->l_qseq = 0;
2838
    // qual
2839
565k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2840
565k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2841
2.07k
        memset(t, 0xff, c->l_qseq);
2842
2.07k
        p += 2;
2843
280k
    } else {
2844
280k
        int failed = 0;
2845
280k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2846
280k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2847
280k
                   "SEQ and QUAL are of different length");
2848
280k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2849
280k
        _parse_err(failed, "invalid QUAL character");
2850
280k
        p += c->l_qseq + 1;
2851
280k
    }
2852
2853
    // aux
2854
282k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2855
1.00k
        goto err_ret;
2856
2857
281k
    if (bam_tag2cigar(b, 1, 1) < 0)
2858
0
        return -2;
2859
281k
    return 0;
2860
2861
0
#undef _parse_warn
2862
0
#undef _parse_err
2863
0
#undef _get_mem
2864
0
#undef _read_token
2865
3.46k
err_ret:
2866
3.46k
    return -2;
2867
281k
}
2868
2869
244k
static uint32_t read_ncigar(const char *q) {
2870
244k
    uint32_t n_cigar = 0;
2871
2.74M
    for (; *q && *q != '\t'; ++q)
2872
2.49M
        if (!isdigit_c(*q)) ++n_cigar;
2873
244k
    if (!n_cigar) {
2874
152
        hts_log_error("No CIGAR operations");
2875
152
        return 0;
2876
152
    }
2877
244k
    if (n_cigar >= 2147483647) {
2878
0
        hts_log_error("Too many CIGAR operations");
2879
0
        return 0;
2880
0
    }
2881
2882
244k
    return n_cigar;
2883
244k
}
2884
2885
/*! @function
2886
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2887
 @param  in      [in]  pointer to the source string
2888
 @param  a_cigar [out]  address of the destination uint32_t buffer
2889
 @return         number of processed input characters; 0 on error
2890
 */
2891
244k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2892
244k
    int i, overflow = 0;
2893
244k
    const char *p = in;
2894
503k
    for (i = 0; i < n_cigar; i++) {
2895
258k
        uint32_t len;
2896
258k
        int op;
2897
258k
        char *q;
2898
258k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2899
258k
        if (q == p) {
2900
156
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2901
156
            return 0;
2902
156
        }
2903
258k
        if (overflow) {
2904
21
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2905
21
            return 0;
2906
21
        }
2907
258k
        p = q;
2908
258k
        op = bam_cigar_table[(unsigned char)*p++];
2909
258k
        if (op < 0) {
2910
46
            hts_log_error("Unrecognized CIGAR operator");
2911
46
            return 0;
2912
46
        }
2913
258k
        a_cigar[i] = len;
2914
258k
        a_cigar[i] |= op;
2915
258k
    }
2916
2917
244k
    return p-in;
2918
244k
}
2919
2920
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2921
0
    size_t n_cigar = 0;
2922
0
    int diff;
2923
2924
0
    if (!in || !a_cigar || !a_mem) {
2925
0
        hts_log_error("NULL pointer arguments");
2926
0
        return -1;
2927
0
    }
2928
0
    if (end) *end = (char *)in;
2929
2930
0
    if (*in == '*') {
2931
0
        if (end) (*end)++;
2932
0
        return 0;
2933
0
    }
2934
0
    n_cigar = read_ncigar(in);
2935
0
    if (!n_cigar) return 0;
2936
0
    if (n_cigar > *a_mem) {
2937
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2938
0
        if (a_tmp) {
2939
0
            *a_cigar = a_tmp;
2940
0
            *a_mem = n_cigar;
2941
0
        } else {
2942
0
            hts_log_error("Memory allocation error");
2943
0
            return -1;
2944
0
        }
2945
0
    }
2946
2947
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2948
0
    if (end) *end = (char *)in+diff;
2949
2950
0
    return n_cigar;
2951
0
}
2952
2953
244k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2954
244k
    size_t n_cigar = 0;
2955
244k
    int diff;
2956
2957
244k
    if (!in || !b) {
2958
0
        hts_log_error("NULL pointer arguments");
2959
0
        return -1;
2960
0
    }
2961
244k
    if (end) *end = (char *)in;
2962
2963
244k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2964
244k
    if (!n_cigar && b->core.n_cigar == 0) {
2965
152
        if (end) *end = (char *)in+1;
2966
152
        return 0;
2967
152
    }
2968
2969
244k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2970
244k
    if (cig_diff > 0 &&
2971
244k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2972
0
        hts_log_error("Memory allocation error");
2973
0
        return -1;
2974
0
    }
2975
2976
244k
    uint32_t *cig = bam_get_cigar(b);
2977
244k
    if ((uint8_t *)cig != b->data + b->l_data) {
2978
        // Modifying an BAM existing BAM record
2979
0
        uint8_t  *seq = bam_get_seq(b);
2980
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2981
0
    }
2982
2983
244k
    if (n_cigar) {
2984
244k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2985
223
            return -1;
2986
244k
    } else {
2987
0
        diff = 1; // handle "*"
2988
0
    }
2989
2990
244k
    b->l_data += cig_diff * sizeof(uint32_t);
2991
244k
    b->core.n_cigar = n_cigar;
2992
244k
    if (end) *end = (char *)in + diff;
2993
2994
244k
    return n_cigar;
2995
244k
}
2996
2997
/*
2998
 * -----------------------------------------------------------------------------
2999
 * SAM threading
3000
 */
3001
// Size of SAM text block (reading)
3002
0
#define SAM_NBYTES 240000
3003
3004
// Number of BAM records (writing, up to NB_mem in size)
3005
0
#define SAM_NBAM 1000
3006
3007
struct SAM_state;
3008
3009
// Output job - a block of BAM records
3010
typedef struct sp_bams {
3011
    struct sp_bams *next;
3012
    int serial;
3013
3014
    bam1_t *bams;
3015
    int nbams, abams; // used and alloc for bams[] array
3016
    size_t bam_mem;   // very approximate total size
3017
3018
    struct SAM_state *fd;
3019
} sp_bams;
3020
3021
// Input job - a block of SAM text
3022
typedef struct sp_lines {
3023
    struct sp_lines *next;
3024
    int serial;
3025
3026
    char *data;
3027
    int data_size;
3028
    int alloc;
3029
3030
    struct SAM_state *fd;
3031
    sp_bams *bams;
3032
} sp_lines;
3033
3034
enum sam_cmd {
3035
    SAM_NONE = 0,
3036
    SAM_CLOSE,
3037
    SAM_CLOSE_DONE,
3038
};
3039
3040
typedef struct SAM_state {
3041
    sam_hdr_t *h;
3042
3043
    hts_tpool *p;
3044
    int own_pool;
3045
    pthread_mutex_t lines_m;
3046
    hts_tpool_process *q;
3047
    pthread_t dispatcher;
3048
    int dispatcher_set;
3049
3050
    sp_lines *lines;
3051
    sp_bams *bams;
3052
3053
    sp_bams *curr_bam;
3054
    int curr_idx;
3055
    int serial;
3056
3057
    // Be warned: moving these mutexes around in this struct can reduce
3058
    // threading performance by up to 70%!
3059
    pthread_mutex_t command_m;
3060
    pthread_cond_t command_c;
3061
    enum sam_cmd command;
3062
3063
    // One of the E* errno codes
3064
    int errcode;
3065
3066
    htsFile *fp;
3067
} SAM_state;
3068
3069
// Returns a SAM_state struct from a generic hFILE.
3070
//
3071
// Returns NULL on failure.
3072
0
static SAM_state *sam_state_create(htsFile *fp) {
3073
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3074
    // be a redirect call with an additional 'S' mode.  This in turn would
3075
    // correctly set the designed format to sam instead of a generic
3076
    // text_format.
3077
0
    if (fp->format.format != sam && fp->format.format != text_format)
3078
0
        return NULL;
3079
3080
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3081
0
    if (!fd)
3082
0
        return NULL;
3083
3084
0
    fp->state = fd;
3085
0
    fd->fp = fp;
3086
3087
0
    return fd;
3088
0
}
3089
3090
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3091
static void *sam_format_worker(void *arg);
3092
3093
0
static void sam_state_err(SAM_state *fd, int errcode) {
3094
0
    pthread_mutex_lock(&fd->command_m);
3095
0
    if (!fd->errcode)
3096
0
        fd->errcode = errcode;
3097
0
    pthread_mutex_unlock(&fd->command_m);
3098
0
}
3099
3100
0
static void sam_free_sp_bams(sp_bams *b) {
3101
0
    if (!b)
3102
0
        return;
3103
3104
0
    if (b->bams) {
3105
0
        int i;
3106
0
        for (i = 0; i < b->abams; i++) {
3107
0
            if (b->bams[i].data)
3108
0
                free(b->bams[i].data);
3109
0
        }
3110
0
        free(b->bams);
3111
0
    }
3112
0
    free(b);
3113
0
}
3114
3115
// Destroys the state produce by sam_state_create.
3116
14.4k
int sam_state_destroy(htsFile *fp) {
3117
14.4k
    int ret = 0;
3118
3119
14.4k
    if (!fp->state)
3120
14.4k
        return 0;
3121
3122
0
    SAM_state *fd = fp->state;
3123
0
    if (fd->p) {
3124
0
        if (fd->h) {
3125
            // Notify sam_dispatcher we're closing
3126
0
            pthread_mutex_lock(&fd->command_m);
3127
0
            if (fd->command != SAM_CLOSE_DONE)
3128
0
                fd->command = SAM_CLOSE;
3129
0
            pthread_cond_signal(&fd->command_c);
3130
0
            ret = -fd->errcode;
3131
0
            if (fd->q)
3132
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3133
3134
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3135
0
                for (;;) {
3136
                    // Avoid deadlocks with dispatcher
3137
0
                    if (fd->command == SAM_CLOSE_DONE)
3138
0
                        break;
3139
0
                    hts_tpool_wake_dispatch(fd->q);
3140
0
                    pthread_mutex_unlock(&fd->command_m);
3141
0
                    usleep(10000);
3142
0
                    pthread_mutex_lock(&fd->command_m);
3143
0
                }
3144
0
            }
3145
0
            pthread_mutex_unlock(&fd->command_m);
3146
3147
0
            if (fp->is_write) {
3148
                // Dispatch the last partial block.
3149
0
                sp_bams *gb = fd->curr_bam;
3150
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3151
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3152
3153
                // Flush and drain output
3154
0
                if (fd->q)
3155
0
                    hts_tpool_process_flush(fd->q);
3156
0
                pthread_mutex_lock(&fd->command_m);
3157
0
                if (!ret) ret = -fd->errcode;
3158
0
                pthread_mutex_unlock(&fd->command_m);
3159
3160
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3161
0
                    usleep(10000);
3162
0
                    pthread_mutex_lock(&fd->command_m);
3163
0
                    ret = -fd->errcode;
3164
                    // not empty but shutdown implies error
3165
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3166
0
                        ret = EIO;
3167
0
                    pthread_mutex_unlock(&fd->command_m);
3168
0
                }
3169
0
                if (fd->q)
3170
0
                    hts_tpool_process_shutdown(fd->q);
3171
0
            }
3172
3173
            // Wait for it to acknowledge
3174
0
            if (fd->dispatcher_set)
3175
0
                pthread_join(fd->dispatcher, NULL);
3176
0
            if (!ret) ret = -fd->errcode;
3177
0
        }
3178
3179
        // Tidy up memory
3180
0
        if (fd->q)
3181
0
            hts_tpool_process_destroy(fd->q);
3182
3183
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3184
0
            hts_tpool_destroy(fd->p);
3185
0
            fd->p = NULL;
3186
0
        }
3187
0
        pthread_mutex_destroy(&fd->lines_m);
3188
0
        pthread_mutex_destroy(&fd->command_m);
3189
0
        pthread_cond_destroy(&fd->command_c);
3190
3191
0
        sp_lines *l = fd->lines;
3192
0
        while (l) {
3193
0
            sp_lines *n = l->next;
3194
0
            free(l->data);
3195
0
            free(l);
3196
0
            l = n;
3197
0
        }
3198
3199
0
        sp_bams *b = fd->bams;
3200
0
        while (b) {
3201
0
            if (fd->curr_bam == b)
3202
0
                fd->curr_bam = NULL;
3203
0
            sp_bams *n = b->next;
3204
0
            sam_free_sp_bams(b);
3205
0
            b = n;
3206
0
        }
3207
3208
0
        if (fd->curr_bam)
3209
0
            sam_free_sp_bams(fd->curr_bam);
3210
3211
        // Decrement counter by one, maybe destroying too.
3212
        // This is to permit the caller using bam_hdr_destroy
3213
        // before sam_close without triggering decode errors
3214
        // in the background threads.
3215
0
        bam_hdr_destroy(fd->h);
3216
0
    }
3217
3218
0
    free(fp->state);
3219
0
    fp->state = NULL;
3220
0
    return ret;
3221
14.4k
}
3222
3223
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3224
0
static void cleanup_sp_lines(void *arg) {
3225
0
    sp_lines *gl = (sp_lines *)arg;
3226
0
    if (!gl) return;
3227
3228
    // Should always be true for lines passed to / from thread workers.
3229
0
    assert(gl->next == NULL);
3230
3231
0
    free(gl->data);
3232
0
    sam_free_sp_bams(gl->bams);
3233
0
    free(gl);
3234
0
}
3235
3236
// Run from one of the worker threads.
3237
// Convert a passed in array of lines to array of BAMs, returning
3238
// the result back to the thread queue.
3239
0
static void *sam_parse_worker(void *arg) {
3240
0
    sp_lines *gl = (sp_lines *)arg;
3241
0
    sp_bams *gb = NULL;
3242
0
    char *lines = gl->data;
3243
0
    int i;
3244
0
    bam1_t *b;
3245
0
    SAM_state *fd = gl->fd;
3246
3247
    // Use a block of BAM structs we had earlier if available.
3248
0
    pthread_mutex_lock(&fd->lines_m);
3249
0
    if (fd->bams) {
3250
0
        gb = fd->bams;
3251
0
        fd->bams = gb->next;
3252
0
    }
3253
0
    pthread_mutex_unlock(&fd->lines_m);
3254
3255
0
    if (gb == NULL) {
3256
0
        gb = calloc(1, sizeof(*gb));
3257
0
        if (!gb) {
3258
0
            return NULL;
3259
0
        }
3260
0
        gb->abams = 100;
3261
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3262
0
        if (!gb->bams) {
3263
0
            sam_state_err(fd, ENOMEM);
3264
0
            goto err;
3265
0
        }
3266
0
        gb->nbams = 0;
3267
0
        gb->bam_mem = 0;
3268
0
    }
3269
0
    gb->serial = gl->serial;
3270
0
    gb->next = NULL;
3271
3272
0
    b = (bam1_t *)gb->bams;
3273
0
    if (!b) {
3274
0
        sam_state_err(fd, ENOMEM);
3275
0
        goto err;
3276
0
    }
3277
3278
0
    i = 0;
3279
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3280
0
    while (cp < cp_end) {
3281
0
        if (i >= gb->abams) {
3282
0
            int old_abams = gb->abams;
3283
0
            gb->abams *= 2;
3284
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3285
0
            if (!b) {
3286
0
                gb->abams /= 2;
3287
0
                sam_state_err(fd, ENOMEM);
3288
0
                goto err;
3289
0
            }
3290
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3291
0
            gb->bams = b;
3292
0
        }
3293
3294
        // Ideally we'd get sam_parse1 to return the number of
3295
        // bytes decoded and to be able to stop on newline as
3296
        // well as \0.
3297
        //
3298
        // We can then avoid the additional strchr loop.
3299
        // It's around 6% of our CPU cost, albeit threadable.
3300
        //
3301
        // However this is an API change so for now we copy.
3302
3303
0
        char *nl = strchr(cp, '\n');
3304
0
        char *line_end;
3305
0
        if (nl) {
3306
0
            line_end = nl;
3307
0
            if (line_end > cp && *(line_end - 1) == '\r')
3308
0
                line_end--;
3309
0
            nl++;
3310
0
        } else {
3311
0
            nl = line_end = cp_end;
3312
0
        }
3313
0
        *line_end = '\0';
3314
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3315
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3316
0
            sam_state_err(fd, errno ? errno : EIO);
3317
0
            cleanup_sp_lines(gl);
3318
0
            goto err;
3319
0
        }
3320
3321
0
        cp = nl;
3322
0
        i++;
3323
0
    }
3324
0
    gb->nbams = i;
3325
3326
0
    pthread_mutex_lock(&fd->lines_m);
3327
0
    gl->next = fd->lines;
3328
0
    fd->lines = gl;
3329
0
    pthread_mutex_unlock(&fd->lines_m);
3330
0
    return gb;
3331
3332
0
 err:
3333
0
    sam_free_sp_bams(gb);
3334
0
    return NULL;
3335
0
}
3336
3337
0
static void *sam_parse_eof(void *arg) {
3338
0
    return NULL;
3339
0
}
3340
3341
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3342
0
static void cleanup_sp_bams(void *arg) {
3343
0
    sam_free_sp_bams((sp_bams *) arg);
3344
0
}
3345
3346
// Runs in its own thread.
3347
// Reads a block of text (SAM) and sends a new job to the thread queue to
3348
// translate this to BAM.
3349
0
static void *sam_dispatcher_read(void *vp) {
3350
0
    htsFile *fp = vp;
3351
0
    kstring_t line = {0};
3352
0
    int line_frag = 0;
3353
0
    SAM_state *fd = fp->state;
3354
0
    sp_lines *l = NULL;
3355
3356
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3357
    // matter as it will grow if necessary).
3358
0
    if (ks_resize(&line, 1000) < 0)
3359
0
        goto err;
3360
3361
0
    for (;;) {
3362
        // Check for command
3363
0
        pthread_mutex_lock(&fd->command_m);
3364
0
        switch (fd->command) {
3365
3366
0
        case SAM_CLOSE:
3367
0
            pthread_cond_signal(&fd->command_c);
3368
0
            pthread_mutex_unlock(&fd->command_m);
3369
0
            hts_tpool_process_shutdown(fd->q);
3370
0
            goto tidyup;
3371
3372
0
        default:
3373
0
            break;
3374
0
        }
3375
0
        pthread_mutex_unlock(&fd->command_m);
3376
3377
0
        pthread_mutex_lock(&fd->lines_m);
3378
0
        if (fd->lines) {
3379
            // reuse existing line buffer
3380
0
            l = fd->lines;
3381
0
            fd->lines = l->next;
3382
0
        }
3383
0
        pthread_mutex_unlock(&fd->lines_m);
3384
3385
0
        if (l == NULL) {
3386
            // none to reuse, to create a new one
3387
0
            l = calloc(1, sizeof(*l));
3388
0
            if (!l)
3389
0
                goto err;
3390
0
            l->alloc = SAM_NBYTES;
3391
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3392
0
            if (!l->data) {
3393
0
                free(l);
3394
0
                l = NULL;
3395
0
                goto err;
3396
0
            }
3397
0
            l->fd = fd;
3398
0
        }
3399
0
        l->next = NULL;
3400
3401
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3402
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3403
0
            if (!rp)
3404
0
                goto err;
3405
0
            l->alloc = line_frag+SAM_NBYTES/2;
3406
0
            l->data = rp;
3407
0
        }
3408
0
        memcpy(l->data, line.s, line_frag);
3409
3410
0
        l->data_size = line_frag;
3411
0
        ssize_t nbytes;
3412
0
    longer_line:
3413
0
        if (fp->is_bgzf)
3414
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3415
0
        else
3416
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3417
0
        if (nbytes < 0) {
3418
0
            sam_state_err(fd, errno ? errno : EIO);
3419
0
            goto err;
3420
0
        } else if (nbytes == 0)
3421
0
            break; // EOF
3422
0
        l->data_size += nbytes;
3423
3424
        // trim to last \n. Maybe \r\n, but that's still fine
3425
0
        if (nbytes == l->alloc - line_frag) {
3426
0
            char *cp_end = l->data + l->data_size;
3427
0
            char *cp = cp_end-1;
3428
3429
0
            while (cp > (char *)l->data && *cp != '\n')
3430
0
                cp--;
3431
3432
            // entire buffer is part of a single line
3433
0
            if (cp == l->data) {
3434
0
                line_frag = l->data_size;
3435
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3436
0
                if (!rp)
3437
0
                    goto err;
3438
0
                l->alloc *= 2;
3439
0
                l->data = rp;
3440
0
                assert(l->alloc >= l->data_size);
3441
0
                assert(l->alloc >= line_frag);
3442
0
                assert(l->alloc >= l->alloc - line_frag);
3443
0
                goto longer_line;
3444
0
            }
3445
0
            cp++;
3446
3447
            // line holds the remainder of our line.
3448
0
            if (ks_resize(&line, cp_end - cp) < 0)
3449
0
                goto err;
3450
0
            memcpy(line.s, cp, cp_end - cp);
3451
0
            line_frag = cp_end - cp;
3452
0
            l->data_size = l->alloc - line_frag;
3453
0
        } else {
3454
            // out of buffer
3455
0
            line_frag = 0;
3456
0
        }
3457
3458
0
        l->serial = fd->serial++;
3459
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3460
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3461
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3462
0
            goto err;
3463
0
        pthread_mutex_lock(&fd->command_m);
3464
0
        if (fd->command == SAM_CLOSE) {
3465
0
            pthread_mutex_unlock(&fd->command_m);
3466
0
            l = NULL;
3467
0
            goto tidyup;
3468
0
        }
3469
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3470
0
        pthread_mutex_unlock(&fd->command_m);
3471
0
    }
3472
3473
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3474
0
        goto err;
3475
3476
    // At EOF, wait for close request.
3477
    // (In future if we add support for seek, this is where we need to catch it.)
3478
0
    for (;;) {
3479
0
        pthread_mutex_lock(&fd->command_m);
3480
0
        if (fd->command == SAM_NONE)
3481
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3482
0
        switch (fd->command) {
3483
0
        case SAM_CLOSE:
3484
0
            pthread_cond_signal(&fd->command_c);
3485
0
            pthread_mutex_unlock(&fd->command_m);
3486
0
            hts_tpool_process_shutdown(fd->q);
3487
0
            goto tidyup;
3488
3489
0
        default:
3490
0
            pthread_mutex_unlock(&fd->command_m);
3491
0
            break;
3492
0
        }
3493
0
    }
3494
3495
0
 tidyup:
3496
0
    pthread_mutex_lock(&fd->command_m);
3497
0
    fd->command = SAM_CLOSE_DONE;
3498
0
    pthread_cond_signal(&fd->command_c);
3499
0
    pthread_mutex_unlock(&fd->command_m);
3500
3501
0
    if (l) {
3502
0
        pthread_mutex_lock(&fd->lines_m);
3503
0
        l->next = fd->lines;
3504
0
        fd->lines = l;
3505
0
        pthread_mutex_unlock(&fd->lines_m);
3506
0
    }
3507
0
    free(line.s);
3508
3509
0
    return NULL;
3510
3511
0
 err:
3512
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3513
0
    hts_tpool_process_shutdown(fd->q);
3514
0
    goto tidyup;
3515
0
}
3516
3517
// Runs in its own thread.
3518
// Takes encoded blocks of SAM off the thread results queue and writes them
3519
// to our output stream.
3520
0
static void *sam_dispatcher_write(void *vp) {
3521
0
    htsFile *fp = vp;
3522
0
    SAM_state *fd = fp->state;
3523
0
    hts_tpool_result *r;
3524
3525
    // Iterates until result queue is shutdown, where it returns NULL.
3526
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3527
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3528
0
        if (!gl) {
3529
0
            sam_state_err(fd, ENOMEM);
3530
0
            goto err;
3531
0
        }
3532
3533
0
        if (fp->idx) {
3534
0
            sp_bams *gb = gl->bams;
3535
0
            int i = 0, count = 0;
3536
0
            while (i < gl->data_size) {
3537
0
                int j = i;
3538
0
                while (i < gl->data_size && gl->data[i] != '\n')
3539
0
                    i++;
3540
0
                if (i < gl->data_size)
3541
0
                    i++;
3542
3543
0
                if (fp->is_bgzf) {
3544
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3545
0
                        goto err;
3546
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3547
0
                        goto err;
3548
0
                } else {
3549
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3550
0
                        goto err;
3551
0
                }
3552
3553
0
                bam1_t *b = &gb->bams[count++];
3554
0
                if (fp->format.compression == bgzf) {
3555
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3556
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3557
0
                                      bgzf_tell(fp->fp.bgzf),
3558
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3559
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3560
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3561
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3562
0
                        goto err;
3563
0
                    }
3564
0
                } else {
3565
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3566
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3567
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3568
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3569
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3570
0
                        goto err;
3571
0
                    }
3572
0
                }
3573
0
            }
3574
3575
0
            assert(count == gb->nbams);
3576
3577
            // Add bam array to free-list
3578
0
            pthread_mutex_lock(&fd->lines_m);
3579
0
            gb->next = fd->bams;
3580
0
            fd->bams = gl->bams;
3581
0
            gl->bams = NULL;
3582
0
            pthread_mutex_unlock(&fd->lines_m);
3583
0
        } else {
3584
0
            if (fp->is_bgzf) {
3585
                // We keep track of how much in the current block we have
3586
                // remaining => R.  We look for the last newline in input
3587
                // [i] to [i+R], backwards => position N.
3588
                //
3589
                // If we find a newline, we write out bytes i to N.
3590
                // We know we cannot fit the next record in this bgzf block,
3591
                // so we flush what we have and copy input N to i+R into
3592
                // the start of a new block, and recompute a new R for that.
3593
                //
3594
                // If we don't find a newline (i==N) then we cannot extend
3595
                // the current block at all, so flush whatever is in it now
3596
                // if it ends on a newline.
3597
                // We still copy i(==N) to i+R to the next block and
3598
                // continue as before with a new R.
3599
                //
3600
                // The only exception on the flush is when we run out of
3601
                // data in the input.  In that case we skip it as we don't
3602
                // yet know if the next record will fit.
3603
                //
3604
                // Both conditions share the same code here:
3605
                // - Look for newline (pos N)
3606
                // - Write i to N (which maybe 0)
3607
                // - Flush if block ends on newline and not end of input
3608
                // - write N to i+R
3609
3610
0
                int i = 0;
3611
0
                BGZF *fb = fp->fp.bgzf;
3612
0
                while (i < gl->data_size) {
3613
                    // remaining space in block
3614
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3615
0
                    int eod = 0;
3616
0
                    if (R > gl->data_size-i)
3617
0
                        R = gl->data_size-i, eod = 1;
3618
3619
                    // Find last newline in input data
3620
0
                    int N = i + R;
3621
0
                    while (--N > i) {
3622
0
                        if (gl->data[N] == '\n')
3623
0
                            break;
3624
0
                    }
3625
3626
0
                    if (N != i) {
3627
                        // Found a newline
3628
0
                        N++;
3629
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3630
0
                            goto err;
3631
0
                    }
3632
3633
                    // Flush bgzf block
3634
0
                    int b_off = fb->block_offset;
3635
0
                    if (!eod && b_off &&
3636
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3637
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3638
0
                            goto err;
3639
3640
                    // Copy from N onwards into next block
3641
0
                    if (i+R > N)
3642
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3643
0
                            != i+R - N)
3644
0
                            goto err;
3645
3646
0
                    i = i+R;
3647
0
                }
3648
0
            } else {
3649
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3650
0
                    goto err;
3651
0
            }
3652
0
        }
3653
3654
0
        hts_tpool_delete_result(r, 0);
3655
3656
        // Also updated by main thread
3657
0
        pthread_mutex_lock(&fd->lines_m);
3658
0
        gl->next = fd->lines;
3659
0
        fd->lines = gl;
3660
0
        pthread_mutex_unlock(&fd->lines_m);
3661
0
    }
3662
3663
0
    sam_state_err(fd, 0); // success
3664
0
    hts_tpool_process_shutdown(fd->q);
3665
0
    return NULL;
3666
3667
0
 err:
3668
0
    sam_state_err(fd, errno ? errno : EIO);
3669
0
    return (void *)-1;
3670
0
}
3671
3672
// Run from one of the worker threads.
3673
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3674
// of text SAM records (sp_lines).
3675
0
static void *sam_format_worker(void *arg) {
3676
0
    sp_bams *gb = (sp_bams *)arg;
3677
0
    sp_lines *gl = NULL;
3678
0
    int i;
3679
0
    SAM_state *fd = gb->fd;
3680
0
    htsFile *fp = fd->fp;
3681
3682
    // Use a block of SAM strings we had earlier if available.
3683
0
    pthread_mutex_lock(&fd->lines_m);
3684
0
    if (fd->lines) {
3685
0
        gl = fd->lines;
3686
0
        fd->lines = gl->next;
3687
0
    }
3688
0
    pthread_mutex_unlock(&fd->lines_m);
3689
3690
0
    if (gl == NULL) {
3691
0
        gl = calloc(1, sizeof(*gl));
3692
0
        if (!gl) {
3693
0
            sam_state_err(fd, ENOMEM);
3694
0
            return NULL;
3695
0
        }
3696
0
        gl->alloc = gl->data_size = 0;
3697
0
        gl->data = NULL;
3698
0
    }
3699
0
    gl->serial = gb->serial;
3700
0
    gl->next = NULL;
3701
3702
0
    kstring_t ks = {0, gl->alloc, gl->data};
3703
3704
0
    for (i = 0; i < gb->nbams; i++) {
3705
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3706
0
            sam_state_err(fd, errno ? errno : EIO);
3707
0
            goto err;
3708
0
        }
3709
0
        kputc('\n', &ks);
3710
0
    }
3711
3712
0
    pthread_mutex_lock(&fd->lines_m);
3713
0
    gl->data_size = ks.l;
3714
0
    gl->alloc = ks.m;
3715
0
    gl->data = ks.s;
3716
3717
0
    if (fp->idx) {
3718
        // Keep hold of the bam array a little longer as
3719
        // sam_dispatcher_write needs to use them for building the index.
3720
0
        gl->bams = gb;
3721
0
    } else {
3722
        // Add bam array to free-list
3723
0
        gb->next = fd->bams;
3724
0
        fd->bams = gb;
3725
0
    }
3726
0
    pthread_mutex_unlock(&fd->lines_m);
3727
3728
0
    return gl;
3729
3730
0
 err:
3731
    // Possible race between this and fd->curr_bam.
3732
    // Easier to not free and leave it on the input list so it
3733
    // gets freed there instead?
3734
    // sam_free_sp_bams(gb);
3735
0
    if (gl) {
3736
0
        free(gl->data);
3737
0
        free(gl);
3738
0
    }
3739
0
    return NULL;
3740
0
}
3741
3742
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3743
0
    if (fp->state)
3744
0
        return 0;
3745
3746
0
    if (!(fp->state = sam_state_create(fp)))
3747
0
        return -1;
3748
0
    SAM_state *fd = (SAM_state *)fp->state;
3749
3750
0
    pthread_mutex_init(&fd->lines_m, NULL);
3751
0
    pthread_mutex_init(&fd->command_m, NULL);
3752
0
    pthread_cond_init(&fd->command_c, NULL);
3753
0
    fd->p = p->pool;
3754
0
    int qsize = p->qsize;
3755
0
    if (!qsize)
3756
0
        qsize = 2*hts_tpool_size(fd->p);
3757
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3758
0
    if (!fd->q) {
3759
0
        sam_state_destroy(fp);
3760
0
        return -1;
3761
0
    }
3762
3763
0
    if (fp->format.compression == bgzf)
3764
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3765
3766
0
    return 0;
3767
0
}
3768
3769
0
int sam_set_threads(htsFile *fp, int nthreads) {
3770
0
    if (nthreads <= 0)
3771
0
        return 0;
3772
3773
0
    htsThreadPool p;
3774
0
    p.pool = hts_tpool_init(nthreads);
3775
0
    p.qsize = nthreads*2;
3776
3777
0
    int ret = sam_set_thread_pool(fp, &p);
3778
0
    if (ret < 0)
3779
0
        return ret;
3780
3781
0
    SAM_state *fd = (SAM_state *)fp->state;
3782
0
    fd->own_pool = 1;
3783
3784
0
    return 0;
3785
0
}
3786
3787
typedef struct {
3788
    kstring_t name;
3789
    kstring_t comment; // NB: pointer into name, do not free
3790
    kstring_t seq;
3791
    kstring_t qual;
3792
    int casava;
3793
    int aux;
3794
    int rnum;
3795
    char BC[3];         // aux tag ID for barcode
3796
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3797
    char nprefix;
3798
    int sra_names;
3799
} fastq_state;
3800
3801
// Initialise fastq state.
3802
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3803
1.39k
static fastq_state *fastq_state_init(int name_char) {
3804
1.39k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3805
1.39k
    if (!x)
3806
0
        return NULL;
3807
1.39k
    strcpy(x->BC, "BC");
3808
1.39k
    x->nprefix = name_char;
3809
3810
1.39k
    return x;
3811
1.39k
}
3812
3813
1.85k
void fastq_state_destroy(htsFile *fp) {
3814
1.85k
    if (fp->state) {
3815
1.39k
        fastq_state *x = (fastq_state *)fp->state;
3816
1.39k
        if (x->tags)
3817
1.39k
            kh_destroy(tag, x->tags);
3818
1.39k
        ks_free(&x->name);
3819
1.39k
        ks_free(&x->seq);
3820
1.39k
        ks_free(&x->qual);
3821
1.39k
        free(fp->state);
3822
1.39k
    }
3823
1.85k
}
3824
3825
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3826
0
    va_list args;
3827
3828
0
    if (!fp)
3829
0
        return -1;
3830
0
    if (!fp->state)
3831
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3832
0
                                           ? '@' : '>')))
3833
0
            return -1;
3834
3835
0
    fastq_state *x = (fastq_state *)fp->state;
3836
3837
0
    switch (opt) {
3838
0
    case FASTQ_OPT_CASAVA:
3839
0
        x->casava = 1;
3840
0
        break;
3841
3842
0
    case FASTQ_OPT_NAME2:
3843
0
        x->sra_names = 1;
3844
0
        break;
3845
3846
0
    case FASTQ_OPT_AUX: {
3847
0
        va_start(args, opt);
3848
0
        x->aux = 1;
3849
0
        char *tag = va_arg(args, char *);
3850
0
        va_end(args);
3851
0
        if (tag && strcmp(tag, "1") != 0) {
3852
0
            if (!x->tags)
3853
0
                if (!(x->tags = kh_init(tag)))
3854
0
                    return -1;
3855
3856
0
            size_t i, tlen = strlen(tag);
3857
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3858
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3859
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3860
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3861
0
                    break;
3862
0
                }
3863
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3864
0
                kh_put(tag, x->tags, tcode, &ret);
3865
0
                if (ret < 0)
3866
0
                    return -1;
3867
0
            }
3868
0
        }
3869
0
        break;
3870
0
    }
3871
3872
0
    case FASTQ_OPT_BARCODE: {
3873
0
        va_start(args, opt);
3874
0
        char *bc = va_arg(args, char *);
3875
0
        va_end(args);
3876
0
        strncpy(x->BC, bc, 2);
3877
0
        x->BC[2] = 0;
3878
0
        break;
3879
0
    }
3880
3881
0
    case FASTQ_OPT_RNUM:
3882
0
        x->rnum = 1;
3883
0
        break;
3884
3885
0
    default:
3886
0
        break;
3887
0
    }
3888
0
    return 0;
3889
0
}
3890
3891
29.9M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3892
29.9M
    fastq_state *x = (fastq_state *)fp->state;
3893
29.9M
    size_t i, l;
3894
29.9M
    int ret = 0;
3895
3896
29.9M
    if (fp->format.format == fasta_format && fp->line.s) {
3897
        // For FASTA we've already read the >name line; steal it
3898
        // Not the most efficient, but we don't optimise for fasta reading.
3899
29.9M
        if (fp->line.l == 0)
3900
471
            return -1; // EOF
3901
3902
29.9M
        free(x->name.s);
3903
29.9M
        x->name = fp->line;
3904
29.9M
        fp->line.l = fp->line.m = 0;
3905
29.9M
        fp->line.s = NULL;
3906
29.9M
    } else {
3907
        // Read a FASTQ format entry.
3908
2.69k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3909
2.69k
        if (ret == -1)
3910
3
            return -1;  // EOF
3911
2.68k
        else if (ret < -1)
3912
15
            return ret; // ERR
3913
2.69k
    }
3914
3915
    // Name
3916
29.9M
    if (*x->name.s != x->nprefix)
3917
24
        return -2;
3918
3919
    // Reverse the SRA strangeness of putting the run_name.number before
3920
    // the read name.
3921
29.9M
    i = 0;
3922
29.9M
    char *name = x->name.s+1;
3923
29.9M
    if (x->sra_names) {
3924
0
        char *cp = strpbrk(x->name.s, " \t");
3925
0
        if (cp) {
3926
0
            while (*cp == ' ' || *cp == '\t')
3927
0
                cp++;
3928
0
            *--cp = '@';
3929
0
            i = cp - x->name.s;
3930
0
            name = cp+1;
3931
0
        }
3932
0
    }
3933
3934
29.9M
    l = x->name.l;
3935
29.9M
    char *s = x->name.s;
3936
102M
    while (i < l && !isspace_c(s[i]))
3937
72.1M
        i++;
3938
29.9M
    if (i < l) {
3939
176k
        s[i] = 0;
3940
176k
        x->name.l = i++;
3941
176k
    }
3942
3943
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3944
30.7M
    while (i < l && isspace_c(s[i]))
3945
814k
        i++;
3946
29.9M
    x->comment.s = s+i;
3947
29.9M
    x->comment.l = l - i;
3948
3949
    // Seq
3950
29.9M
    x->seq.l = 0;
3951
157M
    for (;;) {
3952
157M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3953
1.23k
            if (fp->format.format == fastq_format || ret < -1)
3954
765
                return -2;
3955
157M
        if (ret == -1 ||
3956
157M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3957
29.9M
            break;
3958
127M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3959
0
            return -2;
3960
127M
    }
3961
3962
    // Qual
3963
29.9M
    if (fp->format.format == fastq_format) {
3964
1.32k
        size_t remainder = x->seq.l;
3965
1.32k
        x->qual.l = 0;
3966
5.45k
        do {
3967
5.45k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3968
12
                return -2;
3969
5.44k
            if (fp->line.l > remainder)
3970
15
                return -2;
3971
5.42k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
3972
0
                return -2;
3973
5.42k
            remainder -= fp->line.l;
3974
5.42k
        } while (remainder > 0);
3975
3976
        // Decr qual
3977
14.3k
        for (i = 0; i < x->qual.l; i++)
3978
13.0k
            x->qual.s[i] -= '!';
3979
1.29k
    }
3980
3981
29.9M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
3982
29.9M
    if (x->name.l > 2 &&
3983
29.9M
        x->name.s[x->name.l-2] == '/' &&
3984
29.9M
        isdigit_c(x->name.s[x->name.l-1])) {
3985
9.58k
        switch(x->name.s[x->name.l-1]) {
3986
7.09k
        case '1': flag |= BAM_FREAD1 | pflag; break;
3987
2.03k
        case '2': flag |= BAM_FREAD2 | pflag; break;
3988
450
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3989
9.58k
        }
3990
9.58k
        x->name.s[x->name.l-=2] = 0;
3991
9.58k
    }
3992
3993
    // Convert to BAM
3994
29.9M
    ret = bam_set1(b,
3995
29.9M
                   x->name.s + x->name.l - name, name,
3996
29.9M
                   flag,
3997
29.9M
                   -1, -1, 0, // ref '*', pos, mapq,
3998
29.9M
                   0, NULL,     // no cigar,
3999
29.9M
                   -1, -1, 0,    // mate
4000
29.9M
                   x->seq.l, x->seq.s, x->qual.s,
4001
29.9M
                   0);
4002
4003
    // Identify Illumina CASAVA strings.
4004
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4005
29.9M
    char *barcode = NULL;
4006
29.9M
    int barcode_len = 0;
4007
29.9M
    kstring_t *kc = &x->comment;
4008
29.9M
    char *endptr;
4009
29.9M
    if (x->casava &&
4010
        // \d:[YN]:\d+:[ACGTN]+
4011
29.9M
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4012
29.9M
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4013
29.9M
        && *endptr == ':') {
4014
4015
        // read num
4016
0
        switch(kc->s[0]) {
4017
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4018
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4019
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4020
0
        }
4021
4022
0
        if (kc->s[2] == 'Y')
4023
0
            b->core.flag |= BAM_FQCFAIL;
4024
4025
        // Barcode, maybe numeric in which case we skip it
4026
0
        if (!isdigit_c(endptr[1])) {
4027
0
            barcode = endptr+1;
4028
0
            for (i = barcode - kc->s; i < kc->l; i++)
4029
0
                if (isspace_c(kc->s[i]))
4030
0
                    break;
4031
4032
0
            kc->s[i] = 0;
4033
0
            barcode_len = i+1-(barcode - kc->s);
4034
0
        }
4035
0
    }
4036
4037
29.9M
    if (ret >= 0 && barcode_len)
4038
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4039
0
            ret = -2;
4040
4041
29.9M
    if (!x->aux)
4042
29.9M
        return ret;
4043
4044
    // Identify any SAM style aux tags in comments too.
4045
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4046
0
        ret = -2;
4047
4048
0
    return ret;
4049
29.9M
}
4050
4051
// Internal component of sam_read1 below
4052
4.16k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4053
4.16k
    int ret = bam_read1(fp->fp.bgzf, b);
4054
4.16k
    if (h && ret >= 0) {
4055
3.85k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4056
3.85k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4057
63
            errno = ERANGE;
4058
63
            return -3;
4059
63
        }
4060
3.85k
    }
4061
4.09k
    return ret;
4062
4.16k
}
4063
4064
// Internal component of sam_read1 below
4065
4.04k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4066
4.04k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4067
4.04k
    if (ret < 0)
4068
4.04k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4069
4070
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4071
0
        return -2;
4072
4073
0
    return ret;
4074
0
}
4075
4076
// Internal component of sam_read1 below
4077
288k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4078
288k
    int ret;
4079
4080
    // Consume 1st line after header parsing as it wasn't using peek
4081
288k
    if (fp->line.l != 0) {
4082
0
        ret = sam_parse1(&fp->line, h, b);
4083
0
        fp->line.l = 0;
4084
0
        return ret;
4085
0
    }
4086
4087
288k
    if (fp->state) {
4088
0
        SAM_state *fd = (SAM_state *)fp->state;
4089
4090
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4091
            // We don't support multi-threaded SAM parsing with seeks yet.
4092
0
            int ret;
4093
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4094
0
                errno = -ret;
4095
0
                return -2;
4096
0
            }
4097
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4098
0
                return -1;
4099
0
            fp->fp.bgzf->seeked = 0;
4100
0
            goto err_recover;
4101
0
        }
4102
4103
0
        if (!fd->h) {
4104
0
            fd->h = h;
4105
0
            fd->h->ref_count++;
4106
            // Ensure hrecs is initialised now as we don't want multiple
4107
            // threads trying to do this simultaneously.
4108
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4109
0
                return -2;
4110
4111
            // We can only do this once we've got a header
4112
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4113
0
                               fp) != 0)
4114
0
                return -2;
4115
0
            fd->dispatcher_set = 1;
4116
0
        }
4117
4118
0
        if (fd->h != h) {
4119
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4120
0
            return -1;
4121
0
        }
4122
4123
0
        sp_bams *gb = fd->curr_bam;
4124
0
        if (!gb) {
4125
0
            if (fd->errcode) {
4126
                // In case reader failed
4127
0
                errno = fd->errcode;
4128
0
                return -2;
4129
0
            }
4130
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4131
0
            if (!r)
4132
0
                return -2;
4133
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4134
0
            hts_tpool_delete_result(r, 0);
4135
0
        }
4136
0
        if (!gb)
4137
0
            return fd->errcode ? -2 : -1;
4138
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4139
0
        if (fd->curr_idx < gb->nbams)
4140
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4141
0
                return -2;
4142
0
        if (fd->curr_idx == gb->nbams) {
4143
0
            pthread_mutex_lock(&fd->lines_m);
4144
0
            gb->next = fd->bams;
4145
0
            fd->bams = gb;
4146
0
            pthread_mutex_unlock(&fd->lines_m);
4147
4148
0
            fd->curr_bam = NULL;
4149
0
            fd->curr_idx = 0;
4150
0
        }
4151
4152
0
        ret = 0;
4153
4154
288k
    } else  {
4155
288k
    err_recover:
4156
288k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4157
288k
        if (ret < 0) return ret;
4158
4159
285k
        ret = sam_parse1(&fp->line, h, b);
4160
285k
        fp->line.l = 0;
4161
285k
        if (ret < 0) {
4162
3.46k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4163
3.46k
            if (h && h->ignore_sam_err) goto err_recover;
4164
3.46k
        }
4165
285k
    }
4166
4167
285k
    return ret;
4168
288k
}
4169
4170
// Returns 0 on success,
4171
//        -1 on EOF,
4172
//       <-1 on error
4173
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4174
30.2M
{
4175
30.2M
    int ret, pass_filter;
4176
4177
30.2M
    do {
4178
30.2M
        switch (fp->format.format) {
4179
4.16k
        case bam:
4180
4.16k
            ret = sam_read1_bam(fp, h, b);
4181
4.16k
            break;
4182
4183
4.04k
        case cram:
4184
4.04k
            ret = sam_read1_cram(fp, h, &b);
4185
4.04k
            break;
4186
4187
288k
        case sam:
4188
288k
            ret = sam_read1_sam(fp, h, b);
4189
288k
            break;
4190
4191
29.9M
        case fasta_format:
4192
29.9M
        case fastq_format: {
4193
29.9M
            fastq_state *x = (fastq_state *)fp->state;
4194
29.9M
            if (!x) {
4195
1.39k
                if (!(fp->state = fastq_state_init(fp->format.format
4196
1.39k
                                                   == fastq_format ? '@' : '>')))
4197
0
                    return -2;
4198
1.39k
            }
4199
4200
29.9M
            return fastq_parse1(fp, b);
4201
29.9M
        }
4202
4203
0
        case empty_format:
4204
0
            errno = EPIPE;
4205
0
            return -3;
4206
4207
0
        default:
4208
0
            errno = EFTYPE;
4209
0
            return -3;
4210
30.2M
        }
4211
4212
296k
        pass_filter = (ret >= 0 && fp->filter)
4213
296k
            ? sam_passes_filter(h, b, fp->filter)
4214
296k
            : 1;
4215
296k
    } while (pass_filter == 0);
4216
4217
296k
    return pass_filter < 0 ? -2 : ret;
4218
30.2M
}
4219
4220
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4221
// this code isn't vectorised and runs far slower than is necessary (even
4222
// with the restrict keyword being used).
4223
static inline void HTS_OPT3
4224
1.57k
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4225
1.57k
    uint32_t i;
4226
7.17k
    for (i = 0; i < len; i++)
4227
5.60k
        a[i] = b[i]+33;
4228
1.57k
}
4229
4230
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4231
10.0M
{
4232
10.0M
    int i, r = 0;
4233
10.0M
    uint8_t *s, *end;
4234
10.0M
    const bam1_core_t *c = &b->core;
4235
4236
10.0M
    if (c->l_qname == 0)
4237
0
        return -1;
4238
10.0M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4239
10.0M
    r |= kputc_('\t', str); // query name
4240
10.0M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4241
10.0M
    if (c->tid >= 0) { // chr
4242
53.8k
        r |= kputs(h->target_name[c->tid] , str);
4243
53.8k
        r |= kputc_('\t', str);
4244
10.0M
    } else r |= kputsn_("*\t", 2, str);
4245
10.0M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4246
10.0M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4247
10.0M
    if (c->n_cigar) { // cigar
4248
83.2k
        uint32_t *cigar = bam_get_cigar(b);
4249
4.85M
        for (i = 0; i < c->n_cigar; ++i) {
4250
4.77M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4251
4.77M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4252
4.77M
        }
4253
9.98M
    } else r |= kputc_('*', str);
4254
10.0M
    r |= kputc_('\t', str);
4255
10.0M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4256
4.03k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4257
2.87k
    else {
4258
2.87k
        r |= kputs(h->target_name[c->mtid], str);
4259
2.87k
        r |= kputc_('\t', str);
4260
2.87k
    }
4261
10.0M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4262
10.0M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4263
10.0M
    if (c->l_qseq) { // seq and qual
4264
803k
        uint8_t *s = bam_get_seq(b);
4265
803k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4266
803k
        char *cp = str->s + str->l;
4267
4268
        // Sequence, 2 bases at a time
4269
803k
        nibble2base(s, cp, c->l_qseq);
4270
803k
        cp[c->l_qseq] = '\t';
4271
803k
        cp += c->l_qseq+1;
4272
4273
        // Quality
4274
803k
        s = bam_get_qual(b);
4275
803k
        i = 0;
4276
803k
        if (s[0] == 0xff) {
4277
802k
            cp[i++] = '*';
4278
802k
        } else {
4279
1.57k
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4280
1.57k
            i = c->l_qseq;
4281
1.57k
        }
4282
803k
        cp[i] = 0;
4283
803k
        cp += i;
4284
803k
        str->l = cp - str->s;
4285
9.26M
    } else r |= kputsn_("*\t*", 3, str);
4286
4287
10.0M
    s = bam_get_aux(b); // aux
4288
10.0M
    end = b->data + b->l_data;
4289
4290
13.7M
    while (end - s >= 4) {
4291
3.65M
        r |= kputc_('\t', str);
4292
3.65M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4293
144
            goto bad_aux;
4294
3.65M
    }
4295
10.0M
    r |= kputsn("", 0, str); // nul terminate
4296
10.0M
    if (r < 0) goto mem_err;
4297
4298
10.0M
    return str->l;
4299
4300
144
 bad_aux:
4301
144
    hts_log_error("Corrupted aux data for read %.*s",
4302
144
                  b->core.l_qname, bam_get_qname(b));
4303
144
    errno = EINVAL;
4304
144
    return -1;
4305
4306
0
 mem_err:
4307
0
    hts_log_error("Out of memory");
4308
0
    errno = ENOMEM;
4309
0
    return -1;
4310
10.0M
}
4311
4312
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4313
10.0M
{
4314
10.0M
    str->l = 0;
4315
10.0M
    return sam_format1_append(h, b, str);
4316
10.0M
}
4317
4318
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4319
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4320
0
{
4321
0
    unsigned flag = b->core.flag;
4322
0
    int i, e = 0, len = b->core.l_qseq;
4323
0
    uint8_t *seq, *qual;
4324
4325
0
    str->l = 0;
4326
4327
    // Name
4328
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4329
0
        return -1;
4330
4331
    // /1 or /2 suffix
4332
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4333
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4334
0
        if (r12 == BAM_FREAD1) {
4335
0
            if (kputs("/1", str) == EOF)
4336
0
                return -1;
4337
0
        } else if (r12 == BAM_FREAD2) {
4338
0
            if (kputs("/2", str) == EOF)
4339
0
                return -1;
4340
0
        }
4341
0
    }
4342
4343
    // Illumina CASAVA tag.
4344
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4345
0
    if (x && x->casava) {
4346
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4347
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4348
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4349
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4350
0
                     bc ? (char *)bc+1 : "0") < 0)
4351
0
            return -1;
4352
4353
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4354
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4355
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4356
0
            str->s[str->l-1] = '0';
4357
0
            str->s[str->l] = 0;
4358
0
            bc = NULL;
4359
0
        }
4360
4361
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4362
0
        if (bc) {
4363
0
            int l = strlen((char *)bc+1);
4364
0
            char *c = (char *)str->s + str->l - l;
4365
0
            for (i = 0; i < l; i++) {
4366
0
                if (!isalpha_c(c[i]))
4367
0
                    c[i] = '+';
4368
0
                else if (islower_c(c[i]))
4369
0
                    c[i] = toupper_c(c[i]);
4370
0
            }
4371
0
        }
4372
0
    }
4373
4374
    // Aux tags
4375
0
    if (x && x->aux) {
4376
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4377
0
        while (s && end - s >= 4) {
4378
0
            int tt = s[0]*256 + s[1];
4379
0
            if (x->tags == NULL ||
4380
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4381
0
                e |= kputc_('\t', str) < 0;
4382
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4383
0
                    return -1;
4384
0
            } else {
4385
0
                s = skip_aux(s+2, end);
4386
0
            }
4387
0
        }
4388
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4389
0
    }
4390
4391
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4392
0
    e |= kputc_('\n', str) < 0;
4393
4394
    // Seq line
4395
0
    seq = bam_get_seq(b);
4396
0
    if (flag & BAM_FREVERSE)
4397
0
        for (i = len-1; i >= 0; i--)
4398
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4399
0
    else
4400
0
        for (i = 0; i < len; i++)
4401
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4402
4403
4404
    // Qual line
4405
0
    if (x->nprefix == '@') {
4406
0
        kputsn("\n+\n", 3, str);
4407
0
        qual = bam_get_qual(b);
4408
0
        if (qual[0] == 0xff)
4409
0
            for (i = 0; i < len; i++)
4410
0
                e |= kputc_('B', str) < 0;
4411
0
        else if (flag & BAM_FREVERSE)
4412
0
            for (i = len-1; i >= 0; i--)
4413
0
                e |= kputc_(33 + qual[i], str) < 0;
4414
0
        else
4415
0
            for (i = 0; i < len; i++)
4416
0
                e |= kputc_(33 + qual[i], str) < 0;
4417
4418
0
    }
4419
0
    e |= kputc('\n', str) < 0;
4420
4421
0
    return e ? -1 : str->l;
4422
0
}
4423
4424
// Sadly we need to be able to modify the bam_hdr here so we can
4425
// reference count the structure.
4426
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4427
30.2M
{
4428
30.2M
    switch (fp->format.format) {
4429
0
    case binary_format:
4430
0
        fp->format.category = sequence_data;
4431
0
        fp->format.format = bam;
4432
        /* fall-through */
4433
10.0M
    case bam:
4434
10.0M
        return bam_write_idx1(fp, h, b);
4435
4436
10.0M
    case cram:
4437
10.0M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4438
4439
0
    case text_format:
4440
0
        fp->format.category = sequence_data;
4441
0
        fp->format.format = sam;
4442
        /* fall-through */
4443
10.0M
    case sam:
4444
10.0M
        if (fp->state) {
4445
0
            SAM_state *fd = (SAM_state *)fp->state;
4446
4447
            // Threaded output
4448
0
            if (!fd->h) {
4449
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4450
                // just data pointed to by it (which is a bit weasely still),
4451
                // but out cached pointer must be non-const as we want to
4452
                // destroy it later on and sam_hdr_destroy takes non-const.
4453
                //
4454
                // We do this because some tools do sam_hdr_destroy; sam_close
4455
                // while others do sam_close; sam_hdr_destroy.  The former is
4456
                // an issue as we need the header still when flushing.
4457
0
                fd->h = (sam_hdr_t *)h;
4458
0
                fd->h->ref_count++;
4459
4460
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4461
0
                                   fp) != 0)
4462
0
                    return -2;
4463
0
                fd->dispatcher_set = 1;
4464
0
            }
4465
4466
0
            if (fd->h != h) {
4467
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4468
0
                return -2;
4469
0
            }
4470
4471
            // Find a suitable BAM array to copy to
4472
0
            sp_bams *gb = fd->curr_bam;
4473
0
            if (!gb) {
4474
0
                pthread_mutex_lock(&fd->lines_m);
4475
0
                if (fd->bams) {
4476
0
                    fd->curr_bam = gb = fd->bams;
4477
0
                    fd->bams = gb->next;
4478
0
                    gb->next = NULL;
4479
0
                    gb->nbams = 0;
4480
0
                    gb->bam_mem = 0;
4481
0
                    pthread_mutex_unlock(&fd->lines_m);
4482
0
                } else {
4483
0
                    pthread_mutex_unlock(&fd->lines_m);
4484
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4485
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4486
0
                        free(gb);
4487
0
                        return -1;
4488
0
                    }
4489
0
                    gb->nbams = 0;
4490
0
                    gb->abams = SAM_NBAM;
4491
0
                    gb->bam_mem = 0;
4492
0
                    gb->fd = fd;
4493
0
                    fd->curr_idx = 0;
4494
0
                    fd->curr_bam = gb;
4495
0
                }
4496
0
            }
4497
4498
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4499
0
                return -2;
4500
0
            gb->bam_mem += b->l_data + sizeof(*b);
4501
4502
            // Dispatch if full
4503
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4504
0
                gb->serial = fd->serial++;
4505
0
                pthread_mutex_lock(&fd->command_m);
4506
0
                if (fd->errcode != 0) {
4507
0
                    pthread_mutex_unlock(&fd->command_m);
4508
0
                    return -fd->errcode;
4509
0
                }
4510
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4511
0
                                        cleanup_sp_bams,
4512
0
                                        cleanup_sp_lines, 0) < 0) {
4513
0
                    pthread_mutex_unlock(&fd->command_m);
4514
0
                    return -1;
4515
0
                }
4516
0
                pthread_mutex_unlock(&fd->command_m);
4517
0
                fd->curr_bam = NULL;
4518
0
            }
4519
4520
            // Dummy value as we don't know how long it really is.
4521
            // We could track file sizes via a SAM_state field, but I don't think
4522
            // it is necessary.
4523
0
            return 1;
4524
10.0M
        } else {
4525
10.0M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4526
10.0M
            kputc('\n', &fp->line);
4527
10.0M
            if (fp->is_bgzf) {
4528
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4529
0
                    return -1;
4530
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4531
10.0M
            } else {
4532
10.0M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4533
10.0M
            }
4534
4535
10.0M
            if (fp->idx) {
4536
0
                if (fp->format.compression == bgzf) {
4537
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4538
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4539
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4540
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4541
0
                        return -1;
4542
0
                    }
4543
0
                } else {
4544
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4545
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4546
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4547
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4548
0
                        return -1;
4549
0
                    }
4550
0
                }
4551
0
            }
4552
4553
10.0M
            return fp->line.l;
4554
10.0M
        }
4555
4556
4557
0
    case fasta_format:
4558
0
    case fastq_format: {
4559
0
        fastq_state *x = (fastq_state *)fp->state;
4560
0
        if (!x) {
4561
0
            if (!(fp->state = fastq_state_init(fp->format.format
4562
0
                                               == fastq_format ? '@' : '>')))
4563
0
                return -2;
4564
0
        }
4565
4566
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4567
0
            return -1;
4568
0
        if (fp->is_bgzf) {
4569
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4570
0
                return -1;
4571
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4572
0
                return -1;
4573
0
        } else {
4574
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4575
0
                return -1;
4576
0
        }
4577
0
        return fp->line.l;
4578
0
    }
4579
4580
0
    default:
4581
0
        errno = EBADF;
4582
0
        return -1;
4583
30.2M
    }
4584
30.2M
}
4585
4586
/************************
4587
 *** Auxiliary fields ***
4588
 ************************/
4589
#ifndef HTS_LITTLE_ENDIAN
4590
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4591
    int tsz = aux_type2size(type);
4592
4593
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4594
4595
    switch (tsz) {
4596
        case 'H': case 'Z': case 1:  // Trivial
4597
            memcpy(out, in, len);
4598
            break;
4599
4600
#define aux_val_to_le(type_t, store_le) do {                            \
4601
        type_t v;                                                       \
4602
        size_t i;                                                       \
4603
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4604
            memcpy(&v, in + i, sizeof(type_t));                         \
4605
            store_le(v, out);                                           \
4606
        }                                                               \
4607
    } while (0)
4608
4609
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4610
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4611
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4612
4613
#undef aux_val_to_le
4614
4615
        case 'B': { // Recurse!
4616
            uint32_t n;
4617
            if (len < 5) return -1;
4618
            memcpy(&n, in + 1, 4);
4619
            out[0] = in[0];
4620
            u32_to_le(n, out + 1);
4621
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4622
        }
4623
4624
        default: // Unknown type code
4625
            return -1;
4626
    }
4627
4628
4629
4630
    return 0;
4631
}
4632
#endif
4633
4634
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4635
0
{
4636
0
    uint32_t new_len;
4637
4638
0
    assert(b->l_data >= 0);
4639
0
    new_len = b->l_data + 3 + len;
4640
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4641
4642
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4643
4644
0
    b->data[b->l_data] = tag[0];
4645
0
    b->data[b->l_data + 1] = tag[1];
4646
0
    b->data[b->l_data + 2] = type;
4647
4648
0
#ifdef HTS_LITTLE_ENDIAN
4649
0
    memcpy(b->data + b->l_data + 3, data, len);
4650
#else
4651
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4652
        errno = EINVAL;
4653
        return -1;
4654
    }
4655
#endif
4656
4657
0
    b->l_data = new_len;
4658
4659
0
    return 0;
4660
4661
0
 nomem:
4662
0
    errno = ENOMEM;
4663
0
    return -1;
4664
0
}
4665
4666
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4667
16.6M
{
4668
16.6M
    int size;
4669
16.6M
    uint32_t n;
4670
16.6M
    if (s >= end) return end;
4671
16.6M
    size = aux_type2size(*s); ++s; // skip type
4672
16.6M
    switch (size) {
4673
3.85M
    case 'Z':
4674
3.86M
    case 'H':
4675
1.10G
        while (s < end && *s) ++s;
4676
3.86M
        return s < end ? s + 1 : end;
4677
5.67M
    case 'B':
4678
5.67M
        if (end - s < 5) return NULL;
4679
5.67M
        size = aux_type2size(*s); ++s;
4680
5.67M
        n = le_to_u32(s);
4681
5.67M
        s += 4;
4682
5.67M
        if (size == 0 || end - s < size * n) return NULL;
4683
5.67M
        return s + size * n;
4684
93
    case 0:
4685
93
        return NULL;
4686
7.11M
    default:
4687
7.11M
        if (end - s < size) return NULL;
4688
7.11M
        return s + size;
4689
16.6M
    }
4690
16.6M
}
4691
4692
uint8_t *bam_aux_first(const bam1_t *b)
4693
10.3M
{
4694
10.3M
    uint8_t *s = bam_get_aux(b);
4695
10.3M
    uint8_t *end = b->data + b->l_data;
4696
10.3M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4697
318k
    return s+2;
4698
10.3M
}
4699
4700
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4701
16.5M
{
4702
16.5M
    uint8_t *end = b->data + b->l_data;
4703
16.5M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4704
16.5M
    if (next == NULL) goto bad_aux;
4705
16.5M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4706
16.3M
    return next+2;
4707
4708
110
 bad_aux:
4709
110
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4710
110
    errno = EINVAL;
4711
110
    return NULL;
4712
16.5M
}
4713
4714
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4715
10.3M
{
4716
10.3M
    uint8_t *s;
4717
26.8M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4718
16.6M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4719
            // Check the tag value is valid and complete
4720
97.7k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4721
97.7k
            if (e == NULL) goto bad_aux;
4722
97.7k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4723
4724
97.7k
            return s;
4725
97.7k
        }
4726
4727
    // errno now as set by bam_aux_first()/bam_aux_next()
4728
10.2M
    return NULL;
4729
4730
0
 bad_aux:
4731
0
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4732
0
    errno = EINVAL;
4733
0
    return NULL;
4734
10.3M
}
4735
4736
int bam_aux_del(bam1_t *b, uint8_t *s)
4737
0
{
4738
0
    s = bam_aux_remove(b, s);
4739
0
    return (s || errno == ENOENT)? 0 : -1;
4740
0
}
4741
4742
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4743
0
{
4744
0
    uint8_t *end = b->data + b->l_data;
4745
0
    uint8_t *next = skip_aux(s, end);
4746
0
    if (next == NULL) goto bad_aux;
4747
4748
0
    b->l_data -= next - (s-2);
4749
0
    if (next >= end) { errno = ENOENT; return NULL; }
4750
4751
0
    memmove(s-2, next, end - next);
4752
0
    return s;
4753
4754
0
 bad_aux:
4755
0
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4756
0
    errno = EINVAL;
4757
0
    return NULL;
4758
0
}
4759
4760
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4761
0
{
4762
    // FIXME: This is not at all efficient!
4763
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4764
0
    size_t old_ln = 0;
4765
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4766
0
    int save_errno = errno;
4767
0
    int new_tag = 0;
4768
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4769
4770
0
    if (s) {  // Replacing existing tag
4771
0
        char type = *s;
4772
0
        if (type != 'Z') {
4773
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4774
0
            errno = EINVAL;
4775
0
            return -1;
4776
0
        }
4777
0
        s++;
4778
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4779
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4780
0
        s -= 3;
4781
0
    } else {
4782
0
        if (errno != ENOENT) { // Invalid aux data, give up
4783
0
            return -1;
4784
0
        } else { // Tag doesn't exist - put it on the end
4785
0
            errno = save_errno;
4786
0
            s = b->data + b->l_data;
4787
0
            new_tag = 3;
4788
0
        }
4789
0
    }
4790
4791
0
    if (old_ln < ln + need_nul + new_tag) {
4792
0
        ptrdiff_t s_offset = s - b->data;
4793
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4794
0
            return -1;
4795
0
        s = b->data + s_offset;
4796
0
    }
4797
0
    if (!new_tag) {
4798
0
        memmove(s + 3 + ln + need_nul,
4799
0
                s + 3 + old_ln,
4800
0
                b->l_data - (s + 3 - b->data) - old_ln);
4801
0
    }
4802
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4803
4804
0
    s[0] = tag[0];
4805
0
    s[1] = tag[1];
4806
0
    s[2] = 'Z';
4807
0
    memmove(s+3,data,ln);
4808
0
    if (need_nul) s[3 + ln] = '\0';
4809
0
    return 0;
4810
0
}
4811
4812
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4813
0
{
4814
0
    uint32_t sz, old_sz = 0, new = 0;
4815
0
    uint8_t *s, type;
4816
4817
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4818
0
        errno = EOVERFLOW;
4819
0
        return -1;
4820
0
    }
4821
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4822
0
    else if (val < INT8_MIN<