Coverage Report

Created: 2024-05-20 06:32

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2024 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
// Suppress deprecation message for cigar_tab, which we initialise
45
#include "htslib/hts_defs.h"
46
#undef HTS_DEPRECATED
47
#define HTS_DEPRECATED(message)
48
49
#include "htslib/sam.h"
50
#include "htslib/bgzf.h"
51
#include "cram/cram.h"
52
#include "hts_internal.h"
53
#include "sam_internal.h"
54
#include "htslib/hfile.h"
55
#include "htslib/hts_endian.h"
56
#include "htslib/hts_expr.h"
57
#include "header.h"
58
59
#include "htslib/khash.h"
60
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
61
KHASH_SET_INIT_INT(tag)
62
63
#ifndef EFTYPE
64
0
#define EFTYPE ENOEXEC
65
#endif
66
#ifndef EOVERFLOW
67
#define EOVERFLOW ERANGE
68
#endif
69
70
/**********************
71
 *** BAM header I/O ***
72
 **********************/
73
74
HTSLIB_EXPORT
75
const int8_t bam_cigar_table[256] = {
76
    // 0 .. 47
77
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
81
    // 48 .. 63  (including =)
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
83
84
    // 64 .. 79  (including MIDNHB)
85
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
86
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
87
88
    // 80 .. 95  (including SPX)
89
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
90
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 96 .. 127
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
96
    // 128 .. 255
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
105
};
106
107
sam_hdr_t *sam_hdr_init(void)
108
59.1k
{
109
59.1k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
110
59.1k
    if (bh == NULL) return NULL;
111
112
59.1k
    bh->cigar_tab = bam_cigar_table;
113
59.1k
    return bh;
114
59.1k
}
115
116
void sam_hdr_destroy(sam_hdr_t *bh)
117
167k
{
118
167k
    int32_t i;
119
120
167k
    if (bh == NULL) return;
121
122
79.9k
    if (bh->ref_count > 0) {
123
20.8k
        --bh->ref_count;
124
20.8k
        return;
125
20.8k
    }
126
127
59.1k
    if (bh->target_name) {
128
82.7k
        for (i = 0; i < bh->n_targets; ++i)
129
63.5k
            free(bh->target_name[i]);
130
19.1k
        free(bh->target_name);
131
19.1k
        free(bh->target_len);
132
19.1k
    }
133
59.1k
    free(bh->text);
134
59.1k
    if (bh->hrecs)
135
53.7k
        sam_hrecs_free(bh->hrecs);
136
59.1k
    if (bh->sdict)
137
59.1k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
138
59.1k
    free(bh);
139
59.1k
}
140
141
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
142
// references before sam_hdr_t::hrecs is populated
143
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
144
112
{
145
112
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
146
112
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
147
112
    int i;
148
112
    if (!dest_long_refs) return -1;
149
150
3.57k
    for (i = 0; i < h->n_targets; i++) {
151
3.45k
        int ret;
152
3.45k
        khiter_t ksrc, kdest;
153
3.45k
        if (h->target_len[i] < UINT32_MAX) continue;
154
1.94k
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
155
1.94k
        if (ksrc == kh_end(src_long_refs)) continue;
156
1.94k
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
157
1.94k
        if (ret < 0) {
158
0
            kh_destroy(s2i, dest_long_refs);
159
0
            return -1;
160
0
        }
161
1.94k
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
162
1.94k
    }
163
164
112
    h->sdict = dest_long_refs;
165
112
    return 0;
166
112
}
167
168
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
169
20.1k
{
170
20.1k
    if (h0 == NULL) return NULL;
171
20.1k
    sam_hdr_t *h;
172
20.1k
    if ((h = sam_hdr_init()) == NULL) return NULL;
173
    // copy the simple data
174
20.1k
    h->n_targets = 0;
175
20.1k
    h->ignore_sam_err = h0->ignore_sam_err;
176
20.1k
    h->l_text = 0;
177
178
    // Then the pointery stuff
179
180
20.1k
    if (!h0->hrecs) {
181
1.10k
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
182
1.10k
        if (!h->target_len) goto fail;
183
1.10k
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
184
1.10k
        if (!h->target_name) goto fail;
185
186
1.10k
        int i;
187
5.26k
        for (i = 0; i < h0->n_targets; ++i) {
188
4.15k
            h->target_len[i] = h0->target_len[i];
189
4.15k
            h->target_name[i] = strdup(h0->target_name[i]);
190
4.15k
            if (!h->target_name[i]) break;
191
4.15k
        }
192
1.10k
        h->n_targets = i;
193
1.10k
        if (i < h0->n_targets) goto fail;
194
195
1.10k
        if (h0->sdict) {
196
112
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
197
112
        }
198
1.10k
    }
199
200
20.1k
    if (h0->hrecs) {
201
19.0k
        kstring_t tmp = { 0, 0, NULL };
202
19.0k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
203
0
            free(ks_release(&tmp));
204
0
            goto fail;
205
0
        }
206
207
19.0k
        h->l_text = tmp.l;
208
19.0k
        h->text   = ks_release(&tmp);
209
210
19.0k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
211
0
            goto fail;
212
19.0k
    } else {
213
1.10k
        h->l_text = h0->l_text;
214
1.10k
        h->text = malloc(h->l_text + 1);
215
1.10k
        if (!h->text) goto fail;
216
1.10k
        memcpy(h->text, h0->text, h->l_text);
217
1.10k
        h->text[h->l_text] = '\0';
218
1.10k
    }
219
220
20.1k
    return h;
221
222
0
 fail:
223
0
    sam_hdr_destroy(h);
224
0
    return NULL;
225
20.1k
}
226
227
sam_hdr_t *bam_hdr_read(BGZF *fp)
228
2.91k
{
229
2.91k
    sam_hdr_t *h;
230
2.91k
    uint8_t buf[4];
231
2.91k
    int magic_len, has_EOF;
232
2.91k
    int32_t i, name_len, num_names = 0;
233
2.91k
    size_t bufsize;
234
2.91k
    ssize_t bytes;
235
    // check EOF
236
2.91k
    has_EOF = bgzf_check_EOF(fp);
237
2.91k
    if (has_EOF < 0) {
238
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
239
2.91k
    } else if (has_EOF == 0) {
240
2.91k
        hts_log_warning("EOF marker is absent. The input is probably truncated");
241
2.91k
    }
242
    // read "BAM1"
243
2.91k
    magic_len = bgzf_read(fp, buf, 4);
244
2.91k
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
245
3
        hts_log_error("Invalid BAM binary header");
246
3
        return 0;
247
3
    }
248
2.91k
    h = sam_hdr_init();
249
2.91k
    if (!h) goto nomem;
250
251
    // read plain text and the number of reference sequences
252
2.91k
    bytes = bgzf_read(fp, buf, 4);
253
2.91k
    if (bytes != 4) goto read_err;
254
2.90k
    h->l_text = le_to_u32(buf);
255
256
2.90k
    bufsize = h->l_text + 1;
257
2.90k
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
258
2.90k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
259
2.90k
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
260
2.89k
#endif
261
2.89k
    h->text = (char*)malloc(bufsize);
262
2.89k
    if (!h->text) goto nomem;
263
2.89k
    h->text[h->l_text] = 0; // make sure it is NULL terminated
264
2.89k
    bytes = bgzf_read(fp, h->text, h->l_text);
265
2.89k
    if (bytes != h->l_text) goto read_err;
266
267
2.67k
    bytes = bgzf_read(fp, &h->n_targets, 4);
268
2.67k
    if (bytes != 4) goto read_err;
269
2.66k
    if (fp->is_be) ed_swap_4p(&h->n_targets);
270
271
2.66k
    if (h->n_targets < 0) goto invalid;
272
273
    // read reference sequence names and lengths
274
2.58k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
275
2.58k
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
276
21
        goto nomem;
277
2.56k
#endif
278
2.56k
    if (h->n_targets > 0) {
279
1.12k
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
280
1.12k
        if (!h->target_name) goto nomem;
281
1.12k
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
282
1.12k
        if (!h->target_len) goto nomem;
283
1.12k
    }
284
1.44k
    else {
285
1.44k
        h->target_name = NULL;
286
1.44k
        h->target_len = NULL;
287
1.44k
    }
288
289
5.32k
    for (i = 0; i != h->n_targets; ++i) {
290
3.12k
        bytes = bgzf_read(fp, &name_len, 4);
291
3.12k
        if (bytes != 4) goto read_err;
292
3.07k
        if (fp->is_be) ed_swap_4p(&name_len);
293
3.07k
        if (name_len <= 0) goto invalid;
294
295
2.97k
        h->target_name[i] = (char*)malloc(name_len);
296
2.97k
        if (!h->target_name[i]) goto nomem;
297
2.97k
        num_names++;
298
299
2.97k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
300
2.97k
        if (bytes != name_len) goto read_err;
301
302
2.79k
        if (h->target_name[i][name_len - 1] != '\0') {
303
            /* Fix missing NUL-termination.  Is this being too nice?
304
               We could alternatively bail out with an error. */
305
1.63k
            char *new_name;
306
1.63k
            if (name_len == INT32_MAX) goto invalid;
307
1.63k
            new_name = realloc(h->target_name[i], name_len + 1);
308
1.63k
            if (new_name == NULL) goto nomem;
309
1.63k
            h->target_name[i] = new_name;
310
1.63k
            h->target_name[i][name_len] = '\0';
311
1.63k
        }
312
313
2.79k
        bytes = bgzf_read(fp, &h->target_len[i], 4);
314
2.79k
        if (bytes != 4) goto read_err;
315
2.76k
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
316
2.76k
    }
317
2.19k
    return h;
318
319
30
 nomem:
320
30
    hts_log_error("Out of memory");
321
30
    goto clean;
322
323
501
 read_err:
324
501
    if (bytes < 0) {
325
48
        hts_log_error("Error reading BGZF stream");
326
453
    } else {
327
453
        hts_log_error("Truncated BAM header");
328
453
    }
329
501
    goto clean;
330
331
186
 invalid:
332
186
    hts_log_error("Invalid BAM binary header");
333
334
717
 clean:
335
717
    if (h != NULL) {
336
717
        h->n_targets = num_names; // ensure we free only allocated target_names
337
717
        sam_hdr_destroy(h);
338
717
    }
339
717
    return NULL;
340
186
}
341
342
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
343
11.6k
{
344
11.6k
    int32_t i, name_len, x;
345
11.6k
    kstring_t hdr_ks = { 0, 0, NULL };
346
11.6k
    char *text;
347
11.6k
    uint32_t l_text;
348
349
11.6k
    if (!h) return -1;
350
351
11.6k
    if (h->hrecs) {
352
10.5k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
353
10.5k
        if (hdr_ks.l > UINT32_MAX) {
354
0
            hts_log_error("Header too long for BAM format");
355
0
            free(hdr_ks.s);
356
0
            return -1;
357
10.5k
        } else if (hdr_ks.l > INT32_MAX) {
358
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
359
0
            hts_log_warning("Output file may not be portable");
360
0
        }
361
10.5k
        text = hdr_ks.s;
362
10.5k
        l_text = hdr_ks.l;
363
10.5k
    } else {
364
1.10k
        if (h->l_text > UINT32_MAX) {
365
0
            hts_log_error("Header too long for BAM format");
366
0
            return -1;
367
1.10k
        } else if (h->l_text > INT32_MAX) {
368
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
369
0
            hts_log_warning("Output file may not be portable");
370
0
        }
371
1.10k
        text = h->text;
372
1.10k
        l_text = h->l_text;
373
1.10k
    }
374
    // write "BAM1"
375
11.6k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
376
    // write plain text and the number of reference sequences
377
11.6k
    if (fp->is_be) {
378
0
        x = ed_swap_4(l_text);
379
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
380
0
        if (l_text) {
381
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
382
0
        }
383
0
        x = ed_swap_4(h->n_targets);
384
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
385
11.6k
    } else {
386
11.6k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
387
11.6k
        if (l_text) {
388
7.25k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
389
7.25k
        }
390
11.6k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
391
11.6k
    }
392
11.6k
    free(hdr_ks.s);
393
    // write sequence names and lengths
394
26.8k
    for (i = 0; i != h->n_targets; ++i) {
395
15.1k
        char *p = h->target_name[i];
396
15.1k
        name_len = strlen(p) + 1;
397
15.1k
        if (fp->is_be) {
398
0
            x = ed_swap_4(name_len);
399
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
400
15.1k
        } else {
401
15.1k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
402
15.1k
        }
403
15.1k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
404
15.1k
        if (fp->is_be) {
405
0
            x = ed_swap_4(h->target_len[i]);
406
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
407
15.1k
        } else {
408
15.1k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
409
15.1k
        }
410
15.1k
    }
411
11.6k
    if (bgzf_flush(fp) < 0) return -1;
412
11.6k
    return 0;
413
11.6k
}
414
415
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
416
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
417
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
418
0
}
419
420
/*************************
421
 *** BAM alignment I/O ***
422
 *************************/
423
424
bam1_t *bam_init1(void)
425
1.64M
{
426
1.64M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
427
1.64M
}
428
429
int sam_realloc_bam_data(bam1_t *b, size_t desired)
430
2.14M
{
431
2.14M
    uint32_t new_m_data;
432
2.14M
    uint8_t *new_data;
433
2.14M
    new_m_data = desired;
434
2.14M
    kroundup32(new_m_data);
435
2.14M
    if (new_m_data < desired) {
436
0
        errno = ENOMEM; // Not strictly true but we can't store the size
437
0
        return -1;
438
0
    }
439
2.14M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
440
2.14M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
441
45
        errno = ENOMEM;
442
45
        return -1;
443
45
    }
444
2.14M
#endif
445
2.14M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
446
2.14M
        new_data = realloc(b->data, new_m_data);
447
2.14M
    } else {
448
0
        if ((new_data = malloc(new_m_data)) != NULL) {
449
0
            if (b->l_data > 0)
450
0
                memcpy(new_data, b->data,
451
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
452
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
453
0
        }
454
0
    }
455
2.14M
    if (!new_data) return -1;
456
2.14M
    b->data = new_data;
457
2.14M
    b->m_data = new_m_data;
458
2.14M
    return 0;
459
2.14M
}
460
461
void bam_destroy1(bam1_t *b)
462
47.6M
{
463
47.6M
    if (b == 0) return;
464
1.64M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
465
1.64M
        free(b->data);
466
1.64M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
467
            // In case of reuse
468
0
            b->data = NULL;
469
0
            b->m_data = 0;
470
0
            b->l_data = 0;
471
0
        }
472
1.64M
    }
473
474
1.64M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
475
1.64M
        free(b);
476
1.64M
}
477
478
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
479
18.7M
{
480
18.7M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
481
18.7M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
482
18.7M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
483
18.7M
    bdst->l_data = bsrc->l_data;
484
18.7M
    bdst->id = bsrc->id;
485
18.7M
    return bdst;
486
18.7M
}
487
488
bam1_t *bam_dup1(const bam1_t *bsrc)
489
1.61M
{
490
1.61M
    if (bsrc == NULL) return NULL;
491
1.61M
    bam1_t *bdst = bam_init1();
492
1.61M
    if (bdst == NULL) return NULL;
493
1.61M
    if (bam_copy1(bdst, bsrc) == NULL) {
494
0
        bam_destroy1(bdst);
495
0
        return NULL;
496
0
    }
497
1.61M
    return bdst;
498
1.61M
}
499
500
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
501
                             hts_pos_t *rlen, hts_pos_t *qlen)
502
2.75k
{
503
2.75k
    int k;
504
2.75k
    *rlen = *qlen = 0;
505
759k
    for (k = 0; k < n_cigar; ++k) {
506
756k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
507
756k
        int len = bam_cigar_oplen(cigar[k]);
508
756k
        if (type & 1) *qlen += len;
509
756k
        if (type & 2) *rlen += len;
510
756k
    }
511
2.75k
}
512
513
static int subtract_check_underflow(size_t length, size_t *limit)
514
279M
{
515
279M
    if (length <= *limit) {
516
279M
        *limit -= length;
517
279M
        return 0;
518
279M
    }
519
520
0
    return -1;
521
279M
}
522
523
int bam_set1(bam1_t *bam,
524
             size_t l_qname, const char *qname,
525
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
526
             size_t n_cigar, const uint32_t *cigar,
527
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
528
             size_t l_seq, const char *seq, const char *qual,
529
             size_t l_aux)
530
55.8M
{
531
    // use a default qname "*" if none is provided
532
55.8M
    if (l_qname == 0) {
533
51.9M
        l_qname = 1;
534
51.9M
        qname = "*";
535
51.9M
    }
536
537
    // note: the qname is stored nul terminated and padded as described in the
538
    // documentation for the bam1_t struct.
539
55.8M
    size_t qname_nuls = 4 - l_qname % 4;
540
541
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
542
    // can't use bam_endpos() directly as some fields not yet set up.
543
55.8M
    hts_pos_t rlen = 0, qlen = 0;
544
55.8M
    if (!(flag & BAM_FUNMAP)) {
545
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
546
0
    }
547
55.8M
    if (rlen == 0) {
548
55.8M
        rlen = 1;
549
55.8M
    }
550
551
    // validate parameters
552
55.8M
    if (l_qname > 254) {
553
90
        hts_log_error("Query name too long");
554
90
        errno = EINVAL;
555
90
        return -1;
556
90
    }
557
55.8M
    if (HTS_POS_MAX - rlen <= pos) {
558
0
        hts_log_error("Read ends beyond highest supported position");
559
0
        errno = EINVAL;
560
0
        return -1;
561
0
    }
562
55.8M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
563
0
        hts_log_error("Mapped query must have a CIGAR");
564
0
        errno = EINVAL;
565
0
        return -1;
566
0
    }
567
55.8M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
568
0
        hts_log_error("CIGAR and query sequence are of different length");
569
0
        errno = EINVAL;
570
0
        return -1;
571
0
    }
572
573
55.8M
    size_t limit = INT32_MAX;
574
55.8M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
575
55.8M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
576
55.8M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
577
55.8M
    u    += subtract_check_underflow(l_seq, &limit);
578
55.8M
    u    += subtract_check_underflow(l_aux, &limit);
579
55.8M
    if (u != 0) {
580
0
        hts_log_error("Size overflow");
581
0
        errno = EINVAL;
582
0
        return -1;
583
0
    }
584
585
    // re-allocate the data buffer as needed.
586
55.8M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
587
55.8M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
588
0
        return -1;
589
0
    }
590
591
55.8M
    bam->l_data = (int)data_len;
592
55.8M
    bam->core.pos = pos;
593
55.8M
    bam->core.tid = tid;
594
55.8M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
595
55.8M
    bam->core.qual = mapq;
596
55.8M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
597
55.8M
    bam->core.flag = flag;
598
55.8M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
599
55.8M
    bam->core.n_cigar = (uint32_t)n_cigar;
600
55.8M
    bam->core.l_qseq = (int32_t)l_seq;
601
55.8M
    bam->core.mtid = mtid;
602
55.8M
    bam->core.mpos = mpos;
603
55.8M
    bam->core.isize = isize;
604
605
55.8M
    uint8_t *cp = bam->data;
606
55.8M
    strncpy((char *)cp, qname, l_qname);
607
55.8M
    int i;
608
220M
    for (i = 0; i < qname_nuls; i++) {
609
164M
        cp[l_qname + i] = '\0';
610
164M
    }
611
55.8M
    cp += l_qname + qname_nuls;
612
613
55.8M
    if (n_cigar > 0) {
614
0
        memcpy(cp, cigar, n_cigar * 4);
615
0
    }
616
55.8M
    cp += n_cigar * 4;
617
618
1.87G
#define NN 16
619
55.8M
    const uint8_t *useq = (uint8_t *)seq;
620
207M
    for (i = 0; i + NN < l_seq; i += NN) {
621
151M
        int j;
622
151M
        const uint8_t *u2 = useq+i;
623
1.36G
        for (j = 0; j < NN/2; j++)
624
1.21G
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
625
151M
        cp += NN/2;
626
151M
    }
627
62.3M
    for (; i + 1 < l_seq; i += 2) {
628
6.42M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
629
6.42M
    }
630
631
57.0M
    for (; i < l_seq; i++) {
632
1.15M
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
633
1.15M
    }
634
635
55.8M
    if (qual) {
636
2.35k
        memcpy(cp, qual, l_seq);
637
2.35k
    }
638
55.8M
    else {
639
55.8M
        memset(cp, '\xff', l_seq);
640
55.8M
    }
641
642
55.8M
    return (int)data_len;
643
55.8M
}
644
645
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
646
18.3M
{
647
18.3M
    int k;
648
18.3M
    hts_pos_t l;
649
25.9M
    for (k = l = 0; k < n_cigar; ++k)
650
7.65M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
651
7.26M
            l += bam_cigar_oplen(cigar[k]);
652
18.3M
    return l;
653
18.3M
}
654
655
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
656
179k
{
657
179k
    int k;
658
179k
    hts_pos_t l;
659
27.6M
    for (k = l = 0; k < n_cigar; ++k)
660
27.5M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
661
25.7M
            l += bam_cigar_oplen(cigar[k]);
662
179k
    return l;
663
179k
}
664
665
hts_pos_t bam_endpos(const bam1_t *b)
666
5.96k
{
667
5.96k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
668
5.96k
    if (rlen == 0) rlen = 1;
669
5.96k
    return b->core.pos + rlen;
670
5.96k
}
671
672
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
673
338k
{
674
338k
    bam1_core_t *c = &b->core;
675
338k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
676
338k
    uint8_t *CG;
677
678
    // test where there is a real CIGAR in the CG tag to move
679
338k
    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
680
173k
    cigar0 = bam_get_cigar(b);
681
173k
    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
682
158k
    fake_bytes = c->n_cigar * 4;
683
158k
    int saved_errno = errno;
684
158k
    CG = bam_aux_get(b, "CG");
685
158k
    if (!CG) {
686
149k
        if (errno != ENOENT) return -1;  // Bad aux data
687
149k
        errno = saved_errno; // restore errno on expected no-CG-tag case
688
149k
        return 0;
689
149k
    }
690
8.98k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
691
3.00k
        return 0; // not of type B,I
692
5.98k
    CG_len = le_to_u32(CG + 2);
693
5.98k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
694
695
    // move from the CG tag to the right position
696
5.96k
    cigar_st = (uint8_t*)cigar0 - b->data;
697
5.96k
    c->n_cigar = CG_len;
698
5.96k
    n_cigar4 = c->n_cigar * 4;
699
5.96k
    CG_st = CG - b->data - 2;
700
5.96k
    CG_en = CG_st + 8 + n_cigar4;
701
5.96k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
702
5.96k
    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
703
5.96k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
704
5.96k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
705
5.96k
    if (ori_len > CG_en) // move data after the CG tag
706
1.38k
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
707
5.96k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
708
5.96k
    if (recal_bin)
709
5.96k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
710
5.96k
    if (give_warning)
711
5.96k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
712
5.96k
    return 1;
713
5.96k
}
714
715
static inline int aux_type2size(uint8_t type)
716
3.90M
{
717
3.90M
    switch (type) {
718
2.59M
    case 'A': case 'c': case 'C':
719
2.59M
        return 1;
720
395k
    case 's': case 'S':
721
395k
        return 2;
722
636k
    case 'i': case 'I': case 'f':
723
636k
        return 4;
724
70.5k
    case 'd':
725
70.5k
        return 8;
726
209k
    case 'Z': case 'H': case 'B':
727
209k
        return type;
728
367
    default:
729
367
        return 0;
730
3.90M
    }
731
3.90M
}
732
733
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
734
0
{
735
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
736
0
    uint32_t i;
737
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
738
0
}
739
740
// Fix bad records where qname is not terminated correctly.
741
2.04k
static int fixup_missing_qname_nul(bam1_t *b) {
742
2.04k
    bam1_core_t *c = &b->core;
743
744
    // Note this is called before c->l_extranul is added to c->l_qname
745
2.04k
    if (c->l_extranul > 0) {
746
1.69k
        b->data[c->l_qname++] = '\0';
747
1.69k
        c->l_extranul--;
748
1.69k
    } else {
749
349
        if (b->l_data > INT_MAX - 4) return -1;
750
349
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
751
349
        b->l_data += 4;
752
349
        b->data[c->l_qname++] = '\0';
753
349
        c->l_extranul = 3;
754
349
    }
755
2.04k
    return 0;
756
2.04k
}
757
758
/*
759
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
760
 * in multi-threaded handling.  This may be worth considering for htslib2.
761
 */
762
int bam_read1(BGZF *fp, bam1_t *b)
763
4.56k
{
764
4.56k
    bam1_core_t *c = &b->core;
765
4.56k
    int32_t block_len, ret, i;
766
4.56k
    uint32_t x[8], new_l_data;
767
768
4.56k
    b->l_data = 0;
769
770
4.56k
    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
771
278
        if (ret == 0) return -1; // normal end-of-file
772
166
        else return -2; // truncated
773
278
    }
774
4.29k
    if (fp->is_be)
775
0
        ed_swap_4p(&block_len);
776
4.29k
    if (block_len < 32) return -4;  // block_len includes core data
777
4.02k
    if (bgzf_read(fp, x, 32) != 32) return -3;
778
3.82k
    if (fp->is_be) {
779
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
780
0
    }
781
3.82k
    c->tid = x[0]; c->pos = (int32_t)x[1];
782
3.82k
    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
783
3.82k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
784
3.82k
    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
785
3.82k
    c->l_qseq = x[4];
786
3.82k
    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
787
788
3.82k
    new_l_data = block_len - 32 + c->l_extranul;
789
3.82k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
790
3.67k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
791
3.67k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
792
139
        return -4;
793
3.54k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
794
3.49k
    b->l_data = new_l_data;
795
796
3.49k
    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
797
3.42k
    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
798
2.04k
        if (fixup_missing_qname_nul(b) < 0) return -4;
799
2.04k
    }
800
7.16k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
801
3.42k
    c->l_qname += c->l_extranul;
802
3.42k
    if (b->l_data < c->l_qname ||
803
3.42k
        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
804
206
        return -4;
805
3.22k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
806
3.22k
    if (bam_tag2cigar(b, 0, 0) < 0)
807
8
        return -4;
808
809
3.21k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
810
2.75k
        hts_pos_t rlen, qlen;
811
2.75k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
812
2.75k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
813
2.75k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
814
        // Sanity check for broken CIGAR alignments
815
2.75k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
816
93
            hts_log_error("CIGAR and query sequence lengths differ for %s",
817
93
                    bam_get_qname(b));
818
93
            return -4;
819
93
        }
820
2.75k
    }
821
822
3.12k
    return 4 + block_len;
823
3.21k
}
824
825
int bam_write1(BGZF *fp, const bam1_t *b)
826
18.7M
{
827
18.7M
    const bam1_core_t *c = &b->core;
828
18.7M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
829
18.7M
    int i, ok;
830
18.7M
    if (c->l_qname - c->l_extranul > 255) {
831
8
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
832
8
        errno = EOVERFLOW;
833
8
        return -1;
834
8
    }
835
18.7M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
836
18.7M
    if (c->pos > INT_MAX ||
837
18.7M
        c->mpos > INT_MAX ||
838
18.7M
        c->isize < INT_MIN || c->isize > INT_MAX) {
839
730
        hts_log_error("Positional data is too large for BAM format");
840
730
        return -1;
841
730
    }
842
18.7M
    x[0] = c->tid;
843
18.7M
    x[1] = c->pos;
844
18.7M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
845
18.7M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
846
18.7M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
847
18.7M
    x[4] = c->l_qseq;
848
18.7M
    x[5] = c->mtid;
849
18.7M
    x[6] = c->mpos;
850
18.7M
    x[7] = c->isize;
851
18.7M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
852
18.7M
    if (fp->is_be) {
853
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
854
0
        y = block_len;
855
0
        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
856
0
        swap_data(c, b->l_data, b->data, 1);
857
18.7M
    } else {
858
18.7M
        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
859
18.7M
    }
860
18.7M
    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
861
18.7M
    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
862
18.7M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
863
18.7M
        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
864
18.7M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
865
35
        uint8_t buf[8];
866
35
        uint32_t cigar_st, cigar_en, cigar[2];
867
35
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
868
35
        if (cigreflen >= (1<<28)) {
869
            // Length of reference covered is greater than the biggest
870
            // CIGAR operation currently allowed.
871
3
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
872
3
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
873
3
                          bam_get_qname(b), c->n_cigar, cigreflen);
874
3
            return -1;
875
3
        }
876
32
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
877
32
        cigar_en = cigar_st + c->n_cigar * 4;
878
32
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
879
32
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
880
32
        u32_to_le(cigar[0], buf);
881
32
        u32_to_le(cigar[1], buf + 4);
882
32
        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
883
32
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
884
32
        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
885
32
        u32_to_le(c->n_cigar, buf);
886
32
        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
887
32
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
888
32
    }
889
18.7M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
890
18.7M
    return ok? 4 + block_len : -1;
891
18.7M
}
892
893
/*
894
 * Write a BAM file and append to the in-memory index simultaneously.
895
 */
896
18.7M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
897
18.7M
    BGZF *bfp = fp->fp.bgzf;
898
899
18.7M
    if (!fp->idx)
900
18.7M
        return bam_write1(bfp, b);
901
902
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
903
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
904
0
        return -1;
905
0
    if (!bfp->mt)
906
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
907
908
0
    int ret = bam_write1(bfp, b);
909
0
    if (ret < 0)
910
0
        return -1;
911
912
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
913
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
914
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
915
0
        ret = -1;
916
0
    }
917
918
0
    return ret;
919
0
}
920
921
/*
922
 * Set the qname in a BAM record
923
 */
924
int bam_set_qname(bam1_t *rec, const char *qname)
925
0
{
926
0
    if (!rec) return -1;
927
0
    if (!qname || !*qname) return -1;
928
929
0
    size_t old_len = rec->core.l_qname;
930
0
    size_t new_len = strlen(qname) + 1;
931
0
    if (new_len < 1 || new_len > 255) return -1;
932
933
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
934
935
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
936
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
937
938
    // Make room
939
0
    if (new_len + extranul != rec->core.l_qname)
940
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
941
    // Copy in new name and pad if needed
942
0
    memcpy(rec->data, qname, new_len);
943
0
    int n;
944
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
945
946
0
    rec->l_data = new_data_len;
947
0
    rec->core.l_qname = new_len + extranul;
948
0
    rec->core.l_extranul = extranul;
949
950
0
    return 0;
951
0
}
952
953
/********************
954
 *** BAM indexing ***
955
 ********************/
956
957
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
958
0
{
959
0
    int n_lvls, i, fmt, ret;
960
0
    bam1_t *b;
961
0
    hts_idx_t *idx;
962
0
    sam_hdr_t *h;
963
0
    h = sam_hdr_read(fp);
964
0
    if (h == NULL) return NULL;
965
0
    if (min_shift > 0) {
966
0
        hts_pos_t max_len = 0, s;
967
0
        for (i = 0; i < h->n_targets; ++i) {
968
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
969
0
            if (max_len < len) max_len = len;
970
0
        }
971
0
        max_len += 256;
972
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
973
0
        fmt = HTS_FMT_CSI;
974
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
975
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
976
0
    b = bam_init1();
977
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
978
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
979
0
        if (ret < 0) { // unsorted or doesn't fit
980
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
981
0
            goto err;
982
0
        }
983
0
    }
984
0
    if (ret < -1) goto err; // corrupted BAM file
985
986
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
987
0
    sam_hdr_destroy(h);
988
0
    bam_destroy1(b);
989
0
    return idx;
990
991
0
err:
992
0
    bam_destroy1(b);
993
0
    hts_idx_destroy(idx);
994
0
    return NULL;
995
0
}
996
997
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
998
0
{
999
0
    hts_idx_t *idx;
1000
0
    htsFile *fp;
1001
0
    int ret = 0;
1002
1003
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1004
0
    if (nthreads)
1005
0
        hts_set_threads(fp, nthreads);
1006
1007
0
    switch (fp->format.format) {
1008
0
    case cram:
1009
1010
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1011
0
        break;
1012
1013
0
    case bam:
1014
0
    case sam:
1015
0
        if (fp->format.compression != bgzf) {
1016
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1017
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1018
0
            ret = -1;
1019
0
            break;
1020
0
        }
1021
0
        idx = sam_index(fp, min_shift);
1022
0
        if (idx) {
1023
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1024
0
            if (ret < 0) ret = -4;
1025
0
            hts_idx_destroy(idx);
1026
0
        }
1027
0
        else ret = -1;
1028
0
        break;
1029
1030
0
    default:
1031
0
        ret = -3;
1032
0
        break;
1033
0
    }
1034
0
    hts_close(fp);
1035
1036
0
    return ret;
1037
0
}
1038
1039
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1040
0
{
1041
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1042
0
}
1043
1044
int sam_index_build(const char *fn, int min_shift)
1045
0
{
1046
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1047
0
}
1048
1049
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1050
#undef bam_index_build
1051
int bam_index_build(const char *fn, int min_shift)
1052
0
{
1053
0
    return sam_index_build2(fn, NULL, min_shift);
1054
0
}
1055
1056
// Initialise fp->idx for the current format type.
1057
// This must be called after the header has been written but no other data.
1058
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1059
0
    fp->fnidx = fnidx;
1060
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1061
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1062
0
        int n_lvls, fmt = HTS_FMT_CSI;
1063
0
        if (min_shift > 0) {
1064
0
            int64_t max_len = 0, s;
1065
0
            int i;
1066
0
            for (i = 0; i < h->n_targets; ++i)
1067
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1068
0
            max_len += 256;
1069
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1070
1071
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1072
1073
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1074
0
        return fp->idx ? 0 : -1;
1075
0
    }
1076
1077
0
    if (fp->format.format == cram) {
1078
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1079
0
        return fp->fp.cram->idxfp ? 0 : -1;
1080
0
    }
1081
1082
0
    return -1;
1083
0
}
1084
1085
// Finishes an index. Call after the last record has been written.
1086
// Returns 0 on success, <0 on failure.
1087
0
int sam_idx_save(htsFile *fp) {
1088
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1089
0
        fp->format.format == vcf || fp->format.format == sam) {
1090
0
        int ret;
1091
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1092
0
            errno = -ret;
1093
0
            return -1;
1094
0
        }
1095
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1096
0
            return -1;
1097
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1098
1099
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1100
0
            return -1;
1101
1102
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1103
1104
0
    } else if (fp->format.format == cram) {
1105
        // flushed and closed by cram_close
1106
0
    }
1107
1108
0
    return 0;
1109
0
}
1110
1111
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1112
0
{
1113
0
    htsFile *fp = (htsFile *)fpv;
1114
0
    bam1_t *b = bv;
1115
0
    fp->line.l = 0;
1116
0
    int ret = sam_read1(fp, fp->bam_header, b);
1117
0
    if (ret >= 0) {
1118
0
        *tid = b->core.tid;
1119
0
        *beg = b->core.pos;
1120
0
        *end = bam_endpos(b);
1121
0
    }
1122
0
    return ret;
1123
0
}
1124
1125
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1126
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1127
0
{
1128
0
    htsFile *fp = (htsFile *)fpv;
1129
0
    bam1_t *b = bv;
1130
0
    fp->line.l = 0;
1131
0
    int ret = sam_read1(fp, fp->bam_header, b);
1132
0
    return ret;
1133
0
}
1134
1135
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1136
// samtools/bam.c.
1137
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1138
0
{
1139
0
    const char *rg;
1140
0
    kstring_t lib = { 0, 0, NULL };
1141
0
    rg = (char *)bam_aux_get(b, "RG");
1142
1143
0
    if (!rg)
1144
0
        return NULL;
1145
0
    else
1146
0
        rg++;
1147
1148
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1149
0
        return NULL;
1150
1151
0
    static char LB_text[1024];
1152
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1153
1154
0
    memcpy(LB_text, lib.s, len);
1155
0
    LB_text[len] = 0;
1156
1157
0
    free(lib.s);
1158
1159
0
    return LB_text;
1160
0
}
1161
1162
1163
// Bam record pointer and SAM header combined
1164
typedef struct {
1165
    const sam_hdr_t *h;
1166
    const bam1_t *b;
1167
} hb_pair;
1168
1169
// Looks up variable names in str and replaces them with their value.
1170
// Also supports aux tags.
1171
//
1172
// Note the expression parser deliberately overallocates str size so it
1173
// is safe to use memcmp over strcmp.
1174
static int bam_sym_lookup(void *data, char *str, char **end,
1175
0
                          hts_expr_val_t *res) {
1176
0
    hb_pair *hb = (hb_pair *)data;
1177
0
    const bam1_t *b = hb->b;
1178
1179
0
    res->is_str = 0;
1180
0
    switch(*str) {
1181
0
    case 'c':
1182
0
        if (memcmp(str, "cigar", 5) == 0) {
1183
0
            *end = str+5;
1184
0
            res->is_str = 1;
1185
0
            ks_clear(&res->s);
1186
0
            uint32_t *cigar = bam_get_cigar(b);
1187
0
            int i, n = b->core.n_cigar, r = 0;
1188
0
            if (n) {
1189
0
                for (i = 0; i < n; i++) {
1190
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1191
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1192
0
                }
1193
0
                r |= kputs("", &res->s) < 0;
1194
0
            } else {
1195
0
                r |= kputs("*", &res->s) < 0;
1196
0
            }
1197
0
            return r ? -1 : 0;
1198
0
        }
1199
0
        break;
1200
1201
0
    case 'e':
1202
0
        if (memcmp(str, "endpos", 6) == 0) {
1203
0
            *end = str+6;
1204
0
            res->d = bam_endpos(b);
1205
0
            return 0;
1206
0
        }
1207
0
        break;
1208
1209
0
    case 'f':
1210
0
        if (memcmp(str, "flag", 4) == 0) {
1211
0
            str = *end = str+4;
1212
0
            if (*str != '.') {
1213
0
                res->d = b->core.flag;
1214
0
                return 0;
1215
0
            } else {
1216
0
                str++;
1217
0
                if (!memcmp(str, "paired", 6)) {
1218
0
                    *end = str+6;
1219
0
                    res->d = b->core.flag & BAM_FPAIRED;
1220
0
                    return 0;
1221
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1222
0
                    *end = str+11;
1223
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1224
0
                    return 0;
1225
0
                } else if (!memcmp(str, "unmap", 5)) {
1226
0
                    *end = str+5;
1227
0
                    res->d = b->core.flag & BAM_FUNMAP;
1228
0
                    return 0;
1229
0
                } else if (!memcmp(str, "munmap", 6)) {
1230
0
                    *end = str+6;
1231
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1232
0
                    return 0;
1233
0
                } else if (!memcmp(str, "reverse", 7)) {
1234
0
                    *end = str+7;
1235
0
                    res->d = b->core.flag & BAM_FREVERSE;
1236
0
                    return 0;
1237
0
                } else if (!memcmp(str, "mreverse", 8)) {
1238
0
                    *end = str+8;
1239
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1240
0
                    return 0;
1241
0
                } else if (!memcmp(str, "read1", 5)) {
1242
0
                    *end = str+5;
1243
0
                    res->d = b->core.flag & BAM_FREAD1;
1244
0
                    return 0;
1245
0
                } else if (!memcmp(str, "read2", 5)) {
1246
0
                    *end = str+5;
1247
0
                    res->d = b->core.flag & BAM_FREAD2;
1248
0
                    return 0;
1249
0
                } else if (!memcmp(str, "secondary", 9)) {
1250
0
                    *end = str+9;
1251
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1252
0
                    return 0;
1253
0
                } else if (!memcmp(str, "qcfail", 6)) {
1254
0
                    *end = str+6;
1255
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1256
0
                    return 0;
1257
0
                } else if (!memcmp(str, "dup", 3)) {
1258
0
                    *end = str+3;
1259
0
                    res->d = b->core.flag & BAM_FDUP;
1260
0
                    return 0;
1261
0
                } else if (!memcmp(str, "supplementary", 13)) {
1262
0
                    *end = str+13;
1263
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1264
0
                    return 0;
1265
0
                } else {
1266
0
                    hts_log_error("Unrecognised flag string");
1267
0
                    return -1;
1268
0
                }
1269
0
            }
1270
0
        }
1271
0
        break;
1272
1273
0
    case 'h':
1274
0
        if (memcmp(str, "hclen", 5) == 0) {
1275
0
            int hclen = 0;
1276
0
            uint32_t *cigar = bam_get_cigar(b);
1277
0
            uint32_t ncigar = b->core.n_cigar;
1278
1279
            // left
1280
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1281
0
                hclen = bam_cigar_oplen(cigar[0]);
1282
1283
            // right
1284
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1285
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1286
1287
0
            *end = str+5;
1288
0
            res->d = hclen;
1289
0
            return 0;
1290
0
        }
1291
0
        break;
1292
1293
0
    case 'l':
1294
0
        if (memcmp(str, "library", 7) == 0) {
1295
0
            *end = str+7;
1296
0
            res->is_str = 1;
1297
0
            const char *lib = bam_get_library(hb->h, b);
1298
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1299
0
            return 0;
1300
0
        }
1301
0
        break;
1302
1303
0
    case 'm':
1304
0
        if (memcmp(str, "mapq", 4) == 0) {
1305
0
            *end = str+4;
1306
0
            res->d = b->core.qual;
1307
0
            return 0;
1308
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1309
0
            *end = str+4;
1310
0
            res->d = b->core.mpos+1;
1311
0
            return 0;
1312
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1313
0
            *end = str+6;
1314
0
            res->is_str = 1;
1315
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1316
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1317
0
            return 0;
1318
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1319
0
            *end = str+6;
1320
0
            res->d = b->core.mtid;
1321
0
            return 0;
1322
0
        }
1323
0
        break;
1324
1325
0
    case 'n':
1326
0
        if (memcmp(str, "ncigar", 6) == 0) {
1327
0
            *end = str+6;
1328
0
            res->d = b->core.n_cigar;
1329
0
            return 0;
1330
0
        }
1331
0
        break;
1332
1333
0
    case 'p':
1334
0
        if (memcmp(str, "pos", 3) == 0) {
1335
0
            *end = str+3;
1336
0
            res->d = b->core.pos+1;
1337
0
            return 0;
1338
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1339
0
            *end = str+5;
1340
0
            res->d = b->core.mpos+1;
1341
0
            return 0;
1342
0
        }
1343
0
        break;
1344
1345
0
    case 'q':
1346
0
        if (memcmp(str, "qlen", 4) == 0) {
1347
0
            *end = str+4;
1348
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1349
0
            return 0;
1350
0
        } else if (memcmp(str, "qname", 5) == 0) {
1351
0
            *end = str+5;
1352
0
            res->is_str = 1;
1353
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1354
0
            return 0;
1355
0
        } else if (memcmp(str, "qual", 4) == 0) {
1356
0
            *end = str+4;
1357
0
            ks_clear(&res->s);
1358
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1359
0
                return -1;
1360
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1361
0
            res->s.l = b->core.l_qseq;
1362
0
            res->is_str = 1;
1363
0
            return 0;
1364
0
        }
1365
0
        break;
1366
1367
0
    case 'r':
1368
0
        if (memcmp(str, "rlen", 4) == 0) {
1369
0
            *end = str+4;
1370
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1371
0
            return 0;
1372
0
        } else if (memcmp(str, "rname", 5) == 0) {
1373
0
            *end = str+5;
1374
0
            res->is_str = 1;
1375
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1376
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1377
0
            return 0;
1378
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1379
0
            *end = str+5;
1380
0
            res->is_str = 1;
1381
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1382
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1383
0
            return 0;
1384
0
        } else if (memcmp(str, "refid", 5) == 0) {
1385
0
            *end = str+5;
1386
0
            res->d = b->core.tid;
1387
0
            return 0;
1388
0
        }
1389
0
        break;
1390
1391
0
    case 's':
1392
0
        if (memcmp(str, "seq", 3) == 0) {
1393
0
            *end = str+3;
1394
0
            ks_clear(&res->s);
1395
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1396
0
                return -1;
1397
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1398
0
            res->s.s[b->core.l_qseq] = 0;
1399
0
            res->s.l = b->core.l_qseq;
1400
0
            res->is_str = 1;
1401
0
            return 0;
1402
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1403
0
            int sclen = 0;
1404
0
            uint32_t *cigar = bam_get_cigar(b);
1405
0
            int ncigar = b->core.n_cigar;
1406
0
            int left = 0;
1407
1408
            // left
1409
0
            if (ncigar > 0
1410
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1411
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1412
0
            else if (ncigar > 1
1413
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1414
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1415
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1416
1417
            // right
1418
0
            if (ncigar-1 > left
1419
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1420
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1421
0
            else if (ncigar-2 > left
1422
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1423
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1424
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1425
1426
0
            *end = str+5;
1427
0
            res->d = sclen;
1428
0
            return 0;
1429
0
        }
1430
0
        break;
1431
1432
0
    case 't':
1433
0
        if (memcmp(str, "tlen", 4) == 0) {
1434
0
            *end = str+4;
1435
0
            res->d = b->core.isize;
1436
0
            return 0;
1437
0
        }
1438
0
        break;
1439
1440
0
    case '[':
1441
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1442
            /* aux tags */
1443
0
            *end = str+4;
1444
1445
0
            uint8_t *aux = bam_aux_get(b, str+1);
1446
0
            if (aux) {
1447
                // we define the truth of a tag to be its presence, even if 0.
1448
0
                res->is_true = 1;
1449
0
                switch (*aux) {
1450
0
                case 'Z':
1451
0
                case 'H':
1452
0
                    res->is_str = 1;
1453
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1454
0
                    break;
1455
1456
0
                case 'A':
1457
0
                    res->is_str = 1;
1458
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1459
0
                    break;
1460
1461
0
                case 'i': case 'I':
1462
0
                case 's': case 'S':
1463
0
                case 'c': case 'C':
1464
0
                    res->is_str = 0;
1465
0
                    res->d = bam_aux2i(aux);
1466
0
                    break;
1467
1468
0
                case 'f':
1469
0
                case 'd':
1470
0
                    res->is_str = 0;
1471
0
                    res->d = bam_aux2f(aux);
1472
0
                    break;
1473
1474
0
                default:
1475
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1476
0
                                  *aux);
1477
0
                    return -1;
1478
0
                }
1479
0
                return 0;
1480
1481
0
            } else {
1482
                // hence absent tags are always false (and strings)
1483
0
                res->is_str = 1;
1484
0
                res->s.l = 0;
1485
0
                res->d = 0;
1486
0
                res->is_true = 0;
1487
0
                return 0;
1488
0
            }
1489
0
        }
1490
0
        break;
1491
0
    }
1492
1493
    // All successful matches in switch should return 0.
1494
    // So if we didn't match, it's a parse error.
1495
0
    return -1;
1496
0
}
1497
1498
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1499
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1500
0
{
1501
0
    hb_pair hb = {h, b};
1502
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1503
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1504
0
        hts_log_error("Couldn't process filter expression");
1505
0
        hts_expr_val_free(&res);
1506
0
        return -1;
1507
0
    }
1508
1509
0
    int t = res.is_true;
1510
0
    hts_expr_val_free(&res);
1511
1512
0
    return t;
1513
0
}
1514
1515
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1516
0
{
1517
0
    htsFile *fp = fpv;
1518
0
    bam1_t *b = bv;
1519
0
    int pass_filter, ret;
1520
1521
0
    do {
1522
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1523
0
        if (ret < 0)
1524
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1525
1526
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1527
0
            return -2;
1528
1529
0
        *tid = b->core.tid;
1530
0
        *beg = b->core.pos;
1531
0
        *end = bam_endpos(b);
1532
1533
0
        if (fp->filter) {
1534
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1535
0
            if (pass_filter < 0)
1536
0
                return -2;
1537
0
        } else {
1538
0
            pass_filter = 1;
1539
0
        }
1540
0
    } while (pass_filter == 0);
1541
1542
0
    return ret;
1543
0
}
1544
1545
static int cram_pseek(void *fp, int64_t offset, int whence)
1546
0
{
1547
0
    cram_fd *fd =  (cram_fd *)fp;
1548
1549
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1550
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1551
0
        return -1;
1552
1553
0
    fd->curr_position = offset;
1554
1555
0
    if (fd->ctr) {
1556
0
        cram_free_container(fd->ctr);
1557
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1558
0
            cram_free_container(fd->ctr_mt);
1559
1560
0
        fd->ctr = NULL;
1561
0
        fd->ctr_mt = NULL;
1562
0
        fd->ooc = 0;
1563
0
    }
1564
1565
0
    return 0;
1566
0
}
1567
1568
/*
1569
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1570
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1571
 *   container previously fetched. It was designed like this to integrate with the functionality
1572
 *   of the iterator stepping logic.
1573
 */
1574
1575
static int64_t cram_ptell(void *fp)
1576
0
{
1577
0
    cram_fd *fd = (cram_fd *)fp;
1578
0
    cram_container *c;
1579
0
    cram_slice *s;
1580
0
    int64_t ret = -1L;
1581
1582
0
    if (fd) {
1583
0
        if ((c = fd->ctr) != NULL) {
1584
0
            if ((s = c->slice) != NULL && s->max_rec) {
1585
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1586
0
                    fd->curr_position += c->offset + c->length;
1587
0
            }
1588
0
        }
1589
0
        ret = fd->curr_position;
1590
0
    }
1591
1592
0
    return ret;
1593
0
}
1594
1595
static int bam_pseek(void *fp, int64_t offset, int whence)
1596
0
{
1597
0
    BGZF *fd = (BGZF *)fp;
1598
1599
0
    return bgzf_seek(fd, offset, whence);
1600
0
}
1601
1602
static int64_t bam_ptell(void *fp)
1603
0
{
1604
0
    BGZF *fd = (BGZF *)fp;
1605
0
    if (!fd)
1606
0
        return -1L;
1607
1608
0
    return bgzf_tell(fd);
1609
0
}
1610
1611
1612
1613
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1614
0
{
1615
0
    switch (fp->format.format) {
1616
0
    case bam:
1617
0
    case sam:
1618
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1619
1620
0
    case cram: {
1621
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1622
1623
        // Cons up a fake "index" just pointing at the associated cram_fd:
1624
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1625
0
        if (idx == NULL) return NULL;
1626
0
        idx->fmt = HTS_FMT_CRAI;
1627
0
        idx->cram = fp->fp.cram;
1628
0
        return (hts_idx_t *) idx;
1629
0
        }
1630
1631
0
    default:
1632
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1633
0
    }
1634
0
}
1635
1636
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1637
0
{
1638
0
    return index_load(fp, fn, fnidx, flags);
1639
0
}
1640
1641
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1642
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1643
0
}
1644
1645
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1646
0
{
1647
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1648
0
}
1649
1650
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1651
0
{
1652
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1653
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1654
0
    if (iter == NULL) return NULL;
1655
1656
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1657
    // the readrec function:
1658
0
    iter->is_cram = 1;
1659
0
    iter->read_rest = 1;
1660
0
    iter->off = NULL;
1661
0
    iter->bins.a = NULL;
1662
0
    iter->readrec = readrec;
1663
1664
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1665
0
        cram_range r = { tid, beg+1, end };
1666
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1667
1668
0
        iter->curr_off = 0;
1669
        // The following fields are not required by hts_itr_next(), but are
1670
        // filled in in case user code wants to look at them.
1671
0
        iter->tid = tid;
1672
0
        iter->beg = beg;
1673
0
        iter->end = end;
1674
1675
0
        switch (ret) {
1676
0
        case 0:
1677
0
            break;
1678
1679
0
        case -2:
1680
            // No data vs this ref, so mark iterator as completed.
1681
            // Same as HTS_IDX_NONE.
1682
0
            iter->finished = 1;
1683
0
            break;
1684
1685
0
        default:
1686
0
            free(iter);
1687
0
            return NULL;
1688
0
        }
1689
0
    }
1690
0
    else switch (tid) {
1691
0
    case HTS_IDX_REST:
1692
0
        iter->curr_off = 0;
1693
0
        break;
1694
0
    case HTS_IDX_NONE:
1695
0
        iter->curr_off = 0;
1696
0
        iter->finished = 1;
1697
0
        break;
1698
0
    default:
1699
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1700
0
        abort();
1701
0
        break;
1702
0
    }
1703
1704
0
    return iter;
1705
0
}
1706
1707
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1708
0
{
1709
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1710
0
    if (idx == NULL)
1711
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1712
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1713
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1714
0
    else
1715
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1716
0
}
1717
1718
static int cram_name2id(void *fdv, const char *ref)
1719
0
{
1720
0
    cram_fd *fd = (cram_fd *) fdv;
1721
0
    return sam_hdr_name2tid(fd->header, ref);
1722
0
}
1723
1724
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1725
0
{
1726
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1727
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1728
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1729
0
                          sam_readrec);
1730
0
}
1731
1732
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1733
0
{
1734
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1735
0
    hts_reglist_t *r_list = NULL;
1736
0
    int r_count = 0;
1737
1738
0
    if (!cidx || !hdr)
1739
0
        return NULL;
1740
1741
0
    hts_itr_t *itr = NULL;
1742
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1743
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1744
0
        if (!r_list)
1745
0
            return NULL;
1746
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1747
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1748
0
    } else {
1749
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1750
0
        if (!r_list)
1751
0
            return NULL;
1752
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1753
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1754
0
    }
1755
1756
0
    if (!itr)
1757
0
        hts_reglist_free(r_list, r_count);
1758
1759
0
    return itr;
1760
0
}
1761
1762
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1763
0
{
1764
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1765
1766
0
    if(!cidx || !hdr || !reglist)
1767
0
        return NULL;
1768
1769
0
    if (cidx->fmt == HTS_FMT_CRAI)
1770
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1771
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1772
0
    else
1773
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1774
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1775
0
}
1776
1777
/**********************
1778
 *** SAM header I/O ***
1779
 **********************/
1780
1781
#include "htslib/kseq.h"
1782
#include "htslib/kstring.h"
1783
1784
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1785
0
{
1786
0
    sam_hdr_t *bh = sam_hdr_init();
1787
0
    if (!bh) return NULL;
1788
1789
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1790
0
        sam_hdr_destroy(bh);
1791
0
        return NULL;
1792
0
    }
1793
1794
0
    return bh;
1795
0
}
1796
1797
563k
static int valid_sam_header_type(const char *s) {
1798
563k
    if (s[0] != '@') return 0;
1799
563k
    switch (s[1]) {
1800
2.99k
    case 'H':
1801
2.99k
        return s[2] == 'D' && s[3] == '\t';
1802
48
    case 'S':
1803
48
        return s[2] == 'Q' && s[3] == '\t';
1804
545k
    case 'R':
1805
548k
    case 'P':
1806
548k
        return s[2] == 'G' && s[3] == '\t';
1807
11.5k
    case 'C':
1808
11.5k
        return s[2] == 'O';
1809
563k
    }
1810
81
    return 0;
1811
563k
}
1812
1813
// Minimal sanitisation of a header to ensure.
1814
// - null terminated string.
1815
// - all lines start with @ (also implies no blank lines).
1816
//
1817
// Much more could be done, but currently is not, including:
1818
// - checking header types are known (HD, SQ, etc).
1819
// - syntax (eg checking tab separated fields).
1820
// - validating n_targets matches @SQ records.
1821
// - validating target lengths against @SQ records.
1822
32.2k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1823
32.2k
    if (!h)
1824
720
        return NULL;
1825
1826
    // Special case for empty headers.
1827
31.5k
    if (h->l_text == 0)
1828
10.2k
        return h;
1829
1830
21.3k
    size_t i;
1831
21.3k
    unsigned int lnum = 0;
1832
21.3k
    char *cp = h->text, last = '\n';
1833
64.3M
    for (i = 0; i < h->l_text; i++) {
1834
        // NB: l_text excludes terminating nul.  This finds early ones.
1835
64.2M
        if (cp[i] == 0)
1836
6.26k
            break;
1837
1838
        // Error on \n[^@], including duplicate newlines
1839
64.2M
        if (last == '\n') {
1840
364k
            lnum++;
1841
364k
            if (cp[i] != '@') {
1842
3
                hts_log_error("Malformed SAM header at line %u", lnum);
1843
3
                sam_hdr_destroy(h);
1844
3
                return NULL;
1845
3
            }
1846
364k
        }
1847
1848
64.2M
        last = cp[i];
1849
64.2M
    }
1850
1851
21.3k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1852
6.26k
        size_t j = i;
1853
36.1k
        while (j < h->l_text && cp[j] == '\0') j++;
1854
6.26k
        if (j < h->l_text)
1855
5.98k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1856
6.26k
    }
1857
1858
    // Add trailing newline and/or trailing nul if required.
1859
21.3k
    if (last != '\n') {
1860
6.00k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1861
1862
6.00k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1863
1.08k
            if (h->l_text >= SIZE_MAX - 2) {
1864
0
                hts_log_error("No room for extra newline");
1865
0
                sam_hdr_destroy(h);
1866
0
                return NULL;
1867
0
            }
1868
1869
1.08k
            cp = realloc(h->text, (size_t) h->l_text+2);
1870
1.08k
            if (!cp) {
1871
0
                sam_hdr_destroy(h);
1872
0
                return NULL;
1873
0
            }
1874
1.08k
            h->text = cp;
1875
1.08k
        }
1876
6.00k
        cp[i++] = '\n';
1877
1878
        // l_text may be larger already due to multiple nul padding
1879
6.00k
        if (h->l_text < i)
1880
39
            h->l_text = i;
1881
6.00k
        cp[h->l_text] = '\0';
1882
6.00k
    }
1883
1884
21.3k
    return h;
1885
21.3k
}
1886
1887
2.47k
static void known_stderr(const char *tool, const char *advice) {
1888
2.47k
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1889
2.47k
    hts_log_warning("%s", advice);
1890
2.47k
}
1891
1892
33.2k
static void warn_if_known_stderr(const char *line) {
1893
33.2k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1894
759
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1895
32.5k
    else if (strstr(line, "M::mem_pestat") != NULL)
1896
1.39k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1897
31.1k
    else if (strstr(line, "loaded/built the index") != NULL)
1898
321
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1899
33.2k
}
1900
1901
21.0k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1902
21.0k
    kstring_t str = { 0, 0, NULL };
1903
21.0k
    khint_t k;
1904
21.0k
    sam_hdr_t* h = sam_hdr_init();
1905
21.0k
    const char *q, *r;
1906
21.0k
    char* sn = NULL;
1907
21.0k
    khash_t(s2i) *d = kh_init(s2i);
1908
21.0k
    khash_t(s2i) *long_refs = NULL;
1909
21.0k
    if (!h || !d)
1910
0
        goto error;
1911
1912
21.0k
    int ret, has_SQ = 0;
1913
21.0k
    int next_c = '@';
1914
699k
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1915
678k
        if (fp->line.s[0] != '@')
1916
39
            break;
1917
1918
678k
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1919
114k
            has_SQ = 1;
1920
114k
            hts_pos_t ln = -1;
1921
309k
            for (q = fp->line.s + 4;; ++q) {
1922
309k
                if (strncmp(q, "SN:", 3) == 0) {
1923
111k
                    q += 3;
1924
1.06G
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1925
1926
111k
                    if (sn) {
1927
22.1k
                        hts_log_warning("SQ header line has more than one SN: tag");
1928
22.1k
                        free(sn);
1929
22.1k
                    }
1930
111k
                    sn = (char*)calloc(r - q + 1, 1);
1931
111k
                    if (!sn)
1932
0
                        goto error;
1933
1934
111k
                    strncpy(sn, q, r - q);
1935
111k
                    q = r;
1936
198k
                } else {
1937
198k
                    if (strncmp(q, "LN:", 3) == 0)
1938
102k
                        ln = strtoll(q + 3, (char**)&q, 10);
1939
198k
                }
1940
1941
37.9M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1942
37.6M
                    ++q;
1943
309k
                if (*q == '\0' || *q == '\n')
1944
114k
                    break;
1945
309k
            }
1946
114k
            if (sn) {
1947
88.8k
                if (ln >= 0) {
1948
81.6k
                    int absent;
1949
81.6k
                    k = kh_put(s2i, d, sn, &absent);
1950
81.6k
                    if (absent < 0)
1951
0
                        goto error;
1952
1953
81.6k
                    if (!absent) {
1954
36.5k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1955
36.5k
                        free(sn);
1956
45.1k
                    } else {
1957
45.1k
                        sn = NULL;
1958
45.1k
                        if (ln >= UINT32_MAX) {
1959
                            // Stash away ref length that
1960
                            // doesn't fit in target_len array
1961
16.3k
                            int k2;
1962
16.3k
                            if (!long_refs) {
1963
963
                                long_refs = kh_init(s2i);
1964
963
                                if (!long_refs)
1965
0
                                    goto error;
1966
963
                            }
1967
16.3k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1968
16.3k
                            if (absent < 0)
1969
0
                                goto error;
1970
16.3k
                            kh_val(long_refs, k2) = ln;
1971
16.3k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1972
16.3k
                                            | UINT32_MAX);
1973
28.7k
                        } else {
1974
28.7k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1975
28.7k
                        }
1976
45.1k
                    }
1977
81.6k
                } else {
1978
7.20k
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1979
7.20k
                    warn_if_known_stderr(fp->line.s);
1980
7.20k
                    free(sn);
1981
7.20k
                }
1982
88.8k
            } else {
1983
25.8k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
1984
25.8k
                warn_if_known_stderr(fp->line.s);
1985
25.8k
            }
1986
114k
            sn = NULL;
1987
114k
        }
1988
563k
        else if (!valid_sam_header_type(fp->line.s)) {
1989
186
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1990
186
            warn_if_known_stderr(fp->line.s);
1991
186
            goto error;
1992
186
        }
1993
1994
678k
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1995
0
            goto error;
1996
1997
678k
        if (kputc('\n', &str) < 0)
1998
0
            goto error;
1999
2000
678k
        if (fp->is_bgzf) {
2001
539k
            next_c = bgzf_peek(fp->fp.bgzf);
2002
539k
        } else {
2003
138k
            unsigned char nc;
2004
138k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
2005
138k
            next_c = pret > 0 ? nc : pret - 1;
2006
138k
        }
2007
678k
        if (next_c < -1)
2008
3
            goto error;
2009
678k
    }
2010
20.8k
    if (next_c != '@')
2011
20.8k
        fp->line.l = 0;
2012
2013
20.8k
    if (ret < -1)
2014
27
        goto error;
2015
2016
20.8k
    if (!has_SQ && fp->fn_aux) {
2017
0
        kstring_t line = { 0, 0, NULL };
2018
2019
        /* The reference index (.fai) is actually needed here */
2020
0
        char *fai_fn = fp->fn_aux;
2021
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
2022
0
        if (fn_delim)
2023
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
2024
2025
0
        hFILE* f = hopen(fai_fn, "r");
2026
0
        int e = 0, absent;
2027
0
        if (f == NULL)
2028
0
            goto error;
2029
2030
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
2031
0
            char* tab = strchr(line.s, '\t');
2032
0
            hts_pos_t ln;
2033
2034
0
            if (tab == NULL)
2035
0
                continue;
2036
2037
0
            sn = (char*)calloc(tab-line.s+1, 1);
2038
0
            if (!sn) {
2039
0
                e = 1;
2040
0
                break;
2041
0
            }
2042
0
            memcpy(sn, line.s, tab-line.s);
2043
0
            k = kh_put(s2i, d, sn, &absent);
2044
0
            if (absent < 0) {
2045
0
                e = 1;
2046
0
                break;
2047
0
            }
2048
2049
0
            ln = strtoll(tab, NULL, 10);
2050
2051
0
            if (!absent) {
2052
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2053
0
                free(sn);
2054
0
                sn = NULL;
2055
0
            } else {
2056
0
                sn = NULL;
2057
0
                if (ln >= UINT32_MAX) {
2058
                    // Stash away ref length that
2059
                    // doesn't fit in target_len array
2060
0
                    khint_t k2;
2061
0
                    int absent = -1;
2062
0
                    if (!long_refs) {
2063
0
                        long_refs = kh_init(s2i);
2064
0
                        if (!long_refs) {
2065
0
                            e = 1;
2066
0
                            break;
2067
0
                        }
2068
0
                    }
2069
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2070
0
                    if (absent < 0) {
2071
0
                         e = 1;
2072
0
                         break;
2073
0
                    }
2074
0
                    kh_val(long_refs, k2) = ln;
2075
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2076
0
                                    | UINT32_MAX);
2077
0
                } else {
2078
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2079
0
                }
2080
0
                has_SQ = 1;
2081
0
            }
2082
2083
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2084
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2085
0
            e |= kputs("\tLN:", &str) < 0;
2086
0
            e |= kputll(ln, &str) < 0;
2087
0
            e |= kputc('\n', &str) < 0;
2088
0
            if (e)
2089
0
                break;
2090
0
        }
2091
2092
0
        ks_free(&line);
2093
0
        if (hclose(f) != 0) {
2094
0
            hts_log_error("Error on closing %s", fai_fn);
2095
0
            e = 1;
2096
0
        }
2097
0
        if (e)
2098
0
            goto error;
2099
0
    }
2100
2101
20.8k
    if (has_SQ) {
2102
        // Populate the targets array
2103
12.3k
        h->n_targets = kh_size(d);
2104
2105
12.3k
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2106
12.3k
        if (!h->target_name) {
2107
0
            h->n_targets = 0;
2108
0
            goto error;
2109
0
        }
2110
2111
12.3k
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2112
12.3k
        if (!h->target_len) {
2113
0
            h->n_targets = 0;
2114
0
            goto error;
2115
0
        }
2116
2117
117k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2118
104k
            if (!kh_exist(d, k))
2119
61.5k
                continue;
2120
2121
43.1k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2122
43.1k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2123
43.1k
            kh_val(d, k) >>= 32;
2124
43.1k
        }
2125
12.3k
    }
2126
2127
    // Repurpose sdict to hold any references longer than UINT32_MAX
2128
20.8k
    h->sdict = long_refs;
2129
2130
20.8k
    kh_destroy(s2i, d);
2131
2132
20.8k
    if (str.l == 0)
2133
39
        kputsn("", 0, &str);
2134
20.8k
    h->l_text = str.l;
2135
20.8k
    h->text = ks_release(&str);
2136
20.8k
    fp->bam_header = sam_hdr_sanitise(h);
2137
20.8k
    fp->bam_header->ref_count = 1;
2138
2139
20.8k
    return fp->bam_header;
2140
2141
216
 error:
2142
216
    if (h && d && (!h->target_name || !h->target_len)) {
2143
3.61k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2144
3.39k
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2145
216
    }
2146
216
    sam_hdr_destroy(h);
2147
216
    ks_free(&str);
2148
216
    kh_destroy(s2i, d);
2149
216
    kh_destroy(s2i, long_refs);
2150
216
    if (sn) free(sn);
2151
216
    return NULL;
2152
20.8k
}
2153
2154
sam_hdr_t *sam_hdr_read(htsFile *fp)
2155
35.9k
{
2156
35.9k
    if (!fp) {
2157
0
        errno = EINVAL;
2158
0
        return NULL;
2159
0
    }
2160
2161
35.9k
    switch (fp->format.format) {
2162
2.91k
    case bam:
2163
2.91k
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2164
2165
8.50k
    case cram:
2166
8.50k
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2167
2168
21.0k
    case sam:
2169
21.0k
        return sam_hdr_create(fp);
2170
2171
396
    case fastq_format:
2172
3.49k
    case fasta_format:
2173
3.49k
        return sam_hdr_init();
2174
2175
0
    case empty_format:
2176
0
        errno = EPIPE;
2177
0
        return NULL;
2178
2179
0
    default:
2180
0
        errno = EFTYPE;
2181
0
        return NULL;
2182
35.9k
    }
2183
35.9k
}
2184
2185
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2186
35.0k
{
2187
35.0k
    if (!fp || !h) {
2188
0
        errno = EINVAL;
2189
0
        return -1;
2190
0
    }
2191
2192
35.0k
    switch (fp->format.format) {
2193
11.6k
    case binary_format:
2194
11.6k
        fp->format.category = sequence_data;
2195
11.6k
        fp->format.format = bam;
2196
        /* fall-through */
2197
11.6k
    case bam:
2198
11.6k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2199
11.6k
        break;
2200
2201
11.6k
    case cram: {
2202
11.6k
        cram_fd *fd = fp->fp.cram;
2203
11.6k
        if (cram_set_header2(fd, h) < 0) return -1;
2204
10.5k
        if (fp->fn_aux)
2205
0
            cram_load_reference(fd, fp->fn_aux);
2206
10.5k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2207
10.5k
        }
2208
10.5k
        break;
2209
2210
11.6k
    case text_format:
2211
11.6k
        fp->format.category = sequence_data;
2212
11.6k
        fp->format.format = sam;
2213
        /* fall-through */
2214
11.6k
    case sam: {
2215
11.6k
        if (!h->hrecs && !h->text)
2216
0
            return 0;
2217
11.6k
        char *text;
2218
11.6k
        kstring_t hdr_ks = { 0, 0, NULL };
2219
11.6k
        size_t l_text;
2220
11.6k
        ssize_t bytes;
2221
11.6k
        int r = 0, no_sq = 0;
2222
2223
11.6k
        if (h->hrecs) {
2224
10.5k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2225
0
                return -1;
2226
10.5k
            text = hdr_ks.s;
2227
10.5k
            l_text = hdr_ks.l;
2228
10.5k
        } else {
2229
1.10k
            const char *p = NULL;
2230
1.37k
            do {
2231
1.37k
                const char *q = p == NULL ? h->text : p + 4;
2232
1.37k
                p = strstr(q, "@SQ\t");
2233
1.37k
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2234
1.10k
            no_sq = p == NULL;
2235
1.10k
            text = h->text;
2236
1.10k
            l_text = h->l_text;
2237
1.10k
        }
2238
2239
11.6k
        if (fp->is_bgzf) {
2240
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2241
11.6k
        } else {
2242
11.6k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2243
11.6k
        }
2244
11.6k
        free(hdr_ks.s);
2245
11.6k
        if (bytes != l_text)
2246
0
            return -1;
2247
2248
11.6k
        if (no_sq) {
2249
653
            int i;
2250
3.33k
            for (i = 0; i < h->n_targets; ++i) {
2251
2.68k
                fp->line.l = 0;
2252
2.68k
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2253
2.68k
                r |= kputs(h->target_name[i], &fp->line) < 0;
2254
2.68k
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2255
2.68k
                r |= kputw(h->target_len[i], &fp->line) < 0;
2256
2.68k
                r |= kputc('\n', &fp->line) < 0;
2257
2.68k
                if (r != 0)
2258
0
                    return -1;
2259
2260
2.68k
                if (fp->is_bgzf) {
2261
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2262
2.68k
                } else {
2263
2.68k
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2264
2.68k
                }
2265
2.68k
                if (bytes != fp->line.l)
2266
0
                    return -1;
2267
2.68k
            }
2268
653
        }
2269
11.6k
        if (fp->is_bgzf) {
2270
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2271
11.6k
        } else {
2272
11.6k
            if (hflush(fp->fp.hfile) != 0) return -1;
2273
11.6k
        }
2274
11.6k
        }
2275
11.6k
        break;
2276
2277
11.6k
    case fastq_format:
2278
0
    case fasta_format:
2279
        // Nothing to output; FASTQ has no file headers.
2280
0
        break;
2281
2282
0
    default:
2283
0
        errno = EBADF;
2284
0
        return -1;
2285
35.0k
    }
2286
33.9k
    return 0;
2287
35.0k
}
2288
2289
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2290
0
{
2291
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2292
0
    size_t new_l_text;
2293
0
    if (!h || !key)
2294
0
        return -1;
2295
2296
0
    if (h->l_text > 3) {
2297
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2298
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2299
0
            *p = '\0'; // for strstr call
2300
2301
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2302
2303
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2304
0
                *p = '\n'; // change back
2305
2306
                // mark the key:val
2307
0
                beg = q;
2308
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2309
0
                end = q;
2310
2311
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2312
0
                    && strlen(val) == end - beg - 4)
2313
0
                     return 0; // val is the same, no need to change
2314
2315
0
            } else {
2316
0
                beg = end = p;
2317
0
                *p = '\n';
2318
0
            }
2319
0
        }
2320
0
    }
2321
0
    if (beg == NULL) { // no @HD
2322
0
        new_l_text = h->l_text;
2323
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2324
0
            return -1;
2325
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2326
0
        if (val) {
2327
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2328
0
                return -1;
2329
0
            new_l_text += strlen(val) + 4;
2330
0
        }
2331
0
        newtext = (char*)malloc(new_l_text + 1);
2332
0
        if (!newtext) return -1;
2333
2334
0
        if (val)
2335
0
            snprintf(newtext, new_l_text + 1,
2336
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2337
0
        else
2338
0
            snprintf(newtext, new_l_text + 1,
2339
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2340
0
    } else { // has @HD but different or no key
2341
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2342
0
        if (val) {
2343
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2344
0
                return -1;
2345
0
            new_l_text += strlen(val) + 4;
2346
0
        }
2347
0
        newtext = (char*)malloc(new_l_text + 1);
2348
0
        if (!newtext) return -1;
2349
2350
0
        if (val) {
2351
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2352
0
                    (int) (beg - h->text), h->text, key, val, end);
2353
0
        } else { //delete key
2354
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2355
0
                    (int) (beg - h->text), h->text, end);
2356
0
        }
2357
0
    }
2358
0
    free(h->text);
2359
0
    h->text = newtext;
2360
0
    h->l_text = new_l_text;
2361
0
    return 0;
2362
0
}
2363
2364
2365
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2366
0
{
2367
0
    if (!h || !key)
2368
0
        return -1;
2369
2370
0
    if (!h->hrecs)
2371
0
        return old_sam_hdr_change_HD(h, key, val);
2372
2373
0
    if (val) {
2374
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2375
0
            return -1;
2376
0
    } else {
2377
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2378
0
            return -1;
2379
0
    }
2380
0
    return sam_hdr_rebuild(h);
2381
0
}
2382
/**********************
2383
 *** SAM record I/O ***
2384
 **********************/
2385
2386
// The speed of this code can vary considerably depending on minor code
2387
// changes elsewhere as some of the tight loops are particularly prone to
2388
// speed changes when the instruction blocks are split over a 32-byte
2389
// boundary.  To protect against this, we explicitly specify an alignment
2390
// for this function.  If this is insufficient, we may also wish to
2391
// consider alignment of blocks within this function via
2392
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2393
// However it's not very portable.
2394
// Instead we break into separate functions so we can explicitly specify
2395
// use __attribute__((aligned(32))) instead and force consistent loop
2396
// alignment.
2397
101k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2398
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2399
101k
    if (*n > INT32_MAX*0.666) {
2400
0
        errno = ENOMEM;
2401
0
        return -1;
2402
0
    }
2403
2404
101k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2405
101k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2406
0
        hts_log_error("Out of memory");
2407
0
        return -1;
2408
0
    }
2409
2410
101k
    (*n)+=*n>>1;
2411
101k
    return 0;
2412
101k
}
2413
2414
2415
// This ensures that q always ends up at the next comma after
2416
// reading a number even if it's followed by junk.  It
2417
// prevents the possibility of trying to read more than n items.
2418
26.0M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2419
2420
HTS_ALIGN32
2421
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2422
4.29k
                               uint32_t *nalloc, int *overflow) {
2423
326k
    while (*q == ',') {
2424
321k
        if ((*nused)++ >= (*nalloc)) {
2425
8.53k
            if (grow_B_array(b, nalloc, 1) < 0)
2426
0
                return NULL;
2427
8.53k
        }
2428
321k
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2429
321k
        b->l_data++;
2430
321k
    }
2431
4.29k
    return q;
2432
4.29k
}
2433
2434
HTS_ALIGN32
2435
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2436
5.74k
                               uint32_t *nalloc, int *overflow) {
2437
2.53M
    while (*q == ',') {
2438
2.52M
        if ((*nused)++ >= (*nalloc)) {
2439
17.9k
            if (grow_B_array(b, nalloc, 1) < 0)
2440
0
                return NULL;
2441
17.9k
        }
2442
2.52M
        if (q[1] != '-') {
2443
2.51M
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2444
2.51M
            b->l_data++;
2445
2.51M
        } else {
2446
15.4k
            *overflow = 1;
2447
15.4k
            q++;
2448
15.4k
            skip_to_comma_(q);
2449
15.4k
        }
2450
2.52M
    }
2451
5.74k
    return q;
2452
5.74k
}
2453
2454
HTS_ALIGN32
2455
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2456
1.81k
                               uint32_t *nalloc, int *overflow) {
2457
81.6k
    while (*q == ',') {
2458
79.8k
        if ((*nused)++ >= (*nalloc)) {
2459
924
            if (grow_B_array(b, nalloc, 2) < 0)
2460
0
                return NULL;
2461
924
        }
2462
79.8k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2463
79.8k
                  b->data + b->l_data);
2464
79.8k
        b->l_data += 2;
2465
79.8k
    }
2466
1.81k
    return q;
2467
1.81k
}
2468
2469
HTS_ALIGN32
2470
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2471
13.5k
                               uint32_t *nalloc, int *overflow) {
2472
21.4M
    while (*q == ',') {
2473
21.4M
        if ((*nused)++ >= (*nalloc)) {
2474
65.8k
            if (grow_B_array(b, nalloc, 2) < 0)
2475
0
                return NULL;
2476
65.8k
        }
2477
21.4M
        if (q[1] != '-') {
2478
21.1M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2479
21.1M
                      b->data + b->l_data);
2480
21.1M
            b->l_data += 2;
2481
21.1M
        } else {
2482
325k
            *overflow = 1;
2483
325k
            q++;
2484
325k
            skip_to_comma_(q);
2485
325k
        }
2486
21.4M
    }
2487
13.5k
    return q;
2488
13.5k
}
2489
2490
HTS_ALIGN32
2491
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2492
10.2k
                               uint32_t *nalloc, int *overflow) {
2493
23.9M
    while (*q == ',') {
2494
23.9M
        if ((*nused)++ >= (*nalloc)) {
2495
2.96k
            if (grow_B_array(b, nalloc, 4) < 0)
2496
0
                return NULL;
2497
2.96k
        }
2498
23.9M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2499
23.9M
                  b->data + b->l_data);
2500
23.9M
        b->l_data += 4;
2501
23.9M
    }
2502
10.2k
    return q;
2503
10.2k
}
2504
2505
HTS_ALIGN32
2506
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2507
22.8k
                               uint32_t *nalloc, int *overflow) {
2508
300k
    while (*q == ',') {
2509
277k
        if ((*nused)++ >= (*nalloc)) {
2510
571
            if (grow_B_array(b, nalloc, 4) < 0)
2511
0
                return NULL;
2512
571
        }
2513
277k
        if (q[1] != '-') {
2514
275k
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2515
275k
                      b->data + b->l_data);
2516
275k
            b->l_data += 4;
2517
275k
        } else {
2518
1.83k
            *overflow = 1;
2519
1.83k
            q++;
2520
1.83k
            skip_to_comma_(q);
2521
1.83k
        }
2522
277k
    }
2523
22.8k
    return q;
2524
22.8k
}
2525
2526
HTS_ALIGN32
2527
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2528
3.54k
                               uint32_t *nalloc, int *overflow) {
2529
46.7k
    while (*q == ',') {
2530
43.2k
        if ((*nused)++ >= (*nalloc)) {
2531
4.48k
            if (grow_B_array(b, nalloc, 4) < 0)
2532
0
                return NULL;
2533
4.48k
        }
2534
43.2k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2535
43.2k
        b->l_data += 4;
2536
43.2k
    }
2537
3.54k
    return q;
2538
3.54k
}
2539
2540
HTS_ALIGN32
2541
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2542
                              char **end, bam1_t *b,
2543
62.1k
                              int *ctr) {
2544
    // Protect against infinite recursion when dealing with invalid input.
2545
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2546
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2547
    //
2548
    // Loop detection is the safest solution incase there are other
2549
    // strange corner cases with malformed inputs.
2550
62.1k
    if (++(*ctr) > 2) {
2551
142
        hts_log_error("Malformed data in B:%c array", type);
2552
142
        return -1;
2553
142
    }
2554
2555
62.0k
    int orig_l = b->l_data;
2556
62.0k
    char *q = in;
2557
62.0k
    int32_t size;
2558
62.0k
    size_t bytes;
2559
62.0k
    int overflow = 0;
2560
2561
62.0k
    size = aux_type2size(type);
2562
62.0k
    if (size <= 0 || size > 4) {
2563
33
        hts_log_error("Unrecognized type B:%c", type);
2564
33
        return -1;
2565
33
    }
2566
2567
    // Ensure space for type + values.
2568
    // The first pass through here we don't know the number of entries and
2569
    // nalloc == 0.  We start with a small working set and then parse the
2570
    // data, growing as needed.
2571
    //
2572
    // If we have a second pass through we do know the number of entries
2573
    // and nalloc is already known.  We have no need to expand the bam data.
2574
62.0k
    if (!nalloc)
2575
45.5k
         nalloc=7;
2576
2577
    // Ensure allocated memory is big enough (for current nalloc estimate)
2578
62.0k
    bytes = (size_t) nalloc * (size_t) size;
2579
62.0k
    if (bytes / size != nalloc
2580
62.0k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2581
0
        hts_log_error("Out of memory");
2582
0
        return -1;
2583
0
    }
2584
2585
62.0k
    uint32_t nused = 0;
2586
2587
62.0k
    b->data[b->l_data++] = 'B';
2588
62.0k
    b->data[b->l_data++] = type;
2589
    // 32-bit B-array length is inserted later once we know it.
2590
62.0k
    int b_len_idx = b->l_data;
2591
62.0k
    b->l_data += sizeof(uint32_t);
2592
2593
62.0k
    if (type == 'c') {
2594
4.29k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2595
0
            return -1;
2596
57.7k
    } else if (type == 'C') {
2597
5.74k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2598
0
            return -1;
2599
51.9k
    } else if (type == 's') {
2600
1.81k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2601
0
            return -1;
2602
50.1k
    } else if (type == 'S') {
2603
13.5k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2604
0
            return -1;
2605
36.6k
    } else if (type == 'i') {
2606
10.2k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2607
0
            return -1;
2608
26.4k
    } else if (type == 'I') {
2609
22.8k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2610
0
            return -1;
2611
22.8k
    } else if (type == 'f') {
2612
3.54k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2613
0
            return -1;
2614
3.54k
    }
2615
62.0k
    if (*q != '\t' && *q != '\0') {
2616
        // Unknown B array type or junk in the numbers
2617
393
        hts_log_error("Malformed B:%c", type);
2618
393
        return -1;
2619
393
    }
2620
61.6k
    i32_to_le(nused, b->data + b_len_idx);
2621
2622
61.6k
    if (!overflow) {
2623
44.3k
        *end = q;
2624
44.3k
        return 0;
2625
44.3k
    } else {
2626
17.3k
        int64_t max = 0, min = 0, val;
2627
        // Given type was incorrect.  Try to rescue the situation.
2628
17.3k
        char *r = q;
2629
17.3k
        q = in;
2630
17.3k
        overflow = 0;
2631
17.3k
        b->l_data = orig_l;
2632
        // Find out what range of values is present
2633
24.2M
        while (q < r) {
2634
24.2M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2635
24.2M
            if (max < val) max = val;
2636
24.2M
            if (min > val) min = val;
2637
24.2M
            skip_to_comma_(q);
2638
24.2M
        }
2639
        // Retry with appropriate type
2640
17.3k
        if (!overflow) {
2641
17.1k
            if (min < 0) {
2642
12.1k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2643
1.95k
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2644
10.2k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2645
1.41k
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2646
8.82k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2647
8.49k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2648
8.49k
                }
2649
12.1k
            } else {
2650
4.92k
                if (max < UINT8_MAX) {
2651
192
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2652
4.73k
                } else if (max <= UINT16_MAX) {
2653
342
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2654
4.39k
                } else if (max <= UINT32_MAX) {
2655
4.21k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2656
4.21k
                }
2657
4.92k
            }
2658
17.1k
        }
2659
        // If here then at least one of the values is too big to store
2660
690
        hts_log_error("Numeric value in B array out of allowed range");
2661
690
        return -1;
2662
17.3k
    }
2663
61.6k
#undef skip_to_comma_
2664
61.6k
}
2665
2666
HTS_ALIGN32
2667
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2668
45.5k
{
2669
45.5k
    int ctr = 0;
2670
45.5k
    uint32_t nalloc = 0;
2671
45.5k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2672
45.5k
}
2673
2674
342k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2675
342k
    if (*v >= '1' && *v <= '9') {
2676
36.1k
        return hts_str2uint(v, rv, 16, overflow);
2677
36.1k
    }
2678
306k
    else if (*v == '0') {
2679
        // handle single-digit "0" directly; otherwise it's hex or octal
2680
24.8k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2681
336
        else {
2682
336
            unsigned long val = strtoul(v, rv, 0);
2683
336
            if (val > 65535) { *overflow = 1; return 65535; }
2684
186
            return val;
2685
336
        }
2686
24.8k
    }
2687
281k
    else {
2688
        // TODO implement symbolic flag letters
2689
281k
        *rv = v;
2690
281k
        return 0;
2691
281k
    }
2692
342k
}
2693
2694
// Parse tag line and append to bam object b.
2695
// Shared by both SAM and FASTQ parsers.
2696
//
2697
// The difference between the two is how lenient we are to recognising
2698
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2699
// non-SAM looking strings.
2700
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2701
337k
                            khash_t(tag) *tag_whitelist) {
2702
337k
    int overflow = 0;
2703
337k
    int checkpoint;
2704
337k
    char logbuf[40];
2705
337k
    char *q = start, *p = end;
2706
2707
337k
#define _parse_err(cond, ...)                   \
2708
20.0M
    do {                                        \
2709
40.7M
        if (cond) {                             \
2710
766
            if (lenient) {                      \
2711
0
                while (q < p && !isspace_c(*q))   \
2712
0
                    q++;                        \
2713
0
                while (q < p && isspace_c(*q))    \
2714
0
                    q++;                        \
2715
0
                b->l_data = checkpoint;         \
2716
0
                goto loop;                      \
2717
766
            } else {                            \
2718
766
                hts_log_error(__VA_ARGS__);     \
2719
766
                goto err_ret;                   \
2720
766
            }                                   \
2721
766
        }                                       \
2722
20.0M
    } while (0)
2723
2724
19.6M
    while (q < p) loop: {
2725
19.6M
        char type;
2726
19.6M
        checkpoint = b->l_data;
2727
19.6M
        if (p - q < 5) {
2728
143
            if (lenient) {
2729
0
                break;
2730
143
            } else {
2731
143
                hts_log_error("Incomplete aux field");
2732
143
                goto err_ret;
2733
143
            }
2734
143
        }
2735
9.84M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2736
2737
9.84M
        if (lenient && (q[2] | q[4]) != ':') {
2738
0
            while (q < p && !isspace_c(*q))
2739
0
                q++;
2740
0
            while (q < p && isspace_c(*q))
2741
0
                q++;
2742
0
            continue;
2743
0
        }
2744
2745
9.84M
        if (tag_whitelist) {
2746
0
            int tt = q[0]*256 + q[1];
2747
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2748
0
                while (q < p && *q != '\t')
2749
0
                    q++;
2750
0
                continue;
2751
0
            }
2752
0
        }
2753
2754
        // Copy over id
2755
9.84M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2756
9.84M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2757
9.84M
        q += 3; type = *q++; ++q; // q points to value
2758
9.84M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2759
9.64M
            _parse_err(*q <= '\t', "incomplete aux field");
2760
2761
        // Ensure enough space for a double + type allocated.
2762
9.84M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2763
2764
9.84M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2765
4.13M
            b->data[b->l_data++] = 'A';
2766
4.13M
            b->data[b->l_data++] = *q++;
2767
5.70M
        } else if (type == 'i' || type == 'I') {
2768
5.23M
            if (*q == '-') {
2769
4.38M
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2770
4.38M
                if (x >= INT8_MIN) {
2771
2.08M
                    b->data[b->l_data++] = 'c';
2772
2.08M
                    b->data[b->l_data++] = x;
2773
2.30M
                } else if (x >= INT16_MIN) {
2774
723k
                    b->data[b->l_data++] = 's';
2775
723k
                    i16_to_le(x, b->data + b->l_data);
2776
723k
                    b->l_data += 2;
2777
1.57M
                } else {
2778
1.57M
                    b->data[b->l_data++] = 'i';
2779
1.57M
                    i32_to_le(x, b->data + b->l_data);
2780
1.57M
                    b->l_data += 4;
2781
1.57M
                }
2782
4.38M
            } else {
2783
854k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2784
854k
                if (x <= UINT8_MAX) {
2785
536k
                    b->data[b->l_data++] = 'C';
2786
536k
                    b->data[b->l_data++] = x;
2787
536k
                } else if (x <= UINT16_MAX) {
2788
306k
                    b->data[b->l_data++] = 'S';
2789
306k
                    u16_to_le(x, b->data + b->l_data);
2790
306k
                    b->l_data += 2;
2791
306k
                } else {
2792
11.5k
                    b->data[b->l_data++] = 'I';
2793
11.5k
                    u32_to_le(x, b->data + b->l_data);
2794
11.5k
                    b->l_data += 4;
2795
11.5k
                }
2796
854k
            }
2797
5.23M
        } else if (type == 'f') {
2798
10.3k
            b->data[b->l_data++] = 'f';
2799
10.3k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2800
10.3k
            b->l_data += sizeof(float);
2801
452k
        } else if (type == 'd') {
2802
214k
            b->data[b->l_data++] = 'd';
2803
214k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2804
214k
            b->l_data += sizeof(double);
2805
237k
        } else if (type == 'Z' || type == 'H') {
2806
191k
            char *end = strchr(q, '\t');
2807
191k
            if (!end) end = q + strlen(q);
2808
191k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2809
191k
                       "hex field does not have an even number of digits");
2810
191k
            b->data[b->l_data++] = type;
2811
191k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2812
191k
            memcpy(b->data + b->l_data, q, end - q);
2813
191k
            b->l_data += end - q;
2814
191k
            b->data[b->l_data++] = '\0';
2815
191k
            q = end;
2816
191k
        } else if (type == 'B') {
2817
45.6k
            type = *q++; // q points to the first ',' following the typing byte
2818
45.6k
            _parse_err(*q && *q != ',' && *q != '\t',
2819
45.6k
                       "B aux field type not followed by ','");
2820
2821
45.5k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2822
1.25k
                goto err_ret;
2823
45.5k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2824
2825
60.5M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2826
9.83M
        q++;
2827
9.83M
    }
2828
2829
335k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2830
335k
#undef _parse_err
2831
2832
335k
    return 0;
2833
2834
2.16k
err_ret:
2835
2.16k
    return -2;
2836
335k
}
2837
2838
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2839
343k
{
2840
1.40M
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2841
2842
343k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2843
2844
// Macro that operates on 64-bits at a time.
2845
343k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2846
343k
    do {                                                        \
2847
331k
        uint64_u *from8 = (uint64_u *)(from);                   \
2848
331k
        uint64_u *to8 = (uint64_u *)(to);                       \
2849
331k
        uint64_t uflow = 0;                                     \
2850
331k
        size_t l8 = (l)>>3, i;                                  \
2851
332k
        for (i = 0; i < l8; i++) {                              \
2852
366
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2853
366
            uflow |= to8[i];                                    \
2854
366
        }                                                       \
2855
333k
        for (i<<=3; i < (l); ++i) {                             \
2856
1.40k
            to[i] = from[i] - (n);                              \
2857
1.40k
            uflow |= to[i];                                     \
2858
1.40k
        }                                                       \
2859
331k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2860
331k
    } while (0)
2861
2862
#else
2863
2864
// Basic version which operates a byte at a time
2865
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2866
        uint8_t uflow = 0;                                   \
2867
        for (i = 0; i < (l); ++i) {                          \
2868
            (to)[i] = (from)[i] - (n);                       \
2869
            uflow |= (uint8_t) (to)[i];                      \
2870
        }                                                    \
2871
        failed = (uflow & 0x80) > 0;                         \
2872
    } while (0)
2873
2874
#endif
2875
2876
635k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2877
4.50M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2878
1.28M
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2879
2880
343k
    uint8_t *t;
2881
2882
343k
    char *p = s->s, *q;
2883
343k
    int i, overflow = 0;
2884
343k
    char logbuf[40];
2885
343k
    hts_pos_t cigreflen;
2886
343k
    bam1_core_t *c = &b->core;
2887
2888
343k
    b->l_data = 0;
2889
343k
    memset(c, 0, 32);
2890
2891
    // qname
2892
343k
    q = _read_token(p);
2893
2894
342k
    _parse_warn(p - q <= 1, "empty query name");
2895
342k
    _parse_err(p - q > 255, "query name too long");
2896
    // resize large enough for name + extranul
2897
342k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2898
342k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2899
2900
342k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2901
342k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2902
342k
    b->l_data += c->l_extranul;
2903
2904
342k
    c->l_qname = p - q + c->l_extranul;
2905
2906
    // flag
2907
342k
    c->flag = parse_sam_flag(p, &p, &overflow);
2908
342k
    if (*p++ != '\t') goto err_ret; // malformated flag
2909
2910
    // chr
2911
342k
    q = _read_token(p);
2912
341k
    if (strcmp(q, "*")) {
2913
312k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2914
312k
        c->tid = bam_name2id(h, q);
2915
312k
        _parse_err(c->tid < -1, "failed to parse header");
2916
312k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2917
312k
    } else c->tid = -1;
2918
2919
    // pos
2920
341k
    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2921
341k
    if (*p++ != '\t') goto err_ret;
2922
340k
    if (c->pos < 0 && c->tid >= 0) {
2923
7.35k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2924
7.35k
        c->tid = -1;
2925
7.35k
    }
2926
340k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2927
2928
    // mapq
2929
340k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2930
340k
    if (*p++ != '\t') goto err_ret;
2931
    // cigar
2932
340k
    if (*p != '*') {
2933
302k
        uint32_t *cigar = NULL;
2934
302k
        int old_l_data = b->l_data;
2935
302k
        int n_cigar = bam_parse_cigar(p, &p, b);
2936
302k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2937
301k
        cigar = (uint32_t *)(b->data + old_l_data);
2938
2939
        // can't use bam_endpos() directly as some fields not yet set up
2940
301k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2941
301k
        if (cigreflen == 0) cigreflen = 1;
2942
301k
    } else {
2943
37.5k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2944
37.5k
        c->flag |= BAM_FUNMAP;
2945
37.5k
        q = _read_token(p);
2946
37.5k
        cigreflen = 1;
2947
37.5k
    }
2948
339k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2949
339k
               "read ends beyond highest supported position");
2950
339k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2951
    // mate chr
2952
339k
    q = _read_token(p);
2953
339k
    if (strcmp(q, "=") == 0) {
2954
799
        c->mtid = c->tid;
2955
338k
    } else if (strcmp(q, "*") == 0) {
2956
297
        c->mtid = -1;
2957
337k
    } else {
2958
337k
        c->mtid = bam_name2id(h, q);
2959
337k
        _parse_err(c->mtid < -1, "failed to parse header");
2960
337k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2961
337k
    }
2962
    // mpos
2963
339k
    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2964
339k
    if (*p++ != '\t') goto err_ret;
2965
338k
    if (c->mpos < 0 && c->mtid >= 0) {
2966
248k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2967
248k
        c->mtid = -1;
2968
248k
    }
2969
    // tlen
2970
338k
    c->isize = hts_str2int(p, &p, 64, &overflow);
2971
338k
    if (*p++ != '\t') goto err_ret;
2972
    // seq
2973
338k
    q = _read_token(p);
2974
337k
    if (strcmp(q, "*")) {
2975
298k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2976
298k
        c->l_qseq = p - q - 1;
2977
298k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2978
298k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2979
298k
        i = (c->l_qseq + 1) >> 1;
2980
298k
        _get_mem(uint8_t, &t, b, i);
2981
2982
298k
        unsigned int lqs2 = c->l_qseq&~1, i;
2983
301k
        for (i = 0; i < lqs2; i+=2)
2984
3.40k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2985
304k
        for (; i < c->l_qseq; ++i)
2986
6.84k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2987
298k
    } else c->l_qseq = 0;
2988
    // qual
2989
675k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2990
675k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2991
5.91k
        memset(t, 0xff, c->l_qseq);
2992
5.91k
        p += 2;
2993
331k
    } else {
2994
331k
        int failed = 0;
2995
331k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2996
331k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2997
331k
                   "SEQ and QUAL are of different length");
2998
331k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2999
331k
        _parse_err(failed, "invalid QUAL character");
3000
331k
        p += c->l_qseq + 1;
3001
331k
    }
3002
3003
    // aux
3004
337k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
3005
2.16k
        goto err_ret;
3006
3007
335k
    if (bam_tag2cigar(b, 1, 1) < 0)
3008
0
        return -2;
3009
335k
    return 0;
3010
3011
0
#undef _parse_warn
3012
0
#undef _parse_err
3013
0
#undef _get_mem
3014
0
#undef _read_token
3015
8.31k
err_ret:
3016
8.31k
    return -2;
3017
335k
}
3018
3019
302k
static uint32_t read_ncigar(const char *q) {
3020
302k
    uint32_t n_cigar = 0;
3021
1.86M
    for (; *q && *q != '\t'; ++q)
3022
1.56M
        if (!isdigit_c(*q)) ++n_cigar;
3023
302k
    if (!n_cigar) {
3024
279
        hts_log_error("No CIGAR operations");
3025
279
        return 0;
3026
279
    }
3027
302k
    if (n_cigar >= 2147483647) {
3028
0
        hts_log_error("Too many CIGAR operations");
3029
0
        return 0;
3030
0
    }
3031
3032
302k
    return n_cigar;
3033
302k
}
3034
3035
/*! @function
3036
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
3037
 @param  in      [in]  pointer to the source string
3038
 @param  a_cigar [out]  address of the destination uint32_t buffer
3039
 @return         number of processed input characters; 0 on error
3040
 */
3041
302k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
3042
302k
    int i, overflow = 0;
3043
302k
    const char *p = in;
3044
741k
    for (i = 0; i < n_cigar; i++) {
3045
439k
        uint32_t len;
3046
439k
        int op;
3047
439k
        char *q;
3048
439k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
3049
439k
        if (q == p) {
3050
364
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
3051
364
            return 0;
3052
364
        }
3053
439k
        if (overflow) {
3054
81
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
3055
81
            return 0;
3056
81
        }
3057
439k
        p = q;
3058
439k
        op = bam_cigar_table[(unsigned char)*p++];
3059
439k
        if (op < 0) {
3060
240
            hts_log_error("Unrecognized CIGAR operator");
3061
240
            return 0;
3062
240
        }
3063
438k
        a_cigar[i] = len;
3064
438k
        a_cigar[i] |= op;
3065
438k
    }
3066
3067
301k
    return p-in;
3068
302k
}
3069
3070
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
3071
0
    size_t n_cigar = 0;
3072
0
    int diff;
3073
3074
0
    if (!in || !a_cigar || !a_mem) {
3075
0
        hts_log_error("NULL pointer arguments");
3076
0
        return -1;
3077
0
    }
3078
0
    if (end) *end = (char *)in;
3079
3080
0
    if (*in == '*') {
3081
0
        if (end) (*end)++;
3082
0
        return 0;
3083
0
    }
3084
0
    n_cigar = read_ncigar(in);
3085
0
    if (!n_cigar) return 0;
3086
0
    if (n_cigar > *a_mem) {
3087
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
3088
0
        if (a_tmp) {
3089
0
            *a_cigar = a_tmp;
3090
0
            *a_mem = n_cigar;
3091
0
        } else {
3092
0
            hts_log_error("Memory allocation error");
3093
0
            return -1;
3094
0
        }
3095
0
    }
3096
3097
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
3098
0
    if (end) *end = (char *)in+diff;
3099
3100
0
    return n_cigar;
3101
0
}
3102
3103
302k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
3104
302k
    size_t n_cigar = 0;
3105
302k
    int diff;
3106
3107
302k
    if (!in || !b) {
3108
0
        hts_log_error("NULL pointer arguments");
3109
0
        return -1;
3110
0
    }
3111
302k
    if (end) *end = (char *)in;
3112
3113
302k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
3114
302k
    if (!n_cigar && b->core.n_cigar == 0) {
3115
279
        if (end) *end = (char *)in+1;
3116
279
        return 0;
3117
279
    }
3118
3119
302k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
3120
302k
    if (cig_diff > 0 &&
3121
302k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
3122
0
        hts_log_error("Memory allocation error");
3123
0
        return -1;
3124
0
    }
3125
3126
302k
    uint32_t *cig = bam_get_cigar(b);
3127
302k
    if ((uint8_t *)cig != b->data + b->l_data) {
3128
        // Modifying an BAM existing BAM record
3129
0
        uint8_t  *seq = bam_get_seq(b);
3130
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
3131
0
    }
3132
3133
302k
    if (n_cigar) {
3134
302k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
3135
685
            return -1;
3136
302k
    } else {
3137
0
        diff = 1; // handle "*"
3138
0
    }
3139
3140
301k
    b->l_data += cig_diff * sizeof(uint32_t);
3141
301k
    b->core.n_cigar = n_cigar;
3142
301k
    if (end) *end = (char *)in + diff;
3143
3144
301k
    return n_cigar;
3145
302k
}
3146
3147
/*
3148
 * -----------------------------------------------------------------------------
3149
 * SAM threading
3150
 */
3151
// Size of SAM text block (reading)
3152
0
#define SAM_NBYTES 240000
3153
3154
// Number of BAM records (writing, up to NB_mem in size)
3155
0
#define SAM_NBAM 1000
3156
3157
struct SAM_state;
3158
3159
// Output job - a block of BAM records
3160
typedef struct sp_bams {
3161
    struct sp_bams *next;
3162
    int serial;
3163
3164
    bam1_t *bams;
3165
    int nbams, abams; // used and alloc for bams[] array
3166
    size_t bam_mem;   // very approximate total size
3167
3168
    struct SAM_state *fd;
3169
} sp_bams;
3170
3171
// Input job - a block of SAM text
3172
typedef struct sp_lines {
3173
    struct sp_lines *next;
3174
    int serial;
3175
3176
    char *data;
3177
    int data_size;
3178
    int alloc;
3179
3180
    struct SAM_state *fd;
3181
    sp_bams *bams;
3182
} sp_lines;
3183
3184
enum sam_cmd {
3185
    SAM_NONE = 0,
3186
    SAM_CLOSE,
3187
    SAM_CLOSE_DONE,
3188
};
3189
3190
typedef struct SAM_state {
3191
    sam_hdr_t *h;
3192
3193
    hts_tpool *p;
3194
    int own_pool;
3195
    pthread_mutex_t lines_m;
3196
    hts_tpool_process *q;
3197
    pthread_t dispatcher;
3198
    int dispatcher_set;
3199
3200
    sp_lines *lines;
3201
    sp_bams *bams;
3202
3203
    sp_bams *curr_bam;
3204
    int curr_idx;
3205
    int serial;
3206
3207
    // Be warned: moving these mutexes around in this struct can reduce
3208
    // threading performance by up to 70%!
3209
    pthread_mutex_t command_m;
3210
    pthread_cond_t command_c;
3211
    enum sam_cmd command;
3212
3213
    // One of the E* errno codes
3214
    int errcode;
3215
3216
    htsFile *fp;
3217
} SAM_state;
3218
3219
// Returns a SAM_state struct from a generic hFILE.
3220
//
3221
// Returns NULL on failure.
3222
0
static SAM_state *sam_state_create(htsFile *fp) {
3223
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3224
    // be a redirect call with an additional 'S' mode.  This in turn would
3225
    // correctly set the designed format to sam instead of a generic
3226
    // text_format.
3227
0
    if (fp->format.format != sam && fp->format.format != text_format)
3228
0
        return NULL;
3229
3230
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3231
0
    if (!fd)
3232
0
        return NULL;
3233
3234
0
    fp->state = fd;
3235
0
    fd->fp = fp;
3236
3237
0
    return fd;
3238
0
}
3239
3240
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3241
static void *sam_format_worker(void *arg);
3242
3243
0
static void sam_state_err(SAM_state *fd, int errcode) {
3244
0
    pthread_mutex_lock(&fd->command_m);
3245
0
    if (!fd->errcode)
3246
0
        fd->errcode = errcode;
3247
0
    pthread_mutex_unlock(&fd->command_m);
3248
0
}
3249
3250
0
static void sam_free_sp_bams(sp_bams *b) {
3251
0
    if (!b)
3252
0
        return;
3253
3254
0
    if (b->bams) {
3255
0
        int i;
3256
0
        for (i = 0; i < b->abams; i++) {
3257
0
            if (b->bams[i].data)
3258
0
                free(b->bams[i].data);
3259
0
        }
3260
0
        free(b->bams);
3261
0
    }
3262
0
    free(b);
3263
0
}
3264
3265
// Destroys the state produce by sam_state_create.
3266
39.7k
int sam_state_destroy(htsFile *fp) {
3267
39.7k
    int ret = 0;
3268
3269
39.7k
    if (!fp->state)
3270
39.7k
        return 0;
3271
3272
0
    SAM_state *fd = fp->state;
3273
0
    if (fd->p) {
3274
0
        if (fd->h) {
3275
            // Notify sam_dispatcher we're closing
3276
0
            pthread_mutex_lock(&fd->command_m);
3277
0
            if (fd->command != SAM_CLOSE_DONE)
3278
0
                fd->command = SAM_CLOSE;
3279
0
            pthread_cond_signal(&fd->command_c);
3280
0
            ret = -fd->errcode;
3281
0
            if (fd->q)
3282
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3283
3284
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3285
0
                for (;;) {
3286
                    // Avoid deadlocks with dispatcher
3287
0
                    if (fd->command == SAM_CLOSE_DONE)
3288
0
                        break;
3289
0
                    hts_tpool_wake_dispatch(fd->q);
3290
0
                    pthread_mutex_unlock(&fd->command_m);
3291
0
                    usleep(10000);
3292
0
                    pthread_mutex_lock(&fd->command_m);
3293
0
                }
3294
0
            }
3295
0
            pthread_mutex_unlock(&fd->command_m);
3296
3297
0
            if (fp->is_write) {
3298
                // Dispatch the last partial block.
3299
0
                sp_bams *gb = fd->curr_bam;
3300
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3301
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3302
3303
                // Flush and drain output
3304
0
                if (fd->q)
3305
0
                    hts_tpool_process_flush(fd->q);
3306
0
                pthread_mutex_lock(&fd->command_m);
3307
0
                if (!ret) ret = -fd->errcode;
3308
0
                pthread_mutex_unlock(&fd->command_m);
3309
3310
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3311
0
                    usleep(10000);
3312
0
                    pthread_mutex_lock(&fd->command_m);
3313
0
                    ret = -fd->errcode;
3314
                    // not empty but shutdown implies error
3315
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3316
0
                        ret = EIO;
3317
0
                    pthread_mutex_unlock(&fd->command_m);
3318
0
                }
3319
0
                if (fd->q)
3320
0
                    hts_tpool_process_shutdown(fd->q);
3321
0
            }
3322
3323
            // Wait for it to acknowledge
3324
0
            if (fd->dispatcher_set)
3325
0
                pthread_join(fd->dispatcher, NULL);
3326
0
            if (!ret) ret = -fd->errcode;
3327
0
        }
3328
3329
        // Tidy up memory
3330
0
        if (fd->q)
3331
0
            hts_tpool_process_destroy(fd->q);
3332
3333
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3334
0
            hts_tpool_destroy(fd->p);
3335
0
            fd->p = NULL;
3336
0
        }
3337
0
        pthread_mutex_destroy(&fd->lines_m);
3338
0
        pthread_mutex_destroy(&fd->command_m);
3339
0
        pthread_cond_destroy(&fd->command_c);
3340
3341
0
        sp_lines *l = fd->lines;
3342
0
        while (l) {
3343
0
            sp_lines *n = l->next;
3344
0
            free(l->data);
3345
0
            free(l);
3346
0
            l = n;
3347
0
        }
3348
3349
0
        sp_bams *b = fd->bams;
3350
0
        while (b) {
3351
0
            if (fd->curr_bam == b)
3352
0
                fd->curr_bam = NULL;
3353
0
            sp_bams *n = b->next;
3354
0
            sam_free_sp_bams(b);
3355
0
            b = n;
3356
0
        }
3357
3358
0
        if (fd->curr_bam)
3359
0
            sam_free_sp_bams(fd->curr_bam);
3360
3361
        // Decrement counter by one, maybe destroying too.
3362
        // This is to permit the caller using bam_hdr_destroy
3363
        // before sam_close without triggering decode errors
3364
        // in the background threads.
3365
0
        bam_hdr_destroy(fd->h);
3366
0
    }
3367
3368
0
    free(fp->state);
3369
0
    fp->state = NULL;
3370
0
    return ret;
3371
39.7k
}
3372
3373
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3374
0
static void cleanup_sp_lines(void *arg) {
3375
0
    sp_lines *gl = (sp_lines *)arg;
3376
0
    if (!gl) return;
3377
3378
    // Should always be true for lines passed to / from thread workers.
3379
0
    assert(gl->next == NULL);
3380
3381
0
    free(gl->data);
3382
0
    sam_free_sp_bams(gl->bams);
3383
0
    free(gl);
3384
0
}
3385
3386
// Run from one of the worker threads.
3387
// Convert a passed in array of lines to array of BAMs, returning
3388
// the result back to the thread queue.
3389
0
static void *sam_parse_worker(void *arg) {
3390
0
    sp_lines *gl = (sp_lines *)arg;
3391
0
    sp_bams *gb = NULL;
3392
0
    char *lines = gl->data;
3393
0
    int i;
3394
0
    bam1_t *b;
3395
0
    SAM_state *fd = gl->fd;
3396
3397
    // Use a block of BAM structs we had earlier if available.
3398
0
    pthread_mutex_lock(&fd->lines_m);
3399
0
    if (fd->bams) {
3400
0
        gb = fd->bams;
3401
0
        fd->bams = gb->next;
3402
0
    }
3403
0
    pthread_mutex_unlock(&fd->lines_m);
3404
3405
0
    if (gb == NULL) {
3406
0
        gb = calloc(1, sizeof(*gb));
3407
0
        if (!gb) {
3408
0
            return NULL;
3409
0
        }
3410
0
        gb->abams = 100;
3411
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3412
0
        if (!gb->bams) {
3413
0
            sam_state_err(fd, ENOMEM);
3414
0
            goto err;
3415
0
        }
3416
0
        gb->nbams = 0;
3417
0
        gb->bam_mem = 0;
3418
0
    }
3419
0
    gb->serial = gl->serial;
3420
0
    gb->next = NULL;
3421
3422
0
    b = (bam1_t *)gb->bams;
3423
0
    if (!b) {
3424
0
        sam_state_err(fd, ENOMEM);
3425
0
        goto err;
3426
0
    }
3427
3428
0
    i = 0;
3429
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3430
0
    while (cp < cp_end) {
3431
0
        if (i >= gb->abams) {
3432
0
            int old_abams = gb->abams;
3433
0
            gb->abams *= 2;
3434
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3435
0
            if (!b) {
3436
0
                gb->abams /= 2;
3437
0
                sam_state_err(fd, ENOMEM);
3438
0
                goto err;
3439
0
            }
3440
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3441
0
            gb->bams = b;
3442
0
        }
3443
3444
        // Ideally we'd get sam_parse1 to return the number of
3445
        // bytes decoded and to be able to stop on newline as
3446
        // well as \0.
3447
        //
3448
        // We can then avoid the additional strchr loop.
3449
        // It's around 6% of our CPU cost, albeit threadable.
3450
        //
3451
        // However this is an API change so for now we copy.
3452
3453
0
        char *nl = strchr(cp, '\n');
3454
0
        char *line_end;
3455
0
        if (nl) {
3456
0
            line_end = nl;
3457
0
            if (line_end > cp && *(line_end - 1) == '\r')
3458
0
                line_end--;
3459
0
            nl++;
3460
0
        } else {
3461
0
            nl = line_end = cp_end;
3462
0
        }
3463
0
        *line_end = '\0';
3464
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3465
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3466
0
            sam_state_err(fd, errno ? errno : EIO);
3467
0
            cleanup_sp_lines(gl);
3468
0
            goto err;
3469
0
        }
3470
3471
0
        cp = nl;
3472
0
        i++;
3473
0
    }
3474
0
    gb->nbams = i;
3475
3476
0
    pthread_mutex_lock(&fd->lines_m);
3477
0
    gl->next = fd->lines;
3478
0
    fd->lines = gl;
3479
0
    pthread_mutex_unlock(&fd->lines_m);
3480
0
    return gb;
3481
3482
0
 err:
3483
0
    sam_free_sp_bams(gb);
3484
0
    return NULL;
3485
0
}
3486
3487
0
static void *sam_parse_eof(void *arg) {
3488
0
    return NULL;
3489
0
}
3490
3491
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3492
0
static void cleanup_sp_bams(void *arg) {
3493
0
    sam_free_sp_bams((sp_bams *) arg);
3494
0
}
3495
3496
// Runs in its own thread.
3497
// Reads a block of text (SAM) and sends a new job to the thread queue to
3498
// translate this to BAM.
3499
0
static void *sam_dispatcher_read(void *vp) {
3500
0
    htsFile *fp = vp;
3501
0
    kstring_t line = {0};
3502
0
    int line_frag = 0;
3503
0
    SAM_state *fd = fp->state;
3504
0
    sp_lines *l = NULL;
3505
3506
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3507
    // matter as it will grow if necessary).
3508
0
    if (ks_resize(&line, 1000) < 0)
3509
0
        goto err;
3510
3511
0
    for (;;) {
3512
        // Check for command
3513
0
        pthread_mutex_lock(&fd->command_m);
3514
0
        switch (fd->command) {
3515
3516
0
        case SAM_CLOSE:
3517
0
            pthread_cond_signal(&fd->command_c);
3518
0
            pthread_mutex_unlock(&fd->command_m);
3519
0
            hts_tpool_process_shutdown(fd->q);
3520
0
            goto tidyup;
3521
3522
0
        default:
3523
0
            break;
3524
0
        }
3525
0
        pthread_mutex_unlock(&fd->command_m);
3526
3527
0
        pthread_mutex_lock(&fd->lines_m);
3528
0
        if (fd->lines) {
3529
            // reuse existing line buffer
3530
0
            l = fd->lines;
3531
0
            fd->lines = l->next;
3532
0
        }
3533
0
        pthread_mutex_unlock(&fd->lines_m);
3534
3535
0
        if (l == NULL) {
3536
            // none to reuse, to create a new one
3537
0
            l = calloc(1, sizeof(*l));
3538
0
            if (!l)
3539
0
                goto err;
3540
0
            l->alloc = SAM_NBYTES;
3541
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3542
0
            if (!l->data) {
3543
0
                free(l);
3544
0
                l = NULL;
3545
0
                goto err;
3546
0
            }
3547
0
            l->fd = fd;
3548
0
        }
3549
0
        l->next = NULL;
3550
3551
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3552
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3553
0
            if (!rp)
3554
0
                goto err;
3555
0
            l->alloc = line_frag+SAM_NBYTES/2;
3556
0
            l->data = rp;
3557
0
        }
3558
0
        memcpy(l->data, line.s, line_frag);
3559
3560
0
        l->data_size = line_frag;
3561
0
        ssize_t nbytes;
3562
0
    longer_line:
3563
0
        if (fp->is_bgzf)
3564
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3565
0
        else
3566
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3567
0
        if (nbytes < 0) {
3568
0
            sam_state_err(fd, errno ? errno : EIO);
3569
0
            goto err;
3570
0
        } else if (nbytes == 0)
3571
0
            break; // EOF
3572
0
        l->data_size += nbytes;
3573
3574
        // trim to last \n. Maybe \r\n, but that's still fine
3575
0
        if (nbytes == l->alloc - line_frag) {
3576
0
            char *cp_end = l->data + l->data_size;
3577
0
            char *cp = cp_end-1;
3578
3579
0
            while (cp > (char *)l->data && *cp != '\n')
3580
0
                cp--;
3581
3582
            // entire buffer is part of a single line
3583
0
            if (cp == l->data) {
3584
0
                line_frag = l->data_size;
3585
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3586
0
                if (!rp)
3587
0
                    goto err;
3588
0
                l->alloc *= 2;
3589
0
                l->data = rp;
3590
0
                assert(l->alloc >= l->data_size);
3591
0
                assert(l->alloc >= line_frag);
3592
0
                assert(l->alloc >= l->alloc - line_frag);
3593
0
                goto longer_line;
3594
0
            }
3595
0
            cp++;
3596
3597
            // line holds the remainder of our line.
3598
0
            if (ks_resize(&line, cp_end - cp) < 0)
3599
0
                goto err;
3600
0
            memcpy(line.s, cp, cp_end - cp);
3601
0
            line_frag = cp_end - cp;
3602
0
            l->data_size = l->alloc - line_frag;
3603
0
        } else {
3604
            // out of buffer
3605
0
            line_frag = 0;
3606
0
        }
3607
3608
0
        l->serial = fd->serial++;
3609
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3610
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3611
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3612
0
            goto err;
3613
0
        pthread_mutex_lock(&fd->command_m);
3614
0
        if (fd->command == SAM_CLOSE) {
3615
0
            pthread_mutex_unlock(&fd->command_m);
3616
0
            l = NULL;
3617
0
            goto tidyup;
3618
0
        }
3619
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3620
0
        pthread_mutex_unlock(&fd->command_m);
3621
0
    }
3622
3623
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3624
0
        goto err;
3625
3626
    // At EOF, wait for close request.
3627
    // (In future if we add support for seek, this is where we need to catch it.)
3628
0
    for (;;) {
3629
0
        pthread_mutex_lock(&fd->command_m);
3630
0
        if (fd->command == SAM_NONE)
3631
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3632
0
        switch (fd->command) {
3633
0
        case SAM_CLOSE:
3634
0
            pthread_cond_signal(&fd->command_c);
3635
0
            pthread_mutex_unlock(&fd->command_m);
3636
0
            hts_tpool_process_shutdown(fd->q);
3637
0
            goto tidyup;
3638
3639
0
        default:
3640
0
            pthread_mutex_unlock(&fd->command_m);
3641
0
            break;
3642
0
        }
3643
0
    }
3644
3645
0
 tidyup:
3646
0
    pthread_mutex_lock(&fd->command_m);
3647
0
    fd->command = SAM_CLOSE_DONE;
3648
0
    pthread_cond_signal(&fd->command_c);
3649
0
    pthread_mutex_unlock(&fd->command_m);
3650
3651
0
    if (l) {
3652
0
        pthread_mutex_lock(&fd->lines_m);
3653
0
        l->next = fd->lines;
3654
0
        fd->lines = l;
3655
0
        pthread_mutex_unlock(&fd->lines_m);
3656
0
    }
3657
0
    free(line.s);
3658
3659
0
    return NULL;
3660
3661
0
 err:
3662
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3663
0
    hts_tpool_process_shutdown(fd->q);
3664
0
    goto tidyup;
3665
0
}
3666
3667
// Runs in its own thread.
3668
// Takes encoded blocks of SAM off the thread results queue and writes them
3669
// to our output stream.
3670
0
static void *sam_dispatcher_write(void *vp) {
3671
0
    htsFile *fp = vp;
3672
0
    SAM_state *fd = fp->state;
3673
0
    hts_tpool_result *r;
3674
3675
    // Iterates until result queue is shutdown, where it returns NULL.
3676
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3677
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3678
0
        if (!gl) {
3679
0
            sam_state_err(fd, ENOMEM);
3680
0
            goto err;
3681
0
        }
3682
3683
0
        if (fp->idx) {
3684
0
            sp_bams *gb = gl->bams;
3685
0
            int i = 0, count = 0;
3686
0
            while (i < gl->data_size) {
3687
0
                int j = i;
3688
0
                while (i < gl->data_size && gl->data[i] != '\n')
3689
0
                    i++;
3690
0
                if (i < gl->data_size)
3691
0
                    i++;
3692
3693
0
                if (fp->is_bgzf) {
3694
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3695
0
                        goto err;
3696
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3697
0
                        goto err;
3698
0
                } else {
3699
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3700
0
                        goto err;
3701
0
                }
3702
3703
0
                bam1_t *b = &gb->bams[count++];
3704
0
                if (fp->format.compression == bgzf) {
3705
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3706
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3707
0
                                      bgzf_tell(fp->fp.bgzf),
3708
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3709
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3710
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3711
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3712
0
                        goto err;
3713
0
                    }
3714
0
                } else {
3715
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3716
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3717
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3718
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3719
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3720
0
                        goto err;
3721
0
                    }
3722
0
                }
3723
0
            }
3724
3725
0
            assert(count == gb->nbams);
3726
3727
            // Add bam array to free-list
3728
0
            pthread_mutex_lock(&fd->lines_m);
3729
0
            gb->next = fd->bams;
3730
0
            fd->bams = gl->bams;
3731
0
            gl->bams = NULL;
3732
0
            pthread_mutex_unlock(&fd->lines_m);
3733
0
        } else {
3734
0
            if (fp->is_bgzf) {
3735
                // We keep track of how much in the current block we have
3736
                // remaining => R.  We look for the last newline in input
3737
                // [i] to [i+R], backwards => position N.
3738
                //
3739
                // If we find a newline, we write out bytes i to N.
3740
                // We know we cannot fit the next record in this bgzf block,
3741
                // so we flush what we have and copy input N to i+R into
3742
                // the start of a new block, and recompute a new R for that.
3743
                //
3744
                // If we don't find a newline (i==N) then we cannot extend
3745
                // the current block at all, so flush whatever is in it now
3746
                // if it ends on a newline.
3747
                // We still copy i(==N) to i+R to the next block and
3748
                // continue as before with a new R.
3749
                //
3750
                // The only exception on the flush is when we run out of
3751
                // data in the input.  In that case we skip it as we don't
3752
                // yet know if the next record will fit.
3753
                //
3754
                // Both conditions share the same code here:
3755
                // - Look for newline (pos N)
3756
                // - Write i to N (which maybe 0)
3757
                // - Flush if block ends on newline and not end of input
3758
                // - write N to i+R
3759
3760
0
                int i = 0;
3761
0
                BGZF *fb = fp->fp.bgzf;
3762
0
                while (i < gl->data_size) {
3763
                    // remaining space in block
3764
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3765
0
                    int eod = 0;
3766
0
                    if (R > gl->data_size-i)
3767
0
                        R = gl->data_size-i, eod = 1;
3768
3769
                    // Find last newline in input data
3770
0
                    int N = i + R;
3771
0
                    while (--N > i) {
3772
0
                        if (gl->data[N] == '\n')
3773
0
                            break;
3774
0
                    }
3775
3776
0
                    if (N != i) {
3777
                        // Found a newline
3778
0
                        N++;
3779
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3780
0
                            goto err;
3781
0
                    }
3782
3783
                    // Flush bgzf block
3784
0
                    int b_off = fb->block_offset;
3785
0
                    if (!eod && b_off &&
3786
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3787
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3788
0
                            goto err;
3789
3790
                    // Copy from N onwards into next block
3791
0
                    if (i+R > N)
3792
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3793
0
                            != i+R - N)
3794
0
                            goto err;
3795
3796
0
                    i = i+R;
3797
0
                }
3798
0
            } else {
3799
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3800
0
                    goto err;
3801
0
            }
3802
0
        }
3803
3804
0
        hts_tpool_delete_result(r, 0);
3805
3806
        // Also updated by main thread
3807
0
        pthread_mutex_lock(&fd->lines_m);
3808
0
        gl->next = fd->lines;
3809
0
        fd->lines = gl;
3810
0
        pthread_mutex_unlock(&fd->lines_m);
3811
0
    }
3812
3813
0
    sam_state_err(fd, 0); // success
3814
0
    hts_tpool_process_shutdown(fd->q);
3815
0
    return NULL;
3816
3817
0
 err:
3818
0
    sam_state_err(fd, errno ? errno : EIO);
3819
0
    return (void *)-1;
3820
0
}
3821
3822
// Run from one of the worker threads.
3823
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3824
// of text SAM records (sp_lines).
3825
0
static void *sam_format_worker(void *arg) {
3826
0
    sp_bams *gb = (sp_bams *)arg;
3827
0
    sp_lines *gl = NULL;
3828
0
    int i;
3829
0
    SAM_state *fd = gb->fd;
3830
0
    htsFile *fp = fd->fp;
3831
3832
    // Use a block of SAM strings we had earlier if available.
3833
0
    pthread_mutex_lock(&fd->lines_m);
3834
0
    if (fd->lines) {
3835
0
        gl = fd->lines;
3836
0
        fd->lines = gl->next;
3837
0
    }
3838
0
    pthread_mutex_unlock(&fd->lines_m);
3839
3840
0
    if (gl == NULL) {
3841
0
        gl = calloc(1, sizeof(*gl));
3842
0
        if (!gl) {
3843
0
            sam_state_err(fd, ENOMEM);
3844
0
            return NULL;
3845
0
        }
3846
0
        gl->alloc = gl->data_size = 0;
3847
0
        gl->data = NULL;
3848
0
    }
3849
0
    gl->serial = gb->serial;
3850
0
    gl->next = NULL;
3851
3852
0
    kstring_t ks = {0, gl->alloc, gl->data};
3853
3854
0
    for (i = 0; i < gb->nbams; i++) {
3855
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3856
0
            sam_state_err(fd, errno ? errno : EIO);
3857
0
            goto err;
3858
0
        }
3859
0
        kputc('\n', &ks);
3860
0
    }
3861
3862
0
    pthread_mutex_lock(&fd->lines_m);
3863
0
    gl->data_size = ks.l;
3864
0
    gl->alloc = ks.m;
3865
0
    gl->data = ks.s;
3866
3867
0
    if (fp->idx) {
3868
        // Keep hold of the bam array a little longer as
3869
        // sam_dispatcher_write needs to use them for building the index.
3870
0
        gl->bams = gb;
3871
0
    } else {
3872
        // Add bam array to free-list
3873
0
        gb->next = fd->bams;
3874
0
        fd->bams = gb;
3875
0
    }
3876
0
    pthread_mutex_unlock(&fd->lines_m);
3877
3878
0
    return gl;
3879
3880
0
 err:
3881
    // Possible race between this and fd->curr_bam.
3882
    // Easier to not free and leave it on the input list so it
3883
    // gets freed there instead?
3884
    // sam_free_sp_bams(gb);
3885
0
    if (gl) {
3886
0
        free(gl->data);
3887
0
        free(gl);
3888
0
    }
3889
0
    return NULL;
3890
0
}
3891
3892
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3893
0
    if (fp->state)
3894
0
        return 0;
3895
3896
0
    if (!(fp->state = sam_state_create(fp)))
3897
0
        return -1;
3898
0
    SAM_state *fd = (SAM_state *)fp->state;
3899
3900
0
    pthread_mutex_init(&fd->lines_m, NULL);
3901
0
    pthread_mutex_init(&fd->command_m, NULL);
3902
0
    pthread_cond_init(&fd->command_c, NULL);
3903
0
    fd->p = p->pool;
3904
0
    int qsize = p->qsize;
3905
0
    if (!qsize)
3906
0
        qsize = 2*hts_tpool_size(fd->p);
3907
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3908
0
    if (!fd->q) {
3909
0
        sam_state_destroy(fp);
3910
0
        return -1;
3911
0
    }
3912
3913
0
    if (fp->format.compression == bgzf)
3914
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3915
3916
0
    return 0;
3917
0
}
3918
3919
0
int sam_set_threads(htsFile *fp, int nthreads) {
3920
0
    if (nthreads <= 0)
3921
0
        return 0;
3922
3923
0
    htsThreadPool p;
3924
0
    p.pool = hts_tpool_init(nthreads);
3925
0
    p.qsize = nthreads*2;
3926
3927
0
    int ret = sam_set_thread_pool(fp, &p);
3928
0
    if (ret < 0)
3929
0
        return ret;
3930
3931
0
    SAM_state *fd = (SAM_state *)fp->state;
3932
0
    fd->own_pool = 1;
3933
3934
0
    return 0;
3935
0
}
3936
3937
typedef struct {
3938
    kstring_t name;
3939
    kstring_t comment; // NB: pointer into name, do not free
3940
    kstring_t seq;
3941
    kstring_t qual;
3942
    int casava;
3943
    int aux;
3944
    int rnum;
3945
    char BC[3];         // aux tag ID for barcode
3946
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3947
    char nprefix;
3948
    int sra_names;
3949
} fastq_state;
3950
3951
// Initialise fastq state.
3952
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3953
3.49k
static fastq_state *fastq_state_init(int name_char) {
3954
3.49k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3955
3.49k
    if (!x)
3956
0
        return NULL;
3957
3.49k
    strcpy(x->BC, "BC");
3958
3.49k
    x->nprefix = name_char;
3959
3960
3.49k
    return x;
3961
3.49k
}
3962
3963
4.65k
void fastq_state_destroy(htsFile *fp) {
3964
4.65k
    if (fp->state) {
3965
3.49k
        fastq_state *x = (fastq_state *)fp->state;
3966
3.49k
        if (x->tags)
3967
3.49k
            kh_destroy(tag, x->tags);
3968
3.49k
        ks_free(&x->name);
3969
3.49k
        ks_free(&x->seq);
3970
3.49k
        ks_free(&x->qual);
3971
3.49k
        free(fp->state);
3972
3.49k
    }
3973
4.65k
}
3974
3975
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3976
0
    va_list args;
3977
3978
0
    if (!fp)
3979
0
        return -1;
3980
0
    if (!fp->state)
3981
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3982
0
                                           ? '@' : '>')))
3983
0
            return -1;
3984
3985
0
    fastq_state *x = (fastq_state *)fp->state;
3986
3987
0
    switch (opt) {
3988
0
    case FASTQ_OPT_CASAVA:
3989
0
        x->casava = 1;
3990
0
        break;
3991
3992
0
    case FASTQ_OPT_NAME2:
3993
0
        x->sra_names = 1;
3994
0
        break;
3995
3996
0
    case FASTQ_OPT_AUX: {
3997
0
        va_start(args, opt);
3998
0
        x->aux = 1;
3999
0
        char *tag = va_arg(args, char *);
4000
0
        va_end(args);
4001
0
        if (tag && strcmp(tag, "1") != 0) {
4002
0
            if (!x->tags)
4003
0
                if (!(x->tags = kh_init(tag)))
4004
0
                    return -1;
4005
4006
0
            size_t i, tlen = strlen(tag);
4007
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
4008
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
4009
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
4010
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
4011
0
                    break;
4012
0
                }
4013
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
4014
0
                kh_put(tag, x->tags, tcode, &ret);
4015
0
                if (ret < 0)
4016
0
                    return -1;
4017
0
            }
4018
0
        }
4019
0
        break;
4020
0
    }
4021
4022
0
    case FASTQ_OPT_BARCODE: {
4023
0
        va_start(args, opt);
4024
0
        char *bc = va_arg(args, char *);
4025
0
        va_end(args);
4026
0
        strncpy(x->BC, bc, 2);
4027
0
        x->BC[2] = 0;
4028
0
        break;
4029
0
    }
4030
4031
0
    case FASTQ_OPT_RNUM:
4032
0
        x->rnum = 1;
4033
0
        break;
4034
4035
0
    default:
4036
0
        break;
4037
0
    }
4038
0
    return 0;
4039
0
}
4040
4041
55.8M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
4042
55.8M
    fastq_state *x = (fastq_state *)fp->state;
4043
55.8M
    size_t i, l;
4044
55.8M
    int ret = 0;
4045
4046
55.8M
    if (fp->format.format == fasta_format && fp->line.s) {
4047
        // For FASTA we've already read the >name line; steal it
4048
        // Not the most efficient, but we don't optimise for fasta reading.
4049
55.8M
        if (fp->line.l == 0)
4050
1.65k
            return -1; // EOF
4051
4052
55.8M
        free(x->name.s);
4053
55.8M
        x->name = fp->line;
4054
55.8M
        fp->line.l = fp->line.m = 0;
4055
55.8M
        fp->line.s = NULL;
4056
55.8M
    } else {
4057
        // Read a FASTQ format entry.
4058
5.86k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
4059
5.86k
        if (ret == -1)
4060
51
            return -1;  // EOF
4061
5.81k
        else if (ret < -1)
4062
156
            return ret; // ERR
4063
5.86k
    }
4064
4065
    // Name
4066
55.8M
    if (*x->name.s != x->nprefix)
4067
78
        return -2;
4068
4069
    // Reverse the SRA strangeness of putting the run_name.number before
4070
    // the read name.
4071
55.8M
    i = 0;
4072
55.8M
    char *name = x->name.s+1;
4073
55.8M
    if (x->sra_names) {
4074
0
        char *cp = strpbrk(x->name.s, " \t");
4075
0
        if (cp) {
4076
0
            while (*cp == ' ' || *cp == '\t')
4077
0
                cp++;
4078
0
            *--cp = '@';
4079
0
            i = cp - x->name.s;
4080
0
            name = cp+1;
4081
0
        }
4082
0
    }
4083
4084
55.8M
    l = x->name.l;
4085
55.8M
    char *s = x->name.s;
4086
147M
    while (i < l && !isspace_c(s[i]))
4087
92.0M
        i++;
4088
55.8M
    if (i < l) {
4089
455k
        s[i] = 0;
4090
455k
        x->name.l = i++;
4091
455k
    }
4092
4093
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
4094
58.3M
    while (i < l && isspace_c(s[i]))
4095
2.44M
        i++;
4096
55.8M
    x->comment.s = s+i;
4097
55.8M
    x->comment.l = l - i;
4098
4099
    // Seq
4100
55.8M
    x->seq.l = 0;
4101
184M
    for (;;) {
4102
184M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
4103
3.02k
            if (fp->format.format == fastq_format || ret < -1)
4104
1.34k
                return -2;
4105
184M
        if (ret == -1 ||
4106
184M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
4107
55.8M
            break;
4108
128M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
4109
0
            return -2;
4110
128M
    }
4111
4112
    // Qual
4113
55.8M
    if (fp->format.format == fastq_format) {
4114
2.47k
        size_t remainder = x->seq.l;
4115
2.47k
        x->qual.l = 0;
4116
12.0k
        do {
4117
12.0k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
4118
45
                return -2;
4119
12.0k
            if (fp->line.l > remainder)
4120
75
                return -2;
4121
11.9k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4122
0
                return -2;
4123
11.9k
            remainder -= fp->line.l;
4124
11.9k
        } while (remainder > 0);
4125
4126
        // Decr qual
4127
9.06k
        for (i = 0; i < x->qual.l; i++)
4128
6.70k
            x->qual.s[i] -= '!';
4129
2.35k
    }
4130
4131
55.8M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4132
55.8M
    if (x->name.l > 2 &&
4133
55.8M
        x->name.s[x->name.l-2] == '/' &&
4134
55.8M
        isdigit_c(x->name.s[x->name.l-1])) {
4135
8.99k
        switch(x->name.s[x->name.l-1]) {
4136
1.74k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4137
2.52k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4138
4.73k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4139
8.99k
        }
4140
8.99k
        x->name.s[x->name.l-=2] = 0;
4141
8.99k
    }
4142
4143
    // Convert to BAM
4144
55.8M
    ret = bam_set1(b,
4145
55.8M
                   x->name.s + x->name.l - name, name,
4146
55.8M
                   flag,
4147
55.8M
                   -1, -1, 0, // ref '*', pos, mapq,
4148
55.8M
                   0, NULL,     // no cigar,
4149
55.8M
                   -1, -1, 0,    // mate
4150
55.8M
                   x->seq.l, x->seq.s, x->qual.s,
4151
55.8M
                   0);
4152
4153
    // Identify Illumina CASAVA strings.
4154
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4155
55.8M
    char *barcode = NULL;
4156
55.8M
    int barcode_len = 0;
4157
55.8M
    kstring_t *kc = &x->comment;
4158
55.8M
    char *endptr;
4159
55.8M
    if (x->casava &&
4160
        // \d:[YN]:\d+:[ACGTN]+
4161
55.8M
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4162
55.8M
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4163
55.8M
        && *endptr == ':') {
4164
4165
        // read num
4166
0
        switch(kc->s[0]) {
4167
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4168
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4169
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4170
0
        }
4171
4172
0
        if (kc->s[2] == 'Y')
4173
0
            b->core.flag |= BAM_FQCFAIL;
4174
4175
        // Barcode, maybe numeric in which case we skip it
4176
0
        if (!isdigit_c(endptr[1])) {
4177
0
            barcode = endptr+1;
4178
0
            for (i = barcode - kc->s; i < kc->l; i++)
4179
0
                if (isspace_c(kc->s[i]))
4180
0
                    break;
4181
4182
0
            kc->s[i] = 0;
4183
0
            barcode_len = i+1-(barcode - kc->s);
4184
0
        }
4185
0
    }
4186
4187
55.8M
    if (ret >= 0 && barcode_len)
4188
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4189
0
            ret = -2;
4190
4191
55.8M
    if (!x->aux)
4192
55.8M
        return ret;
4193
4194
    // Identify any SAM style aux tags in comments too.
4195
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4196
0
        ret = -2;
4197
4198
0
    return ret;
4199
55.8M
}
4200
4201
// Internal component of sam_read1 below
4202
4.56k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4203
4.56k
    int ret = bam_read1(fp->fp.bgzf, b);
4204
4.56k
    if (h && ret >= 0) {
4205
3.12k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4206
3.12k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4207
316
            errno = ERANGE;
4208
316
            return -3;
4209
316
        }
4210
3.12k
    }
4211
4.25k
    return ret;
4212
4.56k
}
4213
4214
// Internal component of sam_read1 below
4215
8.50k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4216
8.50k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4217
8.50k
    if (ret < 0)
4218
8.50k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4219
4220
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4221
0
        return -2;
4222
4223
0
    return ret;
4224
0
}
4225
4226
// Internal component of sam_read1 below
4227
354k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4228
354k
    int ret;
4229
4230
    // Consume 1st line after header parsing as it wasn't using peek
4231
354k
    if (fp->line.l != 0) {
4232
39
        ret = sam_parse1(&fp->line, h, b);
4233
39
        fp->line.l = 0;
4234
39
        return ret;
4235
39
    }
4236
4237
354k
    if (fp->state) {
4238
0
        SAM_state *fd = (SAM_state *)fp->state;
4239
4240
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4241
            // We don't support multi-threaded SAM parsing with seeks yet.
4242
0
            int ret;
4243
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4244
0
                errno = -ret;
4245
0
                return -2;
4246
0
            }
4247
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4248
0
                return -1;
4249
0
            fp->fp.bgzf->seeked = 0;
4250
0
            goto err_recover;
4251
0
        }
4252
4253
0
        if (!fd->h) {
4254
0
            fd->h = h;
4255
0
            fd->h->ref_count++;
4256
            // Ensure hrecs is initialised now as we don't want multiple
4257
            // threads trying to do this simultaneously.
4258
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4259
0
                return -2;
4260
4261
            // We can only do this once we've got a header
4262
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4263
0
                               fp) != 0)
4264
0
                return -2;
4265
0
            fd->dispatcher_set = 1;
4266
0
        }
4267
4268
0
        if (fd->h != h) {
4269
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4270
0
            return -1;
4271
0
        }
4272
4273
0
        sp_bams *gb = fd->curr_bam;
4274
0
        if (!gb) {
4275
0
            if (fd->errcode) {
4276
                // In case reader failed
4277
0
                errno = fd->errcode;
4278
0
                return -2;
4279
0
            }
4280
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4281
0
            if (!r)
4282
0
                return -2;
4283
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4284
0
            hts_tpool_delete_result(r, 0);
4285
0
        }
4286
0
        if (!gb)
4287
0
            return fd->errcode ? -2 : -1;
4288
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4289
0
        if (fd->curr_idx < gb->nbams)
4290
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4291
0
                return -2;
4292
0
        if (fd->curr_idx == gb->nbams) {
4293
0
            pthread_mutex_lock(&fd->lines_m);
4294
0
            gb->next = fd->bams;
4295
0
            fd->bams = gb;
4296
0
            pthread_mutex_unlock(&fd->lines_m);
4297
4298
0
            fd->curr_bam = NULL;
4299
0
            fd->curr_idx = 0;
4300
0
        }
4301
4302
0
        ret = 0;
4303
4304
354k
    } else  {
4305
354k
    err_recover:
4306
354k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4307
354k
        if (ret < 0) return ret;
4308
4309
343k
        ret = sam_parse1(&fp->line, h, b);
4310
343k
        fp->line.l = 0;
4311
343k
        if (ret < 0) {
4312
8.30k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4313
8.30k
            if (h && h->ignore_sam_err) goto err_recover;
4314
8.30k
        }
4315
343k
    }
4316
4317
343k
    return ret;
4318
354k
}
4319
4320
// Returns 0 on success,
4321
//        -1 on EOF,
4322
//       <-1 on error
4323
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4324
56.2M
{
4325
56.2M
    int ret, pass_filter;
4326
4327
56.2M
    do {
4328
56.2M
        switch (fp->format.format) {
4329
4.56k
        case bam:
4330
4.56k
            ret = sam_read1_bam(fp, h, b);
4331
4.56k
            break;
4332
4333
8.50k
        case cram:
4334
8.50k
            ret = sam_read1_cram(fp, h, &b);
4335
8.50k
            break;
4336
4337
354k
        case sam:
4338
354k
            ret = sam_read1_sam(fp, h, b);
4339
354k
            break;
4340
4341
55.8M
        case fasta_format:
4342
55.8M
        case fastq_format: {
4343
55.8M
            fastq_state *x = (fastq_state *)fp->state;
4344
55.8M
            if (!x) {
4345
3.49k
                if (!(fp->state = fastq_state_init(fp->format.format
4346
3.49k
                                                   == fastq_format ? '@' : '>')))
4347
0
                    return -2;
4348
3.49k
            }
4349
4350
55.8M
            return fastq_parse1(fp, b);
4351
55.8M
        }
4352
4353
0
        case empty_format:
4354
0
            errno = EPIPE;
4355
0
            return -3;
4356
4357
0
        default:
4358
0
            errno = EFTYPE;
4359
0
            return -3;
4360
56.2M
        }
4361
4362
367k
        pass_filter = (ret >= 0 && fp->filter)
4363
367k
            ? sam_passes_filter(h, b, fp->filter)
4364
367k
            : 1;
4365
367k
    } while (pass_filter == 0);
4366
4367
367k
    return pass_filter < 0 ? -2 : ret;
4368
56.2M
}
4369
4370
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4371
// this code isn't vectorised and runs far slower than is necessary (even
4372
// with the restrict keyword being used).
4373
static inline void HTS_OPT3
4374
1.23k
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4375
1.23k
    uint32_t i;
4376
4.91k
    for (i = 0; i < len; i++)
4377
3.67k
        a[i] = b[i]+33;
4378
1.23k
}
4379
4380
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4381
18.7M
{
4382
18.7M
    int i, r = 0;
4383
18.7M
    uint8_t *s, *end;
4384
18.7M
    const bam1_core_t *c = &b->core;
4385
4386
18.7M
    if (c->l_qname == 0)
4387
0
        return -1;
4388
18.7M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4389
18.7M
    r |= kputc_('\t', str); // query name
4390
18.7M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4391
18.7M
    if (c->tid >= 0) { // chr
4392
60.4k
        r |= kputs(h->target_name[c->tid] , str);
4393
60.4k
        r |= kputc_('\t', str);
4394
18.6M
    } else r |= kputsn_("*\t", 2, str);
4395
18.7M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4396
18.7M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4397
18.7M
    if (c->n_cigar) { // cigar
4398
102k
        uint32_t *cigar = bam_get_cigar(b);
4399
7.48M
        for (i = 0; i < c->n_cigar; ++i) {
4400
7.37M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4401
7.37M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4402
7.37M
        }
4403
18.6M
    } else r |= kputc_('*', str);
4404
18.7M
    r |= kputc_('\t', str);
4405
18.7M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4406
4.26k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4407
2.63k
    else {
4408
2.63k
        r |= kputs(h->target_name[c->mtid], str);
4409
2.63k
        r |= kputc_('\t', str);
4410
2.63k
    }
4411
18.7M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4412
18.7M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4413
18.7M
    if (c->l_qseq) { // seq and qual
4414
729k
        uint8_t *s = bam_get_seq(b);
4415
729k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4416
729k
        char *cp = str->s + str->l;
4417
4418
        // Sequence, 2 bases at a time
4419
729k
        nibble2base(s, cp, c->l_qseq);
4420
729k
        cp[c->l_qseq] = '\t';
4421
729k
        cp += c->l_qseq+1;
4422
4423
        // Quality
4424
729k
        s = bam_get_qual(b);
4425
729k
        i = 0;
4426
729k
        if (s[0] == 0xff) {
4427
728k
            cp[i++] = '*';
4428
728k
        } else {
4429
1.23k
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4430
1.23k
            i = c->l_qseq;
4431
1.23k
        }
4432
729k
        cp[i] = 0;
4433
729k
        cp += i;
4434
729k
        str->l = cp - str->s;
4435
18.0M
    } else r |= kputsn_("*\t*", 3, str);
4436
4437
18.7M
    s = bam_get_aux(b); // aux
4438
18.7M
    end = b->data + b->l_data;
4439
4440
21.9M
    while (end - s >= 4) {
4441
3.23M
        r |= kputc_('\t', str);
4442
3.23M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4443
287
            goto bad_aux;
4444
3.23M
    }
4445
18.7M
    r |= kputsn("", 0, str); // nul terminate
4446
18.7M
    if (r < 0) goto mem_err;
4447
4448
18.7M
    return str->l;
4449
4450
287
 bad_aux:
4451
287
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4452
287
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4453
287
    errno = EINVAL;
4454
287
    return -1;
4455
4456
0
 mem_err:
4457
0
    hts_log_error("Out of memory");
4458
0
    errno = ENOMEM;
4459
0
    return -1;
4460
18.7M
}
4461
4462
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4463
18.7M
{
4464
18.7M
    str->l = 0;
4465
18.7M
    return sam_format1_append(h, b, str);
4466
18.7M
}
4467
4468
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4469
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4470
0
{
4471
0
    unsigned flag = b->core.flag;
4472
0
    int i, e = 0, len = b->core.l_qseq;
4473
0
    uint8_t *seq, *qual;
4474
4475
0
    str->l = 0;
4476
4477
    // Name
4478
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4479
0
        return -1;
4480
4481
    // /1 or /2 suffix
4482
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4483
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4484
0
        if (r12 == BAM_FREAD1) {
4485
0
            if (kputs("/1", str) == EOF)
4486
0
                return -1;
4487
0
        } else if (r12 == BAM_FREAD2) {
4488
0
            if (kputs("/2", str) == EOF)
4489
0
                return -1;
4490
0
        }
4491
0
    }
4492
4493
    // Illumina CASAVA tag.
4494
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4495
0
    if (x && x->casava) {
4496
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4497
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4498
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4499
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4500
0
                     bc ? (char *)bc+1 : "0") < 0)
4501
0
            return -1;
4502
4503
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4504
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4505
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4506
0
            str->s[str->l-1] = '0';
4507
0
            str->s[str->l] = 0;
4508
0
            bc = NULL;
4509
0
        }
4510
4511
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4512
0
        if (bc) {
4513
0
            int l = strlen((char *)bc+1);
4514
0
            char *c = (char *)str->s + str->l - l;
4515
0
            for (i = 0; i < l; i++) {
4516
0
                if (!isalpha_c(c[i]))
4517
0
                    c[i] = '+';
4518
0
                else if (islower_c(c[i]))
4519
0
                    c[i] = toupper_c(c[i]);
4520
0
            }
4521
0
        }
4522
0
    }
4523
4524
    // Aux tags
4525
0
    if (x && x->aux) {
4526
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4527
0
        while (s && end - s >= 4) {
4528
0
            int tt = s[0]*256 + s[1];
4529
0
            if (x->tags == NULL ||
4530
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4531
0
                e |= kputc_('\t', str) < 0;
4532
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4533
0
                    return -1;
4534
0
            } else {
4535
0
                s = skip_aux(s+2, end);
4536
0
            }
4537
0
        }
4538
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4539
0
    }
4540
4541
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4542
0
    e |= kputc_('\n', str) < 0;
4543
4544
    // Seq line
4545
0
    seq = bam_get_seq(b);
4546
0
    if (flag & BAM_FREVERSE)
4547
0
        for (i = len-1; i >= 0; i--)
4548
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4549
0
    else
4550
0
        for (i = 0; i < len; i++)
4551
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4552
4553
4554
    // Qual line
4555
0
    if (x->nprefix == '@') {
4556
0
        kputsn("\n+\n", 3, str);
4557
0
        qual = bam_get_qual(b);
4558
0
        if (qual[0] == 0xff)
4559
0
            for (i = 0; i < len; i++)
4560
0
                e |= kputc_('B', str) < 0;
4561
0
        else if (flag & BAM_FREVERSE)
4562
0
            for (i = len-1; i >= 0; i--)
4563
0
                e |= kputc_(33 + qual[i], str) < 0;
4564
0
        else
4565
0
            for (i = 0; i < len; i++)
4566
0
                e |= kputc_(33 + qual[i], str) < 0;
4567
4568
0
    }
4569
0
    e |= kputc('\n', str) < 0;
4570
4571
0
    return e ? -1 : str->l;
4572
0
}
4573
4574
// Sadly we need to be able to modify the bam_hdr here so we can
4575
// reference count the structure.
4576
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4577
56.2M
{
4578
56.2M
    switch (fp->format.format) {
4579
0
    case binary_format:
4580
0
        fp->format.category = sequence_data;
4581
0
        fp->format.format = bam;
4582
        /* fall-through */
4583
18.7M
    case bam:
4584
18.7M
        return bam_write_idx1(fp, h, b);
4585
4586
18.7M
    case cram:
4587
18.7M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4588
4589
0
    case text_format:
4590
0
        fp->format.category = sequence_data;
4591
0
        fp->format.format = sam;
4592
        /* fall-through */
4593
18.7M
    case sam:
4594
18.7M
        if (fp->state) {
4595
0
            SAM_state *fd = (SAM_state *)fp->state;
4596
4597
            // Threaded output
4598
0
            if (!fd->h) {
4599
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4600
                // just data pointed to by it (which is a bit weasely still),
4601
                // but out cached pointer must be non-const as we want to
4602
                // destroy it later on and sam_hdr_destroy takes non-const.
4603
                //
4604
                // We do this because some tools do sam_hdr_destroy; sam_close
4605
                // while others do sam_close; sam_hdr_destroy.  The former is
4606
                // an issue as we need the header still when flushing.
4607
0
                fd->h = (sam_hdr_t *)h;
4608
0
                fd->h->ref_count++;
4609
4610
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4611
0
                                   fp) != 0)
4612
0
                    return -2;
4613
0
                fd->dispatcher_set = 1;
4614
0
            }
4615
4616
0
            if (fd->h != h) {
4617
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4618
0
                return -2;
4619
0
            }
4620
4621
            // Find a suitable BAM array to copy to
4622
0
            sp_bams *gb = fd->curr_bam;
4623
0
            if (!gb) {
4624
0
                pthread_mutex_lock(&fd->lines_m);
4625
0
                if (fd->bams) {
4626
0
                    fd->curr_bam = gb = fd->bams;
4627
0
                    fd->bams = gb->next;
4628
0
                    gb->next = NULL;
4629
0
                    gb->nbams = 0;
4630
0
                    gb->bam_mem = 0;
4631
0
                    pthread_mutex_unlock(&fd->lines_m);
4632
0
                } else {
4633
0
                    pthread_mutex_unlock(&fd->lines_m);
4634
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4635
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4636
0
                        free(gb);
4637
0
                        return -1;
4638
0
                    }
4639
0
                    gb->nbams = 0;
4640
0
                    gb->abams = SAM_NBAM;
4641
0
                    gb->bam_mem = 0;
4642
0
                    gb->fd = fd;
4643
0
                    fd->curr_idx = 0;
4644
0
                    fd->curr_bam = gb;
4645
0
                }
4646
0
            }
4647
4648
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4649
0
                return -2;
4650
0
            gb->bam_mem += b->l_data + sizeof(*b);
4651
4652
            // Dispatch if full
4653
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4654
0
                gb->serial = fd->serial++;
4655
0
                pthread_mutex_lock(&fd->command_m);
4656
0
                if (fd->errcode != 0) {
4657
0
                    pthread_mutex_unlock(&fd->command_m);
4658
0
                    return -fd->errcode;
4659
0
                }
4660
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4661
0
                                        cleanup_sp_bams,
4662
0
                                        cleanup_sp_lines, 0) < 0) {
4663
0
                    pthread_mutex_unlock(&fd->command_m);
4664
0
                    return -1;
4665
0
                }
4666
0
                pthread_mutex_unlock(&fd->command_m);
4667
0
                fd->curr_bam = NULL;
4668
0
            }
4669
4670
            // Dummy value as we don't know how long it really is.
4671
            // We could track file sizes via a SAM_state field, but I don't think
4672
            // it is necessary.
4673
0
            return 1;
4674
18.7M
        } else {
4675
18.7M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4676
18.7M
            kputc('\n', &fp->line);
4677
18.7M
            if (fp->is_bgzf) {
4678
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4679
0
                    return -1;
4680
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4681
18.7M
            } else {
4682
18.7M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4683
18.7M
            }
4684
4685
18.7M
            if (fp->idx) {
4686
0
                if (fp->format.compression == bgzf) {
4687
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4688
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4689
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4690
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4691
0
                        return -1;
4692
0
                    }
4693
0
                } else {
4694
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4695
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4696
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4697
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4698
0
                        return -1;
4699
0
                    }
4700
0
                }
4701
0
            }
4702
4703
18.7M
            return fp->line.l;
4704
18.7M
        }
4705
4706
4707
0
    case fasta_format:
4708
0
    case fastq_format: {
4709
0
        fastq_state *x = (fastq_state *)fp->state;
4710
0
        if (!x) {
4711
0
            if (!(fp->state = fastq_state_init(fp->format.format
4712
0
                                               == fastq_format ? '@' : '>')))
4713
0
                return -2;
4714
0
        }
4715
4716
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4717
0
            return -1;
4718
0
        if (fp->is_bgzf) {
4719
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4720
0
                return -1;
4721
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4722
0
                return -1;
4723
0
        } else {
4724
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4725
0
                return -1;
4726
0
        }
4727
0
        return fp->line.l;
4728
0
    }
4729
4730
0
    default:
4731
0
        errno = EBADF;
4732
0
        return -1;
4733
56.2M
    }
4734
56.2M
}
4735
4736
/************************
4737
 *** Auxiliary fields ***
4738
 ************************/
4739
#ifndef HTS_LITTLE_ENDIAN
4740
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4741
    int tsz = aux_type2size(type);
4742
4743
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4744
4745
    switch (tsz) {
4746
        case 'H': case 'Z': case 1:  // Trivial
4747
            memcpy(out, in, len);
4748
            break;
4749
4750
#define aux_val_to_le(type_t, store_le) do {                            \
4751
        type_t v;                                                       \
4752
        size_t i;                                                       \
4753
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4754
            memcpy(&v, in + i, sizeof(type_t));                         \
4755
            store_le(v, out);                                           \
4756
        }                                                               \
4757
    } while (0)
4758
4759
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4760
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4761
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4762
4763
#undef aux_val_to_le
4764
4765
        case 'B': { // Recurse!
4766
            uint32_t n;
4767
            if (len < 5) return -1;
4768
            memcpy(&n, in + 1, 4);
4769
            out[0] = in[0];
4770
            u32_to_le(n, out + 1);
4771
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4772
        }
4773
4774
        default: // Unknown type code
4775
            return -1;
4776
    }
4777
4778
4779
4780
    return 0;
4781
}
4782
#endif
4783
4784
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4785
0
{
4786
0
    uint32_t new_len;
4787
4788
0
    assert(b->l_data >= 0);
4789
0
    new_len = b->l_data + 3 + len;
4790
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4791
4792
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4793
4794
0
    b->data[b->l_data] = tag[0];
4795
0
    b->data[b->l_data + 1] = tag[1];
4796
0
    b->data[b->l_data + 2] = type;
4797
4798
0
#ifdef HTS_LITTLE_ENDIAN
4799
0
    memcpy(b->data + b->l_data + 3, data, len);
4800
#else
4801
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4802
        errno = EINVAL;
4803
        return -1;
4804
    }
4805
#endif
4806
4807
0
    b->l_data = new_len;
4808
4809
0
    return 0;
4810
4811
0
 nomem:
4812
0
    errno = ENOMEM;
4813
0
    return -1;
4814
0
}
4815
4816
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4817
3.79M
{
4818
3.79M
    int size;
4819
3.79M
    uint32_t n;
4820
3.79M
    if (s >= end) return end;
4821
3.79M
    size = aux_type2size(*s); ++s; // skip type
4822
3.79M
    switch (size) {
4823
134k
    case 'Z':
4824
163k
    case 'H':
4825
12.4M
        while (s < end && *s) ++s;
4826
163k
        return s < end ? s + 1 : end;
4827
45.8k
    case 'B':
4828
45.8k
        if (end - s < 5) return NULL;
4829
45.8k
        size = aux_type2size(*s); ++s;
4830
45.8k
        n = le_to_u32(s);
4831
45.8k
        s += 4;
4832
45.8k
        if (size == 0 || end - s < size * n) return NULL;
4833
45.8k
        return s + size * n;
4834
350
    case 0:
4835
350
        return NULL;
4836
3.58M
    default:
4837
3.58M
        if (end - s < size) return NULL;
4838
3.58M
        return s + size;
4839
3.79M
    }
4840
3.79M
}
4841
4842
uint8_t *bam_aux_first(const bam1_t *b)