Coverage Report

Created: 2026-01-10 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/header.c
Line
Count
Source
1
/*
2
Copyright (c) 2018-2020, 2023, 2025 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <string.h>
35
#include <assert.h>
36
#include <errno.h>
37
#include "textutils_internal.h"
38
#include "header.h"
39
#include "htslib/bgzf.h"
40
#include "htslib/hfile.h"
41
#include "htslib/kseq.h"
42
43
// Hash table for removing multiple lines from the header
44
KHASH_SET_INIT_STR(rm)
45
// Used for long refs in SAM files
46
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
47
48
typedef khash_t(rm) rmhash_t;
49
50
static int sam_hdr_link_pg(sam_hdr_t *bh);
51
52
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
53
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
54
55
56
434
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
57
229
static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
58
229
    int j;
59
60
229
    if (len > MAX_ERROR_QUOTE)
61
205
        len = MAX_ERROR_QUOTE;
62
66.4k
    for (j = 0; j < len && line[j] != '\n'; j++)
63
66.1k
        ;
64
229
    hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
65
229
}
66
67
/* ==== Static methods ==== */
68
69
13.0k
static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
70
13.0k
    if (!hrecs)
71
0
        return -1;
72
73
13.0k
    if (!type_list) {
74
13.0k
        hrecs->type_count = 5;
75
13.0k
        hrecs->type_order = calloc(hrecs->type_count, 3);
76
13.0k
        if (!hrecs->type_order)
77
0
            return -1;
78
13.0k
        memcpy(hrecs->type_order[0], "HD", 2);
79
13.0k
        memcpy(hrecs->type_order[1], "SQ", 2);
80
13.0k
        memcpy(hrecs->type_order[2], "RG", 2);
81
13.0k
        memcpy(hrecs->type_order[3], "PG", 2);
82
13.0k
        memcpy(hrecs->type_order[4], "CO", 2);
83
13.0k
    }
84
85
13.0k
    return 0;
86
13.0k
}
87
88
9.79k
static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
89
9.79k
    const char *token;
90
9.79k
    ks_tokaux_t aux;
91
92
9.79k
    if (!list)
93
9.59k
        return 0;
94
95
234k
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
96
233k
        if (aux.p == token)
97
228k
            continue;
98
99
5.12k
        char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
100
5.12k
        if (!name)
101
0
            return -1;
102
5.12k
        int r;
103
5.12k
        khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
104
5.12k
        if (r < 0) return -1;
105
106
5.12k
        if (r > 0)
107
1.22k
            kh_val(hrecs->ref_hash, k) = nref;
108
3.90k
        else if (kh_val(hrecs->ref_hash, k) != nref)
109
0
            hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
110
5.12k
    }
111
112
198
    return 0;
113
198
}
114
115
0
static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
116
0
    const char *token, *sn;
117
0
    ks_tokaux_t aux;
118
0
    kstring_t str = KS_INITIALIZE;
119
120
0
    if (expected < 0 || expected >= hrecs->nref)
121
0
        return;
122
0
    sn = hrecs->ref[expected].name;
123
124
0
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
125
0
        kputsn(token, aux.p - token, ks_clear(&str));
126
0
        khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
127
0
        if (k != kh_end(hrecs->ref_hash)
128
0
            && kh_val(hrecs->ref_hash, k) == expected
129
0
            && strcmp(sn, str.s) != 0)
130
0
            kh_del(m_s2i, hrecs->ref_hash, k);
131
0
    }
132
133
0
    free(str.s);
134
0
}
135
136
/* Updates the hash tables in the sam_hrecs_t structure.
137
 *
138
 * Returns 0 on success;
139
 *        -1 on failure
140
 */
141
static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
142
                                   khint32_t type,
143
1.20M
                                   sam_hrec_type_t *h_type) {
144
    /* Add to reference hash? */
145
1.20M
    if (type == TYPEKEY("SQ")) {
146
9.98k
        sam_hrec_tag_t *tag = h_type->tag;
147
9.98k
        int nref = hrecs->nref;
148
9.98k
        const char *name = NULL;
149
9.98k
        const char *altnames = NULL;
150
9.98k
        hts_pos_t len = -1;
151
9.98k
        int r, invLN = 0;
152
9.98k
        khint_t k;
153
154
504k
        while (tag) {
155
494k
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
156
160k
                assert(tag->len >= 3);
157
160k
                name = tag->str+3;
158
334k
            } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
159
164k
                assert(tag->len >= 3);
160
164k
                hts_pos_t tmp = len;
161
164k
                len = strtoll(tag->str+3, NULL, 10);
162
164k
                if (tmp != -1 && tmp != len) {  //duplicate and different LN
163
129
                    invLN = 1;
164
129
                }
165
169k
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
166
151k
                assert(tag->len >= 3);
167
151k
                altnames = tag->str+3;
168
151k
            }
169
494k
            tag = tag->next;
170
494k
        }
171
172
9.98k
        if (!name) {
173
6
            hts_log_error("Header includes @SQ line with no SN: tag");
174
6
            return -1; // SN should be present, according to spec.
175
6
        }
176
177
9.97k
        if (len == -1) {
178
9
            hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
179
9
                          name);
180
9
            return -1; // LN should be present, according to spec.
181
9
        }
182
183
9.96k
        if (invLN) {
184
129
            hts_log_error("Header includes @SQ line \"%s\" with multiple LN:"
185
129
            " tag with different values.", name);
186
129
            return -1; // LN should not be duplicated or be same
187
129
        }
188
189
        // Seen already?
190
9.84k
        k = kh_get(m_s2i, hrecs->ref_hash, name);
191
9.84k
        if (k < kh_end(hrecs->ref_hash)) {
192
2.95k
            nref = kh_val(hrecs->ref_hash, k);
193
2.95k
            int ref_changed_flag = 0;
194
195
            // Check for hash entry added by sam_hrecs_refs_from_targets_array()
196
2.95k
            if (hrecs->ref[nref].ty == NULL) {
197
                // Attach header line to existing stub entry.
198
2.91k
                hrecs->ref[nref].ty = h_type;
199
                // Check lengths match; correct if not.
200
2.91k
                if (len != hrecs->ref[nref].len) {
201
559
                    char tmp[32];
202
559
                    snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
203
559
                             hrecs->ref[nref].len);
204
559
                    if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
205
0
                        return -1;
206
559
                    ref_changed_flag = 1;
207
559
                }
208
2.91k
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
209
0
                    return -1;
210
211
2.91k
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
212
552
                    hrecs->refs_changed = nref;
213
2.91k
                return 0;
214
2.91k
            }
215
216
            // Check to see if an existing entry is being updated
217
43
            if (hrecs->ref[nref].ty == h_type) {
218
0
                if (hrecs->ref[nref].len != len) {
219
0
                    hrecs->ref[nref].len = len;
220
0
                    ref_changed_flag = 1;
221
0
                }
222
0
                if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
223
0
                    hrecs->ref[nref].name = name;
224
0
                    ref_changed_flag = 1;
225
0
                }
226
0
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
227
0
                    return -1;
228
229
0
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
230
0
                    hrecs->refs_changed = nref;
231
0
                return 0;
232
0
            }
233
234
            // If here, the name is a duplicate.
235
            // Check to see if it matches the SN: tag from the earlier record.
236
43
            if (strcmp(hrecs->ref[nref].name, name) == 0) {
237
43
                hts_log_error("Duplicate entry \"%s\" in sam header",
238
43
                                name);
239
43
                return -1;
240
43
            }
241
242
            // Clash with an already-seen altname
243
            // As SN: should be preferred to AN: add this as a new
244
            // record and update the hash entry to point to it.
245
0
            hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
246
0
            nref = hrecs->nref;
247
0
        }
248
249
6.88k
        if (nref == hrecs->ref_sz) {
250
0
            size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
251
0
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
252
0
            if (!new_ref)
253
0
                return -1;
254
0
            hrecs->ref = new_ref;
255
0
            hrecs->ref_sz = new_sz;
256
0
        }
257
258
6.88k
        hrecs->ref[nref].name = name;
259
6.88k
        hrecs->ref[nref].len  = len;
260
6.88k
        hrecs->ref[nref].ty = h_type;
261
262
6.88k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
263
6.88k
        if (-1 == r) return -1;
264
6.88k
        kh_val(hrecs->ref_hash, k) = nref;
265
266
6.88k
        if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
267
0
            return -1;
268
269
6.88k
        if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
270
4.95k
            hrecs->refs_changed = hrecs->nref;
271
6.88k
        hrecs->nref++;
272
6.88k
    }
273
274
    /* Add to read-group hash? */
275
1.20M
    if (type == TYPEKEY("RG")) {
276
1.18M
        sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
277
1.18M
        int nrg = hrecs->nrg, r;
278
1.18M
        khint_t k;
279
280
1.18M
        if (!tag) {
281
27
            hts_log_error("Header includes @RG line with no ID: tag");
282
27
            return -1;  // ID should be present, according to spec.
283
27
        }
284
1.18M
        assert(tag->str && tag->len >= 3);
285
286
        // Seen already?
287
1.18M
        k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
288
1.18M
        if (k < kh_end(hrecs->rg_hash)) {
289
1.17M
            nrg = kh_val(hrecs->rg_hash, k);
290
1.17M
            assert(hrecs->rg[nrg].ty != NULL);
291
1.17M
            if (hrecs->rg[nrg].ty != h_type) {
292
1.17M
                hts_log_warning("Duplicate entry \"%s\" in sam header",
293
1.17M
                                tag->str + 3);
294
1.17M
            } else {
295
0
                hrecs->rg[nrg].name = tag->str + 3;
296
0
                hrecs->rg[nrg].name_len = tag->len - 3;
297
0
            }
298
1.17M
            return 0;
299
1.17M
        }
300
301
8.01k
        if (nrg == hrecs->rg_sz) {
302
3.04k
            size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
303
3.04k
            sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
304
3.04k
            if (!new_rg)
305
0
                return -1;
306
3.04k
            hrecs->rg = new_rg;
307
3.04k
            hrecs->rg_sz = new_sz;
308
3.04k
        }
309
310
8.01k
        hrecs->rg[nrg].name = tag->str + 3;
311
8.01k
        hrecs->rg[nrg].name_len = tag->len - 3;
312
8.01k
        hrecs->rg[nrg].ty   = h_type;
313
8.01k
        hrecs->rg[nrg].id   = nrg;
314
315
8.01k
        k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
316
8.01k
        if (-1 == r) return -1;
317
8.01k
        kh_val(hrecs->rg_hash, k) = nrg;
318
319
8.01k
        hrecs->nrg++;
320
8.01k
    }
321
322
    /* Add to program hash? */
323
31.0k
    if (type == TYPEKEY("PG")) {
324
9.44k
        sam_hrec_tag_t *tag;
325
9.44k
        sam_hrec_pg_t *new_pg;
326
9.44k
        int npg = hrecs->npg;
327
328
9.44k
        if (npg == hrecs->pg_sz) {
329
2.79k
            size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
330
2.79k
            new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
331
2.79k
            if (!new_pg)
332
0
                return -1;
333
2.79k
            hrecs->pg = new_pg;
334
2.79k
            hrecs->pg_sz = new_sz;
335
2.79k
        }
336
337
9.44k
        tag = h_type->tag;
338
9.44k
        hrecs->pg[npg].name = NULL;
339
9.44k
        hrecs->pg[npg].name_len = 0;
340
9.44k
        hrecs->pg[npg].ty  = h_type;
341
9.44k
        hrecs->pg[npg].id   = npg;
342
9.44k
        hrecs->pg[npg].prev_id = -1;
343
344
47.4k
        while (tag) {
345
38.0k
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
346
                /* Avoid duplicate ID tags coming from other applications */
347
14.8k
                if (!hrecs->pg[npg].name) {
348
9.43k
                    assert(tag->len >= 3);
349
9.43k
                    hrecs->pg[npg].name = tag->str + 3;
350
9.43k
                    hrecs->pg[npg].name_len = tag->len - 3;
351
9.43k
                } else {
352
5.42k
                    hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
353
5.42k
                }
354
23.1k
            } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
355
                // Resolve later if needed
356
11.0k
                khint_t k;
357
11.0k
                k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
358
359
11.0k
                if (k != kh_end(hrecs->pg_hash)) {
360
1.62k
                    int p_id = kh_val(hrecs->pg_hash, k);
361
1.62k
                    hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
362
363
                    /* Unmark previous entry as a PG termination */
364
1.62k
                    if (hrecs->npg_end > 0 &&
365
1.24k
                        hrecs->pg_end[hrecs->npg_end-1] == p_id) {
366
231
                        hrecs->npg_end--;
367
1.39k
                    } else {
368
1.39k
                        int i;
369
5.74k
                        for (i = 0; i < hrecs->npg_end; i++) {
370
4.35k
                            if (hrecs->pg_end[i] == p_id) {
371
64
                                memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
372
64
                                        (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
373
64
                                hrecs->npg_end--;
374
64
                            }
375
4.35k
                        }
376
1.39k
                    }
377
9.46k
                } else {
378
9.46k
                    hrecs->pg[npg].prev_id = -1;
379
9.46k
                }
380
11.0k
            }
381
38.0k
            tag = tag->next;
382
38.0k
        }
383
384
9.44k
        if (hrecs->pg[npg].name) {
385
9.43k
            khint_t k;
386
9.43k
            int r;
387
9.43k
            k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
388
9.43k
            if (-1 == r) return -1;
389
9.43k
            kh_val(hrecs->pg_hash, k) = npg;
390
9.43k
        } else {
391
12
            return -1; // ID should be present, according to spec.
392
12
        }
393
394
        /* Add to npg_end[] array. Remove later if we find a PP line */
395
9.43k
        if (hrecs->npg_end >= hrecs->npg_end_alloc) {
396
1.53k
            int *new_pg_end;
397
1.53k
            int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
398
399
1.53k
            new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
400
1.53k
            if (!new_pg_end)
401
0
                return -1;
402
1.53k
            hrecs->npg_end_alloc = new_alloc;
403
1.53k
            hrecs->pg_end = new_pg_end;
404
1.53k
        }
405
9.43k
        hrecs->pg_end[hrecs->npg_end++] = npg;
406
407
9.43k
        hrecs->npg++;
408
9.43k
    }
409
410
30.9k
    return 0;
411
31.0k
}
412
413
0
static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
414
0
    if (!hrecs || !h_type)
415
0
        return -1;
416
417
0
    sam_hrec_tag_t *tag;
418
0
    const char *key = NULL;
419
0
    khint_t k;
420
421
    /* Remove name and any alternative names from reference hash */
422
0
    if (type == TYPEKEY("SQ")) {
423
0
        const char *altnames = NULL;
424
425
0
        tag = h_type->tag;
426
427
0
        while (tag) {
428
0
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
429
0
                assert(tag->len >= 3);
430
0
                key = tag->str + 3;
431
0
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
432
0
                assert(tag->len >= 3);
433
0
                altnames = tag->str + 3;
434
0
            }
435
0
            tag = tag->next;
436
0
        }
437
438
0
        if (key) {
439
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
440
0
            if (k != kh_end(hrecs->ref_hash)) {
441
0
                int idx = kh_val(hrecs->ref_hash, k);
442
0
                if (idx + 1 < hrecs->nref)
443
0
                    memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
444
0
                            sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
445
0
                if (altnames)
446
0
                    sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
447
0
                kh_del(m_s2i, hrecs->ref_hash, k);
448
0
                hrecs->nref--;
449
0
                if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
450
0
                    hrecs->refs_changed = idx;
451
0
                for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
452
0
                    if (kh_exist(hrecs->ref_hash, k)
453
0
                        && kh_value(hrecs->ref_hash, k) > idx) {
454
0
                        kh_value(hrecs->ref_hash, k)--;
455
0
                    }
456
0
                }
457
0
            }
458
0
        }
459
0
    }
460
461
    /* Remove from read-group hash */
462
0
    if (type == TYPEKEY("RG")) {
463
0
        tag = h_type->tag;
464
465
0
        while (tag) {
466
0
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
467
0
                assert(tag->len >= 3);
468
0
                key = tag->str + 3;
469
0
                k = kh_get(m_s2i, hrecs->rg_hash, key);
470
0
                if (k != kh_end(hrecs->rg_hash)) {
471
0
                    int idx = kh_val(hrecs->rg_hash, k);
472
0
                    if (idx + 1 < hrecs->nrg)
473
0
                        memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
474
0
                    kh_del(m_s2i, hrecs->rg_hash, k);
475
0
                    hrecs->nrg--;
476
0
                    for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
477
0
                        if (kh_exist(hrecs->rg_hash, k)
478
0
                            && kh_value(hrecs->rg_hash, k) > idx) {
479
0
                            kh_value(hrecs->rg_hash, k)--;
480
0
                        }
481
0
                    }
482
0
                }
483
0
                break;
484
0
            }
485
0
            tag = tag->next;
486
0
        }
487
0
    }
488
489
0
    return 0;
490
0
}
491
492
/** Add a header record to the global line ordering
493
 *
494
 * If @p after is not NULL, the new record will be inserted after this one,
495
 * otherwise it will go at the end.
496
 *
497
 * An exception is an HD record, which will always be put first unless
498
 * one is already present.
499
 */
500
static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
501
                                      sam_hrec_type_t *h_type,
502
1.93M
                                      sam_hrec_type_t *after) {
503
1.93M
    const khint32_t hd_type = TYPEKEY("HD");
504
1.93M
    int update_first_line = 0;
505
506
    // First line seen
507
1.93M
    if (!hrecs->first_line) {
508
7.76k
        hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
509
7.76k
        return;
510
7.76k
    }
511
512
    // @HD goes at the top (unless there's one already)
513
1.92M
    if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
514
12
        after = hrecs->first_line->global_prev;
515
12
        update_first_line = 1;
516
12
    }
517
518
    // If no instructions given, put it at the end
519
1.92M
    if (!after)
520
1.92M
        after = hrecs->first_line->global_prev;
521
522
1.92M
    h_type->global_prev = after;
523
1.92M
    h_type->global_next = after->global_next;
524
1.92M
    h_type->global_prev->global_next = h_type;
525
1.92M
    h_type->global_next->global_prev = h_type;
526
527
1.92M
    if (update_first_line)
528
12
        hrecs->first_line = h_type;
529
1.92M
}
530
531
/*! Add header record with a va_list interface.
532
 *
533
 * Adds a single record to a SAM header.
534
 *
535
 * This takes a header record type, a va_list argument and one or more
536
 * key,value pairs, ending with the NULL key.
537
 *
538
 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
539
 *
540
 * The purpose of the additional va_list parameter is to permit other
541
 * varargs functions to call this while including their own additional
542
 * parameters; an example is in sam_hdr_add_pg().
543
 *
544
 * Note: this function invokes va_arg at least once, making the value
545
 * of ap indeterminate after the return. The caller should call
546
 * va_start/va_end before/after calling this function or use va_copy.
547
 *
548
 * @return
549
 * Returns >= 0 on success;
550
 *        -1 on failure
551
 */
552
48
static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
553
48
    va_list args;
554
48
    sam_hrec_type_t *h_type;
555
48
    sam_hrec_tag_t *h_tag, *last=NULL;
556
48
    int new;
557
48
    khint32_t type_i = TYPEKEY(type), k;
558
559
48
    if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
560
0
        return sam_hrecs_vupdate(hrecs, h_type, ap);
561
562
48
    if (!(h_type = pool_alloc(hrecs->type_pool)))
563
0
        return -1;
564
48
    k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
565
48
    if (new < 0)
566
0
        return -1;
567
568
48
    h_type->type = type_i;
569
570
    // Form the ring, either with self or other lines of this type
571
48
    if (!new) {
572
0
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
573
0
        p = t->prev;
574
575
0
        assert(p->next == t);
576
0
        p->next = h_type;
577
0
        h_type->prev = p;
578
579
0
        t->prev = h_type;
580
0
        h_type->next = t;
581
48
    } else {
582
48
        kh_val(hrecs->h, k) = h_type;
583
48
        h_type->prev = h_type->next = h_type;
584
48
    }
585
48
    h_type->tag = NULL;
586
587
    // Add to global line ordering after any existing line of the same type,
588
    // or at the end if no line of this type exists yet.
589
48
    sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
590
591
    // Check linked-list invariants
592
48
    assert(h_type->prev->next == h_type);
593
48
    assert(h_type->next->prev == h_type);
594
48
    assert(h_type->global_prev->global_next == h_type);
595
48
    assert(h_type->global_next->global_prev == h_type);
596
597
    // Any ... varargs
598
48
    va_start(args, ap);
599
48
    for (;;) {
600
48
        char *key, *val = NULL, *str;
601
602
48
        if (!(key = (char *)va_arg(args, char *)))
603
48
            break;
604
0
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
605
0
            break;
606
0
        if (*val == '\0')
607
0
            continue;
608
609
0
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
610
0
            return -1;
611
612
0
        if (strncmp(type, "CO", 2)) {
613
0
            h_tag->len = 3 + strlen(val);
614
0
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
615
0
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
616
0
                return -1;
617
0
            h_tag->str = str;
618
0
        } else {
619
0
            h_tag->len = strlen(key);
620
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
621
0
            if (!h_tag->str)
622
0
                return -1;
623
0
        }
624
625
0
        h_tag->next = NULL;
626
0
        if (last)
627
0
            last->next = h_tag;
628
0
        else
629
0
            h_type->tag = h_tag;
630
631
0
        last = h_tag;
632
0
    }
633
48
    va_end(args);
634
635
    // Plus the specified va_list params
636
144
    for (;;) {
637
144
        char *key, *val = NULL, *str;
638
639
144
        if (!(key = (char *)va_arg(ap, char *)))
640
48
            break;
641
96
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
642
0
            break;
643
644
96
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
645
0
            return -1;
646
647
96
        if (strncmp(type, "CO", 2)) {
648
96
            h_tag->len = 3 + strlen(val);
649
96
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
650
96
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
651
0
                return -1;
652
96
            h_tag->str = str;
653
96
        } else {
654
0
            h_tag->len = strlen(key);
655
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
656
0
            if (!h_tag->str)
657
0
                return -1;
658
0
        }
659
660
96
        h_tag->next = NULL;
661
96
        if (last)
662
48
            last->next = h_tag;
663
48
        else
664
48
            h_type->tag = h_tag;
665
666
96
        last = h_tag;
667
96
    }
668
669
48
    if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
670
0
        return -1;
671
672
48
    if (!strncmp(type, "PG", 2))
673
0
        hrecs->pgs_changed = 1;
674
675
48
    hrecs->dirty = 1;
676
677
48
    return 0;
678
48
}
679
680
// As sam_hrecs_vadd(), but without the extra va_list parameter
681
48
static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
682
48
    va_list args;
683
48
    int res;
684
48
    va_start(args, type);
685
48
    res = sam_hrecs_vadd(hrecs, type, args, NULL);
686
48
    va_end(args);
687
48
    return res;
688
48
}
689
690
/*
691
 * Function for deallocating a list of tags
692
 */
693
694
2.14M
static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
695
2.14M
    if (!hrecs || !tag)
696
15
        return;
697
2.14M
    if (tag->next)
698
2.14M
        sam_hrecs_free_tags(hrecs, tag->next);
699
700
2.14M
    pool_free(hrecs->tag_pool, tag);
701
2.14M
}
702
703
0
static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) {
704
0
    if (!hrecs || !type_name || !type_found)
705
0
        return -1;
706
707
0
    khint32_t itype = TYPEKEY(type_name);
708
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
709
0
    if (k == kh_end(hrecs->h))
710
0
        return -1;
711
712
    // Remove from global list (remembering it could be the only line)
713
0
    if (hrecs->first_line == type_found) {
714
0
        hrecs->first_line = (type_found->global_next != type_found
715
0
                             ? type_found->global_next : NULL);
716
0
    }
717
0
    type_found->global_next->global_prev = type_found->global_prev;
718
0
    type_found->global_prev->global_next = type_found->global_next;
719
720
    /* single element in the list */
721
0
    if (type_found->prev == type_found || type_found->next == type_found) {
722
0
        kh_del(sam_hrecs_t, hrecs->h, k);
723
0
    } else {
724
0
        type_found->prev->next = type_found->next;
725
0
        type_found->next->prev = type_found->prev;
726
0
        if (kh_val(hrecs->h, k) == type_found) { //first element
727
0
            kh_val(hrecs->h, k) = type_found->next;
728
0
        }
729
0
    }
730
731
0
    if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)))
732
0
        sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
733
734
0
    sam_hrecs_free_tags(hrecs, type_found->tag);
735
0
    pool_free(hrecs->type_pool, type_found);
736
737
0
    hrecs->dirty = 1;
738
739
0
    return 0;
740
0
}
741
742
// Paste together a line from the parsed data structures
743
1.79M
static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
744
1.79M
    sam_hrec_tag_t *tag;
745
1.79M
    int r = 0;
746
1.79M
    char c[2]= { ty->type >> 8, ty->type & 0xff };
747
748
1.79M
    r |= (kputc_('@', ks) == EOF);
749
1.79M
    r |= (kputsn(c, 2, ks) == EOF);
750
4.92M
    for (tag = ty->tag; tag; tag = tag->next) {
751
3.12M
        r |= (kputc_('\t', ks) == EOF);
752
3.12M
        r |= (kputsn(tag->str, tag->len, ks) == EOF);
753
3.12M
    }
754
755
1.79M
    return r;
756
1.79M
}
757
758
11.0k
static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
759
11.0k
    const sam_hrec_type_t *t1, *t2;
760
761
11.0k
    if (!hrecs->first_line)
762
0
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
763
764
11.0k
    t1 = t2 = hrecs->first_line;
765
1.79M
    do {
766
1.79M
        if (build_header_line(t1, ks) != 0)
767
0
            return -1;
768
1.79M
        if (kputc('\n', ks) < 0)
769
0
            return -1;
770
771
1.79M
        t1 = t1->global_next;
772
1.79M
    } while (t1 != t2);
773
774
11.0k
    return 0;
775
11.0k
}
776
777
// Print warnings about known log messages that may have been inadvertently
778
// merged into SAM files.
779
780
3
static void known_stderr(const char *tool, const char *advice) {
781
3
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
782
3
    hts_log_warning("%s", advice);
783
3
}
784
785
// Look for known log messages that may have been inadvertently merged into SAM
786
// files, and print advice on how to avoid the problem if one is spotted.
787
788
228
static void warn_if_known_stderr(const char *line, size_t len) {
789
228
    int ilen = len < INT_MAX ? len : INT_MAX;
790
228
    if (kmemmem(line, ilen, "M::bwa_idx_load_from_disk", 25, NULL) != NULL)
791
0
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
792
228
    else if (kmemmem(line, ilen, "M::mem_pestat", 13, NULL) != NULL)
793
3
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
794
225
    else if (kmemmem(line, ilen, "loaded/built the index", 22, NULL) != NULL)
795
0
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
796
228
}
797
798
// Parse a CO line
799
// Returns 0 on failure; line length (>= 1) on success
800
static size_t parse_comment_line(const char *hdr, size_t len, size_t lno,
801
10.1k
                                 sam_hrecs_t *hrecs, sam_hrec_type_t *h_type) {
802
10.1k
    size_t j;
803
10.1k
    sam_hrec_tag_t *h_tag;
804
805
10.1k
    if (len == 0 || hdr[0] != '\t') {
806
0
        sam_hrecs_error("Missing tab", hdr, len, lno);
807
0
        warn_if_known_stderr(hdr, len);
808
0
        return 0;
809
0
    }
810
811
535k
    for (j = 1; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
812
524k
        ;
813
814
10.1k
    if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
815
0
        return -1;
816
10.1k
    h_tag->str = string_ndup(hrecs->str_pool, hdr + 1, j - 1);
817
10.1k
    h_tag->len = j - 1;
818
10.1k
    h_tag->next = NULL;
819
10.1k
    if (!h_tag->str)
820
0
        return 0;
821
10.1k
    return j;
822
10.1k
}
823
824
// Parse header lines other than CO
825
// Returns 0 on failure; line length (>= 1) on success
826
static size_t parse_noncomment_line(const char *hdr, size_t len, size_t lno,
827
                                    sam_hrecs_t *hrecs, sam_hrec_type_t *h_type,
828
1.92M
                                    khint_t *sq_count, khint_t *an_count) {
829
1.92M
    size_t i = 0;
830
1.92M
    sam_hrec_tag_t *h_tag, *last = NULL;
831
1.92M
    int is_sq = h_type->type == TYPEKEY("SQ");
832
833
    // Count reference name
834
1.92M
    if (is_sq)
835
63.1k
        (*sq_count)++;
836
837
5.31M
    do {
838
5.31M
        size_t j;
839
840
5.31M
        if (i == len || hdr[i] != '\t') {
841
0
            sam_hrecs_error("Missing tab", hdr, len, lno);
842
0
            warn_if_known_stderr(hdr, len);
843
0
            return 0;
844
0
        }
845
846
5.31M
        for (j = ++i;
847
896M
             j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t';
848
891M
             j++)
849
891M
            ;
850
851
5.31M
        if (j - i < 3 || hdr[i + 2] != ':') {
852
219
            sam_hrecs_error("Malformed key:value pair", hdr, len, lno);
853
219
            warn_if_known_stderr(hdr, len);
854
219
            return 0;
855
219
        }
856
857
        // Count alternate names
858
5.31M
        if (is_sq && hdr[i] == 'A' && hdr[i + 1] == 'N') {
859
866k
            size_t k;
860
866k
            (*an_count)++;
861
5.37M
            for (k = i + 2; k < j; k++)
862
4.50M
                (*an_count) += hdr[k] == ',';
863
866k
        }
864
865
5.31M
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
866
0
            return 0;
867
5.31M
        h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
868
5.31M
        h_tag->len = j-i;
869
5.31M
        h_tag->next = NULL;
870
5.31M
        if (!h_tag->str)
871
0
            return 0;
872
873
5.31M
        if (last)
874
3.38M
            last->next = h_tag;
875
1.92M
        else
876
1.92M
            h_type->tag = h_tag;
877
878
5.31M
        last = h_tag;
879
5.31M
        i = j;
880
5.31M
    } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
881
882
1.92M
    return i;
883
1.92M
}
884
885
/** Parse a single header line.
886
 * @param         hrecs        Destination sam_hrecs_t struct
887
 * @param         hdr          Data to parse
888
 * @param         len          Size of @p hdr
889
 * @param         lno          Line number, for error / warning messages
890
 * @param[out]    line_len_out Returned length of parsed line
891
 * @param[in,out] sq_count     Count of SQ lines seen so far
892
 * @param[in,out] an_count     Count of alternate names seen so far
893
 *
894
 * Used by sam_hrecs_parse_lines() and sam_hdr_build_from_sam_file(),
895
 * so has to be able to deal with lines that are not NUL terminated.
896
 * A single line pointed to by @ph hdr will be parsed into @p hrecs.
897
 * @p len gives the size of @p hdr, which may include more than one line
898
 * if called from sam_hrecs_parse_lines(). The length of the parsed line
899
 * is returned in @p line_len_out.
900
 * @p sq_count and @p an_count are pointers to the number of SQ lines
901
 * (i.e. references), and the number of alternate names.  These are used
902
 * by the caller to set the sizes of the sam_hrecs_t::ref array and
903
 * sam_hrecs_t::ref_hash table once all the lines have been parsed.
904
 */
905
906
static sam_hrec_type_t * sam_hrecs_parse_single_line(sam_hrecs_t *hrecs,
907
                                                     const char *hdr,
908
                                                     size_t len, size_t lno,
909
                                                     size_t *line_len_out,
910
                                                     khint_t *sq_count,
911
1.93M
                                                     khint_t *an_count) {
912
1.93M
    int new;
913
1.93M
    khint32_t type;
914
1.93M
    khint_t k;
915
1.93M
    sam_hrec_type_t *h_type;
916
1.93M
    size_t line_len = 0;
917
918
1.93M
    if (hdr[0] != '@') {
919
1
        sam_hrecs_error("Header line does not start with '@'",
920
1
                        hdr, len, lno);
921
1
        return NULL;
922
1
    }
923
924
1.93M
    if (!isalpha_c(hdr[1]) || !isalpha_c(hdr[2])) {
925
0
        sam_hrecs_error("Header line does not have a two character key",
926
0
                        hdr, len, lno);
927
0
        return NULL;
928
0
    }
929
1.93M
    type = TYPEKEY(&hdr[1]);
930
931
1.93M
    if (len < 3 || hdr[3] == '\n')
932
0
        return NULL;
933
934
    // Add the header line type
935
1.93M
    if (!(h_type = pool_alloc(hrecs->type_pool)))
936
0
        return NULL;
937
938
1.93M
    h_type->tag = NULL;
939
940
1.93M
    k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
941
1.93M
    if (new < 0)
942
0
        goto error;
943
944
1.93M
    h_type->type = type;
945
946
    // Parse the tags on this line
947
1.93M
    if (type == TYPEKEY("CO")) {
948
949
10.1k
        line_len = parse_comment_line(hdr + 3, len - 3, lno,
950
10.1k
                                      hrecs, h_type);
951
10.1k
        if (line_len == 0) // Failed
952
0
            goto error;
953
954
1.92M
    } else {
955
956
1.92M
        line_len = parse_noncomment_line(hdr + 3, len - 3, lno,
957
1.92M
                                         hrecs, h_type,
958
1.92M
                                         sq_count, an_count);
959
1.92M
        if (line_len == 0) // Failed
960
219
            goto error;
961
1.92M
    }
962
963
    // Add to end of global list
964
1.93M
    sam_hrecs_global_list_add(hrecs, h_type, NULL);
965
966
    // Form the ring, either with self or other lines of this type
967
1.93M
    if (!new) {
968
1.92M
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
969
1.92M
        p = t->prev;
970
971
1.92M
        assert(p->next == t);
972
1.92M
        p->next = h_type;
973
1.92M
        h_type->prev = p;
974
975
1.92M
        t->prev = h_type;
976
1.92M
        h_type->next = t;
977
1.92M
    } else {
978
10.6k
        kh_val(hrecs->h, k) = h_type;
979
10.6k
        h_type->prev = h_type->next = h_type;
980
10.6k
    }
981
982
1.93M
    if (line_len_out)
983
302k
        *line_len_out = line_len + 3;
984
1.93M
    return h_type;
985
986
219
 error:
987
    // Tidy up as best we can
988
219
    sam_hrecs_free_tags(hrecs, h_type->tag);
989
219
    pool_free(hrecs->type_pool, h_type);
990
219
    return NULL;
991
1.93M
}
992
993
// Parse complete header into a sam_hrecs_t structure
994
995
2.81k
static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
996
2.81k
    size_t i, lno;
997
998
2.81k
    if (!hrecs || len > SSIZE_MAX)
999
0
        return -1;
1000
1001
2.81k
    if (!len)
1002
0
        len = strlen(hdr);
1003
1004
2.81k
    if (len < 3) {
1005
52
        if (len == 0 || *hdr == '\0') return 0;
1006
0
        sam_hrecs_error("Header line too short", hdr, len, 1);
1007
0
        return -1;
1008
52
    }
1009
1010
2.76k
    sam_hrec_type_t *first_h_type = NULL;
1011
2.76k
    sam_hrec_type_t *last_h_type = NULL;
1012
2.76k
    khint_t sq_count = 0, an_count = 0;
1013
2.76k
    size_t line_len = 0;
1014
1015
305k
    for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; lno++) {
1016
302k
        sam_hrec_type_t *h_type;
1017
302k
        h_type = sam_hrecs_parse_single_line(hrecs, &hdr[i], len - i,
1018
302k
                                             lno, &line_len,
1019
302k
                                             &sq_count, &an_count);
1020
302k
        if (!h_type)
1021
1
            return -1;
1022
1023
302k
        i += line_len + 1;
1024
1025
        // Keep track of first / last new record so they can be passed to
1026
        // sam_hrecs_update_hashes() below.  Ignore HD as it may be stored
1027
        // out of order (it always goes at the start) and
1028
        // sam_hrecs_update_hashes() does nothing with HD.
1029
302k
        if (h_type->type != TYPEKEY("HD")) {
1030
301k
            if (!first_h_type)
1031
2.74k
                first_h_type = h_type;
1032
301k
            last_h_type = h_type;
1033
301k
        }
1034
302k
    }
1035
1036
    // Reserve space in hash tables and refs array
1037
2.76k
    if (sq_count) {
1038
        // It's possible that some entries are already in the hash table,
1039
        // e.g. via sam_hrecs_refs_from_targets_array(), so only reserve
1040
        // space beyond what's already there.
1041
2.59k
        khint_t ref_name_count = sq_count + an_count;
1042
2.59k
        khint_t sz = kh_size(hrecs->ref_hash);
1043
2.59k
        if (ref_name_count > sz
1044
620
            && kh_grow_to_fit(m_s2i, hrecs->ref_hash, ref_name_count - sz) < 0)
1045
0
            return -1;
1046
1047
2.59k
        if (hrecs->ref_sz < sq_count) {
1048
571
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1049
571
                                             sizeof(*hrecs->ref) * sq_count);
1050
571
            if (!new_ref)
1051
0
                return -1;
1052
571
            hrecs->ref = new_ref;
1053
571
            hrecs->ref_sz = sq_count;
1054
571
        }
1055
2.59k
    }
1056
1057
    // Update RG/SQ/PG hashes
1058
2.76k
    if (first_h_type) {
1059
2.74k
        sam_hrec_type_t *h_type = first_h_type->global_prev;
1060
301k
        do {
1061
301k
            h_type = h_type->global_next;
1062
301k
            if (-1 == sam_hrecs_update_hashes(hrecs, h_type->type, h_type))
1063
121
                return -1;
1064
301k
        } while (h_type != last_h_type);
1065
2.74k
    }
1066
2.64k
    return 0;
1067
2.76k
}
1068
1069
/*! Update sam_hdr_t target_name and target_len arrays
1070
 *
1071
 *  @return 0 on success; -1 on failure
1072
 */
1073
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
1074
18.0k
                                 int refs_changed) {
1075
18.0k
    if (!bh || !hrecs)
1076
0
        return -1;
1077
1078
18.0k
    if (refs_changed < 0)
1079
0
        return 0;
1080
1081
    // Grow arrays if necessary
1082
18.0k
    if (bh->n_targets < hrecs->nref) {
1083
11.5k
        char **new_names = realloc(bh->target_name,
1084
11.5k
                                   hrecs->nref * sizeof(*new_names));
1085
11.5k
        if (!new_names)
1086
0
            return -1;
1087
11.5k
        bh->target_name = new_names;
1088
11.5k
        uint32_t *new_lens = realloc(bh->target_len,
1089
11.5k
                                     hrecs->nref * sizeof(*new_lens));
1090
11.5k
        if (!new_lens)
1091
0
            return -1;
1092
11.5k
        bh->target_len = new_lens;
1093
11.5k
    }
1094
1095
    // Update names and lengths where changed
1096
    // hrecs->refs_changed is the first ref that has been updated, so ones
1097
    // before that can be skipped.
1098
18.0k
    int i;
1099
18.0k
    khint_t k;
1100
18.0k
    khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1101
34.5k
    for (i = refs_changed; i < hrecs->nref; i++) {
1102
16.5k
        if (i >= bh->n_targets
1103
15.9k
            || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
1104
15.9k
            if (i < bh->n_targets)
1105
0
                free(bh->target_name[i]);
1106
15.9k
            bh->target_name[i] = strdup(hrecs->ref[i].name);
1107
15.9k
            if (!bh->target_name[i])
1108
0
                return -1;
1109
15.9k
        }
1110
16.5k
        if (hrecs->ref[i].len < UINT32_MAX) {
1111
12.5k
            bh->target_len[i] = hrecs->ref[i].len;
1112
1113
12.5k
            if (!long_refs)
1114
12.0k
                continue;
1115
1116
            // Check if we have an old length, if so remove it.
1117
510
            k = kh_get(s2i, long_refs, bh->target_name[i]);
1118
510
            if (k < kh_end(long_refs))
1119
0
                kh_del(s2i, long_refs, k);
1120
3.93k
        } else {
1121
3.93k
            bh->target_len[i] = UINT32_MAX;
1122
3.93k
            if (bh->hrecs != hrecs) {
1123
                // Called from sam_hdr_dup; need to add sdict entries
1124
2.21k
                if (!long_refs) {
1125
890
                    if (!(bh->sdict = long_refs = kh_init(s2i)))
1126
0
                        return -1;
1127
890
                }
1128
1129
                // Add / update length
1130
2.21k
                int absent;
1131
2.21k
                k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
1132
2.21k
                if (absent < 0)
1133
0
                    return -1;
1134
2.21k
                kh_val(long_refs, k) = hrecs->ref[i].len;
1135
2.21k
            }
1136
3.93k
        }
1137
16.5k
    }
1138
1139
    // Free up any names that have been removed
1140
18.0k
    for (; i < bh->n_targets; i++) {
1141
0
        if (long_refs) {
1142
0
            k = kh_get(s2i, long_refs, bh->target_name[i]);
1143
0
            if (k < kh_end(long_refs))
1144
0
                kh_del(s2i, long_refs, k);
1145
0
        }
1146
0
        free(bh->target_name[i]);
1147
0
    }
1148
1149
18.0k
    bh->n_targets = hrecs->nref;
1150
18.0k
    return 0;
1151
18.0k
}
1152
1153
1.12k
static int rebuild_target_arrays(sam_hdr_t *bh) {
1154
1.12k
    if (!bh || !bh->hrecs)
1155
0
        return -1;
1156
1157
1.12k
    sam_hrecs_t *hrecs = bh->hrecs;
1158
1.12k
    if (hrecs->refs_changed < 0)
1159
0
        return 0;
1160
1161
1.12k
    if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
1162
0
        return -1;
1163
1164
1.12k
    hrecs->refs_changed = -1;
1165
1.12k
    return 0;
1166
1.12k
}
1167
1168
/// Populate hrecs refs array from header target_name, target_len arrays
1169
/**
1170
 * @return 0 on success; -1 on failure
1171
 *
1172
 * Pre-fills the refs hash from the target arrays.  For BAM files this
1173
 * will ensure that they are in the correct order as the target arrays
1174
 * are the canonical source for converting target ids to names and lengths.
1175
 *
1176
 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1177
 * will add the links later for lines found in the text header.
1178
 *
1179
 * This should be called before the text header is parsed.
1180
 */
1181
static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1182
2.06k
                                             const sam_hdr_t *bh) {
1183
2.06k
    int32_t tid = 0;
1184
1185
2.06k
    if (!hrecs || !bh)
1186
0
        return -1;
1187
1188
    // This should always be called before parsing the text header
1189
    // so the ref array should start off empty, and we don't have to try
1190
    // to reconcile any existing data.
1191
2.06k
    if (hrecs->nref > 0) {
1192
0
        hts_log_error("Called with non-empty ref array");
1193
0
        return -1;
1194
0
    }
1195
1196
2.06k
    if (hrecs->ref_sz < bh->n_targets) {
1197
2.06k
        sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1198
2.06k
                                         bh->n_targets * sizeof(*new_ref));
1199
2.06k
        if (!new_ref)
1200
0
            return -1;
1201
1202
2.06k
        hrecs->ref = new_ref;
1203
2.06k
        hrecs->ref_sz = bh->n_targets;
1204
2.06k
    }
1205
1206
2.06k
    if (kh_grow_to_fit(m_s2i, hrecs->ref_hash, bh->n_targets) != 0)
1207
0
        goto fail;
1208
1209
5.09k
    for (tid = 0; tid < bh->n_targets; tid++) {
1210
3.03k
        khint_t k;
1211
3.03k
        int r;
1212
3.03k
        hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1213
3.03k
        if (!hrecs->ref[tid].name) goto fail;
1214
3.03k
        if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1215
2.09k
            hrecs->ref[tid].len  = bh->target_len[tid];
1216
2.09k
        } else {
1217
941
            khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1218
941
            k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1219
941
            if (k < kh_end(long_refs)) {
1220
757
                hrecs->ref[tid].len = kh_val(long_refs, k);
1221
757
            } else {
1222
184
                hrecs->ref[tid].len = UINT32_MAX;
1223
184
            }
1224
941
        }
1225
3.03k
        hrecs->ref[tid].ty   = NULL;
1226
3.03k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1227
3.03k
        if (r < 0) goto fail;
1228
3.03k
        if (r == 0) {
1229
0
            hts_log_error("Duplicate entry \"%s\" in target list",
1230
0
                            hrecs->ref[tid].name);
1231
0
            return -1;
1232
3.03k
        } else {
1233
3.03k
            kh_val(hrecs->ref_hash, k) = tid;
1234
3.03k
        }
1235
3.03k
    }
1236
2.06k
    hrecs->nref = bh->n_targets;
1237
2.06k
    return 0;
1238
1239
0
 fail: {
1240
0
        int32_t i;
1241
0
        hts_log_error("%s", strerror(errno));
1242
0
        for (i = 0; i < tid; i++) {
1243
0
            khint_t k;
1244
0
            if (!hrecs->ref[i].name) continue;
1245
0
            k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1246
0
            if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1247
0
        }
1248
0
        hrecs->nref = 0;
1249
0
        return -1;
1250
2.06k
    }
1251
2.06k
}
1252
1253
/*
1254
 * Add SQ header records for any references in the hrecs->ref array that
1255
 * were added by sam_hrecs_refs_from_targets_array() but have not
1256
 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1257
 *
1258
 * This may be needed either because:
1259
 *
1260
 *   - A bam file was read that had entries in its refs list with no
1261
 *     corresponding @SQ line.
1262
 *
1263
 *   - A program constructed a sam_hdr_t which has target_name and target_len
1264
 *     array entries with no corresponding @SQ line in text.
1265
 */
1266
7.76k
static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1267
7.76k
    int tid;
1268
7.76k
    char len[32];
1269
1270
10.5k
    for (tid = 0; tid < hrecs->nref; tid++) {
1271
2.79k
        if (hrecs->ref[tid].ty == NULL) {
1272
48
            snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1273
48
            if (sam_hrecs_add(hrecs, "SQ",
1274
48
                              "SN", hrecs->ref[tid].name,
1275
48
                              "LN", len, NULL) != 0)
1276
0
                return -1;
1277
1278
            // Check that the stub has actually been filled
1279
48
            if(hrecs->ref[tid].ty == NULL) {
1280
0
                hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1281
0
                        tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1282
0
                return -1;
1283
0
            }
1284
48
        }
1285
2.79k
    }
1286
7.76k
    return 0;
1287
7.76k
}
1288
1289
7.87k
int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1290
7.87k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1291
1292
7.87k
    if (!hrecs)
1293
0
        return -1;
1294
1295
7.87k
    if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1296
2.06k
        if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1297
0
            sam_hrecs_free(hrecs);
1298
0
            return -1;
1299
0
        }
1300
2.06k
    }
1301
1302
    // Parse existing header text
1303
7.87k
    if (bh->text && bh->l_text > 0) {
1304
2.18k
        if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1305
118
            sam_hrecs_free(hrecs);
1306
118
            return -1;
1307
118
        }
1308
2.18k
    }
1309
1310
7.76k
    if (add_stub_ref_sq_lines(hrecs) < 0) {
1311
0
        sam_hrecs_free(hrecs);
1312
0
        return -1;
1313
0
    }
1314
1315
7.76k
    bh->hrecs = hrecs;
1316
1317
7.76k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1318
0
        return -1;
1319
1320
7.76k
    return 0;
1321
7.76k
}
1322
1323
// Check header type is valid according to the SAM specification
1324
1325
1.63M
static int valid_sam_header_type(const char *s) {
1326
1.63M
    if (s[0] != '@') return 0;
1327
1.63M
    switch (s[1]) {
1328
282
    case 'H':
1329
282
        return s[2] == 'D' && s[3] == '\t';
1330
59.0k
    case 'S':
1331
59.0k
        return s[2] == 'Q' && s[3] == '\t';
1332
1.55M
    case 'R':
1333
1.56M
    case 'P':
1334
1.56M
        return s[2] == 'G' && s[3] == '\t';
1335
8.54k
    case 'C':
1336
8.54k
        return s[2] == 'O';
1337
1.63M
    }
1338
0
    return 0;
1339
1.63M
}
1340
1341
/** Populate sam_hdr_t from SAM file content
1342
 *
1343
 * @param hdr empty header struct to be filled
1344
 * @param fp  File to read from
1345
 * @return 0 on success
1346
 *        -1 on failure
1347
 *
1348
 * This function is used to build the header structure when reading SAM files.
1349
 * The lines read from the file are used to create sam_hrecs_t structures.
1350
 * The function also populates sam_hdr_t::text, sam_hdr_t::l_text,
1351
 * sam_hdr_t::target_name and sam_hdr_t::target_len.
1352
 */
1353
5.16k
int sam_hdr_build_from_sam_file(sam_hdr_t *hdr, htsFile* fp) {
1354
5.16k
    kstring_t str = { 0, 0, NULL };
1355
5.16k
    int ret;
1356
5.16k
    int next_c = '@';
1357
5.16k
    sam_hrec_type_t *first_h_type = NULL;
1358
5.16k
    sam_hrec_type_t *last_h_type = NULL;
1359
5.16k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1360
5.16k
    size_t lno = 0;
1361
5.16k
    khint_t sq_count = 0, an_count = 0;
1362
1363
5.16k
    if (!hrecs)
1364
0
        return -1;
1365
1366
5.16k
    hdr->hrecs = hrecs;
1367
1368
1.63M
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1369
1.63M
        sam_hrec_type_t *h_type;
1370
1371
1.63M
        lno++;
1372
1.63M
        if (!fp->line.s || fp->line.s[0] != '@')
1373
0
            break;
1374
1375
1.63M
        if (fp->line.l < 3
1376
1.63M
            || !isalpha_c(fp->line.s[1]) || !isalpha_c(fp->line.s[2])) {
1377
9
            sam_hrecs_error("Header line does not have a two character key",
1378
9
                            fp->line.s, fp->line.l, lno);
1379
9
            goto error;
1380
9
        }
1381
1382
1.63M
        if (fp->line.l == 3 && strncmp(fp->line.s, "@CO", 3) == 0) {
1383
            // HTSlib's original SAM reader let you get away with an untabbed
1384
            // @CO line
1385
5.14k
            kputc('\t', &fp->line);
1386
5.14k
        }
1387
1388
1.63M
        if (!valid_sam_header_type(fp->line.s)) {
1389
9
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1390
9
            warn_if_known_stderr(fp->line.s, fp->line.l);
1391
9
            goto error;
1392
9
        }
1393
1394
1.63M
        h_type = sam_hrecs_parse_single_line(hrecs, fp->line.s, fp->line.l,
1395
1.63M
                                             lno, NULL, &sq_count, &an_count);
1396
1.63M
        if (!h_type)
1397
219
            goto error;
1398
1399
1.63M
        if (h_type->type != TYPEKEY("HD")) {
1400
1.62M
            if (!first_h_type)
1401
4.95k
                first_h_type = h_type;
1402
1.62M
            last_h_type = h_type;
1403
1.62M
        }
1404
1405
1.63M
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1406
0
            goto error;
1407
1408
1.63M
        if (kputc('\n', &str) < 0)
1409
0
            goto error;
1410
1411
1.63M
        if (fp->is_bgzf) {
1412
1.57M
            next_c = bgzf_peek(fp->fp.bgzf);
1413
1.57M
        } else {
1414
50.9k
            unsigned char nc;
1415
50.9k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1416
50.9k
            next_c = pret > 0 ? nc : pret - 1;
1417
50.9k
        }
1418
1.63M
        if (next_c < -1)
1419
6
            goto error;
1420
1.63M
    }
1421
4.92k
    if (next_c != '@')
1422
4.89k
        fp->line.l = 0;
1423
1424
4.92k
    if (ret < -1)
1425
30
        goto error;
1426
1427
4.89k
    if (sq_count == 0 && fp->fn_aux) {
1428
        // No @SQ lines found.  Try to get them from the fasta index.
1429
0
        kstring_t line = { 0, 0, NULL };
1430
1431
        /* The reference index (.fai) is actually needed here */
1432
0
        char *fai_fn = fp->fn_aux;
1433
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1434
0
        if (fn_delim)
1435
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1436
1437
0
        hFILE* f = hopen(fai_fn, "r");
1438
0
        int e = 0;
1439
0
        if (f == NULL)
1440
0
            goto error;
1441
1442
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1443
0
            char* tab = strchr(line.s, '\t');
1444
0
            size_t l_start = str.l;
1445
0
            sam_hrec_type_t *h_type;
1446
0
            hts_pos_t ln;
1447
1448
0
            if (tab == NULL)
1449
0
                continue;
1450
1451
0
            ln = strtoll(tab, NULL, 10);
1452
1453
0
            e |= kputs("@SQ\tSN:", &str) < 0;
1454
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
1455
0
            e |= kputs("\tLN:", &str) < 0;
1456
0
            e |= kputll(ln, &str) < 0;
1457
0
            e |= kputc('\n', &str) < 0;
1458
0
            if (e)
1459
0
                break;
1460
1461
0
            h_type = sam_hrecs_parse_single_line(hrecs, &str.s[l_start],
1462
0
                                                 str.l - l_start,
1463
0
                                                 lno, NULL,
1464
0
                                                 &sq_count, &an_count);
1465
0
            if (!h_type)
1466
0
                goto error;
1467
1468
0
            if (!first_h_type)
1469
0
                first_h_type = h_type;
1470
0
            last_h_type = h_type;
1471
0
        }
1472
1473
0
        ks_free(&line);
1474
0
        if (hclose(f) != 0) {
1475
0
            hts_log_error("Error on closing %s", fai_fn);
1476
0
            e = 1;
1477
0
        }
1478
0
        if (e)
1479
0
            goto error;
1480
0
    }
1481
1482
    // Reserve space in hash tables
1483
4.89k
    if (sq_count) {
1484
        // Hash table should be empty at the moment
1485
4.40k
        if (kh_grow_to_fit(m_s2i, hrecs->ref_hash, sq_count + an_count) < 0)
1486
0
            goto error;
1487
1488
4.40k
        if (hrecs->ref_sz < sq_count) {
1489
4.40k
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1490
4.40k
                                             sizeof(*hrecs->ref) * sq_count);
1491
4.40k
            if (!new_ref)
1492
0
                goto error;
1493
4.40k
            hrecs->ref = new_ref;
1494
4.40k
            hrecs->ref_sz = sq_count;
1495
4.40k
        }
1496
4.40k
    }
1497
1498
    // Update RG/SQ/PG hashes
1499
4.89k
    if (first_h_type) {
1500
4.88k
        sam_hrec_type_t *h_type = first_h_type->global_prev;
1501
906k
        do {
1502
906k
            h_type = h_type->global_next;
1503
906k
            if (-1 == sam_hrecs_update_hashes(hrecs, h_type->type, h_type))
1504
105
                goto error;
1505
906k
        } while (h_type != last_h_type);
1506
4.88k
    }
1507
1508
4.78k
    if (str.l == 0)
1509
0
        kputsn("", 0, &str);
1510
4.78k
    hdr->l_text = str.l;
1511
4.78k
    hdr->text = ks_release(&str);
1512
1513
4.78k
    if (sam_hdr_update_target_arrays(hdr, hrecs, 0) < 0)
1514
0
        goto error;
1515
1516
4.78k
    return 0;
1517
1518
378
 error:
1519
378
    ks_free(&str);
1520
378
    return -1;
1521
4.78k
}
1522
1523
/** Remove outdated header text
1524
1525
    @param bh     BAM header
1526
1527
    This is called when API functions have changed the header so that the
1528
    text version is no longer valid.
1529
 */
1530
624
static void redact_header_text(sam_hdr_t *bh) {
1531
624
    assert(bh->hrecs && bh->hrecs->dirty);
1532
624
    bh->l_text = 0;
1533
624
    free(bh->text);
1534
624
    bh->text = NULL;
1535
624
}
1536
1537
/** Find nth header record of a given type
1538
1539
    @param type   Header type (SQ, RG etc.)
1540
    @param idx    0-based index
1541
1542
    @return sam_hrec_type_t pointer to the record on success
1543
            NULL if no record exists with the given type and index
1544
 */
1545
1546
static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1547
0
                                                const char *type, int idx) {
1548
0
    sam_hrec_type_t *first, *itr;
1549
1550
0
    if (idx < 0)
1551
0
        return NULL;
1552
1553
0
    if (type[0] == 'S' && type[1] == 'Q')
1554
0
        return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1555
1556
0
    if (type[0] == 'R' && type[1] == 'G')
1557
0
        return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1558
1559
0
    if (type[0] == 'P' && type[1] == 'G')
1560
0
        return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1561
1562
0
    first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1563
0
    if (!first)
1564
0
        return NULL;
1565
1566
0
    while (idx > 0) {
1567
0
        itr = itr->next;
1568
0
        if (itr == first)
1569
0
            break;
1570
0
        --idx;
1571
0
    }
1572
1573
0
    return idx == 0 ? itr : NULL;
1574
0
}
1575
1576
/* ==== Public methods ==== */
1577
1578
2.61k
size_t sam_hdr_length(sam_hdr_t *bh) {
1579
2.61k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1580
0
        return SIZE_MAX;
1581
1582
2.61k
    return bh->l_text;
1583
2.61k
}
1584
1585
1.63k
const char *sam_hdr_str(sam_hdr_t *bh) {
1586
1.63k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1587
0
        return NULL;
1588
1589
1.63k
    return bh->text;
1590
1.63k
}
1591
1592
0
int sam_hdr_nref(const sam_hdr_t *bh) {
1593
0
    if (!bh)
1594
0
        return -1;
1595
1596
0
    return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1597
0
}
1598
1599
/*
1600
 * Reconstructs the text representation from the header hash table.
1601
 * Returns 0 on success
1602
 *        -1 on failure
1603
 */
1604
4.24k
int sam_hdr_rebuild(sam_hdr_t *bh) {
1605
4.24k
    sam_hrecs_t *hrecs;
1606
4.24k
    if (!bh)
1607
0
        return -1;
1608
1609
4.24k
    if (!(hrecs = bh->hrecs))
1610
0
        return bh->text ? 0 : -1;
1611
1612
4.24k
    if (hrecs->refs_changed >= 0) {
1613
0
        if (rebuild_target_arrays(bh) < 0) {
1614
0
            hts_log_error("Header target array rebuild has failed");
1615
0
            return -1;
1616
0
        }
1617
0
    }
1618
1619
    /* If header text wasn't changed or header is empty, don't rebuild it. */
1620
4.24k
    if (!hrecs->dirty)
1621
4.05k
        return 0;
1622
1623
189
    if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1624
0
        hts_log_error("Linking @PG lines has failed");
1625
0
        return -1;
1626
0
    }
1627
1628
189
    kstring_t ks = KS_INITIALIZE;
1629
189
    if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1630
0
        ks_free(&ks);
1631
0
        hts_log_error("Header text rebuild has failed");
1632
0
        return -1;
1633
0
    }
1634
1635
189
    hrecs->dirty = 0;
1636
1637
    /* Sync */
1638
189
    free(bh->text);
1639
189
    bh->l_text = ks_len(&ks);
1640
189
    bh->text = ks_release(&ks);
1641
1642
189
    return 0;
1643
189
}
1644
1645
/*
1646
 * Appends a formatted line to an existing SAM header.
1647
 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1648
 * optional new-line. If it contains more than 1 line then multiple lines
1649
 * will be added in order.
1650
 *
1651
 * Input text is of maximum length len or as terminated earlier by a NUL.
1652
 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1653
 *
1654
 * Returns 0 on success
1655
 *        -1 on failure
1656
 */
1657
1.75k
int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1658
1.75k
    sam_hrecs_t *hrecs;
1659
1660
1.75k
    if (!bh || !lines)
1661
0
        return -1;
1662
1663
1.75k
    if (len == 0 && *lines == '\0')
1664
1.12k
        return 0;
1665
1666
628
    if (!(hrecs = bh->hrecs)) {
1667
628
        if (sam_hdr_fill_hrecs(bh) != 0)
1668
0
            return -1;
1669
628
        hrecs = bh->hrecs;
1670
628
    }
1671
1672
628
    if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1673
4
        return -1;
1674
1675
624
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1676
0
        return -1;
1677
1678
624
    hrecs->dirty = 1;
1679
624
    redact_header_text(bh);
1680
1681
624
    return 0;
1682
624
}
1683
1684
/*
1685
 * Adds a single line to a SAM header.
1686
 * Specify type and one or more key,value pairs, ending with the NULL key.
1687
 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1688
 *
1689
 * Returns 0 on success
1690
 *        -1 on failure
1691
 */
1692
0
int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1693
0
    va_list args;
1694
0
    sam_hrecs_t *hrecs;
1695
1696
0
    if (!bh || !type)
1697
0
        return -1;
1698
1699
0
    if (!(hrecs = bh->hrecs)) {
1700
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1701
0
            return -1;
1702
0
        hrecs = bh->hrecs;
1703
0
    }
1704
1705
0
    va_start(args, type);
1706
0
    int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1707
0
    va_end(args);
1708
1709
0
    if (ret == 0) {
1710
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1711
0
            return -1;
1712
1713
0
        if (hrecs->dirty)
1714
0
            redact_header_text(bh);
1715
0
    }
1716
1717
0
    return ret;
1718
0
}
1719
1720
/*
1721
 * Returns a complete line of formatted text for a specific head type/ID
1722
 * combination. If ID_key is NULL then it returns the first line of the specified
1723
 * type.
1724
 */
1725
int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1726
0
                      const char *ID_key, const char *ID_val, kstring_t *ks) {
1727
0
    sam_hrecs_t *hrecs;
1728
0
    if (!bh || !type)
1729
0
        return -2;
1730
1731
0
    if (!(hrecs = bh->hrecs)) {
1732
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1733
0
            return -2;
1734
0
        hrecs = bh->hrecs;
1735
0
    }
1736
1737
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1738
0
    if (!ty)
1739
0
        return -1;
1740
1741
0
    ks->l = 0;
1742
0
    if (build_header_line(ty, ks) < 0) {
1743
0
        return -2;
1744
0
    }
1745
1746
0
    return 0;
1747
0
}
1748
1749
int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1750
0
                          int pos, kstring_t *ks) {
1751
0
    sam_hrecs_t *hrecs;
1752
0
    if (!bh || !type)
1753
0
        return -2;
1754
1755
0
    if (!(hrecs = bh->hrecs)) {
1756
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1757
0
            return -2;
1758
0
        hrecs = bh->hrecs;
1759
0
    }
1760
1761
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1762
0
    if (!ty)
1763
0
        return -1;
1764
1765
0
    ks->l = 0;
1766
0
    if (build_header_line(ty, ks) < 0) {
1767
0
        return -2;
1768
0
    }
1769
1770
0
    return 0;
1771
0
}
1772
1773
/*
1774
 * Remove a line from the header by specifying a tag:value that uniquely
1775
 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1776
 * @SQ line is uniquely identified by SN tag.
1777
 * @RG line is uniquely identified by ID tag.
1778
 * @PG line is uniquely identified by ID tag.
1779
 *
1780
 * Returns 0 on success and -1 on error
1781
 */
1782
1783
0
int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1784
0
    sam_hrecs_t *hrecs;
1785
0
    if (!bh || !type)
1786
0
        return -1;
1787
1788
0
    if (!(hrecs = bh->hrecs)) {
1789
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1790
0
            return -1;
1791
0
        hrecs = bh->hrecs;
1792
0
    }
1793
1794
0
    if (!strncmp(type, "PG", 2)) {
1795
0
        hts_log_warning("Removing PG lines is not supported!");
1796
0
        return -1;
1797
0
    }
1798
1799
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1800
0
    if (!type_found)
1801
0
        return 0;
1802
1803
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1804
0
    if (ret == 0) {
1805
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1806
0
            return -1;
1807
1808
0
        if (hrecs->dirty)
1809
0
            redact_header_text(bh);
1810
0
    }
1811
1812
0
    return ret;
1813
0
}
1814
1815
/*
1816
 * Remove a line from the header by specifying the position in the type
1817
 * group, i.e. 3rd @SQ line.
1818
 *
1819
 * Returns 0 on success and -1 on error
1820
 */
1821
1822
0
int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1823
0
    sam_hrecs_t *hrecs;
1824
0
    if (!bh || !type || position < 0)
1825
0
        return -1;
1826
1827
0
    if (!(hrecs = bh->hrecs)) {
1828
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1829
0
            return -1;
1830
0
        hrecs = bh->hrecs;
1831
0
    }
1832
1833
0
    if (!strncmp(type, "PG", 2)) {
1834
0
        hts_log_warning("Removing PG lines is not supported!");
1835
0
        return -1;
1836
0
    }
1837
1838
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1839
0
                                                          position);
1840
0
    if (!type_found)
1841
0
        return -1;
1842
1843
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1844
0
    if (ret == 0) {
1845
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1846
0
            return -1;
1847
1848
0
        if (hrecs->dirty)
1849
0
            redact_header_text(bh);
1850
0
    }
1851
1852
0
    return ret;
1853
0
}
1854
1855
/*
1856
 * Check if sam_hdr_update_line() is being used to change the name of
1857
 * a record, and if the new name is going to clash with an existing one.
1858
 *
1859
 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1860
 * will go through them all and leave the final one in place.
1861
 *
1862
 * Returns 0 if the name does not change
1863
 *         1 if the name changes but does not clash
1864
 *        -1 if the name changes and the new one is already in use
1865
 */
1866
static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1867
                                 va_list ap, const char **old_name,
1868
                                 const char **new_name,
1869
                                 char id_tag_out[3],
1870
0
                                 khash_t(m_s2i) **hash_out) {
1871
0
    char *key, *val;
1872
0
    const char *id_tag;
1873
0
    sam_hrec_tag_t *tag, *prev;
1874
0
    khash_t(m_s2i) *hash;
1875
0
    khint_t k;
1876
0
    int ret = 0;
1877
1878
0
    if        (rec->type == TYPEKEY("SQ")) {
1879
0
        id_tag = "SN"; hash = hrecs->ref_hash;
1880
0
    } else if (rec->type == TYPEKEY("RG")) {
1881
0
        id_tag = "ID"; hash = hrecs->rg_hash;
1882
0
    } else if (rec->type == TYPEKEY("PG")) {
1883
0
        id_tag = "ID"; hash = hrecs->pg_hash;
1884
0
    } else {
1885
0
        return 0;
1886
0
    }
1887
1888
0
    memcpy(id_tag_out, id_tag, 3);
1889
0
    *hash_out = hash;
1890
1891
0
    tag = sam_hrecs_find_key(rec, id_tag, &prev);
1892
0
    if (!tag)
1893
0
        return 0;
1894
0
    assert(tag->len >= 3);
1895
0
    *old_name = tag->str + 3;
1896
1897
0
    while ((key = va_arg(ap, char *)) != NULL) {
1898
0
        val = va_arg(ap, char *);
1899
0
        if (!val) val = "";
1900
0
        if (strcmp(key, id_tag) != 0) continue;
1901
0
        if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1902
0
        k = kh_get(m_s2i, hash, val);
1903
0
        ret = k < kh_end(hash) ? -1 : 1;
1904
0
        *new_name = val;
1905
0
    }
1906
0
    return ret;
1907
0
}
1908
1909
int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1910
0
        const char *ID_key, const char *ID_value, ...) {
1911
0
    sam_hrecs_t *hrecs;
1912
0
    if (!bh)
1913
0
        return -1;
1914
1915
0
    if (!(hrecs = bh->hrecs)) {
1916
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1917
0
            return -1;
1918
0
        hrecs = bh->hrecs;
1919
0
    }
1920
1921
0
    int ret, rename;
1922
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1923
0
    if (!ty)
1924
0
        return -1;
1925
1926
0
    va_list args;
1927
0
    const char *old_name = "?", *new_name = "?";
1928
0
    char id_tag[3];
1929
0
    khash_t(m_s2i) *hash = NULL;
1930
0
    va_start(args, ID_value);
1931
0
    rename = check_for_name_update(hrecs, ty, args,
1932
0
                                   &old_name, &new_name, id_tag, &hash);
1933
0
    va_end(args);
1934
0
    if (rename < 0) {
1935
0
        hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1936
0
                      type, old_name, new_name);
1937
0
        return -1;
1938
0
    }
1939
0
    if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1940
        // This is just too complicated
1941
0
        hts_log_error("Renaming @PG records is not supported");
1942
0
        return -1;
1943
0
    }
1944
0
    va_start(args, ID_value);
1945
0
    ret = sam_hrecs_vupdate(hrecs, ty, args);
1946
0
    va_end(args);
1947
1948
0
    if (ret)
1949
0
        return ret;
1950
1951
    // TODO Account for @SQ-AN altnames
1952
1953
0
    if (rename) {
1954
        // Adjust the hash table to point to the new name
1955
        // sam_hrecs_update_hashes() should sort out everything else
1956
0
        khint_t k = kh_get(m_s2i, hash, old_name);
1957
0
        sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1958
0
        int r, pos;
1959
0
        assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1960
0
        assert(new_tag && new_tag->str); // id_tag should exist
1961
0
        assert(new_tag->len > 3);
1962
0
        pos = kh_val(hash, k);
1963
0
        kh_del(m_s2i, hash, k);
1964
0
        k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1965
0
        if (r < 1) {
1966
0
            hts_log_error("Failed to rename item in hash table");
1967
0
            return -1;
1968
0
        }
1969
0
        kh_val(hash, k) = pos;
1970
0
    }
1971
1972
0
    ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1973
1974
0
    if (!ret && hrecs->refs_changed >= 0)
1975
0
        ret = rebuild_target_arrays(bh);
1976
1977
0
    if (!ret && hrecs->dirty)
1978
0
        redact_header_text(bh);
1979
1980
0
    return ret;
1981
0
}
1982
1983
0
static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) {
1984
0
    sam_hrec_type_t *head, *step;
1985
0
    khiter_t k;
1986
1987
0
    if (strncmp(type, "SQ", 2) == 0) {
1988
0
        hrecs->nref = 0;
1989
0
        kh_clear(m_s2i, hrecs->ref_hash);
1990
0
    } else if (strncmp(type, "RG", 2) == 0) {
1991
0
        hrecs->nrg = 0;
1992
0
        kh_clear(m_s2i, hrecs->rg_hash);
1993
0
    }
1994
1995
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1996
1997
0
    if (k != kh_end(hrecs->h)) { // something to rebuild
1998
0
        head = kh_val(hrecs->h, k);
1999
0
        step = head;
2000
2001
0
        do {
2002
0
            if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) {
2003
0
                hts_log_error("Unable to rebuild hashes");
2004
0
                return -1;
2005
0
            }
2006
2007
0
            step = step->next;
2008
0
        } while (step != head);
2009
0
    }
2010
2011
0
    return 0;
2012
0
}
2013
2014
0
int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
2015
0
    sam_hrecs_t *hrecs;
2016
0
    if (!bh || !type)
2017
0
        return -1;
2018
2019
0
    if (!(hrecs = bh->hrecs)) {
2020
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2021
0
            return -1;
2022
0
        hrecs = bh->hrecs;
2023
0
    }
2024
2025
0
    sam_hrec_type_t *step;
2026
0
    int ret = 1, remove_all = (ID_key == NULL);
2027
2028
0
    if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
2029
0
        hts_log_warning("Removing PG or CO lines is not supported!");
2030
0
        return -1;
2031
0
    }
2032
2033
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2034
0
    if (!type_found) { // remove all line of this type
2035
0
        khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2036
0
        if (k == kh_end(hrecs->h))
2037
0
            return 0;
2038
0
        type_found =  kh_val(hrecs->h, k);
2039
0
        if (!type_found)
2040
0
            return 0;
2041
0
        remove_all = 1;
2042
0
    }
2043
2044
0
    step = type_found->next;
2045
0
    while (step != type_found) {
2046
0
        sam_hrec_type_t *to_remove = step;
2047
0
        step = step->next;
2048
0
        ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2049
0
    }
2050
2051
0
    if (remove_all)
2052
0
        ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0);
2053
2054
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
2055
       to rebuild than delete one by one).
2056
    */
2057
2058
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
2059
0
        if (rebuild_hash(hrecs, type)) {
2060
0
            return -1;
2061
0
        }
2062
0
    }
2063
2064
0
    if (!ret && hrecs->dirty)
2065
0
        redact_header_text(bh);
2066
2067
0
    return 0;
2068
0
}
2069
2070
0
int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
2071
0
    sam_hrecs_t *hrecs;
2072
0
    rmhash_t *rh = (rmhash_t *)vrh;
2073
2074
0
    if (!bh || !type)
2075
0
        return -1;
2076
0
    if (!rh) // remove all lines
2077
0
        return sam_hdr_remove_except(bh, type, NULL, NULL);
2078
0
    if (!id)
2079
0
        return -1;
2080
2081
0
    if (!(hrecs = bh->hrecs)) {
2082
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2083
0
            return -1;
2084
0
        hrecs = bh->hrecs;
2085
0
    }
2086
2087
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2088
0
    if (k == kh_end(hrecs->h)) // nothing to remove from
2089
0
        return 0;
2090
2091
0
    sam_hrec_type_t *head = kh_val(hrecs->h, k);
2092
0
    if (!head) {
2093
0
        hts_log_error("Header inconsistency");
2094
0
        return -1;
2095
0
    }
2096
2097
0
    int ret = 0;
2098
0
    sam_hrec_type_t *step = head->next;
2099
0
    while (step != head) {
2100
0
        sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
2101
0
        if (tag && tag->str && tag->len >= 3) {
2102
0
           k = kh_get(rm, rh, tag->str+3);
2103
0
           if (k == kh_end(rh)) { // value is not in the hash table, so remove
2104
0
               sam_hrec_type_t *to_remove = step;
2105
0
               step = step->next;
2106
0
               ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2107
0
           } else {
2108
0
               step = step->next;
2109
0
           }
2110
0
        } else { // tag is not on the line, so skip to next line
2111
0
            step = step->next;
2112
0
        }
2113
0
    }
2114
2115
    // process the first line
2116
0
    sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
2117
0
    if (tag && tag->str && tag->len >= 3) {
2118
0
       k = kh_get(rm, rh, tag->str+3);
2119
0
       if (k == kh_end(rh)) { // value is not in the hash table, so remove
2120
0
           sam_hrec_type_t *to_remove = head;
2121
0
           head = head->next;
2122
0
           ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2123
0
       }
2124
0
    }
2125
2126
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
2127
       to rebuild than delete one by one).
2128
    */
2129
2130
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
2131
0
        if (rebuild_hash(hrecs, type)) {
2132
0
            return -1;
2133
0
        }
2134
0
    }
2135
2136
0
    if (!ret && hrecs->dirty)
2137
0
        redact_header_text(bh);
2138
2139
0
    return ret;
2140
0
}
2141
2142
8.18k
int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
2143
8.18k
    int count;
2144
8.18k
    sam_hrec_type_t *first_ty, *itr_ty;
2145
2146
8.18k
    if (!bh || !type)
2147
0
        return -1;
2148
2149
8.18k
    if (!bh->hrecs) {
2150
3.39k
        if (sam_hdr_fill_hrecs(bh) != 0)
2151
0
            return -1;
2152
3.39k
    }
2153
2154
    // Deal with types that have counts
2155
8.18k
    switch (type[0]) {
2156
8.18k
    case 'S':
2157
8.18k
        if (type[1] == 'Q')
2158
8.18k
            return bh->hrecs->nref;
2159
0
        break;
2160
0
    case 'R':
2161
0
        if (type[1] == 'G')
2162
0
            return bh->hrecs->nrg;
2163
0
        break;
2164
0
    case 'P':
2165
0
        if (type[1] == 'G')
2166
0
            return bh->hrecs->npg;
2167
0
        break;
2168
0
    default:
2169
0
        break;
2170
8.18k
    }
2171
2172
0
    first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
2173
0
    if (!first_ty)
2174
0
        return 0;
2175
2176
0
    count = 1;
2177
0
    for (itr_ty = first_ty->next;
2178
0
         itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
2179
0
        count++;
2180
0
    }
2181
2182
0
    return count;
2183
0
}
2184
2185
int sam_hdr_line_index(sam_hdr_t *bh,
2186
                       const char *type,
2187
0
                       const char *key) {
2188
0
    sam_hrecs_t *hrecs;
2189
0
    if (!bh || !type || !key)
2190
0
        return -2;
2191
2192
0
    if (!(hrecs = bh->hrecs)) {
2193
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2194
0
            return -2;
2195
0
        hrecs = bh->hrecs;
2196
0
    }
2197
2198
0
    khint_t k;
2199
0
    int idx = -1;
2200
0
    switch (type[0]) {
2201
0
    case 'S':
2202
0
        if (type[1] == 'Q') {
2203
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
2204
0
            if (k != kh_end(hrecs->ref_hash))
2205
0
                idx = kh_val(hrecs->ref_hash, k);
2206
0
        } else {
2207
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2208
0
        }
2209
0
        break;
2210
0
    case 'R':
2211
0
        if (type[1] == 'G') {
2212
0
            k = kh_get(m_s2i, hrecs->rg_hash, key);
2213
0
            if (k != kh_end(hrecs->rg_hash))
2214
0
                idx = kh_val(hrecs->rg_hash, k);
2215
0
        } else {
2216
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2217
0
        }
2218
0
        break;
2219
0
    case 'P':
2220
0
        if (type[1] == 'G') {
2221
0
            k = kh_get(m_s2i, hrecs->pg_hash, key);
2222
0
            if (k != kh_end(hrecs->pg_hash))
2223
0
                idx = kh_val(hrecs->pg_hash, k);
2224
0
        } else {
2225
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2226
0
        }
2227
0
        break;
2228
0
    default:
2229
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2230
0
    }
2231
2232
0
    return idx;
2233
0
}
2234
2235
const char *sam_hdr_line_name(sam_hdr_t *bh,
2236
                              const char *type,
2237
0
                              int pos) {
2238
0
    sam_hrecs_t *hrecs;
2239
0
    if (!bh || !type || pos < 0)
2240
0
        return NULL;
2241
2242
0
    if (!(hrecs = bh->hrecs)) {
2243
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2244
0
            return NULL;
2245
0
        hrecs = bh->hrecs;
2246
0
    }
2247
2248
0
    switch (type[0]) {
2249
0
    case 'S':
2250
0
        if (type[1] == 'Q') {
2251
0
            if (pos < hrecs->nref)
2252
0
                return hrecs->ref[pos].name;
2253
0
        } else {
2254
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2255
0
        }
2256
0
        break;
2257
0
    case 'R':
2258
0
        if (type[1] == 'G') {
2259
0
            if (pos < hrecs->nrg)
2260
0
                return hrecs->rg[pos].name;
2261
0
        } else {
2262
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2263
0
        }
2264
0
        break;
2265
0
    case 'P':
2266
0
        if (type[1] == 'G') {
2267
0
            if (pos < hrecs->npg)
2268
0
                return hrecs->pg[pos].name;
2269
0
        } else {
2270
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2271
0
        }
2272
0
        break;
2273
0
    default:
2274
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2275
0
    }
2276
2277
0
    return NULL;
2278
0
}
2279
2280
/* ==== Key:val level methods ==== */
2281
2282
int sam_hdr_find_tag_id(sam_hdr_t *bh,
2283
                     const char *type,
2284
                     const char *ID_key,
2285
                     const char *ID_value,
2286
                     const char *key,
2287
0
                     kstring_t *ks) {
2288
0
    sam_hrecs_t *hrecs;
2289
0
    if (!bh || !type || !key)
2290
0
        return -2;
2291
2292
0
    if (!(hrecs = bh->hrecs)) {
2293
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2294
0
            return -2;
2295
0
        hrecs = bh->hrecs;
2296
0
    }
2297
2298
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2299
0
    if (!ty)
2300
0
        return -1;
2301
2302
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
2303
0
    if (!tag || !tag->str || tag->len < 4)
2304
0
        return -1;
2305
2306
0
    ks->l = 0;
2307
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
2308
0
        return -2;
2309
0
    }
2310
2311
0
    return 0;
2312
0
}
2313
2314
int sam_hdr_find_tag_pos(sam_hdr_t *bh,
2315
                     const char *type,
2316
                     int pos,
2317
                     const char *key,
2318
0
                     kstring_t *ks) {
2319
0
    sam_hrecs_t *hrecs;
2320
0
    if (!bh || !type || !key)
2321
0
        return -2;
2322
2323
0
    if (!(hrecs = bh->hrecs)) {
2324
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2325
0
            return -2;
2326
0
        hrecs = bh->hrecs;
2327
0
    }
2328
2329
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
2330
0
    if (!ty)
2331
0
        return -1;
2332
2333
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
2334
0
    if (!tag || !tag->str || tag->len < 4)
2335
0
        return -1;
2336
2337
0
    ks->l = 0;
2338
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
2339
0
        return -2;
2340
0
    }
2341
2342
0
    return 0;
2343
0
}
2344
2345
int sam_hdr_remove_tag_id(sam_hdr_t *bh,
2346
        const char *type,
2347
        const char *ID_key,
2348
        const char *ID_value,
2349
0
        const char *key) {
2350
0
    sam_hrecs_t *hrecs;
2351
0
    if (!bh || !type || !key)
2352
0
        return -1;
2353
2354
0
    if (!(hrecs = bh->hrecs)) {
2355
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2356
0
            return -1;
2357
0
        hrecs = bh->hrecs;
2358
0
    }
2359
2360
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2361
0
    if (!ty)
2362
0
        return -1;
2363
2364
0
    int ret = sam_hrecs_remove_key(hrecs, ty, key);
2365
0
    if (!ret && hrecs->dirty)
2366
0
        redact_header_text(bh);
2367
2368
0
    return ret;
2369
0
}
2370
2371
/*
2372
 * Reconstructs a kstring from the header hash table.
2373
 * Returns 0 on success
2374
 *        -1 on failure
2375
 */
2376
17.7k
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
2377
17.7k
    ks->l = 0;
2378
2379
17.7k
    if (!hrecs->h || !hrecs->h->size) {
2380
6.73k
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
2381
6.73k
    }
2382
11.0k
    if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
2383
0
        return -1;
2384
2385
11.0k
    return 0;
2386
11.0k
}
2387
2388
/*
2389
 * Looks up a reference sequence by name and returns the numerical ID.
2390
 * Returns -1 if unknown reference; -2 if header could not be parsed.
2391
 */
2392
728k
int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
2393
728k
    sam_hrecs_t *hrecs;
2394
728k
    khint_t k;
2395
2396
728k
    if (!bh)
2397
0
        return -1;
2398
2399
728k
    if (!(hrecs = bh->hrecs)) {
2400
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2401
0
            return -2;
2402
0
        hrecs = bh->hrecs;
2403
0
    }
2404
2405
728k
    if (!hrecs->ref_hash)
2406
0
        return -1;
2407
2408
728k
    k = kh_get(m_s2i, hrecs->ref_hash, ref);
2409
728k
    return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
2410
728k
}
2411
2412
0
const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
2413
0
    sam_hrecs_t *hrecs;
2414
2415
0
    if (!h || tid < 0)
2416
0
        return NULL;
2417
2418
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2419
0
        return hrecs->ref[tid].name;
2420
0
    } else {
2421
0
        if (tid < h->n_targets)
2422
0
            return h->target_name[tid];
2423
0
    }
2424
2425
0
    return NULL;
2426
0
}
2427
2428
0
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2429
0
    sam_hrecs_t *hrecs;
2430
2431
0
    if (!h || tid < 0)
2432
0
        return 0;
2433
2434
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2435
0
        return hrecs->ref[tid].len;
2436
0
    } else {
2437
0
        if (tid < h->n_targets) {
2438
0
            if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2439
0
                return h->target_len[tid];
2440
0
            } else {
2441
0
                khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2442
0
                khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2443
0
                if (k < kh_end(long_refs)) {
2444
0
                    return kh_val(long_refs, k);
2445
0
                } else {
2446
0
                    return UINT32_MAX;
2447
0
                }
2448
0
            }
2449
0
        }
2450
0
    }
2451
2452
0
    return 0;
2453
0
}
2454
2455
/*
2456
 * Fixes any PP links in @PG headers.
2457
 * If the entries are in order then this doesn't need doing, but in case
2458
 * our header is out of order this goes through the hrecs->pg[] array
2459
 * setting the prev_id field.
2460
 *
2461
 * Note we can have multiple complete chains. This code should identify the
2462
 * tails of these chains as these are the entries we have to link to in
2463
 * subsequent PP records.
2464
 *
2465
 * Returns 0 on success
2466
 *        -1 on failure (indicating broken PG/PP records)
2467
 */
2468
0
static int sam_hdr_link_pg(sam_hdr_t *bh) {
2469
0
    sam_hrecs_t *hrecs;
2470
0
    int i, j, ret = 0, *new_pg_end;
2471
2472
0
    if (!bh)
2473
0
        return -1;
2474
2475
0
    if (!(hrecs = bh->hrecs)) {
2476
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2477
0
            return -1;
2478
0
        hrecs = bh->hrecs;
2479
0
    }
2480
2481
0
    if (!hrecs->pgs_changed || !hrecs->npg)
2482
0
        return 0;
2483
2484
0
    hrecs->npg_end_alloc = hrecs->npg;
2485
0
    new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2486
0
    if (!new_pg_end)
2487
0
        return -1;
2488
0
    hrecs->pg_end = new_pg_end;
2489
0
    int *chain_size = calloc(hrecs->npg, sizeof(int));
2490
0
    if (!chain_size)
2491
0
        return -1;
2492
2493
0
    for (i = 0; i < hrecs->npg; i++)
2494
0
        hrecs->pg_end[i] = i;
2495
2496
0
    for (i = 0; i < hrecs->npg; i++) {
2497
0
        khint_t k;
2498
0
        sam_hrec_tag_t *tag;
2499
2500
0
        assert(hrecs->pg[i].ty != NULL);
2501
0
        for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2502
0
            if (tag->str[0] == 'P' && tag->str[1] == 'P')
2503
0
                break;
2504
0
        }
2505
0
        if (!tag) {
2506
            /* Chain start points */
2507
0
            continue;
2508
0
        }
2509
2510
0
        k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2511
2512
0
        if (k == kh_end(hrecs->pg_hash)) {
2513
0
            hts_log_warning("PG line with ID:%s has a PP link to missing program '%s'",
2514
0
                    hrecs->pg[i].name, tag->str+3);
2515
0
            continue;
2516
0
        }
2517
2518
0
        int pp_idx = kh_val(hrecs->pg_hash, k);
2519
0
        if (pp_idx == i) {
2520
0
            hts_log_warning("PG line with ID:%s has a PP link to itself",
2521
0
                            hrecs->pg[i].name);
2522
0
            continue;
2523
0
        }
2524
2525
0
        hrecs->pg[i].prev_id = hrecs->pg[pp_idx].id;
2526
0
        hrecs->pg_end[pp_idx] = -1;
2527
0
        chain_size[i] = chain_size[pp_idx]+1;
2528
0
    }
2529
2530
0
    int last_end = -1;
2531
0
    for (i = j = 0; i < hrecs->npg; i++) {
2532
0
        if (hrecs->pg_end[i] != -1) {
2533
0
            last_end = hrecs->pg_end[i];
2534
0
            if (chain_size[i] > 0)
2535
0
                hrecs->pg_end[j++] = hrecs->pg_end[i];
2536
0
        }
2537
0
    }
2538
    /* Only leafs? Choose the last one! */
2539
0
    if (!j && hrecs->npg_end > 0 && last_end >= 0) {
2540
0
        hrecs->pg_end[0] = last_end;
2541
0
        j = 1;
2542
0
    }
2543
2544
0
    hrecs->npg_end = j;
2545
0
    hrecs->pgs_changed = 0;
2546
2547
    /* mark as dirty or empty for rebuild */
2548
0
    hrecs->dirty = 1;
2549
0
    redact_header_text(bh);
2550
0
    free(chain_size);
2551
2552
0
    return ret;
2553
0
}
2554
2555
/*
2556
 * Returns a unique ID from a base name.
2557
 *
2558
 * The value returned is valid until the next call to
2559
 * this function.
2560
 */
2561
0
const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2562
0
    sam_hrecs_t *hrecs;
2563
0
    size_t name_len;
2564
0
    const size_t name_extra = 17;
2565
0
    if (!bh || !name)
2566
0
        return NULL;
2567
2568
0
    if (!(hrecs = bh->hrecs)) {
2569
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2570
0
            return NULL;
2571
0
        hrecs = bh->hrecs;
2572
0
    }
2573
2574
0
    khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2575
0
    if (k == kh_end(hrecs->pg_hash))
2576
0
        return name;
2577
2578
0
    name_len = strlen(name);
2579
0
    if (name_len > 1000) name_len = 1000;
2580
0
    if (hrecs->ID_buf_sz < name_len + name_extra) {
2581
0
        char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2582
0
        if (new_ID_buf == NULL)
2583
0
            return NULL;
2584
0
        hrecs->ID_buf = new_ID_buf;
2585
0
        hrecs->ID_buf_sz = name_len + name_extra;
2586
0
    }
2587
2588
0
    do {
2589
0
        snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2590
0
        k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2591
0
    } while (k != kh_end(hrecs->pg_hash));
2592
2593
0
    return hrecs->ID_buf;
2594
0
}
2595
2596
/*
2597
 * Add an @PG line.
2598
 *
2599
 * If we wish complete control over this use sam_hdr_add_line() directly. This
2600
 * function uses that, but attempts to do a lot of tedious house work for
2601
 * you too.
2602
 *
2603
 * - It will generate a suitable ID if the supplied one clashes.
2604
 * - It will generate multiple @PG records if we have multiple PG chains.
2605
 *
2606
 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2607
 * in NULL.
2608
 *
2609
 * Returns 0 on success
2610
 *        -1 on failure
2611
 */
2612
0
int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2613
0
    sam_hrecs_t *hrecs;
2614
0
    const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2615
0
    const char *key, *val;
2616
0
    if (!bh)
2617
0
        return -1;
2618
2619
0
    if (!(hrecs = bh->hrecs)) {
2620
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2621
0
            return -1;
2622
0
        hrecs = bh->hrecs;
2623
0
    }
2624
2625
0
    bh->hrecs->pgs_changed = 1;
2626
0
    if (sam_hdr_link_pg(bh) < 0) {
2627
0
        hts_log_error("Error linking @PG lines");
2628
0
        return -1;
2629
0
    }
2630
2631
0
    va_list args;
2632
    // Check for ID / PN / PP tags in varargs list
2633
0
    va_start(args, name);
2634
0
    while ((key = va_arg(args, const char *)) != NULL) {
2635
0
        val = va_arg(args, const char *);
2636
0
        if (!val) break;
2637
0
        if (strcmp(key, "PN") == 0 && *val != '\0')
2638
0
            specified_pn = val;
2639
0
        else if (strcmp(key, "PP") == 0 && *val != '\0')
2640
0
            specified_pp = val;
2641
0
        else if (strcmp(key, "ID") == 0 && *val != '\0')
2642
0
            specified_id = val;
2643
0
    }
2644
0
    va_end(args);
2645
2646
0
    if (specified_id && hrecs->pg_hash) {
2647
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2648
0
        if (k != kh_end(hrecs->pg_hash)) {
2649
0
            hts_log_error("Header @PG ID:%s already present", specified_id);
2650
0
            return -1;
2651
0
        }
2652
0
    }
2653
2654
0
    if (specified_pp && hrecs->pg_hash) {
2655
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2656
0
        if (k == kh_end(hrecs->pg_hash)) {
2657
0
            hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2658
0
                          specified_pp);
2659
0
            return -1;
2660
0
        }
2661
0
    }
2662
2663
0
    if (!specified_pp && hrecs->npg_end) {
2664
        /* Copy ends array to avoid us looping while modifying it */
2665
0
        int *end = malloc(hrecs->npg_end * sizeof(int));
2666
0
        int i, nends = hrecs->npg_end;
2667
2668
0
        if (!end)
2669
0
            return -1;
2670
2671
0
        memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2672
2673
0
        for (i = 0; i < nends; i++) {
2674
0
            const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2675
0
            if (!id) {
2676
0
                free(end);
2677
0
                return -1;
2678
0
            }
2679
0
            assert(end[i] >= 0 && end[i] < hrecs->npg);
2680
0
            va_start(args, name);
2681
0
            if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2682
0
                                     "ID", id,
2683
0
                                     "PN", !specified_pn ? name : "",
2684
0
                                     "PP", hrecs->pg[end[i]].name,
2685
0
                                     NULL)) {
2686
0
                free(end);
2687
0
                return  -1;
2688
0
            }
2689
0
            va_end(args);
2690
0
        }
2691
2692
0
        free(end);
2693
0
    } else {
2694
0
        const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2695
0
        if (!id)
2696
0
            return -1;
2697
0
        va_start(args, name);
2698
0
        if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2699
0
                                 "ID", id,
2700
0
                                 "PN", !specified_pn ? name : "",
2701
0
                                 NULL))
2702
0
            return -1;
2703
0
        va_end(args);
2704
0
    }
2705
2706
0
    hrecs->dirty = 1;
2707
0
    redact_header_text(bh);
2708
2709
0
    return 0;
2710
0
}
2711
2712
/*! Increments a reference count on bh.
2713
 *
2714
 * This permits multiple files to share the same header, all calling
2715
 * sam_hdr_destroy when done, without causing errors for other open files.
2716
 */
2717
1.41k
void sam_hdr_incr_ref(sam_hdr_t *bh) {
2718
1.41k
    if (!bh)
2719
0
        return;
2720
1.41k
    bh->ref_count++;
2721
1.41k
}
2722
2723
/* ==== Internal methods ==== */
2724
2725
/*
2726
 * Creates an empty SAM header.  Allocates space for the SAM header
2727
 * structures (hash tables) ready to be populated.
2728
 *
2729
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2730
 *         NULL on failure
2731
 */
2732
13.0k
sam_hrecs_t *sam_hrecs_new(void) {
2733
13.0k
    sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2734
2735
13.0k
    if (!hrecs)
2736
0
        return NULL;
2737
2738
13.0k
    hrecs->h = kh_init(sam_hrecs_t);
2739
13.0k
    if (!hrecs->h)
2740
0
        goto err;
2741
2742
13.0k
    hrecs->ID_cnt = 1;
2743
2744
13.0k
    hrecs->nref = 0;
2745
13.0k
    hrecs->ref_sz = 0;
2746
13.0k
    hrecs->ref  = NULL;
2747
13.0k
    if (!(hrecs->ref_hash = kh_init(m_s2i)))
2748
0
        goto err;
2749
13.0k
    hrecs->refs_changed = -1;
2750
2751
13.0k
    hrecs->nrg = 0;
2752
13.0k
    hrecs->rg_sz = 0;
2753
13.0k
    hrecs->rg  = NULL;
2754
13.0k
    if (!(hrecs->rg_hash = kh_init(m_s2i)))
2755
0
        goto err;
2756
2757
13.0k
    hrecs->npg = 0;
2758
13.0k
    hrecs->pg_sz = 0;
2759
13.0k
    hrecs->pg  = NULL;
2760
13.0k
    hrecs->npg_end = hrecs->npg_end_alloc = 0;
2761
13.0k
    hrecs->pg_end = NULL;
2762
13.0k
    if (!(hrecs->pg_hash = kh_init(m_s2i)))
2763
0
        goto err;
2764
2765
13.0k
    if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2766
0
        goto err;
2767
2768
13.0k
    if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2769
0
        goto err;
2770
2771
13.0k
    if (!(hrecs->str_pool = string_pool_create(65536)))
2772
0
        goto err;
2773
2774
13.0k
    if (sam_hrecs_init_type_order(hrecs, NULL))
2775
0
        goto err;
2776
2777
13.0k
    return hrecs;
2778
2779
0
err:
2780
0
    if (hrecs->h)
2781
0
        kh_destroy(sam_hrecs_t, hrecs->h);
2782
2783
0
    if (hrecs->tag_pool)
2784
0
        pool_destroy(hrecs->tag_pool);
2785
2786
0
    if (hrecs->type_pool)
2787
0
        pool_destroy(hrecs->type_pool);
2788
2789
0
    if (hrecs->str_pool)
2790
0
        string_pool_destroy(hrecs->str_pool);
2791
2792
0
    free(hrecs);
2793
2794
0
    return NULL;
2795
13.0k
}
2796
#if 0
2797
/*
2798
 * Produces a duplicate copy of source and returns it.
2799
 * Returns NULL on failure
2800
 */
2801
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2802
        return NULL;
2803
}
2804
#endif
2805
/*! Deallocates all storage used by a sam_hrecs_t struct.
2806
 *
2807
 * This also decrements the header reference count. If after decrementing
2808
 * it is still non-zero then the header is assumed to be in use by another
2809
 * caller and the free is not done.
2810
 *
2811
 */
2812
13.0k
void sam_hrecs_free(sam_hrecs_t *hrecs) {
2813
13.0k
    if (!hrecs)
2814
0
        return;
2815
2816
13.0k
    if (hrecs->h)
2817
13.0k
        kh_destroy(sam_hrecs_t, hrecs->h);
2818
2819
13.0k
    if (hrecs->ref_hash)
2820
13.0k
        kh_destroy(m_s2i, hrecs->ref_hash);
2821
2822
13.0k
    if (hrecs->ref)
2823
7.04k
        free(hrecs->ref);
2824
2825
13.0k
    if (hrecs->rg_hash)
2826
13.0k
        kh_destroy(m_s2i, hrecs->rg_hash);
2827
2828
13.0k
    if (hrecs->rg)
2829
529
        free(hrecs->rg);
2830
2831
13.0k
    if (hrecs->pg_hash)
2832
13.0k
        kh_destroy(m_s2i, hrecs->pg_hash);
2833
2834
13.0k
    if (hrecs->pg)
2835
971
        free(hrecs->pg);
2836
2837
13.0k
    if (hrecs->pg_end)
2838
968
        free(hrecs->pg_end);
2839
2840
13.0k
    if (hrecs->type_pool)
2841
13.0k
        pool_destroy(hrecs->type_pool);
2842
2843
13.0k
    if (hrecs->tag_pool)
2844
13.0k
        pool_destroy(hrecs->tag_pool);
2845
2846
13.0k
    if (hrecs->str_pool)
2847
13.0k
        string_pool_destroy(hrecs->str_pool);
2848
2849
13.0k
    if (hrecs->type_order)
2850
13.0k
        free(hrecs->type_order);
2851
2852
13.0k
    if (hrecs->ID_buf)
2853
0
        free(hrecs->ID_buf);
2854
2855
13.0k
    free(hrecs);
2856
13.0k
}
2857
2858
/*
2859
 * Internal method already used by the CRAM code
2860
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2861
 * for the tag ID: and compares against the specified ID.
2862
 *
2863
 * Returns NULL if no type/ID is found
2864
 */
2865
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2866
6.18k
                                     const char *ID_key, const char *ID_value) {
2867
6.18k
    if (!hrecs || !type)
2868
0
        return NULL;
2869
6.18k
    sam_hrec_type_t *t1, *t2;
2870
6.18k
    khint_t k;
2871
2872
    /* Special case for types we have prebuilt hashes on */
2873
6.18k
    if (ID_key) {
2874
6.18k
        if (!ID_value)
2875
0
            return NULL;
2876
2877
6.18k
        if (type[0]   == 'S' && type[1]   == 'Q' &&
2878
6.18k
            ID_key[0] == 'S' && ID_key[1] == 'N') {
2879
6.18k
            k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2880
6.18k
            return k != kh_end(hrecs->ref_hash)
2881
6.18k
                ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2882
6.18k
                : NULL;
2883
6.18k
        }
2884
2885
0
        if (type[0]   == 'R' && type[1]   == 'G' &&
2886
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2887
0
            k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2888
0
            return k != kh_end(hrecs->rg_hash)
2889
0
                ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2890
0
                : NULL;
2891
0
        }
2892
2893
0
        if (type[0]   == 'P' && type[1]   == 'G' &&
2894
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2895
0
            k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2896
0
            return k != kh_end(hrecs->pg_hash)
2897
0
                ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2898
0
                : NULL;
2899
0
        }
2900
0
    }
2901
2902
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2903
0
    if (k == kh_end(hrecs->h))
2904
0
        return NULL;
2905
2906
0
    if (!ID_key)
2907
0
        return kh_val(hrecs->h, k);
2908
2909
0
    t1 = t2 = kh_val(hrecs->h, k);
2910
0
    do {
2911
0
        sam_hrec_tag_t *tag;
2912
0
        for (tag = t1->tag; tag; tag = tag->next) {
2913
0
            if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2914
0
                const char *cp1 = tag->str+3;
2915
0
                const char *cp2 = ID_value;
2916
0
                while (*cp1 && *cp1 == *cp2)
2917
0
                    cp1++, cp2++;
2918
0
                if (*cp2 || *cp1)
2919
0
                    continue;
2920
0
                return t1;
2921
0
            }
2922
0
        }
2923
0
        t1 = t1->next;
2924
0
    } while (t1 != t2);
2925
2926
0
    return NULL;
2927
0
}
2928
2929
/*
2930
 * Adds or updates tag key,value pairs in a header line.
2931
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2932
 * @HD line.
2933
 *
2934
 * va_list contains multiple key,value pairs ending in NULL.
2935
 *
2936
 * Returns 0 on success
2937
 *        -1 on failure
2938
 */
2939
559
int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2940
559
    if (!hrecs)
2941
0
        return -1;
2942
2943
1.11k
    for (;;) {
2944
1.11k
        char *k, *v, *str;
2945
1.11k
        sam_hrec_tag_t *tag, *prev = NULL;
2946
2947
1.11k
        if (!(k = (char *)va_arg(ap, char *)))
2948
559
            break;
2949
559
        if (!(v = va_arg(ap, char *)))
2950
0
            v = "";
2951
2952
559
        tag = sam_hrecs_find_key(type, k, &prev);
2953
559
        if (!tag) {
2954
0
            if (!(tag = pool_alloc(hrecs->tag_pool)))
2955
0
                return -1;
2956
0
            if (prev)
2957
0
                prev->next = tag;
2958
0
            else
2959
0
                type->tag = tag;
2960
2961
0
            tag->next = NULL;
2962
0
        }
2963
2964
559
        tag->len = 3 + strlen(v);
2965
559
        str = string_alloc(hrecs->str_pool, tag->len+1);
2966
559
        if (!str)
2967
0
            return -1;
2968
2969
559
        if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2970
0
            return -1;
2971
2972
559
        tag->str = str;
2973
559
    }
2974
2975
559
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2976
2977
559
    return 0;
2978
559
}
2979
2980
/*
2981
 * Adds or updates tag key,value pairs in a header line.
2982
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2983
 * @HD line.
2984
 *
2985
 * Specify multiple key,value pairs ending in NULL.
2986
 *
2987
 * Returns 0 on success
2988
 *        -1 on failure
2989
 */
2990
559
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2991
559
    va_list args;
2992
559
    int res;
2993
559
    va_start(args, type);
2994
559
    res = sam_hrecs_vupdate(hrecs, type, args);
2995
559
    va_end(args);
2996
559
    return res;
2997
559
}
2998
2999
/*
3000
 * Looks for a specific key in a single sam header line identified by *type.
3001
 * If prev is non-NULL it also fills this out with the previous tag, to
3002
 * permit use in key removal. *prev is set to NULL when the tag is the first
3003
 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
3004
 * tag in the existing list.
3005
 *
3006
 * Returns the tag pointer on success
3007
 *         NULL on failure
3008
 */
3009
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
3010
                                   const char *key,
3011
1.19M
                                   sam_hrec_tag_t **prev) {
3012
1.19M
    sam_hrec_tag_t *tag, *p = NULL;
3013
1.19M
    if (!type)
3014
0
        return NULL;
3015
3016
1.66M
    for (tag = type->tag; tag; p = tag, tag = tag->next) {
3017
1.65M
        if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
3018
1.18M
            if (prev)
3019
559
                *prev = p;
3020
1.18M
            return tag;
3021
1.18M
        }
3022
1.65M
    }
3023
3024
6.67k
    if (prev)
3025
0
        *prev = p;
3026
3027
6.67k
    return NULL;
3028
1.19M
}
3029
3030
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
3031
                         sam_hrec_type_t *type,
3032
0
                         const char *key) {
3033
0
    sam_hrec_tag_t *tag, *prev;
3034
0
    if (!hrecs)
3035
0
        return -1;
3036
0
    tag = sam_hrecs_find_key(type, key, &prev);
3037
0
    if (!tag)
3038
0
        return 0; // Not there anyway
3039
3040
0
    if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
3041
0
        assert(tag->len >= 3);
3042
0
        sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
3043
0
        if (sn_tag) {
3044
0
            assert(sn_tag->len >= 3);
3045
0
            khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
3046
0
            if (k != kh_end(hrecs->ref_hash))
3047
0
                sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
3048
0
        }
3049
0
    }
3050
3051
0
    if (!prev) { //first tag
3052
0
        type->tag = tag->next;
3053
0
    } else {
3054
0
        prev->next = tag->next;
3055
0
    }
3056
0
    pool_free(hrecs->tag_pool, tag);
3057
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
3058
3059
0
    return 1;
3060
0
}
3061
3062
/*
3063
 * Looks up a read-group by name and returns a pointer to the start of the
3064
 * associated tag list.
3065
 *
3066
 * Returns NULL on failure
3067
 */
3068
647
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
3069
647
    khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
3070
647
    return k == kh_end(hrecs->rg_hash)
3071
647
        ? NULL
3072
647
        : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
3073
647
}
3074
3075
#if DEBUG_HEADER
3076
void sam_hrecs_dump(sam_hrecs_t *hrecs) {
3077
    khint_t k;
3078
    int i;
3079
3080
    printf("===DUMP===\n");
3081
    for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
3082
        sam_hrec_type_t *t1, *t2;
3083
        char c[2];
3084
        int idx = 0;
3085
3086
        if (!kh_exist(hrecs->h, k))
3087
            continue;
3088
3089
        t1 = t2 = kh_val(hrecs->h, k);
3090
        c[0] = kh_key(hrecs->h, k)>>8;
3091
        c[1] = kh_key(hrecs->h, k)&0xff;
3092
        printf("Type %.2s\n", c);
3093
3094
        do {
3095
            sam_hrec_tag_t *tag;
3096
            printf(">>>%d ", idx++);
3097
            for (tag = t1->tag; tag; tag=tag->next) {
3098
                if (strncmp(c, "CO", 2))
3099
                    printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
3100
                else
3101
                    printf("%s", tag->str);
3102
            }
3103
            putchar('\n');
3104
            t1 = t1->next;
3105
        } while (t1 != t2);
3106
    }
3107
3108
    /* Dump out PG chains */
3109
    printf("\n@PG chains:\n");
3110
    for (i = 0; i < hrecs->npg_end; i++) {
3111
        int j;
3112
        printf("  %d:", i);
3113
        for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
3114
            printf("%s%d(%.*s)",
3115
                   j == hrecs->pg_end[i] ? " " : "->",
3116
                   j, hrecs->pg[j].name_len, hrecs->pg[j].name);
3117
        }
3118
        printf("\n");
3119
    }
3120
3121
    puts("===END DUMP===");
3122
}
3123
#endif
3124
3125
/*
3126
 * Returns the sort order:
3127
 */
3128
0
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
3129
0
    khint_t k;
3130
0
    enum sam_sort_order so;
3131
3132
0
    so = ORDER_UNKNOWN;
3133
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
3134
0
    if (k != kh_end(hrecs->h)) {
3135
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
3136
0
        sam_hrec_tag_t *tag;
3137
0
        for (tag = ty->tag; tag; tag = tag->next) {
3138
0
            if (tag->str[0] == 'S' && tag->str[1] == 'O') {
3139
0
                if (strcmp(tag->str+3, "unsorted") == 0)
3140
0
                    so = ORDER_UNSORTED;
3141
0
                else if (strcmp(tag->str+3, "queryname") == 0)
3142
0
                    so = ORDER_NAME;
3143
0
                else if (strcmp(tag->str+3, "coordinate") == 0)
3144
0
                    so = ORDER_COORD;
3145
0
                else if (strcmp(tag->str+3, "unknown") != 0)
3146
0
                    hts_log_error("Unknown sort order field: %s", tag->str+3);
3147
0
            }
3148
0
        }
3149
0
    }
3150
3151
0
    return so;
3152
0
}
3153
3154
0
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
3155
0
    khint_t k;
3156
0
    enum sam_group_order go;
3157
3158
0
    go = ORDER_NONE;
3159
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
3160
0
    if (k != kh_end(hrecs->h)) {
3161
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
3162
0
        sam_hrec_tag_t *tag;
3163
0
        for (tag = ty->tag; tag; tag = tag->next) {
3164
0
            if (tag->str[0] == 'G' && tag->str[1] == 'O') {
3165
0
                if (strcmp(tag->str+3, "query") == 0)
3166
0
                    go = ORDER_QUERY;
3167
0
                else if (strcmp(tag->str+3, "reference") == 0)
3168
0
                    go = ORDER_REFERENCE;
3169
0
            }
3170
0
        }
3171
0
    }
3172
3173
0
    return go;
3174
0
}