Coverage Report

Created: 2026-06-10 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/header.c
Line
Count
Source
1
/*
2
Copyright (c) 2018-2020, 2023, 2025 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <string.h>
35
#include <assert.h>
36
#include <errno.h>
37
#include "textutils_internal.h"
38
#include "header.h"
39
#include "htslib/bgzf.h"
40
#include "htslib/hfile.h"
41
#include "htslib/hts_alloc.h"
42
#include "htslib/kseq.h"
43
44
// Hash table for removing multiple lines from the header
45
KHASH_SET_INIT_STR(rm)
46
// Used for long refs in SAM files
47
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
48
49
typedef khash_t(rm) rmhash_t;
50
51
static int sam_hdr_link_pg(sam_hdr_t *bh);
52
53
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
54
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
55
56
57
743
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
58
579
static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
59
579
    int j;
60
61
579
    if (len > MAX_ERROR_QUOTE)
62
164
        len = MAX_ERROR_QUOTE;
63
63.4k
    for (j = 0; j < len && line[j] != '\n'; j++)
64
62.8k
        ;
65
579
    hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
66
579
}
67
68
/* ==== Static methods ==== */
69
70
38.9k
static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
71
38.9k
    if (!hrecs)
72
0
        return -1;
73
74
38.9k
    if (!type_list) {
75
38.9k
        hrecs->type_count = 5;
76
38.9k
        hrecs->type_order = calloc(hrecs->type_count, 3);
77
38.9k
        if (!hrecs->type_order)
78
0
            return -1;
79
38.9k
        memcpy(hrecs->type_order[0], "HD", 2);
80
38.9k
        memcpy(hrecs->type_order[1], "SQ", 2);
81
38.9k
        memcpy(hrecs->type_order[2], "RG", 2);
82
38.9k
        memcpy(hrecs->type_order[3], "PG", 2);
83
38.9k
        memcpy(hrecs->type_order[4], "CO", 2);
84
38.9k
    }
85
86
38.9k
    return 0;
87
38.9k
}
88
89
2.43k
static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
90
2.43k
    const char *token;
91
2.43k
    ks_tokaux_t aux;
92
93
2.43k
    if (!list)
94
2.39k
        return 0;
95
96
262
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
97
216
        if (aux.p == token)
98
9
            continue;
99
100
207
        char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
101
207
        if (!name)
102
0
            return -1;
103
207
        int r;
104
207
        khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
105
207
        if (r < 0) return -1;
106
107
207
        if (r > 0)
108
153
            kh_val(hrecs->ref_hash, k) = nref;
109
54
        else if (kh_val(hrecs->ref_hash, k) != nref)
110
50
            hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
111
207
    }
112
113
46
    return 0;
114
46
}
115
116
0
static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
117
0
    const char *token, *sn;
118
0
    ks_tokaux_t aux;
119
0
    kstring_t str = KS_INITIALIZE;
120
121
0
    if (expected < 0 || expected >= hrecs->nref)
122
0
        return;
123
0
    sn = hrecs->ref[expected].name;
124
125
0
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
126
0
        kputsn(token, aux.p - token, ks_clear(&str));
127
0
        khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
128
0
        if (k != kh_end(hrecs->ref_hash)
129
0
            && kh_val(hrecs->ref_hash, k) == expected
130
0
            && strcmp(sn, str.s) != 0)
131
0
            kh_del(m_s2i, hrecs->ref_hash, k);
132
0
    }
133
134
0
    free(str.s);
135
0
}
136
137
/* Updates the hash tables in the sam_hrecs_t structure.
138
 *
139
 * Returns 0 on success;
140
 *        -1 on failure
141
 */
142
static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
143
                                   khint32_t type,
144
25.8k
                                   sam_hrec_type_t *h_type) {
145
    /* Add to reference hash? */
146
25.8k
    if (type == TYPEKEY("SQ")) {
147
11.3k
        sam_hrec_tag_t *tag = h_type->tag;
148
11.3k
        int nref = hrecs->nref;
149
11.3k
        const char *name = NULL;
150
11.3k
        const char *altnames = NULL;
151
11.3k
        hts_pos_t len = -1;
152
11.3k
        int r, invLN = 0;
153
11.3k
        khint_t k;
154
155
321k
        while (tag) {
156
310k
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
157
106k
                assert(tag->len >= 3);
158
106k
                name = tag->str+3;
159
204k
            } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
160
106k
                assert(tag->len >= 3);
161
106k
                hts_pos_t tmp = len;
162
106k
                len = strtoll(tag->str+3, NULL, 10);
163
106k
                if (tmp != -1 && tmp != len) {  //duplicate and different LN
164
339
                    invLN = 1;
165
339
                }
166
106k
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
167
78.0k
                assert(tag->len >= 3);
168
78.0k
                altnames = tag->str+3;
169
78.0k
            }
170
310k
            tag = tag->next;
171
310k
        }
172
173
11.3k
        if (!name) {
174
93
            hts_log_error("Header includes @SQ line with no SN: tag");
175
93
            return -1; // SN should be present, according to spec.
176
93
        }
177
178
11.2k
        if (len <= 0) {
179
8.74k
            hts_log_error("Header includes @SQ line \"%s\" with invalid or no LN: tag",
180
8.74k
                          name);
181
8.74k
            return -1; // LN should be present, according to spec.
182
8.74k
        }
183
184
2.49k
        if (invLN) {
185
47
            hts_log_error("Header includes @SQ line \"%s\" with multiple LN:"
186
47
            " tag with different values.", name);
187
47
            return -1; // LN should not be duplicated or be same
188
47
        }
189
190
        // Seen already?
191
2.45k
        k = kh_get(m_s2i, hrecs->ref_hash, name);
192
2.45k
        if (k < kh_end(hrecs->ref_hash)) {
193
1.23k
            nref = kh_val(hrecs->ref_hash, k);
194
1.23k
            int ref_changed_flag = 0;
195
196
            // Check for hash entry added by sam_hrecs_refs_from_targets_array()
197
1.23k
            if (hrecs->ref[nref].ty == NULL) {
198
                // Attach header line to existing stub entry.
199
1.22k
                hrecs->ref[nref].ty = h_type;
200
                // Check lengths match; correct if not.
201
1.22k
                if (len != hrecs->ref[nref].len) {
202
0
                    char tmp[32];
203
0
                    snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
204
0
                             hrecs->ref[nref].len);
205
0
                    if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
206
0
                        return -1;
207
0
                    ref_changed_flag = 1;
208
0
                }
209
1.22k
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
210
0
                    return -1;
211
212
1.22k
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
213
0
                    hrecs->refs_changed = nref;
214
1.22k
                return 0;
215
1.22k
            }
216
217
            // Check to see if an existing entry is being updated
218
16
            if (hrecs->ref[nref].ty == h_type) {
219
0
                if (hrecs->ref[nref].len != len) {
220
0
                    hrecs->ref[nref].len = len;
221
0
                    ref_changed_flag = 1;
222
0
                }
223
0
                if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
224
0
                    hrecs->ref[nref].name = name;
225
0
                    ref_changed_flag = 1;
226
0
                }
227
0
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
228
0
                    return -1;
229
230
0
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
231
0
                    hrecs->refs_changed = nref;
232
0
                return 0;
233
0
            }
234
235
            // If here, the name is a duplicate.
236
            // Check to see if it matches the SN: tag from the earlier record.
237
16
            if (strcmp(hrecs->ref[nref].name, name) == 0) {
238
16
                hts_log_error("Duplicate entry \"%s\" in sam header",
239
16
                                name);
240
16
                return -1;
241
16
            }
242
243
            // Clash with an already-seen altname
244
            // As SN: should be preferred to AN: add this as a new
245
            // record and update the hash entry to point to it.
246
0
            hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
247
0
            nref = hrecs->nref;
248
0
        }
249
250
1.21k
        if (nref == hrecs->ref_sz) {
251
17
            size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
252
17
            sam_hrec_sq_t *new_ref = hts_realloc_p(hrecs->ref, sizeof(*hrecs->ref), new_sz);
253
17
            if (!new_ref)
254
0
                return -1;
255
17
            hrecs->ref = new_ref;
256
17
            hrecs->ref_sz = new_sz;
257
17
        }
258
259
1.21k
        hrecs->ref[nref].name = name;
260
1.21k
        hrecs->ref[nref].len  = len;
261
1.21k
        hrecs->ref[nref].ty = h_type;
262
263
1.21k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
264
1.21k
        if (-1 == r) return -1;
265
1.21k
        kh_val(hrecs->ref_hash, k) = nref;
266
267
1.21k
        if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
268
0
            return -1;
269
270
1.21k
        if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
271
571
            hrecs->refs_changed = hrecs->nref;
272
1.21k
        hrecs->nref++;
273
1.21k
    }
274
275
    /* Add to read-group hash? */
276
15.7k
    if (type == TYPEKEY("RG")) {
277
3.02k
        sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
278
3.02k
        int nrg = hrecs->nrg, r;
279
3.02k
        khint_t k;
280
281
3.02k
        if (!tag) {
282
61
            hts_log_error("Header includes @RG line with no ID: tag");
283
61
            return -1;  // ID should be present, according to spec.
284
61
        }
285
3.02k
        assert(tag->str && tag->len >= 3);
286
287
        // Seen already?
288
2.96k
        k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
289
2.96k
        if (k < kh_end(hrecs->rg_hash)) {
290
1.28k
            nrg = kh_val(hrecs->rg_hash, k);
291
1.28k
            assert(hrecs->rg[nrg].ty != NULL);
292
1.28k
            if (hrecs->rg[nrg].ty != h_type) {
293
1.28k
                hts_log_warning("Duplicate entry \"%s\" in sam header",
294
1.28k
                                tag->str + 3);
295
1.28k
            } else {
296
0
                hrecs->rg[nrg].name = tag->str + 3;
297
0
                hrecs->rg[nrg].name_len = tag->len - 3;
298
0
            }
299
1.28k
            return 0;
300
1.28k
        }
301
302
1.68k
        if (nrg == hrecs->rg_sz) {
303
883
            size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
304
883
            sam_hrec_rg_t *new_rg = hts_realloc_p(hrecs->rg, sizeof(*hrecs->rg), new_sz);
305
883
            if (!new_rg)
306
0
                return -1;
307
883
            hrecs->rg = new_rg;
308
883
            hrecs->rg_sz = new_sz;
309
883
        }
310
311
1.68k
        hrecs->rg[nrg].name = tag->str + 3;
312
1.68k
        hrecs->rg[nrg].name_len = tag->len - 3;
313
1.68k
        hrecs->rg[nrg].ty   = h_type;
314
1.68k
        hrecs->rg[nrg].id   = nrg;
315
316
1.68k
        k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
317
1.68k
        if (-1 == r) return -1;
318
1.68k
        kh_val(hrecs->rg_hash, k) = nrg;
319
320
1.68k
        hrecs->nrg++;
321
1.68k
    }
322
323
    /* Add to program hash? */
324
14.4k
    if (type == TYPEKEY("PG")) {
325
3.82k
        sam_hrec_tag_t *tag;
326
3.82k
        sam_hrec_pg_t *new_pg;
327
3.82k
        int npg = hrecs->npg;
328
329
3.82k
        if (npg == hrecs->pg_sz) {
330
1.38k
            size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
331
1.38k
            new_pg = hts_realloc_p(hrecs->pg, sizeof(*hrecs->pg), new_sz);
332
1.38k
            if (!new_pg)
333
0
                return -1;
334
1.38k
            hrecs->pg = new_pg;
335
1.38k
            hrecs->pg_sz = new_sz;
336
1.38k
        }
337
338
3.82k
        tag = h_type->tag;
339
3.82k
        hrecs->pg[npg].name = NULL;
340
3.82k
        hrecs->pg[npg].name_len = 0;
341
3.82k
        hrecs->pg[npg].ty  = h_type;
342
3.82k
        hrecs->pg[npg].id   = npg;
343
3.82k
        hrecs->pg[npg].prev_id = -1;
344
345
12.7k
        while (tag) {
346
8.88k
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
347
                /* Avoid duplicate ID tags coming from other applications */
348
4.90k
                if (!hrecs->pg[npg].name) {
349
3.70k
                    assert(tag->len >= 3);
350
3.70k
                    hrecs->pg[npg].name = tag->str + 3;
351
3.70k
                    hrecs->pg[npg].name_len = tag->len - 3;
352
3.70k
                } else {
353
1.20k
                    hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
354
1.20k
                }
355
4.90k
            } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
356
                // Resolve later if needed
357
2.31k
                khint_t k;
358
2.31k
                k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
359
360
2.31k
                if (k != kh_end(hrecs->pg_hash)) {
361
1.43k
                    int p_id = kh_val(hrecs->pg_hash, k);
362
1.43k
                    hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
363
364
                    /* Unmark previous entry as a PG termination */
365
1.43k
                    if (hrecs->npg_end > 0 &&
366
1.38k
                        hrecs->pg_end[hrecs->npg_end-1] == p_id) {
367
178
                        hrecs->npg_end--;
368
1.25k
                    } else {
369
1.25k
                        int i;
370
13.1k
                        for (i = 0; i < hrecs->npg_end; i++) {
371
11.8k
                            if (hrecs->pg_end[i] == p_id) {
372
205
                                memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
373
205
                                        (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
374
205
                                hrecs->npg_end--;
375
205
                            }
376
11.8k
                        }
377
1.25k
                    }
378
1.43k
                } else {
379
878
                    hrecs->pg[npg].prev_id = -1;
380
878
                }
381
2.31k
            }
382
8.88k
            tag = tag->next;
383
8.88k
        }
384
385
3.82k
        if (hrecs->pg[npg].name) {
386
3.70k
            khint_t k;
387
3.70k
            int r;
388
3.70k
            k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
389
3.70k
            if (-1 == r) return -1;
390
3.70k
            kh_val(hrecs->pg_hash, k) = npg;
391
3.70k
        } else {
392
123
            return -1; // ID should be present, according to spec.
393
123
        }
394
395
        /* Add to npg_end[] array. Remove later if we find a PP line */
396
3.70k
        if (hrecs->npg_end >= hrecs->npg_end_alloc) {
397
651
            int *new_pg_end;
398
651
            int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
399
400
651
            new_pg_end = hts_realloc_p(hrecs->pg_end, sizeof(int), new_alloc);
401
651
            if (!new_pg_end)
402
0
                return -1;
403
651
            hrecs->npg_end_alloc = new_alloc;
404
651
            hrecs->pg_end = new_pg_end;
405
651
        }
406
3.70k
        hrecs->pg_end[hrecs->npg_end++] = npg;
407
408
3.70k
        hrecs->npg++;
409
3.70k
    }
410
411
14.3k
    return 0;
412
14.4k
}
413
414
0
static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
415
0
    if (!hrecs || !h_type)
416
0
        return -1;
417
418
0
    sam_hrec_tag_t *tag;
419
0
    const char *key = NULL;
420
0
    khint_t k;
421
422
    /* Remove name and any alternative names from reference hash */
423
0
    if (type == TYPEKEY("SQ")) {
424
0
        const char *altnames = NULL;
425
426
0
        tag = h_type->tag;
427
428
0
        while (tag) {
429
0
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
430
0
                assert(tag->len >= 3);
431
0
                key = tag->str + 3;
432
0
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
433
0
                assert(tag->len >= 3);
434
0
                altnames = tag->str + 3;
435
0
            }
436
0
            tag = tag->next;
437
0
        }
438
439
0
        if (key) {
440
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
441
0
            if (k != kh_end(hrecs->ref_hash)) {
442
0
                int idx = kh_val(hrecs->ref_hash, k);
443
0
                if (idx + 1 < hrecs->nref)
444
0
                    memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
445
0
                            sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
446
0
                if (altnames)
447
0
                    sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
448
0
                kh_del(m_s2i, hrecs->ref_hash, k);
449
0
                hrecs->nref--;
450
0
                if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
451
0
                    hrecs->refs_changed = idx;
452
0
                for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
453
0
                    if (kh_exist(hrecs->ref_hash, k)
454
0
                        && kh_value(hrecs->ref_hash, k) > idx) {
455
0
                        kh_value(hrecs->ref_hash, k)--;
456
0
                    }
457
0
                }
458
0
            }
459
0
        }
460
0
    }
461
462
    /* Remove from read-group hash */
463
0
    if (type == TYPEKEY("RG")) {
464
0
        tag = h_type->tag;
465
466
0
        while (tag) {
467
0
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
468
0
                assert(tag->len >= 3);
469
0
                key = tag->str + 3;
470
0
                k = kh_get(m_s2i, hrecs->rg_hash, key);
471
0
                if (k != kh_end(hrecs->rg_hash)) {
472
0
                    int idx = kh_val(hrecs->rg_hash, k);
473
0
                    if (idx + 1 < hrecs->nrg)
474
0
                        memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
475
0
                    kh_del(m_s2i, hrecs->rg_hash, k);
476
0
                    hrecs->nrg--;
477
0
                    for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
478
0
                        if (kh_exist(hrecs->rg_hash, k)
479
0
                            && kh_value(hrecs->rg_hash, k) > idx) {
480
0
                            kh_value(hrecs->rg_hash, k)--;
481
0
                        }
482
0
                    }
483
0
                }
484
0
                break;
485
0
            }
486
0
            tag = tag->next;
487
0
        }
488
0
    }
489
490
0
    return 0;
491
0
}
492
493
/** Add a header record to the global line ordering
494
 *
495
 * If @p after is not NULL, the new record will be inserted after this one,
496
 * otherwise it will go at the end.
497
 *
498
 * An exception is an HD record, which will always be put first unless
499
 * one is already present.
500
 */
501
static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
502
                                      sam_hrec_type_t *h_type,
503
2.25M
                                      sam_hrec_type_t *after) {
504
2.25M
    const khint32_t hd_type = TYPEKEY("HD");
505
2.25M
    int update_first_line = 0;
506
507
    // First line seen
508
2.25M
    if (!hrecs->first_line) {
509
16.2k
        hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
510
16.2k
        return;
511
16.2k
    }
512
513
    // @HD goes at the top (unless there's one already)
514
2.24M
    if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
515
38
        after = hrecs->first_line->global_prev;
516
38
        update_first_line = 1;
517
38
    }
518
519
    // If no instructions given, put it at the end
520
2.24M
    if (!after)
521
2.24M
        after = hrecs->first_line->global_prev;
522
523
2.24M
    h_type->global_prev = after;
524
2.24M
    h_type->global_next = after->global_next;
525
2.24M
    h_type->global_prev->global_next = h_type;
526
2.24M
    h_type->global_next->global_prev = h_type;
527
528
2.24M
    if (update_first_line)
529
38
        hrecs->first_line = h_type;
530
2.24M
}
531
532
/*! Add header record with a va_list interface.
533
 *
534
 * Adds a single record to a SAM header.
535
 *
536
 * This takes a header record type, a va_list argument and one or more
537
 * key,value pairs, ending with the NULL key.
538
 *
539
 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
540
 *
541
 * The purpose of the additional va_list parameter is to permit other
542
 * varargs functions to call this while including their own additional
543
 * parameters; an example is in sam_hdr_add_pg().
544
 *
545
 * Note: this function invokes va_arg at least once, making the value
546
 * of ap indeterminate after the return. The caller should call
547
 * va_start/va_end before/after calling this function or use va_copy.
548
 *
549
 * @return
550
 * Returns >= 0 on success;
551
 *        -1 on failure
552
 */
553
619
static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
554
619
    va_list args;
555
619
    sam_hrec_type_t *h_type;
556
619
    sam_hrec_tag_t *h_tag, *last=NULL;
557
619
    int new;
558
619
    khint32_t type_i = TYPEKEY(type), k;
559
560
619
    if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
561
0
        return sam_hrecs_vupdate(hrecs, h_type, ap);
562
563
619
    if (!(h_type = pool_alloc(hrecs->type_pool)))
564
0
        return -1;
565
619
    k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
566
619
    if (new < 0)
567
0
        return -1;
568
569
619
    h_type->type = type_i;
570
571
    // Form the ring, either with self or other lines of this type
572
619
    if (!new) {
573
41
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
574
41
        p = t->prev;
575
576
41
        assert(p->next == t);
577
41
        p->next = h_type;
578
41
        h_type->prev = p;
579
580
41
        t->prev = h_type;
581
41
        h_type->next = t;
582
578
    } else {
583
578
        kh_val(hrecs->h, k) = h_type;
584
578
        h_type->prev = h_type->next = h_type;
585
578
    }
586
619
    h_type->tag = NULL;
587
588
    // Add to global line ordering after any existing line of the same type,
589
    // or at the end if no line of this type exists yet.
590
619
    sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
591
592
    // Check linked-list invariants
593
619
    assert(h_type->prev->next == h_type);
594
619
    assert(h_type->next->prev == h_type);
595
619
    assert(h_type->global_prev->global_next == h_type);
596
619
    assert(h_type->global_next->global_prev == h_type);
597
598
    // Any ... varargs
599
619
    va_start(args, ap);
600
619
    for (;;) {
601
619
        char *key, *val = NULL, *str;
602
603
619
        if (!(key = (char *)va_arg(args, char *)))
604
619
            break;
605
0
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
606
0
            break;
607
0
        if (*val == '\0')
608
0
            continue;
609
610
0
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
611
0
            return -1;
612
613
0
        if (strncmp(type, "CO", 2)) {
614
0
            h_tag->len = 3 + strlen(val);
615
0
            str = string_alloc(hrecs->str_pool, hts_add_sat2(h_tag->len, 1));
616
0
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
617
0
                return -1;
618
0
            h_tag->str = str;
619
0
        } else {
620
0
            h_tag->len = strlen(key);
621
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
622
0
            if (!h_tag->str)
623
0
                return -1;
624
0
        }
625
626
0
        h_tag->next = NULL;
627
0
        if (last)
628
0
            last->next = h_tag;
629
0
        else
630
0
            h_type->tag = h_tag;
631
632
0
        last = h_tag;
633
0
    }
634
619
    va_end(args);
635
636
    // Plus the specified va_list params
637
1.85k
    for (;;) {
638
1.85k
        char *key, *val = NULL, *str;
639
640
1.85k
        if (!(key = (char *)va_arg(ap, char *)))
641
619
            break;
642
1.23k
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
643
0
            break;
644
645
1.23k
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
646
0
            return -1;
647
648
1.23k
        if (strncmp(type, "CO", 2)) {
649
1.23k
            h_tag->len = 3 + strlen(val);
650
1.23k
            str = string_alloc(hrecs->str_pool, hts_add_sat2(h_tag->len, 1));
651
1.23k
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
652
0
                return -1;
653
1.23k
            h_tag->str = str;
654
1.23k
        } else {
655
0
            h_tag->len = strlen(key);
656
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
657
0
            if (!h_tag->str)
658
0
                return -1;
659
0
        }
660
661
1.23k
        h_tag->next = NULL;
662
1.23k
        if (last)
663
619
            last->next = h_tag;
664
619
        else
665
619
            h_type->tag = h_tag;
666
667
1.23k
        last = h_tag;
668
1.23k
    }
669
670
619
    if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
671
8
        return -1;
672
673
611
    if (!strncmp(type, "PG", 2))
674
0
        hrecs->pgs_changed = 1;
675
676
611
    hrecs->dirty = 1;
677
678
611
    return 0;
679
619
}
680
681
// As sam_hrecs_vadd(), but without the extra va_list parameter
682
619
static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
683
619
    va_list args;
684
619
    int res;
685
619
    va_start(args, type);
686
619
    res = sam_hrecs_vadd(hrecs, type, args, NULL);
687
619
    va_end(args);
688
619
    return res;
689
619
}
690
691
/*
692
 * Function for deallocating a list of tags
693
 */
694
695
796k
static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
696
796k
    if (!hrecs || !tag)
697
192
        return;
698
796k
    if (tag->next)
699
796k
        sam_hrecs_free_tags(hrecs, tag->next);
700
701
796k
    pool_free(hrecs->tag_pool, tag);
702
796k
}
703
704
0
static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) {
705
0
    if (!hrecs || !type_name || !type_found)
706
0
        return -1;
707
708
0
    khint32_t itype = TYPEKEY(type_name);
709
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
710
0
    if (k == kh_end(hrecs->h))
711
0
        return -1;
712
713
    // Remove from global list (remembering it could be the only line)
714
0
    if (hrecs->first_line == type_found) {
715
0
        hrecs->first_line = (type_found->global_next != type_found
716
0
                             ? type_found->global_next : NULL);
717
0
    }
718
0
    type_found->global_next->global_prev = type_found->global_prev;
719
0
    type_found->global_prev->global_next = type_found->global_next;
720
721
    /* single element in the list */
722
0
    if (type_found->prev == type_found || type_found->next == type_found) {
723
0
        kh_del(sam_hrecs_t, hrecs->h, k);
724
0
    } else {
725
0
        type_found->prev->next = type_found->next;
726
0
        type_found->next->prev = type_found->prev;
727
0
        if (kh_val(hrecs->h, k) == type_found) { //first element
728
0
            kh_val(hrecs->h, k) = type_found->next;
729
0
        }
730
0
    }
731
732
0
    if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)))
733
0
        sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
734
735
0
    sam_hrecs_free_tags(hrecs, type_found->tag);
736
0
    pool_free(hrecs->type_pool, type_found);
737
738
0
    hrecs->dirty = 1;
739
740
0
    return 0;
741
0
}
742
743
// Paste together a line from the parsed data structures
744
23.9k
static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
745
23.9k
    sam_hrec_tag_t *tag;
746
23.9k
    int r = 0;
747
23.9k
    char c[2]= { ty->type >> 8, ty->type & 0xff };
748
749
23.9k
    r |= (kputc_('@', ks) == EOF);
750
23.9k
    r |= (kputsn(c, 2, ks) == EOF);
751
63.3k
    for (tag = ty->tag; tag; tag = tag->next) {
752
39.4k
        r |= (kputc_('\t', ks) == EOF);
753
39.4k
        r |= (kputsn(tag->str, tag->len, ks) == EOF);
754
39.4k
    }
755
756
23.9k
    return r;
757
23.9k
}
758
759
10.3k
static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
760
10.3k
    const sam_hrec_type_t *t1, *t2;
761
762
10.3k
    if (!hrecs->first_line)
763
0
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
764
765
10.3k
    t1 = t2 = hrecs->first_line;
766
23.9k
    do {
767
23.9k
        if (build_header_line(t1, ks) != 0)
768
0
            return -1;
769
23.9k
        if (kputc('\n', ks) < 0)
770
0
            return -1;
771
772
23.9k
        t1 = t1->global_next;
773
23.9k
    } while (t1 != t2);
774
775
10.3k
    return 0;
776
10.3k
}
777
778
// Print warnings about known log messages that may have been inadvertently
779
// merged into SAM files.
780
781
15
static void known_stderr(const char *tool, const char *advice) {
782
15
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
783
15
    hts_log_warning("%s", advice);
784
15
}
785
786
// Look for known log messages that may have been inadvertently merged into SAM
787
// files, and print advice on how to avoid the problem if one is spotted.
788
789
445
static void warn_if_known_stderr(const char *line, size_t len) {
790
445
    int ilen = len < INT_MAX ? len : INT_MAX;
791
445
    if (kmemmem(line, ilen, "M::bwa_idx_load_from_disk", 25, NULL) != NULL)
792
4
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
793
441
    else if (kmemmem(line, ilen, "M::mem_pestat", 13, NULL) != NULL)
794
7
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
795
434
    else if (kmemmem(line, ilen, "loaded/built the index", 22, NULL) != NULL)
796
4
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
797
445
}
798
799
// Parse a CO line
800
// Returns 0 on failure; line length (>= 1) on success
801
static size_t parse_comment_line(const char *hdr, size_t len, size_t lno,
802
8.52k
                                 sam_hrecs_t *hrecs, sam_hrec_type_t *h_type) {
803
8.52k
    size_t j;
804
8.52k
    sam_hrec_tag_t *h_tag;
805
806
8.52k
    if (len == 0 || hdr[0] != '\t') {
807
43
        sam_hrecs_error("Missing tab", hdr, len, lno);
808
43
        warn_if_known_stderr(hdr, len);
809
43
        return 0;
810
43
    }
811
812
336k
    for (j = 1; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
813
327k
        ;
814
815
8.48k
    if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
816
0
        return -1;
817
8.48k
    h_tag->str = string_ndup(hrecs->str_pool, hdr + 1, j - 1);
818
8.48k
    h_tag->len = j - 1;
819
8.48k
    h_tag->next = NULL;
820
8.48k
    if (!h_tag->str)
821
0
        return 0;
822
8.48k
    return j;
823
8.48k
}
824
825
// Parse header lines other than CO
826
// Returns 0 on failure; line length (>= 1) on success
827
static size_t parse_noncomment_line(const char *hdr, size_t len, size_t lno,
828
                                    sam_hrecs_t *hrecs, sam_hrec_type_t *h_type,
829
2.25M
                                    khint_t *sq_count, khint_t *an_count) {
830
2.25M
    size_t i = 0;
831
2.25M
    sam_hrec_tag_t *h_tag, *last = NULL;
832
2.25M
    int is_sq = h_type->type == TYPEKEY("SQ");
833
834
    // Count reference name
835
2.25M
    if (is_sq)
836
13.4k
        (*sq_count)++;
837
838
4.12M
    do {
839
4.12M
        size_t j;
840
841
4.12M
        if (i == len || hdr[i] != '\t') {
842
38
            sam_hrecs_error("Missing tab", hdr, len, lno);
843
38
            warn_if_known_stderr(hdr, len);
844
38
            return 0;
845
38
        }
846
847
4.12M
        for (j = ++i;
848
322M
             j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t';
849
318M
             j++)
850
318M
            ;
851
852
4.12M
        if (j - i < 3 || hdr[i + 2] != ':') {
853
331
            sam_hrecs_error("Malformed key:value pair", hdr, len, lno);
854
331
            warn_if_known_stderr(hdr, len);
855
331
            return 0;
856
331
        }
857
858
        // Count alternate names
859
4.12M
        if (is_sq && hdr[i] == 'A' && hdr[i + 1] == 'N') {
860
333k
            size_t k;
861
333k
            (*an_count)++;
862
2.99M
            for (k = i + 2; k < j; k++)
863
2.66M
                (*an_count) += hdr[k] == ',';
864
333k
        }
865
866
4.12M
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
867
0
            return 0;
868
4.12M
        h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
869
4.12M
        h_tag->len = j-i;
870
4.12M
        h_tag->next = NULL;
871
4.12M
        if (!h_tag->str)
872
0
            return 0;
873
874
4.12M
        if (last)
875
1.87M
            last->next = h_tag;
876
2.24M
        else
877
2.24M
            h_type->tag = h_tag;
878
879
4.12M
        last = h_tag;
880
4.12M
        i = j;
881
4.12M
    } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
882
883
2.24M
    return i;
884
2.25M
}
885
886
/** Parse a single header line.
887
 * @param         hrecs        Destination sam_hrecs_t struct
888
 * @param         hdr          Data to parse
889
 * @param         len          Size of @p hdr
890
 * @param         lno          Line number, for error / warning messages
891
 * @param[out]    line_len_out Returned length of parsed line
892
 * @param[in,out] sq_count     Count of SQ lines seen so far
893
 * @param[in,out] an_count     Count of alternate names seen so far
894
 *
895
 * Used by sam_hrecs_parse_lines() and sam_hdr_build_from_sam_file(),
896
 * so has to be able to deal with lines that are not NUL terminated.
897
 * A single line pointed to by @ph hdr will be parsed into @p hrecs.
898
 * @p len gives the size of @p hdr, which may include more than one line
899
 * if called from sam_hrecs_parse_lines(). The length of the parsed line
900
 * is returned in @p line_len_out.
901
 * @p sq_count and @p an_count are pointers to the number of SQ lines
902
 * (i.e. references), and the number of alternate names.  These are used
903
 * by the caller to set the sizes of the sam_hrecs_t::ref array and
904
 * sam_hrecs_t::ref_hash table once all the lines have been parsed.
905
 */
906
907
static sam_hrec_type_t * sam_hrecs_parse_single_line(sam_hrecs_t *hrecs,
908
                                                     const char *hdr,
909
                                                     size_t len, size_t lno,
910
                                                     size_t *line_len_out,
911
                                                     khint_t *sq_count,
912
2.25M
                                                     khint_t *an_count) {
913
2.25M
    int new;
914
2.25M
    khint32_t type;
915
2.25M
    khint_t k;
916
2.25M
    sam_hrec_type_t *h_type;
917
2.25M
    size_t line_len = 0;
918
919
2.25M
    if (hdr[0] != '@') {
920
27
        sam_hrecs_error("Header line does not start with '@'",
921
27
                        hdr, len, lno);
922
27
        return NULL;
923
27
    }
924
925
2.25M
    if (!isalpha_c(hdr[1]) || !isalpha_c(hdr[2])) {
926
49
        sam_hrecs_error("Header line does not have a two character key",
927
49
                        hdr, len, lno);
928
49
        return NULL;
929
49
    }
930
2.25M
    type = TYPEKEY(&hdr[1]);
931
932
2.25M
    if (len < 3 || hdr[3] == '\n')
933
37
        return NULL;
934
935
    // Add the header line type
936
2.25M
    if (!(h_type = pool_alloc(hrecs->type_pool)))
937
0
        return NULL;
938
939
2.25M
    h_type->tag = NULL;
940
941
2.25M
    k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
942
2.25M
    if (new < 0)
943
0
        goto error;
944
945
2.25M
    h_type->type = type;
946
947
    // Parse the tags on this line
948
2.25M
    if (type == TYPEKEY("CO")) {
949
950
8.52k
        line_len = parse_comment_line(hdr + 3, len - 3, lno,
951
8.52k
                                      hrecs, h_type);
952
8.52k
        if (line_len == 0) // Failed
953
43
            goto error;
954
955
2.25M
    } else {
956
957
2.25M
        line_len = parse_noncomment_line(hdr + 3, len - 3, lno,
958
2.25M
                                         hrecs, h_type,
959
2.25M
                                         sq_count, an_count);
960
2.25M
        if (line_len == 0) // Failed
961
369
            goto error;
962
2.25M
    }
963
964
    // Add to end of global list
965
2.25M
    sam_hrecs_global_list_add(hrecs, h_type, NULL);
966
967
    // Form the ring, either with self or other lines of this type
968
2.25M
    if (!new) {
969
2.24M
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
970
2.24M
        p = t->prev;
971
972
2.24M
        assert(p->next == t);
973
2.24M
        p->next = h_type;
974
2.24M
        h_type->prev = p;
975
976
2.24M
        t->prev = h_type;
977
2.24M
        h_type->next = t;
978
2.24M
    } else {
979
17.4k
        kh_val(hrecs->h, k) = h_type;
980
17.4k
        h_type->prev = h_type->next = h_type;
981
17.4k
    }
982
983
2.25M
    if (line_len_out)
984
6.05k
        *line_len_out = line_len + 3;
985
2.25M
    return h_type;
986
987
412
 error:
988
    // Tidy up as best we can
989
412
    sam_hrecs_free_tags(hrecs, h_type->tag);
990
412
    pool_free(hrecs->type_pool, h_type);
991
412
    return NULL;
992
2.25M
}
993
994
// Parse complete header into a sam_hrecs_t structure
995
996
2.79k
static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
997
2.79k
    size_t i, lno;
998
999
2.79k
    if (!hrecs || len > SSIZE_MAX)
1000
0
        return -1;
1001
1002
2.79k
    if (!len)
1003
0
        len = strlen(hdr);
1004
1005
2.79k
    if (len < 3) {
1006
364
        if (len == 0 || *hdr == '\0') return 0;
1007
28
        sam_hrecs_error("Header line too short", hdr, len, 1);
1008
28
        return -1;
1009
364
    }
1010
1011
2.43k
    sam_hrec_type_t *first_h_type = NULL;
1012
2.43k
    sam_hrec_type_t *last_h_type = NULL;
1013
2.43k
    khint_t sq_count = 0, an_count = 0;
1014
2.43k
    size_t line_len = 0;
1015
1016
8.48k
    for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; lno++) {
1017
6.24k
        sam_hrec_type_t *h_type;
1018
6.24k
        h_type = sam_hrecs_parse_single_line(hrecs, &hdr[i], len - i,
1019
6.24k
                                             lno, &line_len,
1020
6.24k
                                             &sq_count, &an_count);
1021
6.24k
        if (!h_type)
1022
192
            return -1;
1023
1024
6.05k
        i += line_len + 1;
1025
1026
        // Keep track of first / last new record so they can be passed to
1027
        // sam_hrecs_update_hashes() below.  Ignore HD as it may be stored
1028
        // out of order (it always goes at the start) and
1029
        // sam_hrecs_update_hashes() does nothing with HD.
1030
6.05k
        if (h_type->type != TYPEKEY("HD")) {
1031
5.74k
            if (!first_h_type)
1032
2.13k
                first_h_type = h_type;
1033
5.74k
            last_h_type = h_type;
1034
5.74k
        }
1035
6.05k
    }
1036
1037
    // Reserve space in hash tables and refs array
1038
2.24k
    if (sq_count) {
1039
        // It's possible that some entries are already in the hash table,
1040
        // e.g. via sam_hrecs_refs_from_targets_array(), so only reserve
1041
        // space beyond what's already there.
1042
603
        khint_t ref_name_count = sq_count + an_count;
1043
603
        khint_t sz = kh_size(hrecs->ref_hash);
1044
603
        if (ref_name_count > sz
1045
292
            && kh_grow_to_fit(m_s2i, hrecs->ref_hash, ref_name_count - sz) < 0)
1046
0
            return -1;
1047
1048
603
        if (hrecs->ref_sz < sq_count) {
1049
272
            sam_hrec_sq_t *new_ref = hts_realloc_p(hrecs->ref,
1050
272
                                                   sizeof(*hrecs->ref), sq_count);
1051
272
            if (!new_ref)
1052
0
                return -1;
1053
272
            hrecs->ref = new_ref;
1054
272
            hrecs->ref_sz = sq_count;
1055
272
        }
1056
603
    }
1057
1058
    // Update RG/SQ/PG hashes
1059
2.24k
    if (first_h_type) {
1060
2.09k
        sam_hrec_type_t *h_type = first_h_type->global_prev;
1061
5.14k
        do {
1062
5.14k
            h_type = h_type->global_next;
1063
5.14k
            if (-1 == sam_hrecs_update_hashes(hrecs, h_type->type, h_type))
1064
240
                return -1;
1065
5.14k
        } while (h_type != last_h_type);
1066
2.09k
    }
1067
2.00k
    return 0;
1068
2.24k
}
1069
1070
/*! Update sam_hdr_t target_name and target_len arrays
1071
 *
1072
 *  @return 0 on success; -1 on failure
1073
 */
1074
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
1075
31.9k
                                 int refs_changed) {
1076
31.9k
    if (!bh || !hrecs)
1077
0
        return -1;
1078
1079
31.9k
    if (refs_changed < 0)
1080
0
        return 0;
1081
1082
    // Grow arrays if necessary
1083
31.9k
    if (bh->n_targets < hrecs->nref) {
1084
1.64k
        char **new_names = hts_realloc_p(bh->target_name, sizeof(*new_names),
1085
1.64k
                                         hrecs->nref);
1086
1.64k
        if (!new_names)
1087
0
            return -1;
1088
1.64k
        bh->target_name = new_names;
1089
1.64k
        uint32_t *new_lens = hts_realloc_p(bh->target_len, sizeof(*new_lens),
1090
1.64k
                                           hrecs->nref);
1091
1.64k
        if (!new_lens)
1092
0
            return -1;
1093
1.64k
        bh->target_len = new_lens;
1094
1.64k
    }
1095
1096
    // Update names and lengths where changed
1097
    // hrecs->refs_changed is the first ref that has been updated, so ones
1098
    // before that can be skipped.
1099
31.9k
    int i;
1100
31.9k
    khint_t k;
1101
31.9k
    khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1102
35.0k
    for (i = refs_changed; i < hrecs->nref; i++) {
1103
3.10k
        if (i >= bh->n_targets
1104
3.10k
            || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
1105
3.10k
            if (i < bh->n_targets)
1106
0
                free(bh->target_name[i]);
1107
3.10k
            bh->target_name[i] = strdup(hrecs->ref[i].name);
1108
3.10k
            if (!bh->target_name[i])
1109
0
                return -1;
1110
3.10k
        }
1111
3.10k
        if (hrecs->ref[i].len < UINT32_MAX) {
1112
1.52k
            bh->target_len[i] = hrecs->ref[i].len;
1113
1114
1.52k
            if (!long_refs)
1115
1.44k
                continue;
1116
1117
            // Check if we have an old length, if so remove it.
1118
83
            k = kh_get(s2i, long_refs, bh->target_name[i]);
1119
83
            if (k < kh_end(long_refs))
1120
0
                kh_del(s2i, long_refs, k);
1121
1.57k
        } else {
1122
1.57k
            bh->target_len[i] = UINT32_MAX;
1123
1.57k
            if (bh->hrecs != hrecs) {
1124
                // Called from sam_hdr_dup; need to add sdict entries
1125
936
                if (!long_refs) {
1126
260
                    if (!(bh->sdict = long_refs = kh_init(s2i)))
1127
0
                        return -1;
1128
260
                }
1129
1130
                // Add / update length
1131
936
                int absent;
1132
936
                k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
1133
936
                if (absent < 0)
1134
0
                    return -1;
1135
936
                kh_val(long_refs, k) = hrecs->ref[i].len;
1136
936
            }
1137
1.57k
        }
1138
3.10k
    }
1139
1140
    // Free up any names that have been removed
1141
31.9k
    for (; i < bh->n_targets; i++) {
1142
0
        if (long_refs) {
1143
0
            k = kh_get(s2i, long_refs, bh->target_name[i]);
1144
0
            if (k < kh_end(long_refs))
1145
0
                kh_del(s2i, long_refs, k);
1146
0
        }
1147
0
        free(bh->target_name[i]);
1148
0
    }
1149
1150
31.9k
    bh->n_targets = hrecs->nref;
1151
31.9k
    return 0;
1152
31.9k
}
1153
1154
65
static int rebuild_target_arrays(sam_hdr_t *bh) {
1155
65
    if (!bh || !bh->hrecs)
1156
0
        return -1;
1157
1158
65
    sam_hrecs_t *hrecs = bh->hrecs;
1159
65
    if (hrecs->refs_changed < 0)
1160
0
        return 0;
1161
1162
65
    if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
1163
0
        return -1;
1164
1165
65
    hrecs->refs_changed = -1;
1166
65
    return 0;
1167
65
}
1168
1169
/// Populate hrecs refs array from header target_name, target_len arrays
1170
/**
1171
 * @return 0 on success; -1 on failure
1172
 *
1173
 * Pre-fills the refs hash from the target arrays.  For BAM files this
1174
 * will ensure that they are in the correct order as the target arrays
1175
 * are the canonical source for converting target ids to names and lengths.
1176
 *
1177
 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1178
 * will add the links later for lines found in the text header.
1179
 *
1180
 * This should be called before the text header is parsed.
1181
 */
1182
static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1183
1.02k
                                             const sam_hdr_t *bh) {
1184
1.02k
    int32_t tid = 0;
1185
1186
1.02k
    if (!hrecs || !bh)
1187
0
        return -1;
1188
1189
    // This should always be called before parsing the text header
1190
    // so the ref array should start off empty, and we don't have to try
1191
    // to reconcile any existing data.
1192
1.02k
    if (hrecs->nref > 0) {
1193
0
        hts_log_error("Called with non-empty ref array");
1194
0
        return -1;
1195
0
    }
1196
1197
1.02k
    if (hrecs->ref_sz < bh->n_targets) {
1198
1.02k
        sam_hrec_sq_t *new_ref = hts_realloc_p(hrecs->ref, sizeof(*new_ref),
1199
1.02k
                                               bh->n_targets);
1200
1.02k
        if (!new_ref)
1201
0
            return -1;
1202
1203
1.02k
        hrecs->ref = new_ref;
1204
1.02k
        hrecs->ref_sz = bh->n_targets;
1205
1.02k
    }
1206
1207
1.02k
    if (kh_grow_to_fit(m_s2i, hrecs->ref_hash, bh->n_targets) != 0)
1208
0
        goto fail;
1209
1210
2.38k
    for (tid = 0; tid < bh->n_targets; tid++) {
1211
1.42k
        khint_t k;
1212
1.42k
        int r;
1213
1.42k
        hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1214
1.42k
        if (!hrecs->ref[tid].name) goto fail;
1215
1.42k
        if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1216
1.09k
            hrecs->ref[tid].len  = bh->target_len[tid];
1217
1.09k
        } else {
1218
331
            khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1219
331
            k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1220
331
            if (k < kh_end(long_refs)) {
1221
331
                hrecs->ref[tid].len = kh_val(long_refs, k);
1222
331
            } else {
1223
0
                hrecs->ref[tid].len = UINT32_MAX;
1224
0
            }
1225
331
        }
1226
1.42k
        hrecs->ref[tid].ty   = NULL;
1227
1.42k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1228
1.42k
        if (r < 0) goto fail;
1229
1.42k
        if (r == 0) {
1230
60
            hts_log_error("Duplicate entry \"%s\" in target list",
1231
60
                            hrecs->ref[tid].name);
1232
60
            return -1;
1233
1.36k
        } else {
1234
1.36k
            kh_val(hrecs->ref_hash, k) = tid;
1235
1.36k
        }
1236
1.42k
    }
1237
964
    hrecs->nref = bh->n_targets;
1238
964
    return 0;
1239
1240
0
 fail: {
1241
0
        int32_t i;
1242
0
        hts_log_error("%s", strerror(errno));
1243
0
        for (i = 0; i < tid; i++) {
1244
0
            khint_t k;
1245
0
            if (!hrecs->ref[i].name) continue;
1246
0
            k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1247
0
            if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1248
0
        }
1249
0
        hrecs->nref = 0;
1250
0
        return -1;
1251
1.02k
    }
1252
1.02k
}
1253
1254
/*
1255
 * Add SQ header records for any references in the hrecs->ref array that
1256
 * were added by sam_hrecs_refs_from_targets_array() but have not
1257
 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1258
 *
1259
 * This may be needed either because:
1260
 *
1261
 *   - A bam file was read that had entries in its refs list with no
1262
 *     corresponding @SQ line.
1263
 *
1264
 *   - A program constructed a sam_hdr_t which has target_name and target_len
1265
 *     array entries with no corresponding @SQ line in text.
1266
 */
1267
24.8k
static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1268
24.8k
    int tid;
1269
24.8k
    char len[32];
1270
1271
26.0k
    for (tid = 0; tid < hrecs->nref; tid++) {
1272
1.24k
        if (hrecs->ref[tid].ty == NULL) {
1273
619
            snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1274
619
            if (sam_hrecs_add(hrecs, "SQ",
1275
619
                              "SN", hrecs->ref[tid].name,
1276
619
                              "LN", len, NULL) != 0)
1277
8
                return -1;
1278
1279
            // Check that the stub has actually been filled
1280
611
            if(hrecs->ref[tid].ty == NULL) {
1281
0
                hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1282
0
                        tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1283
0
                return -1;
1284
0
            }
1285
611
        }
1286
1.24k
    }
1287
24.7k
    return 0;
1288
24.8k
}
1289
1290
25.0k
int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1291
25.0k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1292
1293
25.0k
    if (!hrecs)
1294
0
        return -1;
1295
1296
25.0k
    if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1297
1.02k
        if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1298
60
            sam_hrecs_free(hrecs);
1299
60
            return -1;
1300
60
        }
1301
1.02k
    }
1302
1303
    // Parse existing header text
1304
24.9k
    if (bh->text && bh->l_text > 0) {
1305
2.11k
        if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1306
186
            sam_hrecs_free(hrecs);
1307
186
            return -1;
1308
186
        }
1309
2.11k
    }
1310
1311
24.8k
    if (add_stub_ref_sq_lines(hrecs) < 0) {
1312
8
        sam_hrecs_free(hrecs);
1313
8
        return -1;
1314
8
    }
1315
1316
24.7k
    bh->hrecs = hrecs;
1317
1318
24.7k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1319
0
        return -1;
1320
1321
24.7k
    return 0;
1322
24.7k
}
1323
1324
// Check header type is valid according to the SAM specification
1325
1326
2.25M
static int valid_sam_header_type(const char *s) {
1327
2.25M
    if (s[0] != '@') return 0;
1328
2.25M
    switch (s[1]) {
1329
963
    case 'H':
1330
963
        return s[2] == 'D' && s[3] == '\t';
1331
11.8k
    case 'S':
1332
11.8k
        return s[2] == 'Q' && s[3] == '\t';
1333
2.22M
    case 'R':
1334
2.23M
    case 'P':
1335
2.23M
        return s[2] == 'G' && s[3] == '\t';
1336
6.66k
    case 'C':
1337
6.66k
        return s[2] == 'O';
1338
2.25M
    }
1339
6
    return 0;
1340
2.25M
}
1341
1342
/** Populate sam_hdr_t from SAM file content
1343
 *
1344
 * @param hdr empty header struct to be filled
1345
 * @param fp  File to read from
1346
 * @return 0 on success
1347
 *        -1 on failure
1348
 *
1349
 * This function is used to build the header structure when reading SAM files.
1350
 * The lines read from the file are used to create sam_hrecs_t structures.
1351
 * The function also populates sam_hdr_t::text, sam_hdr_t::l_text,
1352
 * sam_hdr_t::target_name and sam_hdr_t::target_len.
1353
 */
1354
13.8k
int sam_hdr_build_from_sam_file(sam_hdr_t *hdr, htsFile* fp) {
1355
13.8k
    kstring_t str = { 0, 0, NULL };
1356
13.8k
    int ret;
1357
13.8k
    int next_c = '@';
1358
13.8k
    sam_hrec_type_t *first_h_type = NULL;
1359
13.8k
    sam_hrec_type_t *last_h_type = NULL;
1360
13.8k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1361
13.8k
    size_t lno = 0;
1362
13.8k
    khint_t sq_count = 0, an_count = 0;
1363
1364
13.8k
    if (!hrecs)
1365
0
        return -1;
1366
1367
13.8k
    hdr->hrecs = hrecs;
1368
1369
2.26M
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1370
2.25M
        sam_hrec_type_t *h_type;
1371
1372
2.25M
        lno++;
1373
2.25M
        if (!fp->line.s || fp->line.s[0] != '@')
1374
57
            break;
1375
1376
2.25M
        if (fp->line.l < 3
1377
2.25M
            || !isalpha_c(fp->line.s[1]) || !isalpha_c(fp->line.s[2])) {
1378
63
            sam_hrecs_error("Header line does not have a two character key",
1379
63
                            fp->line.s, fp->line.l, lno);
1380
63
            goto error;
1381
63
        }
1382
1383
2.25M
        if (fp->line.l == 3 && strncmp(fp->line.s, "@CO", 3) == 0) {
1384
            // HTSlib's original SAM reader let you get away with an untabbed
1385
            // @CO line
1386
246
            kputc('\t', &fp->line);
1387
246
        }
1388
1389
2.25M
        if (!valid_sam_header_type(fp->line.s)) {
1390
33
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1391
33
            warn_if_known_stderr(fp->line.s, fp->line.l);
1392
33
            goto error;
1393
33
        }
1394
1395
2.25M
        h_type = sam_hrecs_parse_single_line(hrecs, fp->line.s, fp->line.l,
1396
2.25M
                                             lno, NULL, &sq_count, &an_count);
1397
2.25M
        if (!h_type)
1398
333
            goto error;
1399
1400
2.25M
        if (h_type->type != TYPEKEY("HD")) {
1401
2.25M
            if (!first_h_type)
1402
13.4k
                first_h_type = h_type;
1403
2.25M
            last_h_type = h_type;
1404
2.25M
        }
1405
1406
2.25M
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1407
0
            goto error;
1408
1409
2.25M
        if (kputc('\n', &str) < 0)
1410
0
            goto error;
1411
1412
2.25M
        if (fp->is_bgzf) {
1413
2.21M
            next_c = bgzf_peek(fp->fp.bgzf);
1414
2.21M
        } else {
1415
32.6k
            unsigned char nc;
1416
32.6k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1417
32.6k
            next_c = pret > 0 ? nc : pret - 1;
1418
32.6k
        }
1419
2.25M
        if (next_c < -1)
1420
3
            goto error;
1421
2.25M
    }
1422
13.4k
    if (next_c != '@')
1423
13.3k
        fp->line.l = 0;
1424
1425
13.4k
    if (ret < -1)
1426
36
        goto error;
1427
1428
13.4k
    if (sq_count == 0 && fp->fn_aux) {
1429
        // No @SQ lines found.  Try to get them from the fasta index.
1430
0
        kstring_t line = { 0, 0, NULL };
1431
1432
        /* The reference index (.fai) is actually needed here */
1433
0
        char *fai_fn = fp->fn_aux;
1434
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1435
0
        if (fn_delim)
1436
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1437
1438
0
        hFILE* f = hopen(fai_fn, "r");
1439
0
        int e = 0;
1440
0
        if (f == NULL)
1441
0
            goto error;
1442
1443
0
        while (line.l = 0, khgetline(&line, f) >= 0) {
1444
0
            char* tab = strchr(line.s, '\t');
1445
0
            size_t l_start = str.l;
1446
0
            sam_hrec_type_t *h_type;
1447
0
            hts_pos_t ln;
1448
1449
0
            if (tab == NULL)
1450
0
                continue;
1451
1452
0
            ln = strtoll(tab, NULL, 10);
1453
1454
0
            e |= kputs("@SQ\tSN:", &str) < 0;
1455
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
1456
0
            e |= kputs("\tLN:", &str) < 0;
1457
0
            e |= kputll(ln, &str) < 0;
1458
0
            e |= kputc('\n', &str) < 0;
1459
0
            if (e)
1460
0
                break;
1461
1462
0
            h_type = sam_hrecs_parse_single_line(hrecs, &str.s[l_start],
1463
0
                                                 str.l - l_start,
1464
0
                                                 lno, NULL,
1465
0
                                                 &sq_count, &an_count);
1466
0
            if (!h_type)
1467
0
                goto error;
1468
1469
0
            if (!first_h_type)
1470
0
                first_h_type = h_type;
1471
0
            last_h_type = h_type;
1472
0
        }
1473
1474
0
        ks_free(&line);
1475
0
        if (hclose(f) != 0) {
1476
0
            hts_log_error("Error on closing %s", fai_fn);
1477
0
            e = 1;
1478
0
        }
1479
0
        if (e)
1480
0
            goto error;
1481
0
    }
1482
1483
    // Reserve space in hash tables
1484
13.4k
    if (sq_count) {
1485
        // Hash table should be empty at the moment
1486
8.99k
        if (kh_grow_to_fit(m_s2i, hrecs->ref_hash, sq_count + an_count) < 0)
1487
0
            goto error;
1488
1489
8.99k
        if (hrecs->ref_sz < sq_count) {
1490
8.99k
            sam_hrec_sq_t *new_ref = hts_realloc_p(hrecs->ref,
1491
8.99k
                                                   sizeof(*hrecs->ref), sq_count);
1492
8.99k
            if (!new_ref)
1493
0
                goto error;
1494
8.99k
            hrecs->ref = new_ref;
1495
8.99k
            hrecs->ref_sz = sq_count;
1496
8.99k
        }
1497
8.99k
    }
1498
1499
    // Update RG/SQ/PG hashes
1500
13.4k
    if (first_h_type) {
1501
13.3k
        sam_hrec_type_t *h_type = first_h_type->global_prev;
1502
20.1k
        do {
1503
20.1k
            h_type = h_type->global_next;
1504
20.1k
            if (-1 == sam_hrecs_update_hashes(hrecs, h_type->type, h_type))
1505
8.84k
                goto error;
1506
20.1k
        } while (h_type != last_h_type);
1507
13.3k
    }
1508
1509
4.57k
    if (str.l == 0)
1510
57
        kputsn("", 0, &str);
1511
4.57k
    hdr->l_text = str.l;
1512
4.57k
    hdr->text = ks_release(&str);
1513
1514
4.57k
    if (sam_hdr_update_target_arrays(hdr, hrecs, 0) < 0)
1515
0
        goto error;
1516
1517
4.57k
    return 0;
1518
1519
9.30k
 error:
1520
9.30k
    ks_free(&str);
1521
9.30k
    return -1;
1522
4.57k
}
1523
1524
/** Remove outdated header text
1525
1526
    @param bh     BAM header
1527
1528
    This is called when API functions have changed the header so that the
1529
    text version is no longer valid.
1530
 */
1531
408
static void redact_header_text(sam_hdr_t *bh) {
1532
408
    assert(bh->hrecs && bh->hrecs->dirty);
1533
408
    bh->l_text = 0;
1534
408
    free(bh->text);
1535
408
    bh->text = NULL;
1536
408
}
1537
1538
/** Find nth header record of a given type
1539
1540
    @param type   Header type (SQ, RG etc.)
1541
    @param idx    0-based index
1542
1543
    @return sam_hrec_type_t pointer to the record on success
1544
            NULL if no record exists with the given type and index
1545
 */
1546
1547
static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1548
0
                                                const char *type, int idx) {
1549
0
    sam_hrec_type_t *first, *itr;
1550
1551
0
    if (idx < 0)
1552
0
        return NULL;
1553
1554
0
    if (type[0] == 'S' && type[1] == 'Q')
1555
0
        return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1556
1557
0
    if (type[0] == 'R' && type[1] == 'G')
1558
0
        return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1559
1560
0
    if (type[0] == 'P' && type[1] == 'G')
1561
0
        return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1562
1563
0
    first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1564
0
    if (!first)
1565
0
        return NULL;
1566
1567
0
    while (idx > 0) {
1568
0
        itr = itr->next;
1569
0
        if (itr == first)
1570
0
            break;
1571
0
        --idx;
1572
0
    }
1573
1574
0
    return idx == 0 ? itr : NULL;
1575
0
}
1576
1577
/* ==== Public methods ==== */
1578
1579
5.44k
size_t sam_hdr_length(sam_hdr_t *bh) {
1580
5.44k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1581
0
        return SIZE_MAX;
1582
1583
5.44k
    return bh->l_text;
1584
5.44k
}
1585
1586
1.70k
const char *sam_hdr_str(sam_hdr_t *bh) {
1587
1.70k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1588
0
        return NULL;
1589
1590
1.70k
    return bh->text;
1591
1.70k
}
1592
1593
0
int sam_hdr_nref(const sam_hdr_t *bh) {
1594
0
    if (!bh)
1595
0
        return -1;
1596
1597
0
    return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1598
0
}
1599
1600
/*
1601
 * Reconstructs the text representation from the header hash table.
1602
 * Returns 0 on success
1603
 *        -1 on failure
1604
 */
1605
7.14k
int sam_hdr_rebuild(sam_hdr_t *bh) {
1606
7.14k
    sam_hrecs_t *hrecs;
1607
7.14k
    if (!bh)
1608
0
        return -1;
1609
1610
7.14k
    if (!(hrecs = bh->hrecs))
1611
0
        return bh->text ? 0 : -1;
1612
1613
7.14k
    if (hrecs->refs_changed >= 0) {
1614
0
        if (rebuild_target_arrays(bh) < 0) {
1615
0
            hts_log_error("Header target array rebuild has failed");
1616
0
            return -1;
1617
0
        }
1618
0
    }
1619
1620
    /* If header text wasn't changed or header is empty, don't rebuild it. */
1621
7.14k
    if (!hrecs->dirty)
1622
7.12k
        return 0;
1623
1624
17
    if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1625
0
        hts_log_error("Linking @PG lines has failed");
1626
0
        return -1;
1627
0
    }
1628
1629
17
    kstring_t ks = KS_INITIALIZE;
1630
17
    if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1631
0
        ks_free(&ks);
1632
0
        hts_log_error("Header text rebuild has failed");
1633
0
        return -1;
1634
0
    }
1635
1636
17
    hrecs->dirty = 0;
1637
1638
    /* Sync */
1639
17
    free(bh->text);
1640
17
    bh->l_text = ks_len(&ks);
1641
17
    bh->text = ks_release(&ks);
1642
1643
17
    return 0;
1644
17
}
1645
1646
/*
1647
 * Appends a formatted line to an existing SAM header.
1648
 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1649
 * optional new-line. If it contains more than 1 line then multiple lines
1650
 * will be added in order.
1651
 *
1652
 * Input text is of maximum length len or as terminated earlier by a NUL.
1653
 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1654
 *
1655
 * Returns 0 on success
1656
 *        -1 on failure
1657
 */
1658
7.53k
int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1659
7.53k
    sam_hrecs_t *hrecs;
1660
1661
7.53k
    if (!bh || !lines)
1662
0
        return -1;
1663
1664
7.53k
    if (len == 0 && *lines == '\0')
1665
6.85k
        return 0;
1666
1667
682
    if (!(hrecs = bh->hrecs)) {
1668
682
        if (sam_hdr_fill_hrecs(bh) != 0)
1669
0
            return -1;
1670
682
        hrecs = bh->hrecs;
1671
682
    }
1672
1673
682
    if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1674
274
        return -1;
1675
1676
408
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1677
0
        return -1;
1678
1679
408
    hrecs->dirty = 1;
1680
408
    redact_header_text(bh);
1681
1682
408
    return 0;
1683
408
}
1684
1685
/*
1686
 * Adds a single line to a SAM header.
1687
 * Specify type and one or more key,value pairs, ending with the NULL key.
1688
 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1689
 *
1690
 * Returns 0 on success
1691
 *        -1 on failure
1692
 */
1693
0
int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1694
0
    va_list args;
1695
0
    sam_hrecs_t *hrecs;
1696
1697
0
    if (!bh || !type)
1698
0
        return -1;
1699
1700
0
    if (!(hrecs = bh->hrecs)) {
1701
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1702
0
            return -1;
1703
0
        hrecs = bh->hrecs;
1704
0
    }
1705
1706
0
    va_start(args, type);
1707
0
    int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1708
0
    va_end(args);
1709
1710
0
    if (ret == 0) {
1711
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1712
0
            return -1;
1713
1714
0
        if (hrecs->dirty)
1715
0
            redact_header_text(bh);
1716
0
    }
1717
1718
0
    return ret;
1719
0
}
1720
1721
/*
1722
 * Returns a complete line of formatted text for a specific head type/ID
1723
 * combination. If ID_key is NULL then it returns the first line of the specified
1724
 * type.
1725
 */
1726
int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1727
0
                      const char *ID_key, const char *ID_val, kstring_t *ks) {
1728
0
    sam_hrecs_t *hrecs;
1729
0
    if (!bh || !type)
1730
0
        return -2;
1731
1732
0
    if (!(hrecs = bh->hrecs)) {
1733
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1734
0
            return -2;
1735
0
        hrecs = bh->hrecs;
1736
0
    }
1737
1738
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1739
0
    if (!ty)
1740
0
        return -1;
1741
1742
0
    ks->l = 0;
1743
0
    if (build_header_line(ty, ks) < 0) {
1744
0
        return -2;
1745
0
    }
1746
1747
0
    return 0;
1748
0
}
1749
1750
int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1751
0
                          int pos, kstring_t *ks) {
1752
0
    sam_hrecs_t *hrecs;
1753
0
    if (!bh || !type)
1754
0
        return -2;
1755
1756
0
    if (!(hrecs = bh->hrecs)) {
1757
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1758
0
            return -2;
1759
0
        hrecs = bh->hrecs;
1760
0
    }
1761
1762
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1763
0
    if (!ty)
1764
0
        return -1;
1765
1766
0
    ks->l = 0;
1767
0
    if (build_header_line(ty, ks) < 0) {
1768
0
        return -2;
1769
0
    }
1770
1771
0
    return 0;
1772
0
}
1773
1774
/*
1775
 * Remove a line from the header by specifying a tag:value that uniquely
1776
 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1777
 * @SQ line is uniquely identified by SN tag.
1778
 * @RG line is uniquely identified by ID tag.
1779
 * @PG line is uniquely identified by ID tag.
1780
 *
1781
 * Returns 0 on success and -1 on error
1782
 */
1783
1784
0
int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1785
0
    sam_hrecs_t *hrecs;
1786
0
    if (!bh || !type)
1787
0
        return -1;
1788
1789
0
    if (!(hrecs = bh->hrecs)) {
1790
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1791
0
            return -1;
1792
0
        hrecs = bh->hrecs;
1793
0
    }
1794
1795
0
    if (!strncmp(type, "PG", 2)) {
1796
0
        hts_log_warning("Removing PG lines is not supported!");
1797
0
        return -1;
1798
0
    }
1799
1800
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1801
0
    if (!type_found)
1802
0
        return 0;
1803
1804
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1805
0
    if (ret == 0) {
1806
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1807
0
            return -1;
1808
1809
0
        if (hrecs->dirty)
1810
0
            redact_header_text(bh);
1811
0
    }
1812
1813
0
    return ret;
1814
0
}
1815
1816
/*
1817
 * Remove a line from the header by specifying the position in the type
1818
 * group, i.e. 3rd @SQ line.
1819
 *
1820
 * Returns 0 on success and -1 on error
1821
 */
1822
1823
0
int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1824
0
    sam_hrecs_t *hrecs;
1825
0
    if (!bh || !type || position < 0)
1826
0
        return -1;
1827
1828
0
    if (!(hrecs = bh->hrecs)) {
1829
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1830
0
            return -1;
1831
0
        hrecs = bh->hrecs;
1832
0
    }
1833
1834
0
    if (!strncmp(type, "PG", 2)) {
1835
0
        hts_log_warning("Removing PG lines is not supported!");
1836
0
        return -1;
1837
0
    }
1838
1839
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1840
0
                                                          position);
1841
0
    if (!type_found)
1842
0
        return -1;
1843
1844
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1845
0
    if (ret == 0) {
1846
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1847
0
            return -1;
1848
1849
0
        if (hrecs->dirty)
1850
0
            redact_header_text(bh);
1851
0
    }
1852
1853
0
    return ret;
1854
0
}
1855
1856
/*
1857
 * Check if sam_hdr_update_line() is being used to change the name of
1858
 * a record, and if the new name is going to clash with an existing one.
1859
 *
1860
 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1861
 * will go through them all and leave the final one in place.
1862
 *
1863
 * Returns 0 if the name does not change
1864
 *         1 if the name changes but does not clash
1865
 *        -1 if the name changes and the new one is already in use
1866
 */
1867
static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1868
                                 va_list ap, const char **old_name,
1869
                                 const char **new_name,
1870
                                 char id_tag_out[3],
1871
0
                                 khash_t(m_s2i) **hash_out) {
1872
0
    char *key, *val;
1873
0
    const char *id_tag;
1874
0
    sam_hrec_tag_t *tag, *prev;
1875
0
    khash_t(m_s2i) *hash;
1876
0
    khint_t k;
1877
0
    int ret = 0;
1878
1879
0
    if        (rec->type == TYPEKEY("SQ")) {
1880
0
        id_tag = "SN"; hash = hrecs->ref_hash;
1881
0
    } else if (rec->type == TYPEKEY("RG")) {
1882
0
        id_tag = "ID"; hash = hrecs->rg_hash;
1883
0
    } else if (rec->type == TYPEKEY("PG")) {
1884
0
        id_tag = "ID"; hash = hrecs->pg_hash;
1885
0
    } else {
1886
0
        return 0;
1887
0
    }
1888
1889
0
    memcpy(id_tag_out, id_tag, 3);
1890
0
    *hash_out = hash;
1891
1892
0
    tag = sam_hrecs_find_key(rec, id_tag, &prev);
1893
0
    if (!tag)
1894
0
        return 0;
1895
0
    assert(tag->len >= 3);
1896
0
    *old_name = tag->str + 3;
1897
1898
0
    while ((key = va_arg(ap, char *)) != NULL) {
1899
0
        val = va_arg(ap, char *);
1900
0
        if (!val) val = "";
1901
0
        if (strcmp(key, id_tag) != 0) continue;
1902
0
        if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1903
0
        k = kh_get(m_s2i, hash, val);
1904
0
        ret = k < kh_end(hash) ? -1 : 1;
1905
0
        *new_name = val;
1906
0
    }
1907
0
    return ret;
1908
0
}
1909
1910
int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1911
0
        const char *ID_key, const char *ID_value, ...) {
1912
0
    sam_hrecs_t *hrecs;
1913
0
    if (!bh)
1914
0
        return -1;
1915
1916
0
    if (!(hrecs = bh->hrecs)) {
1917
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1918
0
            return -1;
1919
0
        hrecs = bh->hrecs;
1920
0
    }
1921
1922
0
    int ret, rename;
1923
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1924
0
    if (!ty)
1925
0
        return -1;
1926
1927
0
    va_list args;
1928
0
    const char *old_name = "?", *new_name = "?";
1929
0
    char id_tag[3];
1930
0
    khash_t(m_s2i) *hash = NULL;
1931
0
    va_start(args, ID_value);
1932
0
    rename = check_for_name_update(hrecs, ty, args,
1933
0
                                   &old_name, &new_name, id_tag, &hash);
1934
0
    va_end(args);
1935
0
    if (rename < 0) {
1936
0
        hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1937
0
                      type, old_name, new_name);
1938
0
        return -1;
1939
0
    }
1940
0
    if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1941
        // This is just too complicated
1942
0
        hts_log_error("Renaming @PG records is not supported");
1943
0
        return -1;
1944
0
    }
1945
0
    va_start(args, ID_value);
1946
0
    ret = sam_hrecs_vupdate(hrecs, ty, args);
1947
0
    va_end(args);
1948
1949
0
    if (ret)
1950
0
        return ret;
1951
1952
    // TODO Account for @SQ-AN altnames
1953
1954
0
    if (rename) {
1955
        // Adjust the hash table to point to the new name
1956
        // sam_hrecs_update_hashes() should sort out everything else
1957
0
        khint_t k = kh_get(m_s2i, hash, old_name);
1958
0
        sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1959
0
        int r, pos;
1960
0
        assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1961
0
        assert(new_tag && new_tag->str); // id_tag should exist
1962
0
        assert(new_tag->len > 3);
1963
0
        pos = kh_val(hash, k);
1964
0
        kh_del(m_s2i, hash, k);
1965
0
        k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1966
0
        if (r < 1) {
1967
0
            hts_log_error("Failed to rename item in hash table");
1968
0
            return -1;
1969
0
        }
1970
0
        kh_val(hash, k) = pos;
1971
0
    }
1972
1973
0
    ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1974
1975
0
    if (!ret && hrecs->refs_changed >= 0)
1976
0
        ret = rebuild_target_arrays(bh);
1977
1978
0
    if (!ret && hrecs->dirty)
1979
0
        redact_header_text(bh);
1980
1981
0
    return ret;
1982
0
}
1983
1984
0
static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) {
1985
0
    sam_hrec_type_t *head, *step;
1986
0
    khiter_t k;
1987
1988
0
    if (strncmp(type, "SQ", 2) == 0) {
1989
0
        hrecs->nref = 0;
1990
0
        kh_clear(m_s2i, hrecs->ref_hash);
1991
0
    } else if (strncmp(type, "RG", 2) == 0) {
1992
0
        hrecs->nrg = 0;
1993
0
        kh_clear(m_s2i, hrecs->rg_hash);
1994
0
    }
1995
1996
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1997
1998
0
    if (k != kh_end(hrecs->h)) { // something to rebuild
1999
0
        head = kh_val(hrecs->h, k);
2000
0
        step = head;
2001
2002
0
        do {
2003
0
            if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) {
2004
0
                hts_log_error("Unable to rebuild hashes");
2005
0
                return -1;
2006
0
            }
2007
2008
0
            step = step->next;
2009
0
        } while (step != head);
2010
0
    }
2011
2012
0
    return 0;
2013
0
}
2014
2015
0
int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
2016
0
    sam_hrecs_t *hrecs;
2017
0
    if (!bh || !type)
2018
0
        return -1;
2019
2020
0
    if (!(hrecs = bh->hrecs)) {
2021
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2022
0
            return -1;
2023
0
        hrecs = bh->hrecs;
2024
0
    }
2025
2026
0
    sam_hrec_type_t *step;
2027
0
    int ret = 1, remove_all = (ID_key == NULL);
2028
2029
0
    if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
2030
0
        hts_log_warning("Removing PG or CO lines is not supported!");
2031
0
        return -1;
2032
0
    }
2033
2034
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2035
0
    if (!type_found) { // remove all line of this type
2036
0
        khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2037
0
        if (k == kh_end(hrecs->h))
2038
0
            return 0;
2039
0
        type_found =  kh_val(hrecs->h, k);
2040
0
        if (!type_found)
2041
0
            return 0;
2042
0
        remove_all = 1;
2043
0
    }
2044
2045
0
    step = type_found->next;
2046
0
    while (step != type_found) {
2047
0
        sam_hrec_type_t *to_remove = step;
2048
0
        step = step->next;
2049
0
        ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2050
0
    }
2051
2052
0
    if (remove_all)
2053
0
        ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0);
2054
2055
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
2056
       to rebuild than delete one by one).
2057
    */
2058
2059
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
2060
0
        if (rebuild_hash(hrecs, type)) {
2061
0
            return -1;
2062
0
        }
2063
0
    }
2064
2065
0
    if (!ret && hrecs->dirty)
2066
0
        redact_header_text(bh);
2067
2068
0
    return 0;
2069
0
}
2070
2071
0
int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
2072
0
    sam_hrecs_t *hrecs;
2073
0
    rmhash_t *rh = (rmhash_t *)vrh;
2074
2075
0
    if (!bh || !type)
2076
0
        return -1;
2077
0
    if (!rh) // remove all lines
2078
0
        return sam_hdr_remove_except(bh, type, NULL, NULL);
2079
0
    if (!id)
2080
0
        return -1;
2081
2082
0
    if (!(hrecs = bh->hrecs)) {
2083
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2084
0
            return -1;
2085
0
        hrecs = bh->hrecs;
2086
0
    }
2087
2088
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2089
0
    if (k == kh_end(hrecs->h)) // nothing to remove from
2090
0
        return 0;
2091
2092
0
    sam_hrec_type_t *head = kh_val(hrecs->h, k);
2093
0
    if (!head) {
2094
0
        hts_log_error("Header inconsistency");
2095
0
        return -1;
2096
0
    }
2097
2098
0
    int ret = 0;
2099
0
    sam_hrec_type_t *step = head->next;
2100
0
    while (step != head) {
2101
0
        sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
2102
0
        if (tag && tag->str && tag->len >= 3) {
2103
0
           k = kh_get(rm, rh, tag->str+3);
2104
0
           if (k == kh_end(rh)) { // value is not in the hash table, so remove
2105
0
               sam_hrec_type_t *to_remove = step;
2106
0
               step = step->next;
2107
0
               ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2108
0
           } else {
2109
0
               step = step->next;
2110
0
           }
2111
0
        } else { // tag is not on the line, so skip to next line
2112
0
            step = step->next;
2113
0
        }
2114
0
    }
2115
2116
    // process the first line
2117
0
    sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
2118
0
    if (tag && tag->str && tag->len >= 3) {
2119
0
       k = kh_get(rm, rh, tag->str+3);
2120
0
       if (k == kh_end(rh)) { // value is not in the hash table, so remove
2121
0
           sam_hrec_type_t *to_remove = head;
2122
0
           head = head->next;
2123
0
           ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
2124
0
       }
2125
0
    }
2126
2127
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
2128
       to rebuild than delete one by one).
2129
    */
2130
2131
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
2132
0
        if (rebuild_hash(hrecs, type)) {
2133
0
            return -1;
2134
0
        }
2135
0
    }
2136
2137
0
    if (!ret && hrecs->dirty)
2138
0
        redact_header_text(bh);
2139
2140
0
    return ret;
2141
0
}
2142
2143
16.5k
int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
2144
16.5k
    int count;
2145
16.5k
    sam_hrec_type_t *first_ty, *itr_ty;
2146
2147
16.5k
    if (!bh || !type)
2148
0
        return -1;
2149
2150
16.5k
    if (!bh->hrecs) {
2151
11.9k
        if (sam_hdr_fill_hrecs(bh) != 0)
2152
177
            return -1;
2153
11.9k
    }
2154
2155
    // Deal with types that have counts
2156
16.3k
    switch (type[0]) {
2157
16.3k
    case 'S':
2158
16.3k
        if (type[1] == 'Q')
2159
16.3k
            return bh->hrecs->nref;
2160
0
        break;
2161
0
    case 'R':
2162
0
        if (type[1] == 'G')
2163
0
            return bh->hrecs->nrg;
2164
0
        break;
2165
0
    case 'P':
2166
0
        if (type[1] == 'G')
2167
0
            return bh->hrecs->npg;
2168
0
        break;
2169
0
    default:
2170
0
        break;
2171
16.3k
    }
2172
2173
0
    first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
2174
0
    if (!first_ty)
2175
0
        return 0;
2176
2177
0
    count = 1;
2178
0
    for (itr_ty = first_ty->next;
2179
0
         itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
2180
0
        count++;
2181
0
    }
2182
2183
0
    return count;
2184
0
}
2185
2186
int sam_hdr_line_index(sam_hdr_t *bh,
2187
                       const char *type,
2188
0
                       const char *key) {
2189
0
    sam_hrecs_t *hrecs;
2190
0
    if (!bh || !type || !key)
2191
0
        return -2;
2192
2193
0
    if (!(hrecs = bh->hrecs)) {
2194
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2195
0
            return -2;
2196
0
        hrecs = bh->hrecs;
2197
0
    }
2198
2199
0
    khint_t k;
2200
0
    int idx = -1;
2201
0
    switch (type[0]) {
2202
0
    case 'S':
2203
0
        if (type[1] == 'Q') {
2204
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
2205
0
            if (k != kh_end(hrecs->ref_hash))
2206
0
                idx = kh_val(hrecs->ref_hash, k);
2207
0
        } else {
2208
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2209
0
        }
2210
0
        break;
2211
0
    case 'R':
2212
0
        if (type[1] == 'G') {
2213
0
            k = kh_get(m_s2i, hrecs->rg_hash, key);
2214
0
            if (k != kh_end(hrecs->rg_hash))
2215
0
                idx = kh_val(hrecs->rg_hash, k);
2216
0
        } else {
2217
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2218
0
        }
2219
0
        break;
2220
0
    case 'P':
2221
0
        if (type[1] == 'G') {
2222
0
            k = kh_get(m_s2i, hrecs->pg_hash, key);
2223
0
            if (k != kh_end(hrecs->pg_hash))
2224
0
                idx = kh_val(hrecs->pg_hash, k);
2225
0
        } else {
2226
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2227
0
        }
2228
0
        break;
2229
0
    default:
2230
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2231
0
    }
2232
2233
0
    return idx;
2234
0
}
2235
2236
const char *sam_hdr_line_name(sam_hdr_t *bh,
2237
                              const char *type,
2238
0
                              int pos) {
2239
0
    sam_hrecs_t *hrecs;
2240
0
    if (!bh || !type || pos < 0)
2241
0
        return NULL;
2242
2243
0
    if (!(hrecs = bh->hrecs)) {
2244
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2245
0
            return NULL;
2246
0
        hrecs = bh->hrecs;
2247
0
    }
2248
2249
0
    switch (type[0]) {
2250
0
    case 'S':
2251
0
        if (type[1] == 'Q') {
2252
0
            if (pos < hrecs->nref)
2253
0
                return hrecs->ref[pos].name;
2254
0
        } else {
2255
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2256
0
        }
2257
0
        break;
2258
0
    case 'R':
2259
0
        if (type[1] == 'G') {
2260
0
            if (pos < hrecs->nrg)
2261
0
                return hrecs->rg[pos].name;
2262
0
        } else {
2263
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2264
0
        }
2265
0
        break;
2266
0
    case 'P':
2267
0
        if (type[1] == 'G') {
2268
0
            if (pos < hrecs->npg)
2269
0
                return hrecs->pg[pos].name;
2270
0
        } else {
2271
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2272
0
        }
2273
0
        break;
2274
0
    default:
2275
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
2276
0
    }
2277
2278
0
    return NULL;
2279
0
}
2280
2281
/* ==== Key:val level methods ==== */
2282
2283
int sam_hdr_find_tag_id(sam_hdr_t *bh,
2284
                     const char *type,
2285
                     const char *ID_key,
2286
                     const char *ID_value,
2287
                     const char *key,
2288
0
                     kstring_t *ks) {
2289
0
    sam_hrecs_t *hrecs;
2290
0
    if (!bh || !type || !key)
2291
0
        return -2;
2292
2293
0
    if (!(hrecs = bh->hrecs)) {
2294
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2295
0
            return -2;
2296
0
        hrecs = bh->hrecs;
2297
0
    }
2298
2299
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2300
0
    if (!ty)
2301
0
        return -1;
2302
2303
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
2304
0
    if (!tag || !tag->str || tag->len < 4)
2305
0
        return -1;
2306
2307
0
    ks->l = 0;
2308
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
2309
0
        return -2;
2310
0
    }
2311
2312
0
    return 0;
2313
0
}
2314
2315
int sam_hdr_find_tag_pos(sam_hdr_t *bh,
2316
                     const char *type,
2317
                     int pos,
2318
                     const char *key,
2319
0
                     kstring_t *ks) {
2320
0
    sam_hrecs_t *hrecs;
2321
0
    if (!bh || !type || !key)
2322
0
        return -2;
2323
2324
0
    if (!(hrecs = bh->hrecs)) {
2325
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2326
0
            return -2;
2327
0
        hrecs = bh->hrecs;
2328
0
    }
2329
2330
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
2331
0
    if (!ty)
2332
0
        return -1;
2333
2334
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
2335
0
    if (!tag || !tag->str || tag->len < 4)
2336
0
        return -1;
2337
2338
0
    ks->l = 0;
2339
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
2340
0
        return -2;
2341
0
    }
2342
2343
0
    return 0;
2344
0
}
2345
2346
int sam_hdr_remove_tag_id(sam_hdr_t *bh,
2347
        const char *type,
2348
        const char *ID_key,
2349
        const char *ID_value,
2350
0
        const char *key) {
2351
0
    sam_hrecs_t *hrecs;
2352
0
    if (!bh || !type || !key)
2353
0
        return -1;
2354
2355
0
    if (!(hrecs = bh->hrecs)) {
2356
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2357
0
            return -1;
2358
0
        hrecs = bh->hrecs;
2359
0
    }
2360
2361
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2362
0
    if (!ty)
2363
0
        return -1;
2364
2365
0
    int ret = sam_hrecs_remove_key(hrecs, ty, key);
2366
0
    if (!ret && hrecs->dirty)
2367
0
        redact_header_text(bh);
2368
2369
0
    return ret;
2370
0
}
2371
2372
/*
2373
 * Reconstructs a kstring from the header hash table.
2374
 * Returns 0 on success
2375
 *        -1 on failure
2376
 */
2377
38.2k
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
2378
38.2k
    ks->l = 0;
2379
2380
38.2k
    if (!hrecs->h || !hrecs->h->size) {
2381
27.8k
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
2382
27.8k
    }
2383
10.3k
    if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
2384
0
        return -1;
2385
2386
10.3k
    return 0;
2387
10.3k
}
2388
2389
/*
2390
 * Looks up a reference sequence by name and returns the numerical ID.
2391
 * Returns -1 if unknown reference; -2 if header could not be parsed.
2392
 */
2393
9.61k
int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
2394
9.61k
    sam_hrecs_t *hrecs;
2395
9.61k
    khint_t k;
2396
2397
9.61k
    if (!bh)
2398
0
        return -1;
2399
2400
9.61k
    if (!(hrecs = bh->hrecs)) {
2401
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2402
0
            return -2;
2403
0
        hrecs = bh->hrecs;
2404
0
    }
2405
2406
9.61k
    if (!hrecs->ref_hash)
2407
0
        return -1;
2408
2409
9.61k
    k = kh_get(m_s2i, hrecs->ref_hash, ref);
2410
9.61k
    return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
2411
9.61k
}
2412
2413
0
const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
2414
0
    sam_hrecs_t *hrecs;
2415
2416
0
    if (!h || tid < 0)
2417
0
        return NULL;
2418
2419
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2420
0
        return hrecs->ref[tid].name;
2421
0
    } else {
2422
0
        if (tid < h->n_targets)
2423
0
            return h->target_name[tid];
2424
0
    }
2425
2426
0
    return NULL;
2427
0
}
2428
2429
0
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2430
0
    sam_hrecs_t *hrecs;
2431
2432
0
    if (!h || tid < 0)
2433
0
        return 0;
2434
2435
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2436
0
        return hrecs->ref[tid].len;
2437
0
    } else {
2438
0
        if (tid < h->n_targets) {
2439
0
            if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2440
0
                return h->target_len[tid];
2441
0
            } else {
2442
0
                khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2443
0
                khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2444
0
                if (k < kh_end(long_refs)) {
2445
0
                    return kh_val(long_refs, k);
2446
0
                } else {
2447
0
                    return UINT32_MAX;
2448
0
                }
2449
0
            }
2450
0
        }
2451
0
    }
2452
2453
0
    return 0;
2454
0
}
2455
2456
/*
2457
 * Fixes any PP links in @PG headers.
2458
 * If the entries are in order then this doesn't need doing, but in case
2459
 * our header is out of order this goes through the hrecs->pg[] array
2460
 * setting the prev_id field.
2461
 *
2462
 * Note we can have multiple complete chains. This code should identify the
2463
 * tails of these chains as these are the entries we have to link to in
2464
 * subsequent PP records.
2465
 *
2466
 * Returns 0 on success
2467
 *        -1 on failure (indicating broken PG/PP records)
2468
 */
2469
0
static int sam_hdr_link_pg(sam_hdr_t *bh) {
2470
0
    sam_hrecs_t *hrecs;
2471
0
    int i, j, ret = 0, *new_pg_end;
2472
2473
0
    if (!bh)
2474
0
        return -1;
2475
2476
0
    if (!(hrecs = bh->hrecs)) {
2477
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2478
0
            return -1;
2479
0
        hrecs = bh->hrecs;
2480
0
    }
2481
2482
0
    if (!hrecs->pgs_changed || !hrecs->npg)
2483
0
        return 0;
2484
2485
0
    hrecs->npg_end_alloc = hrecs->npg;
2486
0
    new_pg_end = hts_realloc_p(hrecs->pg_end, sizeof(*new_pg_end), hrecs->npg);
2487
0
    if (!new_pg_end)
2488
0
        return -1;
2489
0
    hrecs->pg_end = new_pg_end;
2490
0
    int *chain_size = calloc(hrecs->npg, sizeof(int));
2491
0
    if (!chain_size)
2492
0
        return -1;
2493
2494
0
    for (i = 0; i < hrecs->npg; i++)
2495
0
        hrecs->pg_end[i] = i;
2496
2497
0
    for (i = 0; i < hrecs->npg; i++) {
2498
0
        khint_t k;
2499
0
        sam_hrec_tag_t *tag;
2500
2501
0
        assert(hrecs->pg[i].ty != NULL);
2502
0
        for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2503
0
            if (tag->str[0] == 'P' && tag->str[1] == 'P')
2504
0
                break;
2505
0
        }
2506
0
        if (!tag) {
2507
            /* Chain start points */
2508
0
            continue;
2509
0
        }
2510
2511
0
        k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2512
2513
0
        if (k == kh_end(hrecs->pg_hash)) {
2514
0
            hts_log_warning("PG line with ID:%s has a PP link to missing program '%s'",
2515
0
                    hrecs->pg[i].name, tag->str+3);
2516
0
            continue;
2517
0
        }
2518
2519
0
        int pp_idx = kh_val(hrecs->pg_hash, k);
2520
0
        if (pp_idx == i) {
2521
0
            hts_log_warning("PG line with ID:%s has a PP link to itself",
2522
0
                            hrecs->pg[i].name);
2523
0
            continue;
2524
0
        }
2525
2526
0
        hrecs->pg[i].prev_id = hrecs->pg[pp_idx].id;
2527
0
        hrecs->pg_end[pp_idx] = -1;
2528
0
        chain_size[i] = chain_size[pp_idx]+1;
2529
0
    }
2530
2531
0
    int last_end = -1;
2532
0
    for (i = j = 0; i < hrecs->npg; i++) {
2533
0
        if (hrecs->pg_end[i] != -1) {
2534
0
            last_end = hrecs->pg_end[i];
2535
0
            if (chain_size[i] > 0)
2536
0
                hrecs->pg_end[j++] = hrecs->pg_end[i];
2537
0
        }
2538
0
    }
2539
    /* Only leafs? Choose the last one! */
2540
0
    if (!j && hrecs->npg_end > 0 && last_end >= 0) {
2541
0
        hrecs->pg_end[0] = last_end;
2542
0
        j = 1;
2543
0
    }
2544
2545
0
    hrecs->npg_end = j;
2546
0
    hrecs->pgs_changed = 0;
2547
2548
    /* mark as dirty or empty for rebuild */
2549
0
    hrecs->dirty = 1;
2550
0
    redact_header_text(bh);
2551
0
    free(chain_size);
2552
2553
0
    return ret;
2554
0
}
2555
2556
/*
2557
 * Returns a unique ID from a base name.
2558
 *
2559
 * The value returned is valid until the next call to
2560
 * this function.
2561
 */
2562
0
const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2563
0
    sam_hrecs_t *hrecs;
2564
0
    size_t name_len;
2565
0
    const size_t name_extra = 17;
2566
0
    if (!bh || !name)
2567
0
        return NULL;
2568
2569
0
    if (!(hrecs = bh->hrecs)) {
2570
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2571
0
            return NULL;
2572
0
        hrecs = bh->hrecs;
2573
0
    }
2574
2575
0
    khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2576
0
    if (k == kh_end(hrecs->pg_hash))
2577
0
        return name;
2578
2579
0
    name_len = strlen(name);
2580
0
    if (name_len > 1000) name_len = 1000;
2581
0
    if (hrecs->ID_buf_sz < name_len + name_extra) {
2582
0
        char *new_ID_buf = realloc(hrecs->ID_buf,
2583
0
                                   hts_add_sat2(name_len, name_extra));
2584
0
        if (new_ID_buf == NULL)
2585
0
            return NULL;
2586
0
        hrecs->ID_buf = new_ID_buf;
2587
0
        hrecs->ID_buf_sz = name_len + name_extra;
2588
0
    }
2589
2590
0
    do {
2591
0
        snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2592
0
        k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2593
0
    } while (k != kh_end(hrecs->pg_hash));
2594
2595
0
    return hrecs->ID_buf;
2596
0
}
2597
2598
/*
2599
 * Add an @PG line.
2600
 *
2601
 * If we wish complete control over this use sam_hdr_add_line() directly. This
2602
 * function uses that, but attempts to do a lot of tedious house work for
2603
 * you too.
2604
 *
2605
 * - It will generate a suitable ID if the supplied one clashes.
2606
 * - It will generate multiple @PG records if we have multiple PG chains.
2607
 *
2608
 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2609
 * in NULL.
2610
 *
2611
 * Returns 0 on success
2612
 *        -1 on failure
2613
 */
2614
0
int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2615
0
    sam_hrecs_t *hrecs;
2616
0
    const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2617
0
    const char *key, *val;
2618
0
    if (!bh)
2619
0
        return -1;
2620
2621
0
    if (!(hrecs = bh->hrecs)) {
2622
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2623
0
            return -1;
2624
0
        hrecs = bh->hrecs;
2625
0
    }
2626
2627
0
    bh->hrecs->pgs_changed = 1;
2628
0
    if (sam_hdr_link_pg(bh) < 0) {
2629
0
        hts_log_error("Error linking @PG lines");
2630
0
        return -1;
2631
0
    }
2632
2633
0
    va_list args;
2634
    // Check for ID / PN / PP tags in varargs list
2635
0
    va_start(args, name);
2636
0
    while ((key = va_arg(args, const char *)) != NULL) {
2637
0
        val = va_arg(args, const char *);
2638
0
        if (!val) break;
2639
0
        if (strcmp(key, "PN") == 0 && *val != '\0')
2640
0
            specified_pn = val;
2641
0
        else if (strcmp(key, "PP") == 0 && *val != '\0')
2642
0
            specified_pp = val;
2643
0
        else if (strcmp(key, "ID") == 0 && *val != '\0')
2644
0
            specified_id = val;
2645
0
    }
2646
0
    va_end(args);
2647
2648
0
    if (specified_id && hrecs->pg_hash) {
2649
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2650
0
        if (k != kh_end(hrecs->pg_hash)) {
2651
0
            hts_log_error("Header @PG ID:%s already present", specified_id);
2652
0
            return -1;
2653
0
        }
2654
0
    }
2655
2656
0
    if (specified_pp && hrecs->pg_hash) {
2657
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2658
0
        if (k == kh_end(hrecs->pg_hash)) {
2659
0
            hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2660
0
                          specified_pp);
2661
0
            return -1;
2662
0
        }
2663
0
    }
2664
2665
0
    if (!specified_pp && hrecs->npg_end) {
2666
        /* Copy ends array to avoid us looping while modifying it */
2667
0
        int *end = hts_malloc_p(sizeof(int), hrecs->npg_end);
2668
0
        int i, nends = hrecs->npg_end;
2669
2670
0
        if (!end)
2671
0
            return -1;
2672
2673
0
        memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2674
2675
0
        for (i = 0; i < nends; i++) {
2676
0
            const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2677
0
            if (!id) {
2678
0
                free(end);
2679
0
                return -1;
2680
0
            }
2681
0
            assert(end[i] >= 0 && end[i] < hrecs->npg);
2682
0
            va_start(args, name);
2683
0
            if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2684
0
                                     "ID", id,
2685
0
                                     "PN", !specified_pn ? name : "",
2686
0
                                     "PP", hrecs->pg[end[i]].name,
2687
0
                                     NULL)) {
2688
0
                free(end);
2689
0
                return  -1;
2690
0
            }
2691
0
            va_end(args);
2692
0
        }
2693
2694
0
        free(end);
2695
0
    } else {
2696
0
        const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2697
0
        if (!id)
2698
0
            return -1;
2699
0
        va_start(args, name);
2700
0
        if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2701
0
                                 "ID", id,
2702
0
                                 "PN", !specified_pn ? name : "",
2703
0
                                 NULL))
2704
0
            return -1;
2705
0
        va_end(args);
2706
0
    }
2707
2708
0
    hrecs->dirty = 1;
2709
0
    redact_header_text(bh);
2710
2711
0
    return 0;
2712
0
}
2713
2714
/*! Increments a reference count on bh.
2715
 *
2716
 * This permits multiple files to share the same header, all calling
2717
 * sam_hdr_destroy when done, without causing errors for other open files.
2718
 */
2719
7.14k
void sam_hdr_incr_ref(sam_hdr_t *bh) {
2720
7.14k
    if (!bh)
2721
0
        return;
2722
7.14k
    bh->ref_count++;
2723
7.14k
}
2724
2725
/* ==== Internal methods ==== */
2726
2727
/*
2728
 * Creates an empty SAM header.  Allocates space for the SAM header
2729
 * structures (hash tables) ready to be populated.
2730
 *
2731
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2732
 *         NULL on failure
2733
 */
2734
38.9k
sam_hrecs_t *sam_hrecs_new(void) {
2735
38.9k
    sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2736
2737
38.9k
    if (!hrecs)
2738
0
        return NULL;
2739
2740
38.9k
    hrecs->h = kh_init(sam_hrecs_t);
2741
38.9k
    if (!hrecs->h)
2742
0
        goto err;
2743
2744
38.9k
    hrecs->ID_cnt = 1;
2745
2746
38.9k
    hrecs->nref = 0;
2747
38.9k
    hrecs->ref_sz = 0;
2748
38.9k
    hrecs->ref  = NULL;
2749
38.9k
    if (!(hrecs->ref_hash = kh_init(m_s2i)))
2750
0
        goto err;
2751
38.9k
    hrecs->refs_changed = -1;
2752
2753
38.9k
    hrecs->nrg = 0;
2754
38.9k
    hrecs->rg_sz = 0;
2755
38.9k
    hrecs->rg  = NULL;
2756
38.9k
    if (!(hrecs->rg_hash = kh_init(m_s2i)))
2757
0
        goto err;
2758
2759
38.9k
    hrecs->npg = 0;
2760
38.9k
    hrecs->pg_sz = 0;
2761
38.9k
    hrecs->pg  = NULL;
2762
38.9k
    hrecs->npg_end = hrecs->npg_end_alloc = 0;
2763
38.9k
    hrecs->pg_end = NULL;
2764
38.9k
    if (!(hrecs->pg_hash = kh_init(m_s2i)))
2765
0
        goto err;
2766
2767
38.9k
    if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2768
0
        goto err;
2769
2770
38.9k
    if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2771
0
        goto err;
2772
2773
38.9k
    if (!(hrecs->str_pool = string_pool_create(65536)))
2774
0
        goto err;
2775
2776
38.9k
    if (sam_hrecs_init_type_order(hrecs, NULL))
2777
0
        goto err;
2778
2779
38.9k
    return hrecs;
2780
2781
0
err:
2782
0
    if (hrecs->h)
2783
0
        kh_destroy(sam_hrecs_t, hrecs->h);
2784
2785
0
    if (hrecs->tag_pool)
2786
0
        pool_destroy(hrecs->tag_pool);
2787
2788
0
    if (hrecs->type_pool)
2789
0
        pool_destroy(hrecs->type_pool);
2790
2791
0
    if (hrecs->str_pool)
2792
0
        string_pool_destroy(hrecs->str_pool);
2793
2794
0
    free(hrecs);
2795
2796
0
    return NULL;
2797
38.9k
}
2798
#if 0
2799
/*
2800
 * Produces a duplicate copy of source and returns it.
2801
 * Returns NULL on failure
2802
 */
2803
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2804
        return NULL;
2805
}
2806
#endif
2807
/*! Deallocates all storage used by a sam_hrecs_t struct.
2808
 *
2809
 * This also decrements the header reference count. If after decrementing
2810
 * it is still non-zero then the header is assumed to be in use by another
2811
 * caller and the free is not done.
2812
 *
2813
 */
2814
38.9k
void sam_hrecs_free(sam_hrecs_t *hrecs) {
2815
38.9k
    if (!hrecs)
2816
0
        return;
2817
2818
38.9k
    if (hrecs->h)
2819
38.9k
        kh_destroy(sam_hrecs_t, hrecs->h);
2820
2821
38.9k
    if (hrecs->ref_hash)
2822
38.9k
        kh_destroy(m_s2i, hrecs->ref_hash);
2823
2824
38.9k
    if (hrecs->ref)
2825
10.2k
        free(hrecs->ref);
2826
2827
38.9k
    if (hrecs->rg_hash)
2828
38.9k
        kh_destroy(m_s2i, hrecs->rg_hash);
2829
2830
38.9k
    if (hrecs->rg)
2831
549
        free(hrecs->rg);
2832
2833
38.9k
    if (hrecs->pg_hash)
2834
38.9k
        kh_destroy(m_s2i, hrecs->pg_hash);
2835
2836
38.9k
    if (hrecs->pg)
2837
413
        free(hrecs->pg);
2838
2839
38.9k
    if (hrecs->pg_end)
2840
369
        free(hrecs->pg_end);
2841
2842
38.9k
    if (hrecs->type_pool)
2843
38.9k
        pool_destroy(hrecs->type_pool);
2844
2845
38.9k
    if (hrecs->tag_pool)
2846
38.9k
        pool_destroy(hrecs->tag_pool);
2847
2848
38.9k
    if (hrecs->str_pool)
2849
38.9k
        string_pool_destroy(hrecs->str_pool);
2850
2851
38.9k
    if (hrecs->type_order)
2852
38.9k
        free(hrecs->type_order);
2853
2854
38.9k
    if (hrecs->ID_buf)
2855
0
        free(hrecs->ID_buf);
2856
2857
38.9k
    free(hrecs);
2858
38.9k
}
2859
2860
/*
2861
 * Internal method already used by the CRAM code
2862
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2863
 * for the tag ID: and compares against the specified ID.
2864
 *
2865
 * Returns NULL if no type/ID is found
2866
 */
2867
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2868
1.33k
                                     const char *ID_key, const char *ID_value) {
2869
1.33k
    if (!hrecs || !type)
2870
0
        return NULL;
2871
1.33k
    sam_hrec_type_t *t1, *t2;
2872
1.33k
    khint_t k;
2873
2874
    /* Special case for types we have prebuilt hashes on */
2875
1.33k
    if (ID_key) {
2876
1.33k
        if (!ID_value)
2877
0
            return NULL;
2878
2879
1.33k
        if (type[0]   == 'S' && type[1]   == 'Q' &&
2880
1.33k
            ID_key[0] == 'S' && ID_key[1] == 'N') {
2881
1.33k
            k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2882
1.33k
            return k != kh_end(hrecs->ref_hash)
2883
1.33k
                ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2884
1.33k
                : NULL;
2885
1.33k
        }
2886
2887
0
        if (type[0]   == 'R' && type[1]   == 'G' &&
2888
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2889
0
            k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2890
0
            return k != kh_end(hrecs->rg_hash)
2891
0
                ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2892
0
                : NULL;
2893
0
        }
2894
2895
0
        if (type[0]   == 'P' && type[1]   == 'G' &&
2896
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2897
0
            k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2898
0
            return k != kh_end(hrecs->pg_hash)
2899
0
                ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2900
0
                : NULL;
2901
0
        }
2902
0
    }
2903
2904
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2905
0
    if (k == kh_end(hrecs->h))
2906
0
        return NULL;
2907
2908
0
    if (!ID_key)
2909
0
        return kh_val(hrecs->h, k);
2910
2911
0
    t1 = t2 = kh_val(hrecs->h, k);
2912
0
    do {
2913
0
        sam_hrec_tag_t *tag;
2914
0
        for (tag = t1->tag; tag; tag = tag->next) {
2915
0
            if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2916
0
                const char *cp1 = tag->str+3;
2917
0
                const char *cp2 = ID_value;
2918
0
                while (*cp1 && *cp1 == *cp2)
2919
0
                    cp1++, cp2++;
2920
0
                if (*cp2 || *cp1)
2921
0
                    continue;
2922
0
                return t1;
2923
0
            }
2924
0
        }
2925
0
        t1 = t1->next;
2926
0
    } while (t1 != t2);
2927
2928
0
    return NULL;
2929
0
}
2930
2931
/*
2932
 * Adds or updates tag key,value pairs in a header line.
2933
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2934
 * @HD line.
2935
 *
2936
 * va_list contains multiple key,value pairs ending in NULL.
2937
 *
2938
 * Returns 0 on success
2939
 *        -1 on failure
2940
 */
2941
0
int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2942
0
    if (!hrecs)
2943
0
        return -1;
2944
2945
0
    for (;;) {
2946
0
        char *k, *v, *str;
2947
0
        sam_hrec_tag_t *tag, *prev = NULL;
2948
2949
0
        if (!(k = (char *)va_arg(ap, char *)))
2950
0
            break;
2951
0
        if (!(v = va_arg(ap, char *)))
2952
0
            v = "";
2953
2954
0
        tag = sam_hrecs_find_key(type, k, &prev);
2955
0
        if (!tag) {
2956
0
            if (!(tag = pool_alloc(hrecs->tag_pool)))
2957
0
                return -1;
2958
0
            if (prev)
2959
0
                prev->next = tag;
2960
0
            else
2961
0
                type->tag = tag;
2962
2963
0
            tag->next = NULL;
2964
0
        }
2965
2966
0
        tag->len = 3 + strlen(v);
2967
0
        str = string_alloc(hrecs->str_pool, hts_add_sat2(tag->len, 1));
2968
0
        if (!str)
2969
0
            return -1;
2970
2971
0
        if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2972
0
            return -1;
2973
2974
0
        tag->str = str;
2975
0
    }
2976
2977
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2978
2979
0
    return 0;
2980
0
}
2981
2982
/*
2983
 * Adds or updates tag key,value pairs in a header line.
2984
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2985
 * @HD line.
2986
 *
2987
 * Specify multiple key,value pairs ending in NULL.
2988
 *
2989
 * Returns 0 on success
2990
 *        -1 on failure
2991
 */
2992
0
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2993
0
    va_list args;
2994
0
    int res;
2995
0
    va_start(args, type);
2996
0
    res = sam_hrecs_vupdate(hrecs, type, args);
2997
0
    va_end(args);
2998
0
    return res;
2999
0
}
3000
3001
/*
3002
 * Looks for a specific key in a single sam header line identified by *type.
3003
 * If prev is non-NULL it also fills this out with the previous tag, to
3004
 * permit use in key removal. *prev is set to NULL when the tag is the first
3005
 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
3006
 * tag in the existing list.
3007
 *
3008
 * Returns the tag pointer on success
3009
 *         NULL on failure
3010
 */
3011
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
3012
                                   const char *key,
3013
5.32k
                                   sam_hrec_tag_t **prev) {
3014
5.32k
    sam_hrec_tag_t *tag, *p = NULL;
3015
5.32k
    if (!type)
3016
0
        return NULL;
3017
3018
11.1k
    for (tag = type->tag; tag; p = tag, tag = tag->next) {
3019
9.67k
        if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
3020
3.85k
            if (prev)
3021
0
                *prev = p;
3022
3.85k
            return tag;
3023
3.85k
        }
3024
9.67k
    }
3025
3026
1.47k
    if (prev)
3027
0
        *prev = p;
3028
3029
1.47k
    return NULL;
3030
5.32k
}
3031
3032
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
3033
                         sam_hrec_type_t *type,
3034
0
                         const char *key) {
3035
0
    sam_hrec_tag_t *tag, *prev;
3036
0
    if (!hrecs)
3037
0
        return -1;
3038
0
    tag = sam_hrecs_find_key(type, key, &prev);
3039
0
    if (!tag)
3040
0
        return 0; // Not there anyway
3041
3042
0
    if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
3043
0
        assert(tag->len >= 3);
3044
0
        sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
3045
0
        if (sn_tag) {
3046
0
            assert(sn_tag->len >= 3);
3047
0
            khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
3048
0
            if (k != kh_end(hrecs->ref_hash))
3049
0
                sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
3050
0
        }
3051
0
    }
3052
3053
0
    if (!prev) { //first tag
3054
0
        type->tag = tag->next;
3055
0
    } else {
3056
0
        prev->next = tag->next;
3057
0
    }
3058
0
    pool_free(hrecs->tag_pool, tag);
3059
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
3060
3061
0
    return 1;
3062
0
}
3063
3064
/*
3065
 * Looks up a read-group by name and returns a pointer to the start of the
3066
 * associated tag list.
3067
 *
3068
 * Returns NULL on failure
3069
 */
3070
544
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
3071
544
    khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
3072
544
    return k == kh_end(hrecs->rg_hash)
3073
544
        ? NULL
3074
544
        : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
3075
544
}
3076
3077
#if DEBUG_HEADER
3078
void sam_hrecs_dump(sam_hrecs_t *hrecs) {
3079
    khint_t k;
3080
    int i;
3081
3082
    printf("===DUMP===\n");
3083
    for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
3084
        sam_hrec_type_t *t1, *t2;
3085
        char c[2];
3086
        int idx = 0;
3087
3088
        if (!kh_exist(hrecs->h, k))
3089
            continue;
3090
3091
        t1 = t2 = kh_val(hrecs->h, k);
3092
        c[0] = kh_key(hrecs->h, k)>>8;
3093
        c[1] = kh_key(hrecs->h, k)&0xff;
3094
        printf("Type %.2s\n", c);
3095
3096
        do {
3097
            sam_hrec_tag_t *tag;
3098
            printf(">>>%d ", idx++);
3099
            for (tag = t1->tag; tag; tag=tag->next) {
3100
                if (strncmp(c, "CO", 2))
3101
                    printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
3102
                else
3103
                    printf("%s", tag->str);
3104
            }
3105
            putchar('\n');
3106
            t1 = t1->next;
3107
        } while (t1 != t2);
3108
    }
3109
3110
    /* Dump out PG chains */
3111
    printf("\n@PG chains:\n");
3112
    for (i = 0; i < hrecs->npg_end; i++) {
3113
        int j;
3114
        printf("  %d:", i);
3115
        for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
3116
            printf("%s%d(%.*s)",
3117
                   j == hrecs->pg_end[i] ? " " : "->",
3118
                   j, hrecs->pg[j].name_len, hrecs->pg[j].name);
3119
        }
3120
        printf("\n");
3121
    }
3122
3123
    puts("===END DUMP===");
3124
}
3125
#endif
3126
3127
/*
3128
 * Returns the sort order:
3129
 */
3130
0
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
3131
0
    khint_t k;
3132
0
    enum sam_sort_order so;
3133
3134
0
    so = ORDER_UNKNOWN;
3135
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
3136
0
    if (k != kh_end(hrecs->h)) {
3137
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
3138
0
        sam_hrec_tag_t *tag;
3139
0
        for (tag = ty->tag; tag; tag = tag->next) {
3140
0
            if (tag->str[0] == 'S' && tag->str[1] == 'O') {
3141
0
                if (strcmp(tag->str+3, "unsorted") == 0)
3142
0
                    so = ORDER_UNSORTED;
3143
0
                else if (strcmp(tag->str+3, "queryname") == 0)
3144
0
                    so = ORDER_NAME;
3145
0
                else if (strcmp(tag->str+3, "coordinate") == 0)
3146
0
                    so = ORDER_COORD;
3147
0
                else if (strcmp(tag->str+3, "unknown") != 0)
3148
0
                    hts_log_error("Unknown sort order field: %s", tag->str+3);
3149
0
            }
3150
0
        }
3151
0
    }
3152
3153
0
    return so;
3154
0
}
3155
3156
0
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
3157
0
    khint_t k;
3158
0
    enum sam_group_order go;
3159
3160
0
    go = ORDER_NONE;
3161
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
3162
0
    if (k != kh_end(hrecs->h)) {
3163
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
3164
0
        sam_hrec_tag_t *tag;
3165
0
        for (tag = ty->tag; tag; tag = tag->next) {
3166
0
            if (tag->str[0] == 'G' && tag->str[1] == 'O') {
3167
0
                if (strcmp(tag->str+3, "query") == 0)
3168
0
                    go = ORDER_QUERY;
3169
0
                else if (strcmp(tag->str+3, "reference") == 0)
3170
0
                    go = ORDER_REFERENCE;
3171
0
            }
3172
0
        }
3173
0
    }
3174
3175
0
    return go;
3176
0
}